From e046eb0c9bf26d94be9e4592c00c7a78b0fa9bfd Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 10 Aug 2017 12:56:18 +0100
Subject: [PATCH 001/104] arm64: mm: Use non-global mappings for kernel space

In preparation for unmapping the kernel whilst running in userspace,
make the kernel mappings non-global so we can avoid expensive TLB
invalidation on kernel exit to userspace.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/kernel-pgtable.h | 12 ++++++++++--
 arch/arm64/include/asm/pgtable-prot.h   | 21 +++++++++++++++------
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 7803343e5881..77a27af01371 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -78,8 +78,16 @@
 /*
  * Initial memory map attributes.
  */
-#define SWAPPER_PTE_FLAGS	(PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
-#define SWAPPER_PMD_FLAGS	(PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
+#define _SWAPPER_PTE_FLAGS	(PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
+#define _SWAPPER_PMD_FLAGS	(PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
+
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+#define SWAPPER_PTE_FLAGS	(_SWAPPER_PTE_FLAGS | PTE_NG)
+#define SWAPPER_PMD_FLAGS	(_SWAPPER_PMD_FLAGS | PMD_SECT_NG)
+#else
+#define SWAPPER_PTE_FLAGS	_SWAPPER_PTE_FLAGS
+#define SWAPPER_PMD_FLAGS	_SWAPPER_PMD_FLAGS
+#endif
 
 #if ARM64_SWAPPER_USES_SECTION_MAPS
 #define SWAPPER_MM_MMUFLAGS	(PMD_ATTRINDX(MT_NORMAL) | SWAPPER_PMD_FLAGS)
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 0a5635fb0ef9..22a926825e3f 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -34,8 +34,16 @@
 
 #include <asm/pgtable-types.h>
 
-#define PROT_DEFAULT		(PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
-#define PROT_SECT_DEFAULT	(PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
+#define _PROT_DEFAULT		(PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
+#define _PROT_SECT_DEFAULT	(PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
+
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+#define PROT_DEFAULT		(_PROT_DEFAULT | PTE_NG)
+#define PROT_SECT_DEFAULT	(_PROT_SECT_DEFAULT | PMD_SECT_NG)
+#else
+#define PROT_DEFAULT		_PROT_DEFAULT
+#define PROT_SECT_DEFAULT	_PROT_SECT_DEFAULT
+#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
 
 #define PROT_DEVICE_nGnRnE	(PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE))
 #define PROT_DEVICE_nGnRE	(PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE))
@@ -48,6 +56,7 @@
 #define PROT_SECT_NORMAL_EXEC	(PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL))
 
 #define _PAGE_DEFAULT		(PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL))
+#define _HYP_PAGE_DEFAULT	(_PAGE_DEFAULT & ~PTE_NG)
 
 #define PAGE_KERNEL		__pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE)
 #define PAGE_KERNEL_RO		__pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_RDONLY)
@@ -55,15 +64,15 @@
 #define PAGE_KERNEL_EXEC	__pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE)
 #define PAGE_KERNEL_EXEC_CONT	__pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT)
 
-#define PAGE_HYP		__pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN)
-#define PAGE_HYP_EXEC		__pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY)
-#define PAGE_HYP_RO		__pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN)
+#define PAGE_HYP		__pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN)
+#define PAGE_HYP_EXEC		__pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY)
+#define PAGE_HYP_RO		__pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN)
 #define PAGE_HYP_DEVICE		__pgprot(PROT_DEVICE_nGnRE | PTE_HYP)
 
 #define PAGE_S2			__pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY)
 #define PAGE_S2_DEVICE		__pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_UXN)
 
-#define PAGE_NONE		__pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_PXN | PTE_UXN)
+#define PAGE_NONE		__pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
 #define PAGE_SHARED		__pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
 #define PAGE_SHARED_EXEC	__pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE)
 #define PAGE_READONLY		__pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)

From 376133b7edc20f237a42e4c72415cc9e8c0a9704 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 10 Aug 2017 13:04:48 +0100
Subject: [PATCH 002/104] arm64: mm: Temporarily disable ARM64_SW_TTBR0_PAN

We're about to rework the way ASIDs are allocated, switch_mm is
implemented and low-level kernel entry/exit is handled, so keep the
ARM64_SW_TTBR0_PAN code out of the way whilst we do the heavy lifting.

It will be re-enabled in a subsequent patch.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a93339f5178f..7e7d7fd152c4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -910,6 +910,7 @@ endif
 
 config ARM64_SW_TTBR0_PAN
 	bool "Emulate Privileged Access Never using TTBR0_EL1 switching"
+	depends on BROKEN       # Temporary while switch_mm is reworked
 	help
 	  Enabling this option prevents the kernel from accessing
 	  user-space memory directly by pointing TTBR0_EL1 to a reserved

From 7655abb953860485940d4de74fb45a8192149bb6 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 10 Aug 2017 13:19:09 +0100
Subject: [PATCH 003/104] arm64: mm: Move ASID from TTBR0 to TTBR1

In preparation for mapping kernelspace and userspace with different
ASIDs, move the ASID to TTBR1 and update switch_mm to context-switch
TTBR0 via an invalid mapping (the zero page).

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/mmu_context.h   | 7 +++++++
 arch/arm64/include/asm/pgtable-hwdef.h | 1 +
 arch/arm64/include/asm/proc-fns.h      | 6 ------
 arch/arm64/mm/proc.S                   | 9 ++++++---
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 9d155fa9a507..aa39c126c0d0 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -57,6 +57,13 @@ static inline void cpu_set_reserved_ttbr0(void)
 	isb();
 }
 
+static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+	BUG_ON(pgd == swapper_pg_dir);
+	cpu_set_reserved_ttbr0();
+	cpu_do_switch_mm(virt_to_phys(pgd),mm);
+}
+
 /*
  * TCR.T0SZ value to use when the ID map is active. Usually equals
  * TCR_T0SZ(VA_BITS), unless system RAM is positioned very high in
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index eb0c2bd90de9..8df4cb6ac6f7 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -272,6 +272,7 @@
 #define TCR_TG1_4K		(UL(2) << TCR_TG1_SHIFT)
 #define TCR_TG1_64K		(UL(3) << TCR_TG1_SHIFT)
 
+#define TCR_A1			(UL(1) << 22)
 #define TCR_ASID16		(UL(1) << 36)
 #define TCR_TBI0		(UL(1) << 37)
 #define TCR_HA			(UL(1) << 39)
diff --git a/arch/arm64/include/asm/proc-fns.h b/arch/arm64/include/asm/proc-fns.h
index 14ad6e4e87d1..16cef2e8449e 100644
--- a/arch/arm64/include/asm/proc-fns.h
+++ b/arch/arm64/include/asm/proc-fns.h
@@ -35,12 +35,6 @@ extern u64 cpu_do_resume(phys_addr_t ptr, u64 idmap_ttbr);
 
 #include <asm/memory.h>
 
-#define cpu_switch_mm(pgd,mm)				\
-do {							\
-	BUG_ON(pgd == swapper_pg_dir);			\
-	cpu_do_switch_mm(virt_to_phys(pgd),mm);		\
-} while (0)
-
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* __ASM_PROCFNS_H */
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 95233dfc4c39..a8a64898a2aa 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -139,9 +139,12 @@ ENDPROC(cpu_do_resume)
  */
 ENTRY(cpu_do_switch_mm)
 	pre_ttbr0_update_workaround x0, x2, x3
+	mrs	x2, ttbr1_el1
 	mmid	x1, x1				// get mm->context.id
-	bfi	x0, x1, #48, #16		// set the ASID
-	msr	ttbr0_el1, x0			// set TTBR0
+	bfi	x2, x1, #48, #16		// set the ASID
+	msr	ttbr1_el1, x2			// in TTBR1 (since TCR.A1 is set)
+	isb
+	msr	ttbr0_el1, x0			// now update TTBR0
 	isb
 	post_ttbr0_update_workaround
 	ret
@@ -224,7 +227,7 @@ ENTRY(__cpu_setup)
 	 * both user and kernel.
 	 */
 	ldr	x10, =TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
-			TCR_TG_FLAGS | TCR_ASID16 | TCR_TBI0
+			TCR_TG_FLAGS | TCR_ASID16 | TCR_TBI0 | TCR_A1
 	tcr_set_idmap_t0sz	x10, x9
 
 	/*

From 85d13c001497b481b4a8cc5d7db243cc44ac2bd8 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 10 Aug 2017 13:29:06 +0100
Subject: [PATCH 004/104] arm64: mm: Remove pre_ttbr0_update_workaround for
 Falkor erratum #E1003

The pre_ttbr0_update_workaround hook is called prior to context-switching
TTBR0 because Falkor erratum E1003 can cause TLB allocation with the wrong
ASID if both the ASID and the base address of the TTBR are updated at
the same time.

With the ASID sitting safely in TTBR1, we no longer update things
atomically, so we can remove the pre_ttbr0_update_workaround macro as
it's no longer required. The erratum infrastructure and documentation
is left around for #E1003, as it will be required by the entry
trampoline code in a future patch.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/assembler.h   | 22 ----------------------
 arch/arm64/include/asm/mmu_context.h |  2 --
 arch/arm64/mm/context.c              | 11 -----------
 arch/arm64/mm/proc.S                 |  1 -
 4 files changed, 36 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index aef72d886677..e1fa5db858b7 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -26,7 +26,6 @@
 #include <asm/asm-offsets.h>
 #include <asm/cpufeature.h>
 #include <asm/debug-monitors.h>
-#include <asm/mmu_context.h>
 #include <asm/page.h>
 #include <asm/pgtable-hwdef.h>
 #include <asm/ptrace.h>
@@ -478,27 +477,6 @@ alternative_endif
 	.endm
 
 /*
- * Errata workaround prior to TTBR0_EL1 update
- *
- * 	val:	TTBR value with new BADDR, preserved
- * 	tmp0:	temporary register, clobbered
- * 	tmp1:	other temporary register, clobbered
- */
-	.macro	pre_ttbr0_update_workaround, val, tmp0, tmp1
-#ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003
-alternative_if ARM64_WORKAROUND_QCOM_FALKOR_E1003
-	mrs	\tmp0, ttbr0_el1
-	mov	\tmp1, #FALKOR_RESERVED_ASID
-	bfi	\tmp0, \tmp1, #48, #16		// reserved ASID + old BADDR
-	msr	ttbr0_el1, \tmp0
-	isb
-	bfi	\tmp0, \val, #0, #48		// reserved ASID + new BADDR
-	msr	ttbr0_el1, \tmp0
-	isb
-alternative_else_nop_endif
-#endif
-	.endm
-
 /*
  * Errata workaround post TTBR0_EL1 update.
  */
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index aa39c126c0d0..da29766a181c 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -19,8 +19,6 @@
 #ifndef __ASM_MMU_CONTEXT_H
 #define __ASM_MMU_CONTEXT_H
 
-#define FALKOR_RESERVED_ASID	1
-
 #ifndef __ASSEMBLY__
 
 #include <linux/compiler.h>
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 6f4017046323..78a2dc596fee 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -79,13 +79,6 @@ void verify_cpu_asid_bits(void)
 	}
 }
 
-static void set_reserved_asid_bits(void)
-{
-	if (IS_ENABLED(CONFIG_QCOM_FALKOR_ERRATUM_1003) &&
-	    cpus_have_const_cap(ARM64_WORKAROUND_QCOM_FALKOR_E1003))
-		__set_bit(FALKOR_RESERVED_ASID, asid_map);
-}
-
 static void flush_context(unsigned int cpu)
 {
 	int i;
@@ -94,8 +87,6 @@ static void flush_context(unsigned int cpu)
 	/* Update the list of reserved ASIDs and the ASID bitmap. */
 	bitmap_clear(asid_map, 0, NUM_USER_ASIDS);
 
-	set_reserved_asid_bits();
-
 	for_each_possible_cpu(i) {
 		asid = atomic64_xchg_relaxed(&per_cpu(active_asids, i), 0);
 		/*
@@ -254,8 +245,6 @@ static int asids_init(void)
 		panic("Failed to allocate bitmap for %lu ASIDs\n",
 		      NUM_USER_ASIDS);
 
-	set_reserved_asid_bits();
-
 	pr_info("ASID allocator initialised with %lu entries\n", NUM_USER_ASIDS);
 	return 0;
 }
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index a8a64898a2aa..f2ff0837577c 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -138,7 +138,6 @@ ENDPROC(cpu_do_resume)
  *	- pgd_phys - physical address of new TTB
  */
 ENTRY(cpu_do_switch_mm)
-	pre_ttbr0_update_workaround x0, x2, x3
 	mrs	x2, ttbr1_el1
 	mmid	x1, x1				// get mm->context.id
 	bfi	x2, x1, #48, #16		// set the ASID

From 158d495899ce55db453f682a8ac8390d5a426578 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 10 Aug 2017 13:34:30 +0100
Subject: [PATCH 005/104] arm64: mm: Rename post_ttbr0_update_workaround

The post_ttbr0_update_workaround hook applies to any change to TTBRx_EL1.
Since we're using TTBR1 for the ASID, rename the hook to make it clearer
as to what it's doing.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/assembler.h | 5 ++---
 arch/arm64/kernel/entry.S          | 2 +-
 arch/arm64/mm/proc.S               | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index e1fa5db858b7..c45bc94f15d0 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -477,10 +477,9 @@ alternative_endif
 	.endm
 
 /*
-/*
- * Errata workaround post TTBR0_EL1 update.
+ * Errata workaround post TTBRx_EL1 update.
  */
-	.macro	post_ttbr0_update_workaround
+	.macro	post_ttbr_update_workaround
 #ifdef CONFIG_CAVIUM_ERRATUM_27456
 alternative_if ARM64_WORKAROUND_CAVIUM_27456
 	ic	iallu
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 6d14b8f29b5f..804e43c9cb0b 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -257,7 +257,7 @@ alternative_else_nop_endif
 	 * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache
 	 * corruption).
 	 */
-	post_ttbr0_update_workaround
+	post_ttbr_update_workaround
 	.endif
 1:
 	.if	\el != 0
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index f2ff0837577c..3146dc96f05b 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -145,7 +145,7 @@ ENTRY(cpu_do_switch_mm)
 	isb
 	msr	ttbr0_el1, x0			// now update TTBR0
 	isb
-	post_ttbr0_update_workaround
+	post_ttbr_update_workaround
 	ret
 ENDPROC(cpu_do_switch_mm)
 

From 27a921e75711d924617269e0ba4adb8bae9fd0d1 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 10 Aug 2017 13:58:16 +0100
Subject: [PATCH 006/104] arm64: mm: Fix and re-enable ARM64_SW_TTBR0_PAN

With the ASID now installed in TTBR1, we can re-enable ARM64_SW_TTBR0_PAN
by ensuring that we switch to a reserved ASID of zero when disabling
user access and restore the active user ASID on the uaccess enable path.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/Kconfig                   |  1 -
 arch/arm64/include/asm/asm-uaccess.h | 25 +++++++++++++++++--------
 arch/arm64/include/asm/uaccess.h     | 21 +++++++++++++++++----
 arch/arm64/kernel/entry.S            |  4 ++--
 arch/arm64/lib/clear_user.S          |  2 +-
 arch/arm64/lib/copy_from_user.S      |  2 +-
 arch/arm64/lib/copy_in_user.S        |  2 +-
 arch/arm64/lib/copy_to_user.S        |  2 +-
 arch/arm64/mm/cache.S                |  2 +-
 arch/arm64/xen/hypercall.S           |  2 +-
 10 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7e7d7fd152c4..a93339f5178f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -910,7 +910,6 @@ endif
 
 config ARM64_SW_TTBR0_PAN
 	bool "Emulate Privileged Access Never using TTBR0_EL1 switching"
-	depends on BROKEN       # Temporary while switch_mm is reworked
 	help
 	  Enabling this option prevents the kernel from accessing
 	  user-space memory directly by pointing TTBR0_EL1 to a reserved
diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index b3da6c886835..21b8cf304028 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -16,11 +16,20 @@
 	add	\tmp1, \tmp1, #SWAPPER_DIR_SIZE	// reserved_ttbr0 at the end of swapper_pg_dir
 	msr	ttbr0_el1, \tmp1		// set reserved TTBR0_EL1
 	isb
+	sub	\tmp1, \tmp1, #SWAPPER_DIR_SIZE
+	bic	\tmp1, \tmp1, #(0xffff << 48)
+	msr	ttbr1_el1, \tmp1		// set reserved ASID
+	isb
 	.endm
 
-	.macro	__uaccess_ttbr0_enable, tmp1
+	.macro	__uaccess_ttbr0_enable, tmp1, tmp2
 	get_thread_info \tmp1
 	ldr	\tmp1, [\tmp1, #TSK_TI_TTBR0]	// load saved TTBR0_EL1
+	mrs	\tmp2, ttbr1_el1
+	extr    \tmp2, \tmp2, \tmp1, #48
+	ror     \tmp2, \tmp2, #16
+	msr	ttbr1_el1, \tmp2		// set the active ASID
+	isb
 	msr	ttbr0_el1, \tmp1		// set the non-PAN TTBR0_EL1
 	isb
 	.endm
@@ -31,18 +40,18 @@ alternative_if_not ARM64_HAS_PAN
 alternative_else_nop_endif
 	.endm
 
-	.macro	uaccess_ttbr0_enable, tmp1, tmp2
+	.macro	uaccess_ttbr0_enable, tmp1, tmp2, tmp3
 alternative_if_not ARM64_HAS_PAN
-	save_and_disable_irq \tmp2		// avoid preemption
-	__uaccess_ttbr0_enable \tmp1
-	restore_irq \tmp2
+	save_and_disable_irq \tmp3		// avoid preemption
+	__uaccess_ttbr0_enable \tmp1, \tmp2
+	restore_irq \tmp3
 alternative_else_nop_endif
 	.endm
 #else
 	.macro	uaccess_ttbr0_disable, tmp1
 	.endm
 
-	.macro	uaccess_ttbr0_enable, tmp1, tmp2
+	.macro	uaccess_ttbr0_enable, tmp1, tmp2, tmp3
 	.endm
 #endif
 
@@ -56,8 +65,8 @@ alternative_if ARM64_ALT_PAN_NOT_UAO
 alternative_else_nop_endif
 	.endm
 
-	.macro	uaccess_enable_not_uao, tmp1, tmp2
-	uaccess_ttbr0_enable \tmp1, \tmp2
+	.macro	uaccess_enable_not_uao, tmp1, tmp2, tmp3
+	uaccess_ttbr0_enable \tmp1, \tmp2, \tmp3
 alternative_if ARM64_ALT_PAN_NOT_UAO
 	SET_PSTATE_PAN(0)
 alternative_else_nop_endif
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index fc0f9eb66039..750a3b76a01c 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -107,15 +107,19 @@ static inline void __uaccess_ttbr0_disable(void)
 {
 	unsigned long ttbr;
 
+	ttbr = read_sysreg(ttbr1_el1);
 	/* reserved_ttbr0 placed at the end of swapper_pg_dir */
-	ttbr = read_sysreg(ttbr1_el1) + SWAPPER_DIR_SIZE;
-	write_sysreg(ttbr, ttbr0_el1);
+	write_sysreg(ttbr + SWAPPER_DIR_SIZE, ttbr0_el1);
+	isb();
+	/* Set reserved ASID */
+	ttbr &= ~(0xffffUL << 48);
+	write_sysreg(ttbr, ttbr1_el1);
 	isb();
 }
 
 static inline void __uaccess_ttbr0_enable(void)
 {
-	unsigned long flags;
+	unsigned long flags, ttbr0, ttbr1;
 
 	/*
 	 * Disable interrupts to avoid preemption between reading the 'ttbr0'
@@ -123,7 +127,16 @@ static inline void __uaccess_ttbr0_enable(void)
 	 * roll-over and an update of 'ttbr0'.
 	 */
 	local_irq_save(flags);
-	write_sysreg(current_thread_info()->ttbr0, ttbr0_el1);
+	ttbr0 = current_thread_info()->ttbr0;
+
+	/* Restore active ASID */
+	ttbr1 = read_sysreg(ttbr1_el1);
+	ttbr1 |= ttbr0 & (0xffffUL << 48);
+	write_sysreg(ttbr1, ttbr1_el1);
+	isb();
+
+	/* Restore user page table */
+	write_sysreg(ttbr0, ttbr0_el1);
 	isb();
 	local_irq_restore(flags);
 }
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 804e43c9cb0b..d454d8ed45e4 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -184,7 +184,7 @@ alternative_if ARM64_HAS_PAN
 alternative_else_nop_endif
 
 	.if	\el != 0
-	mrs	x21, ttbr0_el1
+	mrs	x21, ttbr1_el1
 	tst	x21, #0xffff << 48		// Check for the reserved ASID
 	orr	x23, x23, #PSR_PAN_BIT		// Set the emulated PAN in the saved SPSR
 	b.eq	1f				// TTBR0 access already disabled
@@ -248,7 +248,7 @@ alternative_else_nop_endif
 	tbnz	x22, #22, 1f			// Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set
 	.endif
 
-	__uaccess_ttbr0_enable x0
+	__uaccess_ttbr0_enable x0, x1
 
 	.if	\el == 0
 	/*
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index e88fb99c1561..8f9c4641e706 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -30,7 +30,7 @@
  * Alignment fixed up by hardware.
  */
 ENTRY(__clear_user)
-	uaccess_enable_not_uao x2, x3
+	uaccess_enable_not_uao x2, x3, x4
 	mov	x2, x1			// save the size for fixup return
 	subs	x1, x1, #8
 	b.mi	2f
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 4b5d826895ff..69d86a80f3e2 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -64,7 +64,7 @@
 
 end	.req	x5
 ENTRY(__arch_copy_from_user)
-	uaccess_enable_not_uao x3, x4
+	uaccess_enable_not_uao x3, x4, x5
 	add	end, x0, x2
 #include "copy_template.S"
 	uaccess_disable_not_uao x3
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index b24a830419ad..e442b531252a 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -65,7 +65,7 @@
 
 end	.req	x5
 ENTRY(raw_copy_in_user)
-	uaccess_enable_not_uao x3, x4
+	uaccess_enable_not_uao x3, x4, x5
 	add	end, x0, x2
 #include "copy_template.S"
 	uaccess_disable_not_uao x3
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 351f0766f7a6..318f15d5c336 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -63,7 +63,7 @@
 
 end	.req	x5
 ENTRY(__arch_copy_to_user)
-	uaccess_enable_not_uao x3, x4
+	uaccess_enable_not_uao x3, x4, x5
 	add	end, x0, x2
 #include "copy_template.S"
 	uaccess_disable_not_uao x3
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 7f1dbe962cf5..6cd20a8c0952 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -49,7 +49,7 @@ ENTRY(flush_icache_range)
  *	- end     - virtual end address of region
  */
 ENTRY(__flush_cache_user_range)
-	uaccess_ttbr0_enable x2, x3
+	uaccess_ttbr0_enable x2, x3, x4
 	dcache_line_size x2, x3
 	sub	x3, x2, #1
 	bic	x4, x0, x3
diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S
index 401ceb71540c..acdbd2c9e899 100644
--- a/arch/arm64/xen/hypercall.S
+++ b/arch/arm64/xen/hypercall.S
@@ -101,7 +101,7 @@ ENTRY(privcmd_call)
 	 * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation
 	 * is enabled (it implies that hardware UAO and PAN disabled).
 	 */
-	uaccess_ttbr0_enable x6, x7
+	uaccess_ttbr0_enable x6, x7, x8
 	hvc XEN_IMM
 
 	/*

From 0c8ea531b7740754cf374ca8b7510655f569c5e3 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 10 Aug 2017 14:10:28 +0100
Subject: [PATCH 007/104] arm64: mm: Allocate ASIDs in pairs

In preparation for separate kernel/user ASIDs, allocate them in pairs
for each mm_struct. The bottom bit distinguishes the two: if it is set,
then the ASID will map only userspace.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/mmu.h |  1 +
 arch/arm64/mm/context.c      | 25 +++++++++++++++++--------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 0d34bf0a89c7..01bfb184f2a8 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -17,6 +17,7 @@
 #define __ASM_MMU_H
 
 #define MMCF_AARCH32	0x1	/* mm context flag for AArch32 executables */
+#define USER_ASID_FLAG	(UL(1) << 48)
 
 typedef struct {
 	atomic64_t	id;
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 78a2dc596fee..1cb3bc92ae5c 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -39,7 +39,16 @@ static cpumask_t tlb_flush_pending;
 
 #define ASID_MASK		(~GENMASK(asid_bits - 1, 0))
 #define ASID_FIRST_VERSION	(1UL << asid_bits)
-#define NUM_USER_ASIDS		ASID_FIRST_VERSION
+
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+#define NUM_USER_ASIDS		(ASID_FIRST_VERSION >> 1)
+#define asid2idx(asid)		(((asid) & ~ASID_MASK) >> 1)
+#define idx2asid(idx)		(((idx) << 1) & ~ASID_MASK)
+#else
+#define NUM_USER_ASIDS		(ASID_FIRST_VERSION)
+#define asid2idx(asid)		((asid) & ~ASID_MASK)
+#define idx2asid(idx)		asid2idx(idx)
+#endif
 
 /* Get the ASIDBits supported by the current CPU */
 static u32 get_cpu_asid_bits(void)
@@ -98,7 +107,7 @@ static void flush_context(unsigned int cpu)
 		 */
 		if (asid == 0)
 			asid = per_cpu(reserved_asids, i);
-		__set_bit(asid & ~ASID_MASK, asid_map);
+		__set_bit(asid2idx(asid), asid_map);
 		per_cpu(reserved_asids, i) = asid;
 	}
 
@@ -153,16 +162,16 @@ static u64 new_context(struct mm_struct *mm, unsigned int cpu)
 		 * We had a valid ASID in a previous life, so try to re-use
 		 * it if possible.
 		 */
-		asid &= ~ASID_MASK;
-		if (!__test_and_set_bit(asid, asid_map))
+		if (!__test_and_set_bit(asid2idx(asid), asid_map))
 			return newasid;
 	}
 
 	/*
 	 * Allocate a free ASID. If we can't find one, take a note of the
-	 * currently active ASIDs and mark the TLBs as requiring flushes.
-	 * We always count from ASID #1, as we use ASID #0 when setting a
-	 * reserved TTBR0 for the init_mm.
+	 * currently active ASIDs and mark the TLBs as requiring flushes.  We
+	 * always count from ASID #2 (index 1), as we use ASID #0 when setting
+	 * a reserved TTBR0 for the init_mm and we allocate ASIDs in even/odd
+	 * pairs.
 	 */
 	asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, cur_idx);
 	if (asid != NUM_USER_ASIDS)
@@ -179,7 +188,7 @@ static u64 new_context(struct mm_struct *mm, unsigned int cpu)
 set_asid:
 	__set_bit(asid, asid_map);
 	cur_idx = asid;
-	return asid | generation;
+	return idx2asid(asid) | generation;
 }
 
 void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)

From fc0e1299da548b32440051f58f08e0c1eb7edd0b Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 13:58:08 +0000
Subject: [PATCH 008/104] arm64: mm: Add arm64_kernel_unmapped_at_el0 helper

In order for code such as TLB invalidation to operate efficiently when
the decision to map the kernel at EL0 is determined at runtime, this
patch introduces a helper function, arm64_kernel_unmapped_at_el0, to
determine whether or not the kernel is mapped whilst running in userspace.

Currently, this just reports the value of CONFIG_UNMAP_KERNEL_AT_EL0,
but will later be hooked up to a fake CPU capability using a static key.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/mmu.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 01bfb184f2a8..c07954638658 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -19,6 +19,8 @@
 #define MMCF_AARCH32	0x1	/* mm context flag for AArch32 executables */
 #define USER_ASID_FLAG	(UL(1) << 48)
 
+#ifndef __ASSEMBLY__
+
 typedef struct {
 	atomic64_t	id;
 	void		*vdso;
@@ -32,6 +34,11 @@ typedef struct {
  */
 #define ASID(mm)	((mm)->context.id.counter & 0xffff)
 
+static inline bool arm64_kernel_unmapped_at_el0(void)
+{
+	return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0);
+}
+
 extern void paging_init(void);
 extern void bootmem_init(void);
 extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt);
@@ -42,4 +49,5 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 extern void *fixmap_remap_fdt(phys_addr_t dt_phys);
 extern void mark_linear_text_alias_ro(void);
 
+#endif	/* !__ASSEMBLY__ */
 #endif

From 9b0de864b5bc298ea53005ad812f3386f81aee9c Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 10 Aug 2017 14:13:33 +0100
Subject: [PATCH 009/104] arm64: mm: Invalidate both kernel and user ASIDs when
 performing TLBI

Since an mm has both a kernel and a user ASID, we need to ensure that
broadcast TLB maintenance targets both address spaces so that things
like CoW continue to work with the uaccess primitives in the kernel.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/tlbflush.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index af1c76981911..9e82dd79c7db 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -23,6 +23,7 @@
 
 #include <linux/sched.h>
 #include <asm/cputype.h>
+#include <asm/mmu.h>
 
 /*
  * Raw TLBI operations.
@@ -54,6 +55,11 @@
 
 #define __tlbi(op, ...)		__TLBI_N(op, ##__VA_ARGS__, 1, 0)
 
+#define __tlbi_user(op, arg) do {						\
+	if (arm64_kernel_unmapped_at_el0())					\
+		__tlbi(op, (arg) | USER_ASID_FLAG);				\
+} while (0)
+
 /*
  *	TLB Management
  *	==============
@@ -115,6 +121,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
 
 	dsb(ishst);
 	__tlbi(aside1is, asid);
+	__tlbi_user(aside1is, asid);
 	dsb(ish);
 }
 
@@ -125,6 +132,7 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
 
 	dsb(ishst);
 	__tlbi(vale1is, addr);
+	__tlbi_user(vale1is, addr);
 	dsb(ish);
 }
 
@@ -151,10 +159,13 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 
 	dsb(ishst);
 	for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12)) {
-		if (last_level)
+		if (last_level) {
 			__tlbi(vale1is, addr);
-		else
+			__tlbi_user(vale1is, addr);
+		} else {
 			__tlbi(vae1is, addr);
+			__tlbi_user(vae1is, addr);
+		}
 	}
 	dsb(ish);
 }
@@ -194,6 +205,7 @@ static inline void __flush_tlb_pgtable(struct mm_struct *mm,
 	unsigned long addr = uaddr >> 12 | (ASID(mm) << 48);
 
 	__tlbi(vae1is, addr);
+	__tlbi_user(vae1is, addr);
 	dsb(ish);
 }
 

From c7b9adaf85f818d747eeff5145eb4095ccd587fb Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 14:07:40 +0000
Subject: [PATCH 010/104] arm64: entry: Add exception trampoline page for
 exceptions from EL0

To allow unmapping of the kernel whilst running at EL0, we need to
point the exception vectors at an entry trampoline that can map/unmap
the kernel on entry/exit respectively.

This patch adds the trampoline page, although it is not yet plugged
into the vector table and is therefore unused.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/kernel/entry.S       | 86 +++++++++++++++++++++++++++++++++
 arch/arm64/kernel/vmlinux.lds.S | 17 +++++++
 2 files changed, 103 insertions(+)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index d454d8ed45e4..716b5ef42e29 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -28,6 +28,8 @@
 #include <asm/errno.h>
 #include <asm/esr.h>
 #include <asm/irq.h>
+#include <asm/memory.h>
+#include <asm/mmu.h>
 #include <asm/processor.h>
 #include <asm/ptrace.h>
 #include <asm/thread_info.h>
@@ -943,6 +945,90 @@ __ni_sys_trace:
 
 	.popsection				// .entry.text
 
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+/*
+ * Exception vectors trampoline.
+ */
+	.pushsection ".entry.tramp.text", "ax"
+
+	.macro tramp_map_kernel, tmp
+	mrs	\tmp, ttbr1_el1
+	sub	\tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE)
+	bic	\tmp, \tmp, #USER_ASID_FLAG
+	msr	ttbr1_el1, \tmp
+	.endm
+
+	.macro tramp_unmap_kernel, tmp
+	mrs	\tmp, ttbr1_el1
+	add	\tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE)
+	orr	\tmp, \tmp, #USER_ASID_FLAG
+	msr	ttbr1_el1, \tmp
+	/*
+	 * We avoid running the post_ttbr_update_workaround here because the
+	 * user and kernel ASIDs don't have conflicting mappings, so any
+	 * "blessing" as described in:
+	 *
+	 *   http://lkml.kernel.org/r/56BB848A.6060603@caviumnetworks.com
+	 *
+	 * will not hurt correctness. Whilst this may partially defeat the
+	 * point of using split ASIDs in the first place, it avoids
+	 * the hit of invalidating the entire I-cache on every return to
+	 * userspace.
+	 */
+	.endm
+
+	.macro tramp_ventry, regsize = 64
+	.align	7
+1:
+	.if	\regsize == 64
+	msr	tpidrro_el0, x30	// Restored in kernel_ventry
+	.endif
+	tramp_map_kernel	x30
+	ldr	x30, =vectors
+	prfm	plil1strm, [x30, #(1b - tramp_vectors)]
+	msr	vbar_el1, x30
+	add	x30, x30, #(1b - tramp_vectors)
+	isb
+	br	x30
+	.endm
+
+	.macro tramp_exit, regsize = 64
+	adr	x30, tramp_vectors
+	msr	vbar_el1, x30
+	tramp_unmap_kernel	x30
+	.if	\regsize == 64
+	mrs	x30, far_el1
+	.endif
+	eret
+	.endm
+
+	.align	11
+ENTRY(tramp_vectors)
+	.space	0x400
+
+	tramp_ventry
+	tramp_ventry
+	tramp_ventry
+	tramp_ventry
+
+	tramp_ventry	32
+	tramp_ventry	32
+	tramp_ventry	32
+	tramp_ventry	32
+END(tramp_vectors)
+
+ENTRY(tramp_exit_native)
+	tramp_exit
+END(tramp_exit_native)
+
+ENTRY(tramp_exit_compat)
+	tramp_exit	32
+END(tramp_exit_compat)
+
+	.ltorg
+	.popsection				// .entry.tramp.text
+#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
+
 /*
  * Special system call wrappers.
  */
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 7da3e5c366a0..6b4260f22aab 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -57,6 +57,17 @@ jiffies = jiffies_64;
 #define HIBERNATE_TEXT
 #endif
 
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+#define TRAMP_TEXT					\
+	. = ALIGN(PAGE_SIZE);				\
+	VMLINUX_SYMBOL(__entry_tramp_text_start) = .;	\
+	*(.entry.tramp.text)				\
+	. = ALIGN(PAGE_SIZE);				\
+	VMLINUX_SYMBOL(__entry_tramp_text_end) = .;
+#else
+#define TRAMP_TEXT
+#endif
+
 /*
  * The size of the PE/COFF section that covers the kernel image, which
  * runs from stext to _edata, must be a round multiple of the PE/COFF
@@ -113,6 +124,7 @@ SECTIONS
 			HYPERVISOR_TEXT
 			IDMAP_TEXT
 			HIBERNATE_TEXT
+			TRAMP_TEXT
 			*(.fixup)
 			*(.gnu.warning)
 		. = ALIGN(16);
@@ -214,6 +226,11 @@ SECTIONS
 	. += RESERVED_TTBR0_SIZE;
 #endif
 
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+	tramp_pg_dir = .;
+	. += PAGE_SIZE;
+#endif
+
 	__pecoff_data_size = ABSOLUTE(. - __initdata_begin);
 	_end = .;
 

From 51a0048beb449682d632d0af52a515adb9f9882e Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 14:14:17 +0000
Subject: [PATCH 011/104] arm64: mm: Map entry trampoline into trampoline and
 kernel page tables

The exception entry trampoline needs to be mapped at the same virtual
address in both the trampoline page table (which maps nothing else)
and also the kernel page table, so that we can swizzle TTBR1_EL1 on
exceptions from and return to EL0.

This patch maps the trampoline at a fixed virtual address in the fixmap
area of the kernel virtual address space, which allows the kernel proper
to be randomized with respect to the trampoline when KASLR is enabled.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/fixmap.h  |  4 ++++
 arch/arm64/include/asm/pgtable.h |  1 +
 arch/arm64/kernel/asm-offsets.c  |  6 +++++-
 arch/arm64/mm/mmu.c              | 23 +++++++++++++++++++++++
 4 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h
index 4052ec39e8db..8119b49be98d 100644
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -58,6 +58,10 @@ enum fixed_addresses {
 	FIX_APEI_GHES_NMI,
 #endif /* CONFIG_ACPI_APEI_GHES */
 
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+	FIX_ENTRY_TRAMP_TEXT,
+#define TRAMP_VALIAS		(__fix_to_virt(FIX_ENTRY_TRAMP_TEXT))
+#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
 	__end_of_permanent_fixed_addresses,
 
 	/*
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 149d05fb9421..774003b247ad 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -680,6 +680,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 
 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
 extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
+extern pgd_t tramp_pg_dir[PTRS_PER_PGD];
 
 /*
  * Encode and decode a swap entry:
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 71bf088f1e4b..af247d10252f 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -24,6 +24,7 @@
 #include <linux/kvm_host.h>
 #include <linux/suspend.h>
 #include <asm/cpufeature.h>
+#include <asm/fixmap.h>
 #include <asm/thread_info.h>
 #include <asm/memory.h>
 #include <asm/smp_plat.h>
@@ -148,11 +149,14 @@ int main(void)
   DEFINE(ARM_SMCCC_RES_X2_OFFS,		offsetof(struct arm_smccc_res, a2));
   DEFINE(ARM_SMCCC_QUIRK_ID_OFFS,	offsetof(struct arm_smccc_quirk, id));
   DEFINE(ARM_SMCCC_QUIRK_STATE_OFFS,	offsetof(struct arm_smccc_quirk, state));
-
   BLANK();
   DEFINE(HIBERN_PBE_ORIG,	offsetof(struct pbe, orig_address));
   DEFINE(HIBERN_PBE_ADDR,	offsetof(struct pbe, address));
   DEFINE(HIBERN_PBE_NEXT,	offsetof(struct pbe, next));
   DEFINE(ARM64_FTR_SYSVAL,	offsetof(struct arm64_ftr_reg, sys_val));
+  BLANK();
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+  DEFINE(TRAMP_VALIAS,		TRAMP_VALIAS);
+#endif
   return 0;
 }
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 267d2b79d52d..fe68a48c64cb 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -525,6 +525,29 @@ static int __init parse_rodata(char *arg)
 }
 early_param("rodata", parse_rodata);
 
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+static int __init map_entry_trampoline(void)
+{
+	extern char __entry_tramp_text_start[];
+
+	pgprot_t prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
+	phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start);
+
+	/* The trampoline is always mapped and can therefore be global */
+	pgprot_val(prot) &= ~PTE_NG;
+
+	/* Map only the text into the trampoline page table */
+	memset(tramp_pg_dir, 0, PGD_SIZE);
+	__create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE,
+			     prot, pgd_pgtable_alloc, 0);
+
+	/* ...as well as the kernel page table */
+	__set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot);
+	return 0;
+}
+core_initcall(map_entry_trampoline);
+#endif
+
 /*
  * Create fine-grained mappings for the kernel.
  */

From 5b1f7fe41909cde40decad9f0e8ee585777a0538 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 14:20:21 +0000
Subject: [PATCH 012/104] arm64: entry: Explicitly pass exception level to
 kernel_ventry macro

We will need to treat exceptions from EL0 differently in kernel_ventry,
so rework the macro to take the exception level as an argument and
construct the branch target using that.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/kernel/entry.S | 46 +++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 716b5ef42e29..b99fc928119c 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -71,7 +71,7 @@
 #define BAD_FIQ		2
 #define BAD_ERROR	3
 
-	.macro kernel_ventry	label
+	.macro kernel_ventry, el, label, regsize = 64
 	.align 7
 	sub	sp, sp, #S_FRAME_SIZE
 #ifdef CONFIG_VMAP_STACK
@@ -84,7 +84,7 @@
 	tbnz	x0, #THREAD_SHIFT, 0f
 	sub	x0, sp, x0			// x0'' = sp' - x0' = (sp + x0) - sp = x0
 	sub	sp, sp, x0			// sp'' = sp' - x0 = (sp + x0) - x0 = sp
-	b	\label
+	b	el\()\el\()_\label
 
 0:
 	/*
@@ -116,7 +116,7 @@
 	sub	sp, sp, x0
 	mrs	x0, tpidrro_el0
 #endif
-	b	\label
+	b	el\()\el\()_\label
 	.endm
 
 	.macro	kernel_entry, el, regsize = 64
@@ -369,31 +369,31 @@ tsk	.req	x28		// current thread_info
 
 	.align	11
 ENTRY(vectors)
-	kernel_ventry	el1_sync_invalid		// Synchronous EL1t
-	kernel_ventry	el1_irq_invalid			// IRQ EL1t
-	kernel_ventry	el1_fiq_invalid			// FIQ EL1t
-	kernel_ventry	el1_error_invalid		// Error EL1t
+	kernel_ventry	1, sync_invalid			// Synchronous EL1t
+	kernel_ventry	1, irq_invalid			// IRQ EL1t
+	kernel_ventry	1, fiq_invalid			// FIQ EL1t
+	kernel_ventry	1, error_invalid		// Error EL1t
 
-	kernel_ventry	el1_sync			// Synchronous EL1h
-	kernel_ventry	el1_irq				// IRQ EL1h
-	kernel_ventry	el1_fiq_invalid			// FIQ EL1h
-	kernel_ventry	el1_error			// Error EL1h
+	kernel_ventry	1, sync				// Synchronous EL1h
+	kernel_ventry	1, irq				// IRQ EL1h
+	kernel_ventry	1, fiq_invalid			// FIQ EL1h
+	kernel_ventry	1, error			// Error EL1h
 
-	kernel_ventry	el0_sync			// Synchronous 64-bit EL0
-	kernel_ventry	el0_irq				// IRQ 64-bit EL0
-	kernel_ventry	el0_fiq_invalid			// FIQ 64-bit EL0
-	kernel_ventry	el0_error			// Error 64-bit EL0
+	kernel_ventry	0, sync				// Synchronous 64-bit EL0
+	kernel_ventry	0, irq				// IRQ 64-bit EL0
+	kernel_ventry	0, fiq_invalid			// FIQ 64-bit EL0
+	kernel_ventry	0, error			// Error 64-bit EL0
 
 #ifdef CONFIG_COMPAT
-	kernel_ventry	el0_sync_compat			// Synchronous 32-bit EL0
-	kernel_ventry	el0_irq_compat			// IRQ 32-bit EL0
-	kernel_ventry	el0_fiq_invalid_compat		// FIQ 32-bit EL0
-	kernel_ventry	el0_error_compat		// Error 32-bit EL0
+	kernel_ventry	0, sync_compat, 32		// Synchronous 32-bit EL0
+	kernel_ventry	0, irq_compat, 32		// IRQ 32-bit EL0
+	kernel_ventry	0, fiq_invalid_compat, 32	// FIQ 32-bit EL0
+	kernel_ventry	0, error_compat, 32		// Error 32-bit EL0
 #else
-	kernel_ventry	el0_sync_invalid		// Synchronous 32-bit EL0
-	kernel_ventry	el0_irq_invalid			// IRQ 32-bit EL0
-	kernel_ventry	el0_fiq_invalid			// FIQ 32-bit EL0
-	kernel_ventry	el0_error_invalid		// Error 32-bit EL0
+	kernel_ventry	0, sync_invalid, 32		// Synchronous 32-bit EL0
+	kernel_ventry	0, irq_invalid, 32		// IRQ 32-bit EL0
+	kernel_ventry	0, fiq_invalid, 32		// FIQ 32-bit EL0
+	kernel_ventry	0, error_invalid, 32		// Error 32-bit EL0
 #endif
 END(vectors)
 

From 4bf3286d29f3a88425d8d8cd53428cbb8f865f04 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 14:24:29 +0000
Subject: [PATCH 013/104] arm64: entry: Hook up entry trampoline to exception
 vectors

Hook up the entry trampoline to our exception vectors so that all
exceptions from and returns to EL0 go via the trampoline, which swizzles
the vector base register accordingly. Transitioning to and from the
kernel clobbers x30, so we use tpidrro_el0 and far_el1 as scratch
registers for native tasks.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/kernel/entry.S | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index b99fc928119c..39e3873b8d5a 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -73,6 +73,17 @@
 
 	.macro kernel_ventry, el, label, regsize = 64
 	.align 7
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+	.if	\el == 0
+	.if	\regsize == 64
+	mrs	x30, tpidrro_el0
+	msr	tpidrro_el0, xzr
+	.else
+	mov	x30, xzr
+	.endif
+	.endif
+#endif
+
 	sub	sp, sp, #S_FRAME_SIZE
 #ifdef CONFIG_VMAP_STACK
 	/*
@@ -119,6 +130,11 @@
 	b	el\()\el\()_\label
 	.endm
 
+	.macro tramp_alias, dst, sym
+	mov_q	\dst, TRAMP_VALIAS
+	add	\dst, \dst, #(\sym - .entry.tramp.text)
+	.endm
+
 	.macro	kernel_entry, el, regsize = 64
 	.if	\regsize == 32
 	mov	w0, w0				// zero upper 32 bits of x0
@@ -271,18 +287,20 @@ alternative_else_nop_endif
 	.if	\el == 0
 	ldr	x23, [sp, #S_SP]		// load return stack pointer
 	msr	sp_el0, x23
+	tst	x22, #PSR_MODE32_BIT		// native task?
+	b.eq	3f
+
 #ifdef CONFIG_ARM64_ERRATUM_845719
 alternative_if ARM64_WORKAROUND_845719
-	tbz	x22, #4, 1f
 #ifdef CONFIG_PID_IN_CONTEXTIDR
 	mrs	x29, contextidr_el1
 	msr	contextidr_el1, x29
 #else
 	msr contextidr_el1, xzr
 #endif
-1:
 alternative_else_nop_endif
 #endif
+3:
 	.endif
 
 	msr	elr_el1, x21			// set up the return data
@@ -304,7 +322,22 @@ alternative_else_nop_endif
 	ldp	x28, x29, [sp, #16 * 14]
 	ldr	lr, [sp, #S_LR]
 	add	sp, sp, #S_FRAME_SIZE		// restore sp
-	eret					// return to kernel
+
+#ifndef CONFIG_UNMAP_KERNEL_AT_EL0
+	eret
+#else
+	.if	\el == 0
+	bne	4f
+	msr	far_el1, x30
+	tramp_alias	x30, tramp_exit_native
+	br	x30
+4:
+	tramp_alias	x30, tramp_exit_compat
+	br	x30
+	.else
+	eret
+	.endif
+#endif
 	.endm
 
 	.macro	irq_stack_entry

From d1777e686ad10ba7c594304429c6045fb79255a1 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 14:29:19 +0000
Subject: [PATCH 014/104] arm64: erratum: Work around Falkor erratum #E1003 in
 trampoline code

We rely on an atomic swizzling of TTBR1 when transitioning from the entry
trampoline to the kernel proper on an exception. We can't rely on this
atomicity in the face of Falkor erratum #E1003, so on affected cores we
can issue a TLB invalidation to invalidate the walk cache prior to
jumping into the kernel. There is still the possibility of a TLB conflict
here due to conflicting walk cache entries prior to the invalidation, but
this doesn't appear to be the case on these CPUs in practice.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/Kconfig        | 17 +++++------------
 arch/arm64/kernel/entry.S | 12 ++++++++++++
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a93339f5178f..fdcc7b9bb15d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -522,20 +522,13 @@ config CAVIUM_ERRATUM_30115
 config QCOM_FALKOR_ERRATUM_1003
 	bool "Falkor E1003: Incorrect translation due to ASID change"
 	default y
-	select ARM64_PAN if ARM64_SW_TTBR0_PAN
 	help
 	  On Falkor v1, an incorrect ASID may be cached in the TLB when ASID
-	  and BADDR are changed together in TTBRx_EL1. The workaround for this
-	  issue is to use a reserved ASID in cpu_do_switch_mm() before
-	  switching to the new ASID. Saying Y here selects ARM64_PAN if
-	  ARM64_SW_TTBR0_PAN is selected. This is done because implementing and
-	  maintaining the E1003 workaround in the software PAN emulation code
-	  would be an unnecessary complication. The affected Falkor v1 CPU
-	  implements ARMv8.1 hardware PAN support and using hardware PAN
-	  support versus software PAN emulation is mutually exclusive at
-	  runtime.
-
-	  If unsure, say Y.
+	  and BADDR are changed together in TTBRx_EL1. Since we keep the ASID
+	  in TTBR1_EL1, this situation only occurs in the entry trampoline and
+	  then only for entries in the walk cache, since the leaf translation
+	  is unchanged. Work around the erratum by invalidating the walk cache
+	  entries for the trampoline before entering the kernel proper.
 
 config QCOM_FALKOR_ERRATUM_1009
 	bool "Falkor E1009: Prematurely complete a DSB after a TLBI"
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 39e3873b8d5a..ce56592b5f70 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -989,6 +989,18 @@ __ni_sys_trace:
 	sub	\tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE)
 	bic	\tmp, \tmp, #USER_ASID_FLAG
 	msr	ttbr1_el1, \tmp
+#ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003
+alternative_if ARM64_WORKAROUND_QCOM_FALKOR_E1003
+	/* ASID already in \tmp[63:48] */
+	movk	\tmp, #:abs_g2_nc:(TRAMP_VALIAS >> 12)
+	movk	\tmp, #:abs_g1_nc:(TRAMP_VALIAS >> 12)
+	/* 2MB boundary containing the vectors, so we nobble the walk cache */
+	movk	\tmp, #:abs_g0_nc:((TRAMP_VALIAS & ~(SZ_2M - 1)) >> 12)
+	isb
+	tlbi	vae1, \tmp
+	dsb	nsh
+alternative_else_nop_endif
+#endif /* CONFIG_QCOM_FALKOR_ERRATUM_1003 */
 	.endm
 
 	.macro tramp_unmap_kernel, tmp

From 18011eac28c7cb31c87b86b7d0e5b01894405c7f Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 14:33:28 +0000
Subject: [PATCH 015/104] arm64: tls: Avoid unconditional zeroing of
 tpidrro_el0 for native tasks

When unmapping the kernel at EL0, we use tpidrro_el0 as a scratch register
during exception entry from native tasks and subsequently zero it in
the kernel_ventry macro. We can therefore avoid zeroing tpidrro_el0
in the context-switch path for native tasks using the entry trampoline.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/kernel/process.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 6b7dcf4310ac..583fd8154695 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -370,16 +370,14 @@ void tls_preserve_current_state(void)
 
 static void tls_thread_switch(struct task_struct *next)
 {
-	unsigned long tpidr, tpidrro;
-
 	tls_preserve_current_state();
 
-	tpidr = *task_user_tls(next);
-	tpidrro = is_compat_thread(task_thread_info(next)) ?
-		  next->thread.tp_value : 0;
+	if (is_compat_thread(task_thread_info(next)))
+		write_sysreg(next->thread.tp_value, tpidrro_el0);
+	else if (!arm64_kernel_unmapped_at_el0())
+		write_sysreg(0, tpidrro_el0);
 
-	write_sysreg(tpidr, tpidr_el0);
-	write_sysreg(tpidrro, tpidrro_el0);
+	write_sysreg(*task_user_tls(next), tpidr_el0);
 }
 
 /* Restore the UAO state depending on next's addr_limit */

From ea1e3de85e94d711f63437c04624aa0e8de5c8b3 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 14:38:19 +0000
Subject: [PATCH 016/104] arm64: entry: Add fake CPU feature for unmapping the
 kernel at EL0

Allow explicit disabling of the entry trampoline on the kernel command
line (kpti=off) by adding a fake CPU feature (ARM64_UNMAP_KERNEL_AT_EL0)
that can be used to toggle the alternative sequences in our entry code and
avoid use of the trampoline altogether if desired. This also allows us to
make use of a static key in arm64_kernel_unmapped_at_el0().

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/cpucaps.h |  3 ++-
 arch/arm64/include/asm/mmu.h     |  3 ++-
 arch/arm64/kernel/cpufeature.c   | 41 ++++++++++++++++++++++++++++++++
 arch/arm64/kernel/entry.S        |  9 +++----
 4 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 2ff7c5e8efab..b4537ffd1018 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -41,7 +41,8 @@
 #define ARM64_WORKAROUND_CAVIUM_30115		20
 #define ARM64_HAS_DCPOP				21
 #define ARM64_SVE				22
+#define ARM64_UNMAP_KERNEL_AT_EL0		23
 
-#define ARM64_NCAPS				23
+#define ARM64_NCAPS				24
 
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index c07954638658..da6f12e40714 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -36,7 +36,8 @@ typedef struct {
 
 static inline bool arm64_kernel_unmapped_at_el0(void)
 {
-	return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0);
+	return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) &&
+	       cpus_have_const_cap(ARM64_UNMAP_KERNEL_AT_EL0);
 }
 
 extern void paging_init(void);
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index c5ba0097887f..9f0545dfe497 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -845,6 +845,40 @@ static bool has_no_fpsimd(const struct arm64_cpu_capabilities *entry, int __unus
 					ID_AA64PFR0_FP_SHIFT) < 0;
 }
 
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */
+
+static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry,
+				int __unused)
+{
+	/* Forced on command line? */
+	if (__kpti_forced) {
+		pr_info_once("kernel page table isolation forced %s by command line option\n",
+			     __kpti_forced > 0 ? "ON" : "OFF");
+		return __kpti_forced > 0;
+	}
+
+	/* Useful for KASLR robustness */
+	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
+		return true;
+
+	return false;
+}
+
+static int __init parse_kpti(char *str)
+{
+	bool enabled;
+	int ret = strtobool(str, &enabled);
+
+	if (ret)
+		return ret;
+
+	__kpti_forced = enabled ? 1 : -1;
+	return 0;
+}
+__setup("kpti=", parse_kpti);
+#endif	/* CONFIG_UNMAP_KERNEL_AT_EL0 */
+
 static const struct arm64_cpu_capabilities arm64_features[] = {
 	{
 		.desc = "GIC system register CPU interface",
@@ -931,6 +965,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.def_scope = SCOPE_SYSTEM,
 		.matches = hyp_offset_low,
 	},
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+	{
+		.capability = ARM64_UNMAP_KERNEL_AT_EL0,
+		.def_scope = SCOPE_SYSTEM,
+		.matches = unmap_kernel_at_el0,
+	},
+#endif
 	{
 		/* FP/SIMD is not implemented */
 		.capability = ARM64_HAS_NO_FPSIMD,
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index ce56592b5f70..5d51bdbb2131 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -74,6 +74,7 @@
 	.macro kernel_ventry, el, label, regsize = 64
 	.align 7
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+alternative_if ARM64_UNMAP_KERNEL_AT_EL0
 	.if	\el == 0
 	.if	\regsize == 64
 	mrs	x30, tpidrro_el0
@@ -82,6 +83,7 @@
 	mov	x30, xzr
 	.endif
 	.endif
+alternative_else_nop_endif
 #endif
 
 	sub	sp, sp, #S_FRAME_SIZE
@@ -323,10 +325,9 @@ alternative_else_nop_endif
 	ldr	lr, [sp, #S_LR]
 	add	sp, sp, #S_FRAME_SIZE		// restore sp
 
-#ifndef CONFIG_UNMAP_KERNEL_AT_EL0
-	eret
-#else
 	.if	\el == 0
+alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 	bne	4f
 	msr	far_el1, x30
 	tramp_alias	x30, tramp_exit_native
@@ -334,10 +335,10 @@ alternative_else_nop_endif
 4:
 	tramp_alias	x30, tramp_exit_compat
 	br	x30
+#endif
 	.else
 	eret
 	.endif
-#endif
 	.endm
 
 	.macro	irq_stack_entry

From 084eb77cd3a81134d02500977dc0ecc9277dc97d Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 14:41:01 +0000
Subject: [PATCH 017/104] arm64: Kconfig: Add CONFIG_UNMAP_KERNEL_AT_EL0

Add a Kconfig entry to control use of the entry trampoline, which allows
us to unmap the kernel whilst running in userspace and improve the
robustness of KASLR.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/Kconfig | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index fdcc7b9bb15d..3af1657fcac3 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -833,6 +833,19 @@ config FORCE_MAX_ZONEORDER
 	  However for 4K, we choose a higher default value, 11 as opposed to 10, giving us
 	  4M allocations matching the default size used by generic code.
 
+config UNMAP_KERNEL_AT_EL0
+	bool "Unmap kernel when running in userspace (aka \"KAISER\")"
+	default y
+	help
+	  Some attacks against KASLR make use of the timing difference between
+	  a permission fault which could arise from a page table entry that is
+	  present in the TLB, and a translation fault which always requires a
+	  page table walk. This option defends against these attacks by unmapping
+	  the kernel whilst running in userspace, therefore forcing translation
+	  faults for all of kernel space.
+
+	  If unsure, say Y.
+
 menuconfig ARMV8_DEPRECATED
 	bool "Emulate deprecated/obsolete ARMv8 instructions"
 	depends on COMPAT

From 7a4a0c1555b824e0d3dd72942481b1190abea604 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Mon, 27 Nov 2017 15:49:53 +0000
Subject: [PATCH 018/104] perf: arm_spe: Fail device probe when
 arm64_kernel_unmapped_at_el0()

When running with the kernel unmapped whilst at EL0, the virtually-addressed
SPE buffer is also unmapped, which can lead to buffer faults if userspace
profiling is enabled and potentially also when writing back kernel samples
unless an expensive drain operation is performed on exception return.

For now, fail the SPE driver probe when arm64_kernel_unmapped_at_el0().

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/arm_spe_pmu.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index 8ce262fc2561..51b40aecb776 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -1164,6 +1164,15 @@ static int arm_spe_pmu_device_dt_probe(struct platform_device *pdev)
 	struct arm_spe_pmu *spe_pmu;
 	struct device *dev = &pdev->dev;
 
+	/*
+	 * If kernelspace is unmapped when running at EL0, then the SPE
+	 * buffer will fault and prematurely terminate the AUX session.
+	 */
+	if (arm64_kernel_unmapped_at_el0()) {
+		dev_warn_once(dev, "profiling buffer inaccessible. Try passing \"kpti=off\" on the kernel command line\n");
+		return -EPERM;
+	}
+
 	spe_pmu = devm_kzalloc(dev, sizeof(*spe_pmu), GFP_KERNEL);
 	if (!spe_pmu) {
 		dev_err(dev, "failed to allocate spe_pmu\n");

From b519538dfefc2f8478a1bcb458459c861d431784 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 1 Dec 2017 17:33:48 +0000
Subject: [PATCH 019/104] arm64: mm: Introduce TTBR_ASID_MASK for getting at
 the ASID in the TTBR

There are now a handful of open-coded masks to extract the ASID from a
TTBR value, so introduce a TTBR_ASID_MASK and use that instead.

Suggested-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/asm-uaccess.h | 3 ++-
 arch/arm64/include/asm/mmu.h         | 1 +
 arch/arm64/include/asm/uaccess.h     | 4 ++--
 arch/arm64/kernel/entry.S            | 2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index 21b8cf304028..f4f234b6155e 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -4,6 +4,7 @@
 
 #include <asm/alternative.h>
 #include <asm/kernel-pgtable.h>
+#include <asm/mmu.h>
 #include <asm/sysreg.h>
 #include <asm/assembler.h>
 
@@ -17,7 +18,7 @@
 	msr	ttbr0_el1, \tmp1		// set reserved TTBR0_EL1
 	isb
 	sub	\tmp1, \tmp1, #SWAPPER_DIR_SIZE
-	bic	\tmp1, \tmp1, #(0xffff << 48)
+	bic	\tmp1, \tmp1, #TTBR_ASID_MASK
 	msr	ttbr1_el1, \tmp1		// set reserved ASID
 	isb
 	.endm
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index da6f12e40714..6f7bdb89817f 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -18,6 +18,7 @@
 
 #define MMCF_AARCH32	0x1	/* mm context flag for AArch32 executables */
 #define USER_ASID_FLAG	(UL(1) << 48)
+#define TTBR_ASID_MASK	(UL(0xffff) << 48)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 750a3b76a01c..6eadf55ebaf0 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -112,7 +112,7 @@ static inline void __uaccess_ttbr0_disable(void)
 	write_sysreg(ttbr + SWAPPER_DIR_SIZE, ttbr0_el1);
 	isb();
 	/* Set reserved ASID */
-	ttbr &= ~(0xffffUL << 48);
+	ttbr &= ~TTBR_ASID_MASK;
 	write_sysreg(ttbr, ttbr1_el1);
 	isb();
 }
@@ -131,7 +131,7 @@ static inline void __uaccess_ttbr0_enable(void)
 
 	/* Restore active ASID */
 	ttbr1 = read_sysreg(ttbr1_el1);
-	ttbr1 |= ttbr0 & (0xffffUL << 48);
+	ttbr1 |= ttbr0 & TTBR_ASID_MASK;
 	write_sysreg(ttbr1, ttbr1_el1);
 	isb();
 
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 5d51bdbb2131..3eabcb194c87 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -205,7 +205,7 @@ alternative_else_nop_endif
 
 	.if	\el != 0
 	mrs	x21, ttbr1_el1
-	tst	x21, #0xffff << 48		// Check for the reserved ASID
+	tst	x21, #TTBR_ASID_MASK		// Check for the reserved ASID
 	orr	x23, x23, #PSR_PAN_BIT		// Set the emulated PAN in the saved SPSR
 	b.eq	1f				// TTBR0 access already disabled
 	and	x23, x23, #~PSR_PAN_BIT		// Clear the emulated PAN in the saved SPSR

From 6c27c4082f4f70b9f41df4d0adf51128b40351df Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 6 Dec 2017 11:24:02 +0000
Subject: [PATCH 020/104] arm64: kaslr: Put kernel vectors address in separate
 data page

The literal pool entry for identifying the vectors base is the only piece
of information in the trampoline page that identifies the true location
of the kernel.

This patch moves it into a page-aligned region of the .rodata section
and maps this adjacent to the trampoline text via an additional fixmap
entry, which protects against any accidental leakage of the trampoline
contents.

Suggested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/fixmap.h |  1 +
 arch/arm64/kernel/entry.S       | 14 ++++++++++++++
 arch/arm64/kernel/vmlinux.lds.S |  5 ++++-
 arch/arm64/mm/mmu.c             | 10 +++++++++-
 4 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h
index 8119b49be98d..ec1e6d6fa14c 100644
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -59,6 +59,7 @@ enum fixed_addresses {
 #endif /* CONFIG_ACPI_APEI_GHES */
 
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+	FIX_ENTRY_TRAMP_DATA,
 	FIX_ENTRY_TRAMP_TEXT,
 #define TRAMP_VALIAS		(__fix_to_virt(FIX_ENTRY_TRAMP_TEXT))
 #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 3eabcb194c87..031392ee5f47 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -1030,7 +1030,13 @@ alternative_else_nop_endif
 	msr	tpidrro_el0, x30	// Restored in kernel_ventry
 	.endif
 	tramp_map_kernel	x30
+#ifdef CONFIG_RANDOMIZE_BASE
+	adr	x30, tramp_vectors + PAGE_SIZE
+alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003
+	ldr	x30, [x30]
+#else
 	ldr	x30, =vectors
+#endif
 	prfm	plil1strm, [x30, #(1b - tramp_vectors)]
 	msr	vbar_el1, x30
 	add	x30, x30, #(1b - tramp_vectors)
@@ -1073,6 +1079,14 @@ END(tramp_exit_compat)
 
 	.ltorg
 	.popsection				// .entry.tramp.text
+#ifdef CONFIG_RANDOMIZE_BASE
+	.pushsection ".rodata", "a"
+	.align PAGE_SHIFT
+	.globl	__entry_tramp_data_start
+__entry_tramp_data_start:
+	.quad	vectors
+	.popsection				// .rodata
+#endif /* CONFIG_RANDOMIZE_BASE */
 #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
 
 /*
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 6b4260f22aab..ddfd3c0942f7 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -251,7 +251,10 @@ ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
 ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1))
 	<= SZ_4K, "Hibernate exit text too big or misaligned")
 #endif
-
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE,
+	"Entry trampoline text too big")
+#endif
 /*
  * If padding is applied before .head.text, virt<->phys conversions will fail.
  */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index fe68a48c64cb..916d9ced1c3f 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -541,8 +541,16 @@ static int __init map_entry_trampoline(void)
 	__create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE,
 			     prot, pgd_pgtable_alloc, 0);
 
-	/* ...as well as the kernel page table */
+	/* Map both the text and data into the kernel page table */
 	__set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot);
+	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
+		extern char __entry_tramp_data_start[];
+
+		__set_fixmap(FIX_ENTRY_TRAMP_DATA,
+			     __pa_symbol(__entry_tramp_data_start),
+			     PAGE_KERNEL_RO);
+	}
+
 	return 0;
 }
 core_initcall(map_entry_trampoline);

From 982aa7c5f0861bf56b2412ca341a13f44c238ba4 Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Wed, 13 Dec 2017 17:07:16 +0000
Subject: [PATCH 021/104] arm64: add kconfig symbol to configure physical
 address size

ARMv8.2 introduces support for 52-bit physical addresses. To prepare for
supporting this, add a new kconfig symbol to configure the physical
address space size. The symbols will be used in subsequent patches.
Currently the only choice is 48, a later patch will add the option of 52
once the required code is in place.

Tested-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Tested-by: Bob Picco <bob.picco@oracle.com>
Reviewed-by: Bob Picco <bob.picco@oracle.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
[catalin.marinas@arm.com: folded minor patches into this one]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig                     | 16 ++++++++++++++++
 arch/arm64/include/asm/pgtable-hwdef.h |  2 +-
 arch/arm64/include/asm/sparsemem.h     |  2 +-
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a93339f5178f..8dc937823eeb 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -646,6 +646,22 @@ config ARM64_VA_BITS
 	default 47 if ARM64_VA_BITS_47
 	default 48 if ARM64_VA_BITS_48
 
+choice
+	prompt "Physical address space size"
+	default ARM64_PA_BITS_48
+	help
+	  Choose the maximum physical address range that the kernel will
+	  support.
+
+config ARM64_PA_BITS_48
+	bool "48-bit"
+
+endchoice
+
+config ARM64_PA_BITS
+	int
+	default 48 if ARM64_PA_BITS_48
+
 config CPU_BIG_ENDIAN
        bool "Build big-endian kernel"
        help
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index eb0c2bd90de9..c1de9f67980b 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -196,7 +196,7 @@
 /*
  * Highest possible physical address supported.
  */
-#define PHYS_MASK_SHIFT		(48)
+#define PHYS_MASK_SHIFT		(CONFIG_ARM64_PA_BITS)
 #define PHYS_MASK		((UL(1) << PHYS_MASK_SHIFT) - 1)
 
 /*
diff --git a/arch/arm64/include/asm/sparsemem.h b/arch/arm64/include/asm/sparsemem.h
index 74a9d301819f..b299929fe56c 100644
--- a/arch/arm64/include/asm/sparsemem.h
+++ b/arch/arm64/include/asm/sparsemem.h
@@ -17,7 +17,7 @@
 #define __ASM_SPARSEMEM_H
 
 #ifdef CONFIG_SPARSEMEM
-#define MAX_PHYSMEM_BITS	48
+#define MAX_PHYSMEM_BITS	CONFIG_ARM64_PA_BITS
 #define SECTION_SIZE_BITS	30
 #endif
 

From 787fd1d019b269af7912249231dfe34a5fe3e7c8 Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Wed, 13 Dec 2017 17:07:17 +0000
Subject: [PATCH 022/104] arm64: limit PA size to supported range

We currently copy the physical address size from
ID_AA64MMFR0_EL1.PARange directly into TCR.(I)PS. This will not work for
4k and 16k granule kernels on systems that support 52-bit physical
addresses, since 52-bit addresses are only permitted with the 64k
granule.

To fix this, fall back to 48 bits when configuring the PA size when the
kernel does not support 52-bit PAs. When it does, fall back to 52, to
avoid similar problems in the future if the PA size is ever increased
above 52.

Tested-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Bob Picco <bob.picco@oracle.com>
Reviewed-by: Bob Picco <bob.picco@oracle.com>
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
[catalin.marinas@arm.com: tcr_set_pa_size macro renamed to tcr_compute_pa_size]
[catalin.marinas@arm.com: comments added to tcr_compute_pa_size]
[catalin.marinas@arm.com: definitions added for TCR_*PS_SHIFT]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/assembler.h     | 18 ++++++++++++++++++
 arch/arm64/include/asm/pgtable-hwdef.h |  2 ++
 arch/arm64/include/asm/sysreg.h        |  8 ++++++++
 arch/arm64/kvm/hyp-init.S              |  6 ++----
 arch/arm64/kvm/hyp/s2-setup.c          |  2 ++
 arch/arm64/mm/proc.S                   |  6 ++----
 6 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index aef72d886677..04a92307e6c1 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -350,6 +350,24 @@ alternative_endif
 #endif
 	.endm
 
+/*
+ * tcr_compute_pa_size - set TCR.(I)PS to the highest supported
+ * ID_AA64MMFR0_EL1.PARange value
+ *
+ *	tcr:		register with the TCR_ELx value to be updated
+ *	pos:		PARange bitfield position
+ *	tmp{0,1}:	temporary registers
+ */
+	.macro	tcr_compute_pa_size, tcr, pos, tmp0, tmp1
+	mrs	\tmp0, ID_AA64MMFR0_EL1
+	// Narrow PARange to fit the PS field in TCR_ELx
+	ubfx	\tmp0, \tmp0, #ID_AA64MMFR0_PARANGE_SHIFT, #3
+	mov	\tmp1, #ID_AA64MMFR0_PARANGE_MAX
+	cmp	\tmp0, \tmp1
+	csel	\tmp0, \tmp1, \tmp0, hi
+	bfi	\tcr, \tmp0, \pos, #3
+	.endm
+
 /*
  * Macro to perform a data cache maintenance for the interval
  * [kaddr, kaddr + size)
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index c1de9f67980b..9be2e9371c52 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -272,6 +272,8 @@
 #define TCR_TG1_4K		(UL(2) << TCR_TG1_SHIFT)
 #define TCR_TG1_64K		(UL(3) << TCR_TG1_SHIFT)
 
+#define TCR_IPS_SHIFT		32
+#define TCR_IPS_MASK		(UL(7) << TCR_IPS_SHIFT)
 #define TCR_ASID16		(UL(1) << 36)
 #define TCR_TBI0		(UL(1) << 37)
 #define TCR_HA			(UL(1) << 39)
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 08cc88574659..ec144f480b39 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -471,6 +471,14 @@
 #define ID_AA64MMFR0_TGRAN64_SUPPORTED	0x0
 #define ID_AA64MMFR0_TGRAN16_NI		0x0
 #define ID_AA64MMFR0_TGRAN16_SUPPORTED	0x1
+#define ID_AA64MMFR0_PARANGE_48		0x5
+#define ID_AA64MMFR0_PARANGE_52		0x6
+
+#ifdef CONFIG_ARM64_PA_BITS_52
+#define ID_AA64MMFR0_PARANGE_MAX	ID_AA64MMFR0_PARANGE_52
+#else
+#define ID_AA64MMFR0_PARANGE_MAX	ID_AA64MMFR0_PARANGE_48
+#endif
 
 /* id_aa64mmfr1 */
 #define ID_AA64MMFR1_PAN_SHIFT		20
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index 3f9615582377..e2d1fe03662a 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -90,11 +90,9 @@ __do_hyp_init:
 	bfi	x4, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
 #endif
 	/*
-	 * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in
-	 * TCR_EL2.
+	 * Set the PS bits in TCR_EL2.
 	 */
-	mrs	x5, ID_AA64MMFR0_EL1
-	bfi	x4, x5, #16, #3
+	tcr_compute_pa_size x4, #TCR_EL2_PS_SHIFT, x5, x6
 
 	msr	tcr_el2, x4
 
diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c
index a81f5e10fc8c..603e1ee83e89 100644
--- a/arch/arm64/kvm/hyp/s2-setup.c
+++ b/arch/arm64/kvm/hyp/s2-setup.c
@@ -32,6 +32,8 @@ u32 __hyp_text __init_stage2_translation(void)
 	 * PS is only 3. Fortunately, bit 19 is RES0 in VTCR_EL2...
 	 */
 	parange = read_sysreg(id_aa64mmfr0_el1) & 7;
+	if (parange > ID_AA64MMFR0_PARANGE_MAX)
+		parange = ID_AA64MMFR0_PARANGE_MAX;
 	val |= parange << 16;
 
 	/* Compute the actual PARange... */
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 95233dfc4c39..4f133cb340dc 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -228,11 +228,9 @@ ENTRY(__cpu_setup)
 	tcr_set_idmap_t0sz	x10, x9
 
 	/*
-	 * Read the PARange bits from ID_AA64MMFR0_EL1 and set the IPS bits in
-	 * TCR_EL1.
+	 * Set the IPS bits in TCR_EL1.
 	 */
-	mrs	x9, ID_AA64MMFR0_EL1
-	bfi	x10, x9, #32, #3
+	tcr_compute_pa_size x10, #TCR_IPS_SHIFT, x5, x6
 #ifdef CONFIG_ARM64_HW_AFDBM
 	/*
 	 * Hardware update of the Access and Dirty bits.

From 529c4b05a3cb2f324aac347042ee6d641478e946 Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Wed, 13 Dec 2017 17:07:18 +0000
Subject: [PATCH 023/104] arm64: handle 52-bit addresses in TTBR

The top 4 bits of a 52-bit physical address are positioned at bits 2..5
in the TTBR registers. Introduce a couple of macros to move the bits
there, and change all TTBR writers to use them.

Leave TTBR0 PAN code unchanged, to avoid complicating it. A system with
52-bit PA will have PAN anyway (because it's ARMv8.1 or later), and a
system without 52-bit PA can only use up to 48-bit PAs. A later patch in
this series will add a kconfig dependency to ensure PAN is configured.

In addition, when using 52-bit PA there is a special alignment
requirement on the top-level table. We don't currently have any VA_BITS
configuration that would violate the requirement, but one could be added
in the future, so add a compile-time BUG_ON to check for it.

Tested-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Bob Picco <bob.picco@oracle.com>
Reviewed-by: Bob Picco <bob.picco@oracle.com>
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
[catalin.marinas@arm.com: added TTBR_BADD_MASK_52 comment]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm/include/asm/kvm_mmu.h         |  2 ++
 arch/arm64/include/asm/assembler.h     | 16 ++++++++++++++++
 arch/arm64/include/asm/kvm_mmu.h       |  2 ++
 arch/arm64/include/asm/mmu_context.h   |  2 +-
 arch/arm64/include/asm/pgtable-hwdef.h | 13 +++++++++++++
 arch/arm64/include/asm/pgtable.h       |  6 ++++++
 arch/arm64/kernel/head.S               |  6 ++++--
 arch/arm64/kernel/hibernate-asm.S      | 12 +++++++-----
 arch/arm64/kernel/hibernate.c          |  2 +-
 arch/arm64/kvm/hyp-init.S              |  3 ++-
 arch/arm64/mm/pgd.c                    |  8 ++++++++
 arch/arm64/mm/proc.S                   | 13 ++++++++-----
 virt/kvm/arm/arm.c                     |  2 +-
 13 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index fa6f2174276b..8dbec683638b 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -221,6 +221,8 @@ static inline unsigned int kvm_get_vmid_bits(void)
 	return 8;
 }
 
+#define kvm_phys_to_vttbr(addr)		(addr)
+
 #endif	/* !__ASSEMBLY__ */
 
 #endif /* __ARM_KVM_MMU_H__ */
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 04a92307e6c1..49ea3def4bd1 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -530,4 +530,20 @@ alternative_else_nop_endif
 #endif
 	.endm
 
+/*
+ * Arrange a physical address in a TTBR register, taking care of 52-bit
+ * addresses.
+ *
+ * 	phys:	physical address, preserved
+ * 	ttbr:	returns the TTBR value
+ */
+	.macro	phys_to_ttbr, phys, ttbr
+#ifdef CONFIG_ARM64_PA_BITS_52
+	orr	\ttbr, \phys, \phys, lsr #46
+	and	\ttbr, \ttbr, #TTBR_BADDR_MASK_52
+#else
+	mov	\ttbr, \phys
+#endif
+	.endm
+
 #endif	/* __ASM_ASSEMBLER_H */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 672c8684d5c2..747bfff92948 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -309,5 +309,7 @@ static inline unsigned int kvm_get_vmid_bits(void)
 	return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8;
 }
 
+#define kvm_phys_to_vttbr(addr)		phys_to_ttbr(addr)
+
 #endif /* __ASSEMBLY__ */
 #endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 9d155fa9a507..accc2ff32a0e 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -51,7 +51,7 @@ static inline void contextidr_thread_switch(struct task_struct *next)
  */
 static inline void cpu_set_reserved_ttbr0(void)
 {
-	unsigned long ttbr = __pa_symbol(empty_zero_page);
+	unsigned long ttbr = phys_to_ttbr(__pa_symbol(empty_zero_page));
 
 	write_sysreg(ttbr, ttbr0_el1);
 	isb();
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 9be2e9371c52..f92be11a209a 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -16,6 +16,8 @@
 #ifndef __ASM_PGTABLE_HWDEF_H
 #define __ASM_PGTABLE_HWDEF_H
 
+#include <asm/memory.h>
+
 /*
  * Number of page-table levels required to address 'va_bits' wide
  * address, without section mapping. We resolve the top (va_bits - PAGE_SHIFT)
@@ -279,4 +281,15 @@
 #define TCR_HA			(UL(1) << 39)
 #define TCR_HD			(UL(1) << 40)
 
+/*
+ * TTBR.
+ */
+#ifdef CONFIG_ARM64_PA_BITS_52
+/*
+ * This should be GENMASK_ULL(47, 2).
+ * TTBR_ELx[1] is RES0 in this configuration.
+ */
+#define TTBR_BADDR_MASK_52	(((UL(1) << 46) - 1) << 2)
+#endif
+
 #endif
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 149d05fb9421..93677b9db947 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -733,6 +733,12 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
 #define kc_vaddr_to_offset(v)	((v) & ~VA_START)
 #define kc_offset_to_vaddr(o)	((o) | VA_START)
 
+#ifdef CONFIG_ARM64_PA_BITS_52
+#define phys_to_ttbr(addr)	(((addr) | ((addr) >> 46)) & TTBR_BADDR_MASK_52)
+#else
+#define phys_to_ttbr(addr)	(addr)
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ASM_PGTABLE_H */
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 67e86a0f57ac..0addea3760a6 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -679,8 +679,10 @@ ENTRY(__enable_mmu)
 	update_early_cpu_boot_status 0, x1, x2
 	adrp	x1, idmap_pg_dir
 	adrp	x2, swapper_pg_dir
-	msr	ttbr0_el1, x1			// load TTBR0
-	msr	ttbr1_el1, x2			// load TTBR1
+	phys_to_ttbr x1, x3
+	phys_to_ttbr x2, x4
+	msr	ttbr0_el1, x3			// load TTBR0
+	msr	ttbr1_el1, x4			// load TTBR1
 	isb
 	msr	sctlr_el1, x0
 	isb
diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S
index e56d848b6466..84f5d52fddda 100644
--- a/arch/arm64/kernel/hibernate-asm.S
+++ b/arch/arm64/kernel/hibernate-asm.S
@@ -33,12 +33,14 @@
  * Even switching to our copied tables will cause a changed output address at
  * each stage of the walk.
  */
-.macro break_before_make_ttbr_switch zero_page, page_table
-	msr	ttbr1_el1, \zero_page
+.macro break_before_make_ttbr_switch zero_page, page_table, tmp
+	phys_to_ttbr \zero_page, \tmp
+	msr	ttbr1_el1, \tmp
 	isb
 	tlbi	vmalle1
 	dsb	nsh
-	msr	ttbr1_el1, \page_table
+	phys_to_ttbr \page_table, \tmp
+	msr	ttbr1_el1, \tmp
 	isb
 .endm
 
@@ -78,7 +80,7 @@ ENTRY(swsusp_arch_suspend_exit)
 	 * We execute from ttbr0, change ttbr1 to our copied linear map tables
 	 * with a break-before-make via the zero page
 	 */
-	break_before_make_ttbr_switch	x5, x0
+	break_before_make_ttbr_switch	x5, x0, x6
 
 	mov	x21, x1
 	mov	x30, x2
@@ -109,7 +111,7 @@ ENTRY(swsusp_arch_suspend_exit)
 	dsb	ish		/* wait for PoU cleaning to finish */
 
 	/* switch to the restored kernels page tables */
-	break_before_make_ttbr_switch	x25, x21
+	break_before_make_ttbr_switch	x25, x21, x6
 
 	ic	ialluis
 	dsb	ish
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index 3009b8b80f08..efbf6dbd93c8 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -264,7 +264,7 @@ static int create_safe_exec_page(void *src_start, size_t length,
 	 */
 	cpu_set_reserved_ttbr0();
 	local_flush_tlb_all();
-	write_sysreg(virt_to_phys(pgd), ttbr0_el1);
+	write_sysreg(phys_to_ttbr(virt_to_phys(pgd)), ttbr0_el1);
 	isb();
 
 	*phys_dst_addr = virt_to_phys((void *)dst);
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index e2d1fe03662a..f9681cc00973 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -63,7 +63,8 @@ __do_hyp_init:
 	cmp	x0, #HVC_STUB_HCALL_NR
 	b.lo	__kvm_handle_stub_hvc
 
-	msr	ttbr0_el2, x0
+	phys_to_ttbr x0, x4
+	msr	ttbr0_el2, x4
 
 	mrs	x4, tcr_el1
 	ldr	x5, =TCR_EL2_MASK
diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c
index 051e71ec3335..289f9113a27a 100644
--- a/arch/arm64/mm/pgd.c
+++ b/arch/arm64/mm/pgd.c
@@ -49,6 +49,14 @@ void __init pgd_cache_init(void)
 	if (PGD_SIZE == PAGE_SIZE)
 		return;
 
+#ifdef CONFIG_ARM64_PA_BITS_52
+	/*
+	 * With 52-bit physical addresses, the architecture requires the
+	 * top-level table to be aligned to at least 64 bytes.
+	 */
+	BUILD_BUG_ON(PGD_SIZE < 64);
+#endif
+
 	/*
 	 * Naturally aligned pgds required by the architecture.
 	 */
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 4f133cb340dc..e79db5a7576a 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -138,10 +138,11 @@ ENDPROC(cpu_do_resume)
  *	- pgd_phys - physical address of new TTB
  */
 ENTRY(cpu_do_switch_mm)
-	pre_ttbr0_update_workaround x0, x2, x3
+	phys_to_ttbr x0, x2
+	pre_ttbr0_update_workaround x2, x3, x4
 	mmid	x1, x1				// get mm->context.id
-	bfi	x0, x1, #48, #16		// set the ASID
-	msr	ttbr0_el1, x0			// set TTBR0
+	bfi	x2, x1, #48, #16		// set the ASID
+	msr	ttbr0_el1, x2			// set TTBR0
 	isb
 	post_ttbr0_update_workaround
 	ret
@@ -158,14 +159,16 @@ ENTRY(idmap_cpu_replace_ttbr1)
 	save_and_disable_daif flags=x2
 
 	adrp	x1, empty_zero_page
-	msr	ttbr1_el1, x1
+	phys_to_ttbr x1, x3
+	msr	ttbr1_el1, x3
 	isb
 
 	tlbi	vmalle1
 	dsb	nsh
 	isb
 
-	msr	ttbr1_el1, x0
+	phys_to_ttbr x0, x3
+	msr	ttbr1_el1, x3
 	isb
 
 	restore_daif x2
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 6b60c98a6e22..c8d49879307f 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -509,7 +509,7 @@ static void update_vttbr(struct kvm *kvm)
 	pgd_phys = virt_to_phys(kvm->arch.pgd);
 	BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK);
 	vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits);
-	kvm->arch.vttbr = pgd_phys | vmid;
+	kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid;
 
 	spin_unlock(&kvm_vmid_lock);
 }

From e6d588a8e3da24ea51321279a064b97feb502ef0 Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Wed, 13 Dec 2017 17:07:19 +0000
Subject: [PATCH 024/104] arm64: head.S: handle 52-bit PAs in PTEs in early
 page table setup

The top 4 bits of a 52-bit physical address are positioned at bits
12..15 in page table entries. Introduce a macro to move the bits there,
and change the early ID map and swapper table setup code to use it.

Tested-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Bob Picco <bob.picco@oracle.com>
Reviewed-by: Bob Picco <bob.picco@oracle.com>
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
[catalin.marinas@arm.com: additional comments for clarification]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/pgtable-hwdef.h |  6 ++++
 arch/arm64/kernel/head.S               | 40 ++++++++++++++++++++------
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index f92be11a209a..5513ccd687f4 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -168,6 +168,12 @@
 #define PTE_UXN			(_AT(pteval_t, 1) << 54)	/* User XN */
 #define PTE_HYP_XN		(_AT(pteval_t, 1) << 54)	/* HYP XN */
 
+#ifdef CONFIG_ARM64_PA_BITS_52
+#define PTE_ADDR_LOW		(((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
+#define PTE_ADDR_HIGH		(_AT(pteval_t, 0xf) << 12)
+#define PTE_ADDR_MASK_52	(PTE_ADDR_LOW | PTE_ADDR_HIGH)
+#endif
+
 /*
  * AttrIndx[2:0] encoding (mapping attributes defined in the MAIR* registers).
  */
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 0addea3760a6..bb06223691ba 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -147,6 +147,26 @@ preserve_boot_args:
 	b	__inval_dcache_area		// tail call
 ENDPROC(preserve_boot_args)
 
+/*
+ * Macro to arrange a physical address in a page table entry, taking care of
+ * 52-bit addresses.
+ *
+ * Preserves:	phys
+ * Returns:	pte
+ */
+	.macro	phys_to_pte, phys, pte
+#ifdef CONFIG_ARM64_PA_BITS_52
+	/*
+	 * We assume \phys is 64K aligned and this is guaranteed by only
+	 * supporting this configuration with 64K pages.
+	 */
+	orr	\pte, \phys, \phys, lsr #36
+	and	\pte, \pte, #PTE_ADDR_MASK_52
+#else
+	mov	\pte, \phys
+#endif
+	.endm
+
 /*
  * Macro to create a table entry to the next page.
  *
@@ -160,10 +180,11 @@ ENDPROC(preserve_boot_args)
  * Returns:	tbl -> next level table page address
  */
 	.macro	create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2
+	add	\tmp1, \tbl, #PAGE_SIZE
+	phys_to_pte \tmp1, \tmp2
+	orr	\tmp2, \tmp2, #PMD_TYPE_TABLE	// address of next table and entry type
 	lsr	\tmp1, \virt, #\shift
 	and	\tmp1, \tmp1, #\ptrs - 1	// table index
-	add	\tmp2, \tbl, #PAGE_SIZE
-	orr	\tmp2, \tmp2, #PMD_TYPE_TABLE	// address of next table and entry type
 	str	\tmp2, [\tbl, \tmp1, lsl #3]
 	add	\tbl, \tbl, #PAGE_SIZE		// next level table page
 	.endm
@@ -190,16 +211,17 @@ ENDPROC(preserve_boot_args)
  * virtual range (inclusive).
  *
  * Preserves:	tbl, flags
- * Corrupts:	phys, start, end, pstate
+ * Corrupts:	phys, start, end, tmp, pstate
  */
-	.macro	create_block_map, tbl, flags, phys, start, end
-	lsr	\phys, \phys, #SWAPPER_BLOCK_SHIFT
+	.macro	create_block_map, tbl, flags, phys, start, end, tmp
 	lsr	\start, \start, #SWAPPER_BLOCK_SHIFT
 	and	\start, \start, #PTRS_PER_PTE - 1	// table index
-	orr	\phys, \flags, \phys, lsl #SWAPPER_BLOCK_SHIFT	// table entry
+	bic	\phys, \phys, #SWAPPER_BLOCK_SIZE - 1
 	lsr	\end, \end, #SWAPPER_BLOCK_SHIFT
 	and	\end, \end, #PTRS_PER_PTE - 1		// table end index
-9999:	str	\phys, [\tbl, \start, lsl #3]		// store the entry
+9999:	phys_to_pte \phys, \tmp
+	orr	\tmp, \tmp, \flags			// table entry
+	str	\tmp, [\tbl, \start, lsl #3]		// store the entry
 	add	\start, \start, #1			// next entry
 	add	\phys, \phys, #SWAPPER_BLOCK_SIZE		// next block
 	cmp	\start, \end
@@ -286,7 +308,7 @@ __create_page_tables:
 	create_pgd_entry x0, x3, x5, x6
 	mov	x5, x3				// __pa(__idmap_text_start)
 	adr_l	x6, __idmap_text_end		// __pa(__idmap_text_end)
-	create_block_map x0, x7, x3, x5, x6
+	create_block_map x0, x7, x3, x5, x6, x4
 
 	/*
 	 * Map the kernel image (starting with PHYS_OFFSET).
@@ -299,7 +321,7 @@ __create_page_tables:
 	adrp	x3, _text			// runtime __pa(_text)
 	sub	x6, x6, x3			// _end - _text
 	add	x6, x6, x5			// runtime __va(_end)
-	create_block_map x0, x7, x3, x5, x6
+	create_block_map x0, x7, x3, x5, x6, x4
 
 	/*
 	 * Since the page tables have been populated with non-cacheable

From 193383043f14a398393dc18bae8380f7fe665ec3 Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Wed, 13 Dec 2017 17:07:20 +0000
Subject: [PATCH 025/104] arm64: don't open code page table entry creation

Instead of open coding the generation of page table entries, use the
macros/functions that exist for this - pfn_p*d and p*d_populate. Most
code in the kernel already uses these macros, this patch tries to fix
up the few places that don't. This is useful for the next patch in this
series, which needs to change the page table entry logic, and it's
better to have that logic in one place.

The KVM extended ID map is special, since we're creating a level above
CONFIG_PGTABLE_LEVELS and the required function isn't available. Leave
it as is and add a comment to explain it. (The normal kernel ID map code
doesn't need this change because its page tables are created in assembly
(__create_page_tables)).

Tested-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Bob Picco <bob.picco@oracle.com>
Reviewed-by: Bob Picco <bob.picco@oracle.com>
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/kvm_mmu.h |  5 +++++
 arch/arm64/include/asm/pgtable.h |  1 +
 arch/arm64/kernel/hibernate.c    |  3 +--
 arch/arm64/mm/mmu.c              | 14 +++++++++-----
 4 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 747bfff92948..9810ebf949b3 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -276,6 +276,11 @@ static inline bool __kvm_cpu_uses_extended_idmap(void)
 	return __cpu_uses_extended_idmap();
 }
 
+/*
+ * Can't use pgd_populate here, because the extended idmap adds an extra level
+ * above CONFIG_PGTABLE_LEVELS (which is 2 or 3 if we're using the extended
+ * idmap), and pgd_populate is only available if CONFIG_PGTABLE_LEVELS = 4.
+ */
 static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd,
 				       pgd_t *hyp_pgd,
 				       pgd_t *merged_hyp_pgd,
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 93677b9db947..5d9554fb2692 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -355,6 +355,7 @@ static inline int pmd_protnone(pmd_t pmd)
 
 #define pud_write(pud)		pte_write(pud_pte(pud))
 #define pud_pfn(pud)		(((pud_val(pud) & PUD_MASK) & PHYS_MASK) >> PAGE_SHIFT)
+#define pfn_pud(pfn,prot)	(__pud(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)))
 
 #define set_pmd_at(mm, addr, pmdp, pmd)	set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd))
 
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index efbf6dbd93c8..f20cf7e99249 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -247,8 +247,7 @@ static int create_safe_exec_page(void *src_start, size_t length,
 	}
 
 	pte = pte_offset_kernel(pmd, dst_addr);
-	set_pte(pte, __pte(virt_to_phys((void *)dst) |
-			 pgprot_val(PAGE_KERNEL_EXEC)));
+	set_pte(pte, pfn_pte(virt_to_pfn(dst), PAGE_KERNEL_EXEC));
 
 	/*
 	 * Load our new page tables. A strict BBM approach requires that we
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 267d2b79d52d..0c631a17ae1d 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -570,8 +570,8 @@ static void __init map_kernel(pgd_t *pgd)
 		 * entry instead.
 		 */
 		BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
-		set_pud(pud_set_fixmap_offset(pgd, FIXADDR_START),
-			__pud(__pa_symbol(bm_pmd) | PUD_TYPE_TABLE));
+		pud_populate(&init_mm, pud_set_fixmap_offset(pgd, FIXADDR_START),
+			     lm_alias(bm_pmd));
 		pud_clear_fixmap();
 	} else {
 		BUG();
@@ -686,7 +686,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
 			if (!p)
 				return -ENOMEM;
 
-			set_pmd(pmd, __pmd(__pa(p) | PROT_SECT_NORMAL));
+			pmd_set_huge(pmd, __pa(p), __pgprot(PROT_SECT_NORMAL));
 		} else
 			vmemmap_verify((pte_t *)pmd, node, addr, next);
 	} while (addr = next, addr != end);
@@ -879,15 +879,19 @@ int __init arch_ioremap_pmd_supported(void)
 
 int pud_set_huge(pud_t *pud, phys_addr_t phys, pgprot_t prot)
 {
+	pgprot_t sect_prot = __pgprot(PUD_TYPE_SECT |
+					pgprot_val(mk_sect_prot(prot)));
 	BUG_ON(phys & ~PUD_MASK);
-	set_pud(pud, __pud(phys | PUD_TYPE_SECT | pgprot_val(mk_sect_prot(prot))));
+	set_pud(pud, pfn_pud(__phys_to_pfn(phys), sect_prot));
 	return 1;
 }
 
 int pmd_set_huge(pmd_t *pmd, phys_addr_t phys, pgprot_t prot)
 {
+	pgprot_t sect_prot = __pgprot(PMD_TYPE_SECT |
+					pgprot_val(mk_sect_prot(prot)));
 	BUG_ON(phys & ~PMD_MASK);
-	set_pmd(pmd, __pmd(phys | PMD_TYPE_SECT | pgprot_val(mk_sect_prot(prot))));
+	set_pmd(pmd, pfn_pmd(__phys_to_pfn(phys), sect_prot));
 	return 1;
 }
 

From 75387b92635e7dca410c1ef92cfe510019248f76 Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Wed, 13 Dec 2017 17:07:21 +0000
Subject: [PATCH 026/104] arm64: handle 52-bit physical addresses in page table
 entries

The top 4 bits of a 52-bit physical address are positioned at bits
12..15 of a page table entry. Introduce macros to convert between a
physical address and its placement in a table entry, and change all
macros/functions that access PTEs to use them.

Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Tested-by: Bob Picco <bob.picco@oracle.com>
Reviewed-by: Bob Picco <bob.picco@oracle.com>
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
[catalin.marinas@arm.com: some long lines wrapped]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/kvm_mmu.h       |  7 ++--
 arch/arm64/include/asm/pgalloc.h       |  6 ++--
 arch/arm64/include/asm/pgtable-hwdef.h |  6 ++--
 arch/arm64/include/asm/pgtable.h       | 50 +++++++++++++++++++-------
 arch/arm64/kernel/head.S               |  2 +-
 5 files changed, 51 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 9810ebf949b3..b3f7b68b042d 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -287,6 +287,7 @@ static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd,
 				       unsigned long hyp_idmap_start)
 {
 	int idmap_idx;
+	u64 pgd_addr;
 
 	/*
 	 * Use the first entry to access the HYP mappings. It is
@@ -294,7 +295,8 @@ static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd,
 	 * extended idmap.
 	 */
 	VM_BUG_ON(pgd_val(merged_hyp_pgd[0]));
-	merged_hyp_pgd[0] = __pgd(__pa(hyp_pgd) | PMD_TYPE_TABLE);
+	pgd_addr = __phys_to_pgd_val(__pa(hyp_pgd));
+	merged_hyp_pgd[0] = __pgd(pgd_addr | PMD_TYPE_TABLE);
 
 	/*
 	 * Create another extended level entry that points to the boot HYP map,
@@ -304,7 +306,8 @@ static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd,
 	 */
 	idmap_idx = hyp_idmap_start >> VA_BITS;
 	VM_BUG_ON(pgd_val(merged_hyp_pgd[idmap_idx]));
-	merged_hyp_pgd[idmap_idx] = __pgd(__pa(boot_hyp_pgd) | PMD_TYPE_TABLE);
+	pgd_addr = __phys_to_pgd_val(__pa(boot_hyp_pgd));
+	merged_hyp_pgd[idmap_idx] = __pgd(pgd_addr | PMD_TYPE_TABLE);
 }
 
 static inline unsigned int kvm_get_vmid_bits(void)
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 5ca6a573a701..e9d9f1b006ef 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -44,7 +44,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 
 static inline void __pud_populate(pud_t *pud, phys_addr_t pmd, pudval_t prot)
 {
-	set_pud(pud, __pud(pmd | prot));
+	set_pud(pud, __pud(__phys_to_pud_val(pmd) | prot));
 }
 
 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
@@ -73,7 +73,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 
 static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pud, pgdval_t prot)
 {
-	set_pgd(pgdp, __pgd(pud | prot));
+	set_pgd(pgdp, __pgd(__phys_to_pgd_val(pud) | prot));
 }
 
 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
@@ -129,7 +129,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
 static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t pte,
 				  pmdval_t prot)
 {
-	set_pmd(pmdp, __pmd(pte | prot));
+	set_pmd(pmdp, __pmd(__phys_to_pmd_val(pte) | prot));
 }
 
 /*
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 5513ccd687f4..85069f37ae37 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -168,10 +168,12 @@
 #define PTE_UXN			(_AT(pteval_t, 1) << 54)	/* User XN */
 #define PTE_HYP_XN		(_AT(pteval_t, 1) << 54)	/* HYP XN */
 
-#ifdef CONFIG_ARM64_PA_BITS_52
 #define PTE_ADDR_LOW		(((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
+#ifdef CONFIG_ARM64_PA_BITS_52
 #define PTE_ADDR_HIGH		(_AT(pteval_t, 0xf) << 12)
-#define PTE_ADDR_MASK_52	(PTE_ADDR_LOW | PTE_ADDR_HIGH)
+#define PTE_ADDR_MASK		(PTE_ADDR_LOW | PTE_ADDR_HIGH)
+#else
+#define PTE_ADDR_MASK		PTE_ADDR_LOW
 #endif
 
 /*
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 5d9554fb2692..4fd8af303e2c 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -57,9 +57,22 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 
 #define pte_ERROR(pte)		__pte_error(__FILE__, __LINE__, pte_val(pte))
 
-#define pte_pfn(pte)		((pte_val(pte) & PHYS_MASK) >> PAGE_SHIFT)
+/*
+ * Macros to convert between a physical address and its placement in a
+ * page table entry, taking care of 52-bit addresses.
+ */
+#ifdef CONFIG_ARM64_PA_BITS_52
+#define __pte_to_phys(pte)	\
+	((pte_val(pte) & PTE_ADDR_LOW) | ((pte_val(pte) & PTE_ADDR_HIGH) << 36))
+#define __phys_to_pte_val(phys)	(((phys) | ((phys) >> 36)) & PTE_ADDR_MASK)
+#else
+#define __pte_to_phys(pte)	(pte_val(pte) & PTE_ADDR_MASK)
+#define __phys_to_pte_val(phys)	(phys)
+#endif
 
-#define pfn_pte(pfn,prot)	(__pte(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)))
+#define pte_pfn(pte)		(__pte_to_phys(pte) >> PAGE_SHIFT)
+#define pfn_pte(pfn,prot)	\
+	__pte(__phys_to_pte_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
 
 #define pte_none(pte)		(!pte_val(pte))
 #define pte_clear(mm,addr,ptep)	set_pte(ptep, __pte(0))
@@ -284,6 +297,11 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b)
 
 #define __HAVE_ARCH_PTE_SPECIAL
 
+static inline pte_t pgd_pte(pgd_t pgd)
+{
+	return __pte(pgd_val(pgd));
+}
+
 static inline pte_t pud_pte(pud_t pud)
 {
 	return __pte(pud_val(pud));
@@ -349,16 +367,24 @@ static inline int pmd_protnone(pmd_t pmd)
 
 #define pmd_mkhuge(pmd)		(__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
 
-#define pmd_pfn(pmd)		(((pmd_val(pmd) & PMD_MASK) & PHYS_MASK) >> PAGE_SHIFT)
-#define pfn_pmd(pfn,prot)	(__pmd(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)))
+#define __pmd_to_phys(pmd)	__pte_to_phys(pmd_pte(pmd))
+#define __phys_to_pmd_val(phys)	__phys_to_pte_val(phys)
+#define pmd_pfn(pmd)		((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT)
+#define pfn_pmd(pfn,prot)	__pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
 #define mk_pmd(page,prot)	pfn_pmd(page_to_pfn(page),prot)
 
 #define pud_write(pud)		pte_write(pud_pte(pud))
-#define pud_pfn(pud)		(((pud_val(pud) & PUD_MASK) & PHYS_MASK) >> PAGE_SHIFT)
-#define pfn_pud(pfn,prot)	(__pud(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)))
+
+#define __pud_to_phys(pud)	__pte_to_phys(pud_pte(pud))
+#define __phys_to_pud_val(phys)	__phys_to_pte_val(phys)
+#define pud_pfn(pud)		((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT)
+#define pfn_pud(pfn,prot)	__pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
 
 #define set_pmd_at(mm, addr, pmdp, pmd)	set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd))
 
+#define __pgd_to_phys(pgd)	__pte_to_phys(pgd_pte(pgd))
+#define __phys_to_pgd_val(phys)	__phys_to_pte_val(phys)
+
 #define __pgprot_modify(prot,mask,bits) \
 	__pgprot((pgprot_val(prot) & ~(mask)) | (bits))
 
@@ -409,7 +435,7 @@ static inline void pmd_clear(pmd_t *pmdp)
 
 static inline phys_addr_t pmd_page_paddr(pmd_t pmd)
 {
-	return pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK;
+	return __pmd_to_phys(pmd);
 }
 
 /* Find an entry in the third-level page table. */
@@ -427,7 +453,7 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd)
 #define pte_set_fixmap_offset(pmd, addr)	pte_set_fixmap(pte_offset_phys(pmd, addr))
 #define pte_clear_fixmap()		clear_fixmap(FIX_PTE)
 
-#define pmd_page(pmd)		pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK))
+#define pmd_page(pmd)		pfn_to_page(__phys_to_pfn(__pmd_to_phys(pmd)))
 
 /* use ONLY for statically allocated translation tables */
 #define pte_offset_kimg(dir,addr)	((pte_t *)__phys_to_kimg(pte_offset_phys((dir), (addr))))
@@ -460,7 +486,7 @@ static inline void pud_clear(pud_t *pudp)
 
 static inline phys_addr_t pud_page_paddr(pud_t pud)
 {
-	return pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK;
+	return __pud_to_phys(pud);
 }
 
 /* Find an entry in the second-level page table. */
@@ -473,7 +499,7 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
 #define pmd_set_fixmap_offset(pud, addr)	pmd_set_fixmap(pmd_offset_phys(pud, addr))
 #define pmd_clear_fixmap()		clear_fixmap(FIX_PMD)
 
-#define pud_page(pud)		pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK))
+#define pud_page(pud)		pfn_to_page(__phys_to_pfn(__pud_to_phys(pud)))
 
 /* use ONLY for statically allocated translation tables */
 #define pmd_offset_kimg(dir,addr)	((pmd_t *)__phys_to_kimg(pmd_offset_phys((dir), (addr))))
@@ -512,7 +538,7 @@ static inline void pgd_clear(pgd_t *pgdp)
 
 static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
 {
-	return pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK;
+	return __pgd_to_phys(pgd);
 }
 
 /* Find an entry in the frst-level page table. */
@@ -525,7 +551,7 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
 #define pud_set_fixmap_offset(pgd, addr)	pud_set_fixmap(pud_offset_phys(pgd, addr))
 #define pud_clear_fixmap()		clear_fixmap(FIX_PUD)
 
-#define pgd_page(pgd)		pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK))
+#define pgd_page(pgd)		pfn_to_page(__phys_to_pfn(__pgd_to_phys(pgd)))
 
 /* use ONLY for statically allocated translation tables */
 #define pud_offset_kimg(dir,addr)	((pud_t *)__phys_to_kimg(pud_offset_phys((dir), (addr))))
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index bb06223691ba..eeec0001e204 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -161,7 +161,7 @@ ENDPROC(preserve_boot_args)
 	 * supporting this configuration with 64K pages.
 	 */
 	orr	\pte, \phys, \phys, lsr #36
-	and	\pte, \pte, #PTE_ADDR_MASK_52
+	and	\pte, \pte, #PTE_ADDR_MASK
 #else
 	mov	\pte, \phys
 #endif

From fa2a8445b1d3810c52f2a6b3a006456bd1aacb7e Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Wed, 13 Dec 2017 17:07:24 +0000
Subject: [PATCH 027/104] arm64: allow ID map to be extended to 52 bits

Currently, when using VA_BITS < 48, if the ID map text happens to be
placed in physical memory above VA_BITS, we increase the VA size (up to
48) and create a new table level, in order to map in the ID map text.
This is okay because the system always supports 48 bits of VA.

This patch extends the code such that if the system supports 52 bits of
VA, and the ID map text is placed that high up, then we increase the VA
size accordingly, up to 52.

One difference from the current implementation is that so far the
condition of VA_BITS < 48 has meant that the top level table is always
"full", with the maximum number of entries, and an extra table level is
always needed. Now, when VA_BITS = 48 (and using 64k pages), the top
level table is not full, and we simply need to increase the number of
entries in it, instead of creating a new table level.

Tested-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Bob Picco <bob.picco@oracle.com>
Reviewed-by: Bob Picco <bob.picco@oracle.com>
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
[catalin.marinas@arm.com: reduce arguments to __create_hyp_mappings()]
[catalin.marinas@arm.com: reworked/renamed __cpu_uses_extended_idmap_level()]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm/include/asm/kvm_mmu.h       |  5 ++
 arch/arm64/include/asm/assembler.h   |  2 -
 arch/arm64/include/asm/kvm_mmu.h     |  7 ++-
 arch/arm64/include/asm/mmu_context.h | 10 ++++
 arch/arm64/kernel/head.S             | 80 ++++++++++++++++------------
 arch/arm64/kvm/hyp-init.S            | 17 +++---
 arch/arm64/mm/mmu.c                  |  1 +
 virt/kvm/arm/mmu.c                   | 10 +++-
 8 files changed, 85 insertions(+), 47 deletions(-)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 8dbec683638b..8c5643e2eea4 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -211,6 +211,11 @@ static inline bool __kvm_cpu_uses_extended_idmap(void)
 	return false;
 }
 
+static inline unsigned long __kvm_idmap_ptrs_per_pgd(void)
+{
+	return PTRS_PER_PGD;
+}
+
 static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd,
 				       pgd_t *hyp_pgd,
 				       pgd_t *merged_hyp_pgd,
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 49ea3def4bd1..942fdb5ef0ad 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -344,10 +344,8 @@ alternative_endif
  * tcr_set_idmap_t0sz - update TCR.T0SZ so that we can load the ID map
  */
 	.macro	tcr_set_idmap_t0sz, valreg, tmpreg
-#ifndef CONFIG_ARM64_VA_BITS_48
 	ldr_l	\tmpreg, idmap_t0sz
 	bfi	\valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH
-#endif
 	.endm
 
 /*
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index b3f7b68b042d..b33bdb5eeb3d 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -273,7 +273,12 @@ void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
 
 static inline bool __kvm_cpu_uses_extended_idmap(void)
 {
-	return __cpu_uses_extended_idmap();
+	return __cpu_uses_extended_idmap_level();
+}
+
+static inline unsigned long __kvm_idmap_ptrs_per_pgd(void)
+{
+	return idmap_ptrs_per_pgd;
 }
 
 /*
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index accc2ff32a0e..88451d47036b 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -63,6 +63,7 @@ static inline void cpu_set_reserved_ttbr0(void)
  * physical memory, in which case it will be smaller.
  */
 extern u64 idmap_t0sz;
+extern u64 idmap_ptrs_per_pgd;
 
 static inline bool __cpu_uses_extended_idmap(void)
 {
@@ -70,6 +71,15 @@ static inline bool __cpu_uses_extended_idmap(void)
 		unlikely(idmap_t0sz != TCR_T0SZ(VA_BITS)));
 }
 
+/*
+ * True if the extended ID map requires an extra level of translation table
+ * to be configured.
+ */
+static inline bool __cpu_uses_extended_idmap_level(void)
+{
+	return ARM64_HW_PGTABLE_LEVELS((64 - idmap_t0sz)) > CONFIG_PGTABLE_LEVELS;
+}
+
 /*
  * Set TCR.T0SZ to its default value (based on VA_BITS)
  */
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index eeec0001e204..66f01869e97c 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -176,7 +176,7 @@ ENDPROC(preserve_boot_args)
  *	ptrs:	#imm pointers per table page
  *
  * Preserves:	virt
- * Corrupts:	tmp1, tmp2
+ * Corrupts:	ptrs, tmp1, tmp2
  * Returns:	tbl -> next level table page address
  */
 	.macro	create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2
@@ -184,7 +184,8 @@ ENDPROC(preserve_boot_args)
 	phys_to_pte \tmp1, \tmp2
 	orr	\tmp2, \tmp2, #PMD_TYPE_TABLE	// address of next table and entry type
 	lsr	\tmp1, \virt, #\shift
-	and	\tmp1, \tmp1, #\ptrs - 1	// table index
+	sub	\ptrs, \ptrs, #1
+	and	\tmp1, \tmp1, \ptrs		// table index
 	str	\tmp2, [\tbl, \tmp1, lsl #3]
 	add	\tbl, \tbl, #PAGE_SIZE		// next level table page
 	.endm
@@ -194,15 +195,17 @@ ENDPROC(preserve_boot_args)
  * block entry in the next level (tbl) for the given virtual address.
  *
  * Preserves:	tbl, next, virt
- * Corrupts:	tmp1, tmp2
+ * Corrupts:	ptrs_per_pgd, tmp1, tmp2
  */
-	.macro	create_pgd_entry, tbl, virt, tmp1, tmp2
-	create_table_entry \tbl, \virt, PGDIR_SHIFT, PTRS_PER_PGD, \tmp1, \tmp2
+	.macro	create_pgd_entry, tbl, virt, ptrs_per_pgd, tmp1, tmp2
+	create_table_entry \tbl, \virt, PGDIR_SHIFT, \ptrs_per_pgd, \tmp1, \tmp2
 #if SWAPPER_PGTABLE_LEVELS > 3
-	create_table_entry \tbl, \virt, PUD_SHIFT, PTRS_PER_PUD, \tmp1, \tmp2
+	mov	\ptrs_per_pgd, PTRS_PER_PUD
+	create_table_entry \tbl, \virt, PUD_SHIFT, \ptrs_per_pgd, \tmp1, \tmp2
 #endif
 #if SWAPPER_PGTABLE_LEVELS > 2
-	create_table_entry \tbl, \virt, SWAPPER_TABLE_SHIFT, PTRS_PER_PTE, \tmp1, \tmp2
+	mov	\ptrs_per_pgd, PTRS_PER_PTE
+	create_table_entry \tbl, \virt, SWAPPER_TABLE_SHIFT, \ptrs_per_pgd, \tmp1, \tmp2
 #endif
 	.endm
 
@@ -266,16 +269,34 @@ __create_page_tables:
 	adrp	x0, idmap_pg_dir
 	adrp	x3, __idmap_text_start		// __pa(__idmap_text_start)
 
-#ifndef CONFIG_ARM64_VA_BITS_48
+	/*
+	 * VA_BITS may be too small to allow for an ID mapping to be created
+	 * that covers system RAM if that is located sufficiently high in the
+	 * physical address space. So for the ID map, use an extended virtual
+	 * range in that case, and configure an additional translation level
+	 * if needed.
+	 *
+	 * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the
+	 * entire ID map region can be mapped. As T0SZ == (64 - #bits used),
+	 * this number conveniently equals the number of leading zeroes in
+	 * the physical address of __idmap_text_end.
+	 */
+	adrp	x5, __idmap_text_end
+	clz	x5, x5
+	cmp	x5, TCR_T0SZ(VA_BITS)	// default T0SZ small enough?
+	b.ge	1f			// .. then skip VA range extension
+
+	adr_l	x6, idmap_t0sz
+	str	x5, [x6]
+	dmb	sy
+	dc	ivac, x6		// Invalidate potentially stale cache line
+
+#if (VA_BITS < 48)
 #define EXTRA_SHIFT	(PGDIR_SHIFT + PAGE_SHIFT - 3)
-#define EXTRA_PTRS	(1 << (48 - EXTRA_SHIFT))
+#define EXTRA_PTRS	(1 << (PHYS_MASK_SHIFT - EXTRA_SHIFT))
 
 	/*
-	 * If VA_BITS < 48, it may be too small to allow for an ID mapping to be
-	 * created that covers system RAM if that is located sufficiently high
-	 * in the physical address space. So for the ID map, use an extended
-	 * virtual range in that case, by configuring an additional translation
-	 * level.
+	 * If VA_BITS < 48, we have to configure an additional table level.
 	 * First, we have to verify our assumption that the current value of
 	 * VA_BITS was chosen such that all translation levels are fully
 	 * utilised, and that lowering T0SZ will always result in an additional
@@ -285,27 +306,19 @@ __create_page_tables:
 #error "Mismatch between VA_BITS and page size/number of translation levels"
 #endif
 
+	mov	x4, EXTRA_PTRS
+	create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6
+#else
 	/*
-	 * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the
-	 * entire ID map region can be mapped. As T0SZ == (64 - #bits used),
-	 * this number conveniently equals the number of leading zeroes in
-	 * the physical address of __idmap_text_end.
+	 * If VA_BITS == 48, we don't have to configure an additional
+	 * translation level, but the top-level table has more entries.
 	 */
-	adrp	x5, __idmap_text_end
-	clz	x5, x5
-	cmp	x5, TCR_T0SZ(VA_BITS)	// default T0SZ small enough?
-	b.ge	1f			// .. then skip additional level
-
-	adr_l	x6, idmap_t0sz
-	str	x5, [x6]
-	dmb	sy
-	dc	ivac, x6		// Invalidate potentially stale cache line
-
-	create_table_entry x0, x3, EXTRA_SHIFT, EXTRA_PTRS, x5, x6
-1:
+	mov	x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT)
+	str_l	x4, idmap_ptrs_per_pgd, x5
 #endif
-
-	create_pgd_entry x0, x3, x5, x6
+1:
+	ldr_l	x4, idmap_ptrs_per_pgd
+	create_pgd_entry x0, x3, x4, x5, x6
 	mov	x5, x3				// __pa(__idmap_text_start)
 	adr_l	x6, __idmap_text_end		// __pa(__idmap_text_end)
 	create_block_map x0, x7, x3, x5, x6, x4
@@ -316,7 +329,8 @@ __create_page_tables:
 	adrp	x0, swapper_pg_dir
 	mov_q	x5, KIMAGE_VADDR + TEXT_OFFSET	// compile time __va(_text)
 	add	x5, x5, x23			// add KASLR displacement
-	create_pgd_entry x0, x5, x3, x6
+	mov	x4, PTRS_PER_PGD
+	create_pgd_entry x0, x5, x4, x3, x6
 	adrp	x6, _end			// runtime __pa(_end)
 	adrp	x3, _text			// runtime __pa(_text)
 	sub	x6, x6, x3			// _end - _text
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index f9681cc00973..33c40b3eea01 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -72,24 +72,23 @@ __do_hyp_init:
 	mov	x5, #TCR_EL2_RES1
 	orr	x4, x4, x5
 
-#ifndef CONFIG_ARM64_VA_BITS_48
 	/*
-	 * If we are running with VA_BITS < 48, we may be running with an extra
-	 * level of translation in the ID map. This is only the case if system
-	 * RAM is out of range for the currently configured page size and number
-	 * of translation levels, in which case we will also need the extra
-	 * level for the HYP ID map, or we won't be able to enable the EL2 MMU.
+	 * The ID map may be configured to use an extended virtual address
+	 * range. This is only the case if system RAM is out of range for the
+	 * currently configured page size and VA_BITS, in which case we will
+	 * also need the extended virtual range for the HYP ID map, or we won't
+	 * be able to enable the EL2 MMU.
 	 *
 	 * However, at EL2, there is only one TTBR register, and we can't switch
 	 * between translation tables *and* update TCR_EL2.T0SZ at the same
-	 * time. Bottom line: we need the extra level in *both* our translation
-	 * tables.
+	 * time. Bottom line: we need to use the extended range with *both* our
+	 * translation tables.
 	 *
 	 * So use the same T0SZ value we use for the ID map.
 	 */
 	ldr_l	x5, idmap_t0sz
 	bfi	x4, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
-#endif
+
 	/*
 	 * Set the PS bits in TCR_EL2.
 	 */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 0c631a17ae1d..baa34418c3bf 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -50,6 +50,7 @@
 #define NO_CONT_MAPPINGS	BIT(1)
 
 u64 idmap_t0sz = TCR_T0SZ(VA_BITS);
+u64 idmap_ptrs_per_pgd = PTRS_PER_PGD;
 
 u64 kimage_voffset __ro_after_init;
 EXPORT_SYMBOL(kimage_voffset);
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index b36945d49986..761787befd3b 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -629,14 +629,20 @@ static int __create_hyp_mappings(pgd_t *pgdp,
 {
 	pgd_t *pgd;
 	pud_t *pud;
-	unsigned long addr, next;
+	unsigned long addr, next, ptrs_per_pgd = PTRS_PER_PGD;
 	int err = 0;
 
+	/*
+	 * If it's not the hyp_pgd, fall back to the kvm idmap layout.
+	 */
+	if (pgdp != hyp_pgd)
+		ptrs_per_pgd = __kvm_idmap_ptrs_per_pgd();
+
 	mutex_lock(&kvm_hyp_pgd_mutex);
 	addr = start & PAGE_MASK;
 	end = PAGE_ALIGN(end);
 	do {
-		pgd = pgdp + pgd_index(addr);
+		pgd = pgdp + ((addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1));
 
 		if (pgd_none(*pgd)) {
 			pud = pud_alloc_one(NULL, addr);

From f77d281713d4188973bb34ecb10e51ae39ce6946 Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Wed, 13 Dec 2017 17:07:25 +0000
Subject: [PATCH 028/104] arm64: enable 52-bit physical address support

Now that 52-bit physical address support is in place, add the kconfig
symbol to enable it. As described in ARMv8.2, the larger addresses are
only supported with the 64k granule. Also ensure that PAN is configured
(or TTBR0 PAN is not), as explained in an earlier patch in this series.

Tested-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Bob Picco <bob.picco@oracle.com>
Reviewed-by: Bob Picco <bob.picco@oracle.com>
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 8dc937823eeb..337a696c9b02 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -656,11 +656,24 @@ choice
 config ARM64_PA_BITS_48
 	bool "48-bit"
 
+config ARM64_PA_BITS_52
+	bool "52-bit (ARMv8.2)"
+	depends on ARM64_64K_PAGES
+	depends on ARM64_PAN || !ARM64_SW_TTBR0_PAN
+	help
+	  Enable support for a 52-bit physical address space, introduced as
+	  part of the ARMv8.2-LPA extension.
+
+	  With this enabled, the kernel will also continue to work on CPUs that
+	  do not support ARMv8.2-LPA, but with some added memory overhead (and
+	  minor performance overhead).
+
 endchoice
 
 config ARM64_PA_BITS
 	int
 	default 48 if ARM64_PA_BITS_48
+	default 52 if ARM64_PA_BITS_52
 
 config CPU_BIG_ENDIAN
        bool "Build big-endian kernel"

From db50a74d8193944dd1ee488fd2a813a364fbbaa7 Mon Sep 17 00:00:00 2001
From: Prashanth Prakash <pprakash@codeaurora.org>
Date: Wed, 15 Nov 2017 10:11:49 -0700
Subject: [PATCH 029/104] cpuidle: Add new macro to enter a retention idle
 state

If a CPU is entering a low power idle state where it doesn't lose any
context, then there is no need to call cpu_pm_enter()/cpu_pm_exit().
Add a new macro(CPU_PM_CPU_IDLE_ENTER_RETENTION) to be used by cpuidle
drivers when they are entering retention state. By not calling
cpu_pm_enter and cpu_pm_exit we reduce the latency involved in
entering and exiting the retention idle states.

CPU_PM_CPU_IDLE_ENTER_RETENTION assumes that no state is lost and
hence CPU PM notifiers will not be called. We may need a broader
change if we need to support partial retention states effeciently.

On ARM64 based Qualcomm Server Platform we measured below overhead for
for calling cpu_pm_enter and cpu_pm_exit for retention states.

workload: stress --hdd #CPUs --hdd-bytes 32M  -t 30
        Average overhead of cpu_pm_enter - 1.2us
        Average overhead of cpu_pm_exit  - 3.1us

Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Prashanth Prakash <pprakash@codeaurora.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/cpuidle.h | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 8f7788d23b57..871f9e21810c 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -257,22 +257,30 @@ static inline int cpuidle_register_governor(struct cpuidle_governor *gov)
 {return 0;}
 #endif
 
-#define CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx)	\
-({								\
-	int __ret;						\
-								\
-	if (!idx) {						\
-		cpu_do_idle();					\
-		return idx;					\
-	}							\
-								\
-	__ret = cpu_pm_enter();					\
-	if (!__ret) {						\
-		__ret = low_level_idle_enter(idx);		\
-		cpu_pm_exit();					\
-	}							\
-								\
-	__ret ? -1 : idx;					\
+#define __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, is_retention) \
+({									\
+	int __ret = 0;							\
+									\
+	if (!idx) {							\
+		cpu_do_idle();						\
+		return idx;						\
+	}								\
+									\
+	if (!is_retention)						\
+		__ret =  cpu_pm_enter();				\
+	if (!__ret) {							\
+		__ret = low_level_idle_enter(idx);			\
+		if (!is_retention)					\
+			cpu_pm_exit();					\
+	}								\
+									\
+	__ret ? -1 : idx;						\
 })
 
+#define CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx)	\
+	__CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, 0)
+
+#define CPU_PM_CPU_IDLE_ENTER_RETENTION(low_level_idle_enter, idx)	\
+	__CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, 1)
+
 #endif /* _LINUX_CPUIDLE_H */

From 8b9951ed7e5b517bdd5743d71ac662885d3c7bfc Mon Sep 17 00:00:00 2001
From: Prashanth Prakash <pprakash@codeaurora.org>
Date: Wed, 15 Nov 2017 10:11:50 -0700
Subject: [PATCH 030/104] ARM64 / cpuidle: Use new cpuidle macro for entering
 retention state

CPU_PM_CPU_IDLE_ENTER_RETENTION skips calling cpu_pm_enter() and
cpu_pm_exit(). By not calling cpu_pm functions in idle entry/exit
paths we can reduce the latency involved in entering and exiting
the low power idle state.

On ARM64 based Qualcomm server platform we measured below overhead
for calling cpu_pm_enter and cpu_pm_exit for retention states.

workload: stress --hdd #CPUs --hdd-bytes 32M  -t 30
	Average overhead of cpu_pm_enter - 1.2us
	Average overhead of cpu_pm_exit  - 3.1us

Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Prashanth Prakash <pprakash@codeaurora.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/cpuidle.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c
index fd691087dc9a..f2d13810daa8 100644
--- a/arch/arm64/kernel/cpuidle.c
+++ b/arch/arm64/kernel/cpuidle.c
@@ -47,6 +47,8 @@ int arm_cpuidle_suspend(int index)
 
 #include <acpi/processor.h>
 
+#define ARM64_LPI_IS_RETENTION_STATE(arch_flags) (!(arch_flags))
+
 int acpi_processor_ffh_lpi_probe(unsigned int cpu)
 {
 	return arm_cpuidle_init(cpu);
@@ -54,6 +56,10 @@ int acpi_processor_ffh_lpi_probe(unsigned int cpu)
 
 int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi)
 {
-	return CPU_PM_CPU_IDLE_ENTER(arm_cpuidle_suspend, lpi->index);
+	if (ARM64_LPI_IS_RETENTION_STATE(lpi->arch_flags))
+		return CPU_PM_CPU_IDLE_ENTER_RETENTION(arm_cpuidle_suspend,
+						lpi->index);
+	else
+		return CPU_PM_CPU_IDLE_ENTER(arm_cpuidle_suspend, lpi->index);
 }
 #endif

From f5ed22e2647cc89dd0a57936b38a582546527509 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 7 Nov 2017 11:24:04 +0900
Subject: [PATCH 031/104] arm64: make label allocation style consistent in
 tishift

This is entirely cosmetic, but somehow it was missed when sending
differing versions of this patch. This just makes the file a bit more
uniform.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/lib/tishift.S | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/lib/tishift.S b/arch/arm64/lib/tishift.S
index 0179a43cc045..d3db9b2cd479 100644
--- a/arch/arm64/lib/tishift.S
+++ b/arch/arm64/lib/tishift.S
@@ -38,19 +38,19 @@ ENTRY(__ashlti3)
 ENDPROC(__ashlti3)
 
 ENTRY(__ashrti3)
-	cbz	x2, 3f
+	cbz	x2, 1f
 	mov	x3, #64
 	sub	x3, x3, x2
 	cmp	x3, #0
-	b.le	4f
+	b.le	2f
 	lsr	x0, x0, x2
 	lsl	x3, x1, x3
 	asr	x2, x1, x2
 	orr	x0, x0, x3
 	mov	x1, x2
-3:
+1:
 	ret
-4:
+2:
 	neg	w0, w3
 	asr	x2, x1, #63
 	asr	x0, x1, x0

From 82975c46da8275a2a212cd94049bbef9bb961da2 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Tue, 2 Jan 2018 11:25:26 +0000
Subject: [PATCH 032/104] perf: Export perf_event_update_userpage

Export perf_event_update_userpage() so that PMU driver using them,
can be built as modules.

Acked-by: Peter Zilstra <peterz@infradead.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 kernel/events/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4df5b695bf0d..64ec2c9d2c44 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4904,6 +4904,7 @@ void perf_event_update_userpage(struct perf_event *event)
 unlock:
 	rcu_read_unlock();
 }
+EXPORT_SYMBOL_GPL(perf_event_update_userpage);
 
 static int perf_mmap_fault(struct vm_fault *vmf)
 {

From a0e71cd9b1a0195f96500b2268759873ff8ff819 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Tue, 2 Jan 2018 11:25:27 +0000
Subject: [PATCH 033/104] of: Add helper for mapping device node to logical CPU
 number

Add a helper to map a device node to a logical CPU number to avoid
duplication. Currently this is open coded in different places (e.g
gic-v3, coresight). The helper tries to map device node to a "possible"
logical CPU id, which may not be online yet. It is the responsibility
of the user to make sure that the CPU is online. The helper uses
of_cpu_device_node_get() to retrieve the device node for a given CPU
(which uses per_cpu data if available else falls back to slower
of_get_cpu_node()).

Cc: devicetree@vger.kernel.org
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/of/base.c  | 26 ++++++++++++++++++++++++++
 include/linux/of.h |  7 +++++++
 2 files changed, 33 insertions(+)

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 26618ba8f92a..a9d6fe86585b 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -315,6 +315,32 @@ struct device_node *of_get_cpu_node(int cpu, unsigned int *thread)
 }
 EXPORT_SYMBOL(of_get_cpu_node);
 
+/**
+ * of_cpu_node_to_id: Get the logical CPU number for a given device_node
+ *
+ * @cpu_node: Pointer to the device_node for CPU.
+ *
+ * Returns the logical CPU number of the given CPU device_node.
+ * Returns -ENODEV if the CPU is not found.
+ */
+int of_cpu_node_to_id(struct device_node *cpu_node)
+{
+	int cpu;
+	bool found = false;
+	struct device_node *np;
+
+	for_each_possible_cpu(cpu) {
+		np = of_cpu_device_node_get(cpu);
+		found = (cpu_node == np);
+		of_node_put(np);
+		if (found)
+			return cpu;
+	}
+
+	return -ENODEV;
+}
+EXPORT_SYMBOL(of_cpu_node_to_id);
+
 /**
  * __of_device_is_compatible() - Check if the node matches given constraints
  * @device: pointer to node
diff --git a/include/linux/of.h b/include/linux/of.h
index d3dea1d1e3a9..173102dafb07 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -544,6 +544,8 @@ const char *of_prop_next_string(struct property *prop, const char *cur);
 
 bool of_console_check(struct device_node *dn, char *name, int index);
 
+extern int of_cpu_node_to_id(struct device_node *np);
+
 #else /* CONFIG_OF */
 
 static inline void of_core_init(void)
@@ -916,6 +918,11 @@ static inline void of_property_clear_flag(struct property *p, unsigned long flag
 {
 }
 
+static inline int of_cpu_node_to_id(struct device_node *np)
+{
+	return -ENODEV;
+}
+
 #define of_match_ptr(_ptr)	NULL
 #define of_match_node(_matches, _node)	NULL
 #endif /* CONFIG_OF */

From 29198e3844caaaacb0e21eff634c5f1cfd7f89be Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Tue, 2 Jan 2018 11:25:28 +0000
Subject: [PATCH 034/104] coresight: of: Use of_cpu_node_to_id helper

Reuse the new generic helper, of_cpu_node_to_id() to map a
given CPU phandle to a logical CPU number.

Acked-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Tested-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/hwtracing/coresight/of_coresight.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/drivers/hwtracing/coresight/of_coresight.c b/drivers/hwtracing/coresight/of_coresight.c
index a18794128bf8..7c375443ede6 100644
--- a/drivers/hwtracing/coresight/of_coresight.c
+++ b/drivers/hwtracing/coresight/of_coresight.c
@@ -104,26 +104,17 @@ static int of_coresight_alloc_memory(struct device *dev,
 int of_coresight_get_cpu(const struct device_node *node)
 {
 	int cpu;
-	bool found;
-	struct device_node *dn, *np;
+	struct device_node *dn;
 
 	dn = of_parse_phandle(node, "cpu", 0);
-
 	/* Affinity defaults to CPU0 */
 	if (!dn)
 		return 0;
-
-	for_each_possible_cpu(cpu) {
-		np = of_cpu_device_node_get(cpu);
-		found = (dn == np);
-		of_node_put(np);
-		if (found)
-			break;
-	}
+	cpu = of_cpu_node_to_id(dn);
 	of_node_put(dn);
 
 	/* Affinity to CPU0 if no cpu nodes are found */
-	return found ? cpu : 0;
+	return (cpu < 0) ? 0 : cpu;
 }
 EXPORT_SYMBOL_GPL(of_coresight_get_cpu);
 

From c08ec7da75f393c0dbaba9455f4f6b2a5e709355 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Tue, 2 Jan 2018 11:25:29 +0000
Subject: [PATCH 035/104] irqchip: gic-v3: Use of_cpu_node_to_id helper

Use the new generic helper of_cpu_node_to_id() instead
of using our own version to map a device node to logical CPU
number.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/irqchip/irq-gic-v3.c | 29 ++---------------------------
 1 file changed, 2 insertions(+), 27 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index b56c3e23f0af..9a7a15049903 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -1070,31 +1070,6 @@ static int __init gic_validate_dist_version(void __iomem *dist_base)
 	return 0;
 }
 
-static int get_cpu_number(struct device_node *dn)
-{
-	const __be32 *cell;
-	u64 hwid;
-	int cpu;
-
-	cell = of_get_property(dn, "reg", NULL);
-	if (!cell)
-		return -1;
-
-	hwid = of_read_number(cell, of_n_addr_cells(dn));
-
-	/*
-	 * Non affinity bits must be set to 0 in the DT
-	 */
-	if (hwid & ~MPIDR_HWID_BITMASK)
-		return -1;
-
-	for_each_possible_cpu(cpu)
-		if (cpu_logical_map(cpu) == hwid)
-			return cpu;
-
-	return -1;
-}
-
 /* Create all possible partitions at boot time */
 static void __init gic_populate_ppi_partitions(struct device_node *gic_node)
 {
@@ -1145,8 +1120,8 @@ static void __init gic_populate_ppi_partitions(struct device_node *gic_node)
 			if (WARN_ON(!cpu_node))
 				continue;
 
-			cpu = get_cpu_number(cpu_node);
-			if (WARN_ON(cpu == -1))
+			cpu = of_cpu_node_to_id(cpu_node);
+			if (WARN_ON(cpu < 0))
 				continue;
 
 			pr_cont("%pOF[%d] ", cpu_node, cpu);

From 52cac1103af3283c7e4386b796da9cc92c3320e0 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Tue, 2 Jan 2018 11:25:30 +0000
Subject: [PATCH 036/104] arm64: Use of_cpu_node_to_id helper for CPU topology
 parsing

Make use of the new generic helper to convert an of_node of a CPU
to the logical CPU id in parsing the topology.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Will Deacon <will.deacon@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/kernel/topology.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 8d48b233e6ce..21868530018e 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -37,18 +37,14 @@ static int __init get_cpu_for_node(struct device_node *node)
 	if (!cpu_node)
 		return -1;
 
-	for_each_possible_cpu(cpu) {
-		if (of_get_cpu_node(cpu, NULL) == cpu_node) {
-			topology_parse_cpu_capacity(cpu_node, cpu);
-			of_node_put(cpu_node);
-			return cpu;
-		}
-	}
-
-	pr_crit("Unable to find CPU node for %pOF\n", cpu_node);
+	cpu = of_cpu_node_to_id(cpu_node);
+	if (cpu >= 0)
+		topology_parse_cpu_capacity(cpu_node, cpu);
+	else
+		pr_crit("Unable to find CPU node for %pOF\n", cpu_node);
 
 	of_node_put(cpu_node);
-	return -1;
+	return cpu;
 }
 
 static int __init parse_core(struct device_node *core, int cluster_id,

From 66582787314ed0bd1dcac3f7a2b98ff71f3fb653 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Tue, 2 Jan 2018 11:25:31 +0000
Subject: [PATCH 037/104] arm_pmu: Use of_cpu_node_to_id helper

Use the new generic helper, of_cpu_node_to_id(), to map a
a phandle to the logical CPU number while parsing the
PMU irq affinity.

Cc: Will Deacon <will.deacon@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/arm_pmu_platform.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/drivers/perf/arm_pmu_platform.c b/drivers/perf/arm_pmu_platform.c
index 91b224eced18..46501cc79fd7 100644
--- a/drivers/perf/arm_pmu_platform.c
+++ b/drivers/perf/arm_pmu_platform.c
@@ -82,19 +82,10 @@ static int pmu_parse_irq_affinity(struct device_node *node, int i)
 		return -EINVAL;
 	}
 
-	/* Now look up the logical CPU number */
-	for_each_possible_cpu(cpu) {
-		struct device_node *cpu_dn;
-
-		cpu_dn = of_cpu_device_node_get(cpu);
-		of_node_put(cpu_dn);
-
-		if (dn == cpu_dn)
-			break;
-	}
-
-	if (cpu >= nr_cpu_ids) {
+	cpu = of_cpu_node_to_id(dn);
+	if (cpu < 0) {
 		pr_warn("failed to find logical CPU for %s\n", dn->name);
+		cpu = nr_cpu_ids;
 	}
 
 	of_node_put(dn);

From 9249dee611d6624bc9044fdf3877ace67d6143ab Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Tue, 2 Jan 2018 11:25:32 +0000
Subject: [PATCH 038/104] dt-bindings: Document devicetree binding for ARM DSU
 PMU

This patch documents the devicetree bindings for ARM DSU PMU.

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: devicetree@vger.kernel.org
Cc: frowand.list@gmail.com
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 .../devicetree/bindings/arm/arm-dsu-pmu.txt   | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/arm/arm-dsu-pmu.txt

diff --git a/Documentation/devicetree/bindings/arm/arm-dsu-pmu.txt b/Documentation/devicetree/bindings/arm/arm-dsu-pmu.txt
new file mode 100644
index 000000000000..6efabba530f1
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/arm-dsu-pmu.txt
@@ -0,0 +1,27 @@
+* ARM DynamIQ Shared Unit (DSU) Performance Monitor Unit (PMU)
+
+ARM DyanmIQ Shared Unit (DSU) integrates one or more CPU cores
+with a shared L3 memory system, control logic and external interfaces to
+form a multicore cluster. The PMU enables to gather various statistics on
+the operations of the DSU. The PMU provides independent 32bit counters that
+can count any of the supported events, along with a 64bit cycle counter.
+The PMU is accessed via CPU system registers and has no MMIO component.
+
+** DSU PMU required properties:
+
+- compatible	: should be one of :
+
+		"arm,dsu-pmu"
+
+- interrupts	: Exactly 1 SPI must be listed.
+
+- cpus		: List of phandles for the CPUs connected to this DSU instance.
+
+
+** Example:
+
+dsu-pmu-0 {
+	compatible = "arm,dsu-pmu";
+	interrupts = <GIC_SPI 02 IRQ_TYPE_LEVEL_HIGH>;
+	cpus = <&cpu_0>, <&cpu_1>;
+};

From 7520fa99246dade7ab6dde1573a146beed632abd Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Tue, 2 Jan 2018 11:25:33 +0000
Subject: [PATCH 039/104] perf: ARM DynamIQ Shared Unit PMU support

Add support for the Cluster PMU part of the ARM DynamIQ Shared Unit (DSU).
The DSU integrates one or more cores with an L3 memory system, control
logic, and external interfaces to form a multicore cluster. The PMU
allows counting the various events related to L3, SCU etc, along with
providing a cycle counter.

The PMU can be accessed via system registers, which are common
to the cores in the same cluster. The PMU registers follow the
semantics of the ARMv8 PMU, mostly, with the exception that
the counters record the cluster wide events.

This driver is mostly based on the ARMv8 and CCI PMU drivers.
The driver only supports ARM64 at the moment. It can be extended
to support ARM32 by providing register accessors like we do in
arch/arm64/include/arm_dsu_pmu.h.

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 Documentation/perf/arm_dsu_pmu.txt   |  28 +
 arch/arm64/include/asm/arm_dsu_pmu.h | 129 ++++
 drivers/perf/Kconfig                 |   9 +
 drivers/perf/Makefile                |   1 +
 drivers/perf/arm_dsu_pmu.c           | 843 +++++++++++++++++++++++++++
 5 files changed, 1010 insertions(+)
 create mode 100644 Documentation/perf/arm_dsu_pmu.txt
 create mode 100644 arch/arm64/include/asm/arm_dsu_pmu.h
 create mode 100644 drivers/perf/arm_dsu_pmu.c

diff --git a/Documentation/perf/arm_dsu_pmu.txt b/Documentation/perf/arm_dsu_pmu.txt
new file mode 100644
index 000000000000..d611e15f5add
--- /dev/null
+++ b/Documentation/perf/arm_dsu_pmu.txt
@@ -0,0 +1,28 @@
+ARM DynamIQ Shared Unit (DSU) PMU
+==================================
+
+ARM DynamIQ Shared Unit integrates one or more cores with an L3 memory system,
+control logic and external interfaces to form a multicore cluster. The PMU
+allows counting the various events related to the L3 cache, Snoop Control Unit
+etc, using 32bit independent counters. It also provides a 64bit cycle counter.
+
+The PMU can only be accessed via CPU system registers and are common to the
+cores connected to the same DSU. Like most of the other uncore PMUs, DSU
+PMU doesn't support process specific events and cannot be used in sampling mode.
+
+The DSU provides a bitmap for a subset of implemented events via hardware
+registers. There is no way for the driver to determine if the other events
+are available or not. Hence the driver exposes only those events advertised
+by the DSU, in "events" directory under :
+
+  /sys/bus/event_sources/devices/arm_dsu_<N>/
+
+The user should refer to the TRM of the product to figure out the supported events
+and use the raw event code for the unlisted events.
+
+The driver also exposes the CPUs connected to the DSU instance in "associated_cpus".
+
+
+e.g usage :
+
+	perf stat -a -e arm_dsu_0/cycles/
diff --git a/arch/arm64/include/asm/arm_dsu_pmu.h b/arch/arm64/include/asm/arm_dsu_pmu.h
new file mode 100644
index 000000000000..82e5cc3356bf
--- /dev/null
+++ b/arch/arm64/include/asm/arm_dsu_pmu.h
@@ -0,0 +1,129 @@
+/*
+ * ARM DynamIQ Shared Unit (DSU) PMU Low level register access routines.
+ *
+ * Copyright (C) ARM Limited, 2017.
+ *
+ * Author: Suzuki K Poulose <suzuki.poulose@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2, as published by the Free Software Foundation.
+ */
+
+#include <linux/bitops.h>
+#include <linux/build_bug.h>
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <asm/barrier.h>
+#include <asm/sysreg.h>
+
+
+#define CLUSTERPMCR_EL1			sys_reg(3, 0, 15, 5, 0)
+#define CLUSTERPMCNTENSET_EL1		sys_reg(3, 0, 15, 5, 1)
+#define CLUSTERPMCNTENCLR_EL1		sys_reg(3, 0, 15, 5, 2)
+#define CLUSTERPMOVSSET_EL1		sys_reg(3, 0, 15, 5, 3)
+#define CLUSTERPMOVSCLR_EL1		sys_reg(3, 0, 15, 5, 4)
+#define CLUSTERPMSELR_EL1		sys_reg(3, 0, 15, 5, 5)
+#define CLUSTERPMINTENSET_EL1		sys_reg(3, 0, 15, 5, 6)
+#define CLUSTERPMINTENCLR_EL1		sys_reg(3, 0, 15, 5, 7)
+#define CLUSTERPMCCNTR_EL1		sys_reg(3, 0, 15, 6, 0)
+#define CLUSTERPMXEVTYPER_EL1		sys_reg(3, 0, 15, 6, 1)
+#define CLUSTERPMXEVCNTR_EL1		sys_reg(3, 0, 15, 6, 2)
+#define CLUSTERPMMDCR_EL1		sys_reg(3, 0, 15, 6, 3)
+#define CLUSTERPMCEID0_EL1		sys_reg(3, 0, 15, 6, 4)
+#define CLUSTERPMCEID1_EL1		sys_reg(3, 0, 15, 6, 5)
+
+static inline u32 __dsu_pmu_read_pmcr(void)
+{
+	return read_sysreg_s(CLUSTERPMCR_EL1);
+}
+
+static inline void __dsu_pmu_write_pmcr(u32 val)
+{
+	write_sysreg_s(val, CLUSTERPMCR_EL1);
+	isb();
+}
+
+static inline u32 __dsu_pmu_get_reset_overflow(void)
+{
+	u32 val = read_sysreg_s(CLUSTERPMOVSCLR_EL1);
+	/* Clear the bit */
+	write_sysreg_s(val, CLUSTERPMOVSCLR_EL1);
+	isb();
+	return val;
+}
+
+static inline void __dsu_pmu_select_counter(int counter)
+{
+	write_sysreg_s(counter, CLUSTERPMSELR_EL1);
+	isb();
+}
+
+static inline u64 __dsu_pmu_read_counter(int counter)
+{
+	__dsu_pmu_select_counter(counter);
+	return read_sysreg_s(CLUSTERPMXEVCNTR_EL1);
+}
+
+static inline void __dsu_pmu_write_counter(int counter, u64 val)
+{
+	__dsu_pmu_select_counter(counter);
+	write_sysreg_s(val, CLUSTERPMXEVCNTR_EL1);
+	isb();
+}
+
+static inline void __dsu_pmu_set_event(int counter, u32 event)
+{
+	__dsu_pmu_select_counter(counter);
+	write_sysreg_s(event, CLUSTERPMXEVTYPER_EL1);
+	isb();
+}
+
+static inline u64 __dsu_pmu_read_pmccntr(void)
+{
+	return read_sysreg_s(CLUSTERPMCCNTR_EL1);
+}
+
+static inline void __dsu_pmu_write_pmccntr(u64 val)
+{
+	write_sysreg_s(val, CLUSTERPMCCNTR_EL1);
+	isb();
+}
+
+static inline void __dsu_pmu_disable_counter(int counter)
+{
+	write_sysreg_s(BIT(counter), CLUSTERPMCNTENCLR_EL1);
+	isb();
+}
+
+static inline void __dsu_pmu_enable_counter(int counter)
+{
+	write_sysreg_s(BIT(counter), CLUSTERPMCNTENSET_EL1);
+	isb();
+}
+
+static inline void __dsu_pmu_counter_interrupt_enable(int counter)
+{
+	write_sysreg_s(BIT(counter), CLUSTERPMINTENSET_EL1);
+	isb();
+}
+
+static inline void __dsu_pmu_counter_interrupt_disable(int counter)
+{
+	write_sysreg_s(BIT(counter), CLUSTERPMINTENCLR_EL1);
+	isb();
+}
+
+
+static inline u32 __dsu_pmu_read_pmceid(int n)
+{
+	switch (n) {
+	case 0:
+		return read_sysreg_s(CLUSTERPMCEID0_EL1);
+	case 1:
+		return read_sysreg_s(CLUSTERPMCEID1_EL1);
+	default:
+		BUILD_BUG();
+		return 0;
+	}
+}
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index b8f44b068fc6..da5724cd89cf 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -17,6 +17,15 @@ config ARM_PMU_ACPI
 	depends on ARM_PMU && ACPI
 	def_bool y
 
+config ARM_DSU_PMU
+	tristate "ARM DynamIQ Shared Unit (DSU) PMU"
+	depends on ARM64
+	  help
+	  Provides support for performance monitor unit in ARM DynamIQ Shared
+	  Unit (DSU). The DSU integrates one or more cores with an L3 memory
+	  system, control logic. The PMU allows counting various events related
+	  to DSU.
+
 config HISI_PMU
        bool "HiSilicon SoC PMU"
        depends on ARM64 && ACPI
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 710a0135bd61..c2f27419bdf0 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_ARM_DSU_PMU) += arm_dsu_pmu.o
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o
 obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
 obj-$(CONFIG_HISI_PMU) += hisilicon/
diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c
new file mode 100644
index 000000000000..37c0526c93d5
--- /dev/null
+++ b/drivers/perf/arm_dsu_pmu.c
@@ -0,0 +1,843 @@
+/*
+ * ARM DynamIQ Shared Unit (DSU) PMU driver
+ *
+ * Copyright (C) ARM Limited, 2017.
+ *
+ * Based on ARM CCI-PMU, ARMv8 PMU-v3 drivers.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#define PMUNAME		"arm_dsu"
+#define DRVNAME		PMUNAME "_pmu"
+#define pr_fmt(fmt)	DRVNAME ": " fmt
+
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+
+#include <asm/arm_dsu_pmu.h>
+#include <asm/local64.h>
+
+/* PMU event codes */
+#define DSU_PMU_EVT_CYCLES		0x11
+#define DSU_PMU_EVT_CHAIN		0x1e
+
+#define DSU_PMU_MAX_COMMON_EVENTS	0x40
+
+#define DSU_PMU_MAX_HW_CNTRS		32
+#define DSU_PMU_HW_COUNTER_MASK		(DSU_PMU_MAX_HW_CNTRS - 1)
+
+#define CLUSTERPMCR_E			BIT(0)
+#define CLUSTERPMCR_P			BIT(1)
+#define CLUSTERPMCR_C			BIT(2)
+#define CLUSTERPMCR_N_SHIFT		11
+#define CLUSTERPMCR_N_MASK		0x1f
+#define CLUSTERPMCR_IDCODE_SHIFT	16
+#define CLUSTERPMCR_IDCODE_MASK		0xff
+#define CLUSTERPMCR_IMP_SHIFT		24
+#define CLUSTERPMCR_IMP_MASK		0xff
+#define CLUSTERPMCR_RES_MASK		0x7e8
+#define CLUSTERPMCR_RES_VAL		0x40
+
+#define DSU_ACTIVE_CPU_MASK		0x0
+#define DSU_ASSOCIATED_CPU_MASK		0x1
+
+/*
+ * We use the index of the counters as they appear in the counter
+ * bit maps in the PMU registers (e.g CLUSTERPMSELR).
+ * i.e,
+ *	counter 0	- Bit 0
+ *	counter 1	- Bit 1
+ *	...
+ *	Cycle counter	- Bit 31
+ */
+#define DSU_PMU_IDX_CYCLE_COUNTER	31
+
+/* All event counters are 32bit, with a 64bit Cycle counter */
+#define DSU_PMU_COUNTER_WIDTH(idx)	\
+	(((idx) == DSU_PMU_IDX_CYCLE_COUNTER) ? 64 : 32)
+
+#define DSU_PMU_COUNTER_MASK(idx)	\
+	GENMASK_ULL((DSU_PMU_COUNTER_WIDTH((idx)) - 1), 0)
+
+#define DSU_EXT_ATTR(_name, _func, _config)		\
+	(&((struct dev_ext_attribute[]) {				\
+		{							\
+			.attr = __ATTR(_name, 0444, _func, NULL),	\
+			.var = (void *)_config				\
+		}							\
+	})[0].attr.attr)
+
+#define DSU_EVENT_ATTR(_name, _config)		\
+	DSU_EXT_ATTR(_name, dsu_pmu_sysfs_event_show, (unsigned long)_config)
+
+#define DSU_FORMAT_ATTR(_name, _config)		\
+	DSU_EXT_ATTR(_name, dsu_pmu_sysfs_format_show, (char *)_config)
+
+#define DSU_CPUMASK_ATTR(_name, _config)	\
+	DSU_EXT_ATTR(_name, dsu_pmu_cpumask_show, (unsigned long)_config)
+
+struct dsu_hw_events {
+	DECLARE_BITMAP(used_mask, DSU_PMU_MAX_HW_CNTRS);
+	struct perf_event	*events[DSU_PMU_MAX_HW_CNTRS];
+};
+
+/*
+ * struct dsu_pmu	- DSU PMU descriptor
+ *
+ * @pmu_lock		: Protects accesses to DSU PMU register from normal vs
+ *			  interrupt handler contexts.
+ * @hw_events		: Holds the event counter state.
+ * @associated_cpus	: CPUs attached to the DSU.
+ * @active_cpu		: CPU to which the PMU is bound for accesses.
+ * @cpuhp_node		: Node for CPU hotplug notifier link.
+ * @num_counters	: Number of event counters implemented by the PMU,
+ *			  excluding the cycle counter.
+ * @irq			: Interrupt line for counter overflow.
+ * @cpmceid_bitmap	: Bitmap for the availability of architected common
+ *			  events (event_code < 0x40).
+ */
+struct dsu_pmu {
+	struct pmu			pmu;
+	struct device			*dev;
+	raw_spinlock_t			pmu_lock;
+	struct dsu_hw_events		hw_events;
+	cpumask_t			associated_cpus;
+	cpumask_t			active_cpu;
+	struct hlist_node		cpuhp_node;
+	u8				num_counters;
+	int				irq;
+	DECLARE_BITMAP(cpmceid_bitmap, DSU_PMU_MAX_COMMON_EVENTS);
+};
+
+static unsigned long dsu_pmu_cpuhp_state;
+
+static inline struct dsu_pmu *to_dsu_pmu(struct pmu *pmu)
+{
+	return container_of(pmu, struct dsu_pmu, pmu);
+}
+
+static ssize_t dsu_pmu_sysfs_event_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct dev_ext_attribute *eattr = container_of(attr,
+					struct dev_ext_attribute, attr);
+	return snprintf(buf, PAGE_SIZE, "event=0x%lx\n",
+					 (unsigned long)eattr->var);
+}
+
+static ssize_t dsu_pmu_sysfs_format_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct dev_ext_attribute *eattr = container_of(attr,
+					struct dev_ext_attribute, attr);
+	return snprintf(buf, PAGE_SIZE, "%s\n", (char *)eattr->var);
+}
+
+static ssize_t dsu_pmu_cpumask_show(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct dsu_pmu *dsu_pmu = to_dsu_pmu(pmu);
+	struct dev_ext_attribute *eattr = container_of(attr,
+					struct dev_ext_attribute, attr);
+	unsigned long mask_id = (unsigned long)eattr->var;
+	const cpumask_t *cpumask;
+
+	switch (mask_id) {
+	case DSU_ACTIVE_CPU_MASK:
+		cpumask = &dsu_pmu->active_cpu;
+		break;
+	case DSU_ASSOCIATED_CPU_MASK:
+		cpumask = &dsu_pmu->associated_cpus;
+		break;
+	default:
+		return 0;
+	}
+	return cpumap_print_to_pagebuf(true, buf, cpumask);
+}
+
+static struct attribute *dsu_pmu_format_attrs[] = {
+	DSU_FORMAT_ATTR(event, "config:0-31"),
+	NULL,
+};
+
+static const struct attribute_group dsu_pmu_format_attr_group = {
+	.name = "format",
+	.attrs = dsu_pmu_format_attrs,
+};
+
+static struct attribute *dsu_pmu_event_attrs[] = {
+	DSU_EVENT_ATTR(cycles, 0x11),
+	DSU_EVENT_ATTR(bus_access, 0x19),
+	DSU_EVENT_ATTR(memory_error, 0x1a),
+	DSU_EVENT_ATTR(bus_cycles, 0x1d),
+	DSU_EVENT_ATTR(l3d_cache_allocate, 0x29),
+	DSU_EVENT_ATTR(l3d_cache_refill, 0x2a),
+	DSU_EVENT_ATTR(l3d_cache, 0x2b),
+	DSU_EVENT_ATTR(l3d_cache_wb, 0x2c),
+	NULL,
+};
+
+static umode_t
+dsu_pmu_event_attr_is_visible(struct kobject *kobj, struct attribute *attr,
+				int unused)
+{
+	struct pmu *pmu = dev_get_drvdata(kobj_to_dev(kobj));
+	struct dsu_pmu *dsu_pmu = to_dsu_pmu(pmu);
+	struct dev_ext_attribute *eattr = container_of(attr,
+					struct dev_ext_attribute, attr.attr);
+	unsigned long evt = (unsigned long)eattr->var;
+
+	return test_bit(evt, dsu_pmu->cpmceid_bitmap) ? attr->mode : 0;
+}
+
+static const struct attribute_group dsu_pmu_events_attr_group = {
+	.name = "events",
+	.attrs = dsu_pmu_event_attrs,
+	.is_visible = dsu_pmu_event_attr_is_visible,
+};
+
+static struct attribute *dsu_pmu_cpumask_attrs[] = {
+	DSU_CPUMASK_ATTR(cpumask, DSU_ACTIVE_CPU_MASK),
+	DSU_CPUMASK_ATTR(associated_cpus, DSU_ASSOCIATED_CPU_MASK),
+	NULL,
+};
+
+static const struct attribute_group dsu_pmu_cpumask_attr_group = {
+	.attrs = dsu_pmu_cpumask_attrs,
+};
+
+static const struct attribute_group *dsu_pmu_attr_groups[] = {
+	&dsu_pmu_cpumask_attr_group,
+	&dsu_pmu_events_attr_group,
+	&dsu_pmu_format_attr_group,
+	NULL,
+};
+
+static int dsu_pmu_get_online_cpu_any_but(struct dsu_pmu *dsu_pmu, int cpu)
+{
+	struct cpumask online_supported;
+
+	cpumask_and(&online_supported,
+			 &dsu_pmu->associated_cpus, cpu_online_mask);
+	return cpumask_any_but(&online_supported, cpu);
+}
+
+static inline bool dsu_pmu_counter_valid(struct dsu_pmu *dsu_pmu, u32 idx)
+{
+	return (idx < dsu_pmu->num_counters) ||
+	       (idx == DSU_PMU_IDX_CYCLE_COUNTER);
+}
+
+static inline u64 dsu_pmu_read_counter(struct perf_event *event)
+{
+	u64 val;
+	unsigned long flags;
+	struct dsu_pmu *dsu_pmu = to_dsu_pmu(event->pmu);
+	int idx = event->hw.idx;
+
+	if (WARN_ON(!cpumask_test_cpu(smp_processor_id(),
+				 &dsu_pmu->associated_cpus)))
+		return 0;
+
+	if (!dsu_pmu_counter_valid(dsu_pmu, idx)) {
+		dev_err(event->pmu->dev,
+			"Trying reading invalid counter %d\n", idx);
+		return 0;
+	}
+
+	raw_spin_lock_irqsave(&dsu_pmu->pmu_lock, flags);
+	if (idx == DSU_PMU_IDX_CYCLE_COUNTER)
+		val = __dsu_pmu_read_pmccntr();
+	else
+		val = __dsu_pmu_read_counter(idx);
+	raw_spin_unlock_irqrestore(&dsu_pmu->pmu_lock, flags);
+
+	return val;
+}
+
+static void dsu_pmu_write_counter(struct perf_event *event, u64 val)
+{
+	unsigned long flags;
+	struct dsu_pmu *dsu_pmu = to_dsu_pmu(event->pmu);
+	int idx = event->hw.idx;
+
+	if (WARN_ON(!cpumask_test_cpu(smp_processor_id(),
+			 &dsu_pmu->associated_cpus)))
+		return;
+
+	if (!dsu_pmu_counter_valid(dsu_pmu, idx)) {
+		dev_err(event->pmu->dev,
+			"writing to invalid counter %d\n", idx);
+		return;
+	}
+
+	raw_spin_lock_irqsave(&dsu_pmu->pmu_lock, flags);
+	if (idx == DSU_PMU_IDX_CYCLE_COUNTER)
+		__dsu_pmu_write_pmccntr(val);
+	else
+		__dsu_pmu_write_counter(idx, val);
+	raw_spin_unlock_irqrestore(&dsu_pmu->pmu_lock, flags);
+}
+
+static int dsu_pmu_get_event_idx(struct dsu_hw_events *hw_events,
+				 struct perf_event *event)
+{
+	int idx;
+	unsigned long evtype = event->attr.config;
+	struct dsu_pmu *dsu_pmu = to_dsu_pmu(event->pmu);
+	unsigned long *used_mask = hw_events->used_mask;
+
+	if (evtype == DSU_PMU_EVT_CYCLES) {
+		if (test_and_set_bit(DSU_PMU_IDX_CYCLE_COUNTER, used_mask))
+			return -EAGAIN;
+		return DSU_PMU_IDX_CYCLE_COUNTER;
+	}
+
+	idx = find_first_zero_bit(used_mask, dsu_pmu->num_counters);
+	if (idx >= dsu_pmu->num_counters)
+		return -EAGAIN;
+	set_bit(idx, hw_events->used_mask);
+	return idx;
+}
+
+static void dsu_pmu_enable_counter(struct dsu_pmu *dsu_pmu, int idx)
+{
+	__dsu_pmu_counter_interrupt_enable(idx);
+	__dsu_pmu_enable_counter(idx);
+}
+
+static void dsu_pmu_disable_counter(struct dsu_pmu *dsu_pmu, int idx)
+{
+	__dsu_pmu_disable_counter(idx);
+	__dsu_pmu_counter_interrupt_disable(idx);
+}
+
+static inline void dsu_pmu_set_event(struct dsu_pmu *dsu_pmu,
+					struct perf_event *event)
+{
+	int idx = event->hw.idx;
+	unsigned long flags;
+
+	if (!dsu_pmu_counter_valid(dsu_pmu, idx)) {
+		dev_err(event->pmu->dev,
+			"Trying to set invalid counter %d\n", idx);
+		return;
+	}
+
+	raw_spin_lock_irqsave(&dsu_pmu->pmu_lock, flags);
+	__dsu_pmu_set_event(idx, event->hw.config_base);
+	raw_spin_unlock_irqrestore(&dsu_pmu->pmu_lock, flags);
+}
+
+static void dsu_pmu_event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 delta, prev_count, new_count;
+
+	do {
+		/* We may also be called from the irq handler */
+		prev_count = local64_read(&hwc->prev_count);
+		new_count = dsu_pmu_read_counter(event);
+	} while (local64_cmpxchg(&hwc->prev_count, prev_count, new_count) !=
+			prev_count);
+	delta = (new_count - prev_count) & DSU_PMU_COUNTER_MASK(hwc->idx);
+	local64_add(delta, &event->count);
+}
+
+static void dsu_pmu_read(struct perf_event *event)
+{
+	dsu_pmu_event_update(event);
+}
+
+static inline u32 dsu_pmu_get_reset_overflow(void)
+{
+	return __dsu_pmu_get_reset_overflow();
+}
+
+/**
+ * dsu_pmu_set_event_period: Set the period for the counter.
+ *
+ * All DSU PMU event counters, except the cycle counter are 32bit
+ * counters. To handle cases of extreme interrupt latency, we program
+ * the counter with half of the max count for the counters.
+ */
+static void dsu_pmu_set_event_period(struct perf_event *event)
+{
+	int idx = event->hw.idx;
+	u64 val = DSU_PMU_COUNTER_MASK(idx) >> 1;
+
+	local64_set(&event->hw.prev_count, val);
+	dsu_pmu_write_counter(event, val);
+}
+
+static irqreturn_t dsu_pmu_handle_irq(int irq_num, void *dev)
+{
+	int i;
+	bool handled = false;
+	struct dsu_pmu *dsu_pmu = dev;
+	struct dsu_hw_events *hw_events = &dsu_pmu->hw_events;
+	unsigned long overflow;
+
+	overflow = dsu_pmu_get_reset_overflow();
+	if (!overflow)
+		return IRQ_NONE;
+
+	for_each_set_bit(i, &overflow, DSU_PMU_MAX_HW_CNTRS) {
+		struct perf_event *event = hw_events->events[i];
+
+		if (!event)
+			continue;
+		dsu_pmu_event_update(event);
+		dsu_pmu_set_event_period(event);
+		handled = true;
+	}
+
+	return IRQ_RETVAL(handled);
+}
+
+static void dsu_pmu_start(struct perf_event *event, int pmu_flags)
+{
+	struct dsu_pmu *dsu_pmu = to_dsu_pmu(event->pmu);
+
+	/* We always reprogram the counter */
+	if (pmu_flags & PERF_EF_RELOAD)
+		WARN_ON(!(event->hw.state & PERF_HES_UPTODATE));
+	dsu_pmu_set_event_period(event);
+	if (event->hw.idx != DSU_PMU_IDX_CYCLE_COUNTER)
+		dsu_pmu_set_event(dsu_pmu, event);
+	event->hw.state = 0;
+	dsu_pmu_enable_counter(dsu_pmu, event->hw.idx);
+}
+
+static void dsu_pmu_stop(struct perf_event *event, int pmu_flags)
+{
+	struct dsu_pmu *dsu_pmu = to_dsu_pmu(event->pmu);
+
+	if (event->hw.state & PERF_HES_STOPPED)
+		return;
+	dsu_pmu_disable_counter(dsu_pmu, event->hw.idx);
+	dsu_pmu_event_update(event);
+	event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+}
+
+static int dsu_pmu_add(struct perf_event *event, int flags)
+{
+	struct dsu_pmu *dsu_pmu = to_dsu_pmu(event->pmu);
+	struct dsu_hw_events *hw_events = &dsu_pmu->hw_events;
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+
+	if (WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(),
+					   &dsu_pmu->associated_cpus)))
+		return -ENOENT;
+
+	idx = dsu_pmu_get_event_idx(hw_events, event);
+	if (idx < 0)
+		return idx;
+
+	hwc->idx = idx;
+	hw_events->events[idx] = event;
+	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+	if (flags & PERF_EF_START)
+		dsu_pmu_start(event, PERF_EF_RELOAD);
+
+	perf_event_update_userpage(event);
+	return 0;
+}
+
+static void dsu_pmu_del(struct perf_event *event, int flags)
+{
+	struct dsu_pmu *dsu_pmu = to_dsu_pmu(event->pmu);
+	struct dsu_hw_events *hw_events = &dsu_pmu->hw_events;
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	dsu_pmu_stop(event, PERF_EF_UPDATE);
+	hw_events->events[idx] = NULL;
+	clear_bit(idx, hw_events->used_mask);
+	perf_event_update_userpage(event);
+}
+
+static void dsu_pmu_enable(struct pmu *pmu)
+{
+	u32 pmcr;
+	unsigned long flags;
+	struct dsu_pmu *dsu_pmu = to_dsu_pmu(pmu);
+
+	/* If no counters are added, skip enabling the PMU */
+	if (bitmap_empty(dsu_pmu->hw_events.used_mask, DSU_PMU_MAX_HW_CNTRS))
+		return;
+
+	raw_spin_lock_irqsave(&dsu_pmu->pmu_lock, flags);
+	pmcr = __dsu_pmu_read_pmcr();
+	pmcr |= CLUSTERPMCR_E;
+	__dsu_pmu_write_pmcr(pmcr);
+	raw_spin_unlock_irqrestore(&dsu_pmu->pmu_lock, flags);
+}
+
+static void dsu_pmu_disable(struct pmu *pmu)
+{
+	u32 pmcr;
+	unsigned long flags;
+	struct dsu_pmu *dsu_pmu = to_dsu_pmu(pmu);
+
+	raw_spin_lock_irqsave(&dsu_pmu->pmu_lock, flags);
+	pmcr = __dsu_pmu_read_pmcr();
+	pmcr &= ~CLUSTERPMCR_E;
+	__dsu_pmu_write_pmcr(pmcr);
+	raw_spin_unlock_irqrestore(&dsu_pmu->pmu_lock, flags);
+}
+
+static bool dsu_pmu_validate_event(struct pmu *pmu,
+				  struct dsu_hw_events *hw_events,
+				  struct perf_event *event)
+{
+	if (is_software_event(event))
+		return true;
+	/* Reject groups spanning multiple HW PMUs. */
+	if (event->pmu != pmu)
+		return false;
+	return dsu_pmu_get_event_idx(hw_events, event) >= 0;
+}
+
+/*
+ * Make sure the group of events can be scheduled at once
+ * on the PMU.
+ */
+static bool dsu_pmu_validate_group(struct perf_event *event)
+{
+	struct perf_event *sibling, *leader = event->group_leader;
+	struct dsu_hw_events fake_hw;
+
+	if (event->group_leader == event)
+		return true;
+
+	memset(fake_hw.used_mask, 0, sizeof(fake_hw.used_mask));
+	if (!dsu_pmu_validate_event(event->pmu, &fake_hw, leader))
+		return false;
+	list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
+		if (!dsu_pmu_validate_event(event->pmu, &fake_hw, sibling))
+			return false;
+	}
+	return dsu_pmu_validate_event(event->pmu, &fake_hw, event);
+}
+
+static int dsu_pmu_event_init(struct perf_event *event)
+{
+	struct dsu_pmu *dsu_pmu = to_dsu_pmu(event->pmu);
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/* We don't support sampling */
+	if (is_sampling_event(event)) {
+		dev_dbg(dsu_pmu->pmu.dev, "Can't support sampling events\n");
+		return -EOPNOTSUPP;
+	}
+
+	/* We cannot support task bound events */
+	if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK) {
+		dev_dbg(dsu_pmu->pmu.dev, "Can't support per-task counters\n");
+		return -EINVAL;
+	}
+
+	if (has_branch_stack(event) ||
+	    event->attr.exclude_user ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv ||
+	    event->attr.exclude_idle ||
+	    event->attr.exclude_host ||
+	    event->attr.exclude_guest) {
+		dev_dbg(dsu_pmu->pmu.dev, "Can't support filtering\n");
+		return -EINVAL;
+	}
+
+	if (!cpumask_test_cpu(event->cpu, &dsu_pmu->associated_cpus)) {
+		dev_dbg(dsu_pmu->pmu.dev,
+			 "Requested cpu is not associated with the DSU\n");
+		return -EINVAL;
+	}
+	/*
+	 * Choose the current active CPU to read the events. We don't want
+	 * to migrate the event contexts, irq handling etc to the requested
+	 * CPU. As long as the requested CPU is within the same DSU, we
+	 * are fine.
+	 */
+	event->cpu = cpumask_first(&dsu_pmu->active_cpu);
+	if (event->cpu >= nr_cpu_ids)
+		return -EINVAL;
+	if (!dsu_pmu_validate_group(event))
+		return -EINVAL;
+
+	event->hw.config_base = event->attr.config;
+	return 0;
+}
+
+static struct dsu_pmu *dsu_pmu_alloc(struct platform_device *pdev)
+{
+	struct dsu_pmu *dsu_pmu;
+
+	dsu_pmu = devm_kzalloc(&pdev->dev, sizeof(*dsu_pmu), GFP_KERNEL);
+	if (!dsu_pmu)
+		return ERR_PTR(-ENOMEM);
+
+	raw_spin_lock_init(&dsu_pmu->pmu_lock);
+	/*
+	 * Initialise the number of counters to -1, until we probe
+	 * the real number on a connected CPU.
+	 */
+	dsu_pmu->num_counters = -1;
+	return dsu_pmu;
+}
+
+/**
+ * dsu_pmu_dt_get_cpus: Get the list of CPUs in the cluster.
+ */
+static int dsu_pmu_dt_get_cpus(struct device_node *dev, cpumask_t *mask)
+{
+	int i = 0, n, cpu;
+	struct device_node *cpu_node;
+
+	n = of_count_phandle_with_args(dev, "cpus", NULL);
+	if (n <= 0)
+		return -ENODEV;
+	for (; i < n; i++) {
+		cpu_node = of_parse_phandle(dev, "cpus", i);
+		if (!cpu_node)
+			break;
+		cpu = of_cpu_node_to_id(cpu_node);
+		of_node_put(cpu_node);
+		/*
+		 * We have to ignore the failures here and continue scanning
+		 * the list to handle cases where the nr_cpus could be capped
+		 * in the running kernel.
+		 */
+		if (cpu < 0)
+			continue;
+		cpumask_set_cpu(cpu, mask);
+	}
+	return 0;
+}
+
+/*
+ * dsu_pmu_probe_pmu: Probe the PMU details on a CPU in the cluster.
+ */
+static void dsu_pmu_probe_pmu(struct dsu_pmu *dsu_pmu)
+{
+	u64 num_counters;
+	u32 cpmceid[2];
+
+	num_counters = (__dsu_pmu_read_pmcr() >> CLUSTERPMCR_N_SHIFT) &
+						CLUSTERPMCR_N_MASK;
+	/* We can only support up to 31 independent counters */
+	if (WARN_ON(num_counters > 31))
+		num_counters = 31;
+	dsu_pmu->num_counters = num_counters;
+	if (!dsu_pmu->num_counters)
+		return;
+	cpmceid[0] = __dsu_pmu_read_pmceid(0);
+	cpmceid[1] = __dsu_pmu_read_pmceid(1);
+	bitmap_from_u32array(dsu_pmu->cpmceid_bitmap,
+				DSU_PMU_MAX_COMMON_EVENTS,
+				cpmceid,
+				ARRAY_SIZE(cpmceid));
+}
+
+static void dsu_pmu_set_active_cpu(int cpu, struct dsu_pmu *dsu_pmu)
+{
+	cpumask_set_cpu(cpu, &dsu_pmu->active_cpu);
+	if (irq_set_affinity_hint(dsu_pmu->irq, &dsu_pmu->active_cpu))
+		pr_warn("Failed to set irq affinity to %d\n", cpu);
+}
+
+/*
+ * dsu_pmu_init_pmu: Initialise the DSU PMU configurations if
+ * we haven't done it already.
+ */
+static void dsu_pmu_init_pmu(struct dsu_pmu *dsu_pmu)
+{
+	if (dsu_pmu->num_counters == -1)
+		dsu_pmu_probe_pmu(dsu_pmu);
+	/* Reset the interrupt overflow mask */
+	dsu_pmu_get_reset_overflow();
+}
+
+static int dsu_pmu_device_probe(struct platform_device *pdev)
+{
+	int irq, rc;
+	struct dsu_pmu *dsu_pmu;
+	char *name;
+	static atomic_t pmu_idx = ATOMIC_INIT(-1);
+
+	dsu_pmu = dsu_pmu_alloc(pdev);
+	if (IS_ERR(dsu_pmu))
+		return PTR_ERR(dsu_pmu);
+
+	rc = dsu_pmu_dt_get_cpus(pdev->dev.of_node, &dsu_pmu->associated_cpus);
+	if (rc) {
+		dev_warn(&pdev->dev, "Failed to parse the CPUs\n");
+		return rc;
+	}
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0) {
+		dev_warn(&pdev->dev, "Failed to find IRQ\n");
+		return -EINVAL;
+	}
+
+	name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "%s_%d",
+				PMUNAME, atomic_inc_return(&pmu_idx));
+	if (!name)
+		return -ENOMEM;
+	rc = devm_request_irq(&pdev->dev, irq, dsu_pmu_handle_irq,
+			      IRQF_NOBALANCING, name, dsu_pmu);
+	if (rc) {
+		dev_warn(&pdev->dev, "Failed to request IRQ %d\n", irq);
+		return rc;
+	}
+
+	dsu_pmu->irq = irq;
+	platform_set_drvdata(pdev, dsu_pmu);
+	rc = cpuhp_state_add_instance(dsu_pmu_cpuhp_state,
+						&dsu_pmu->cpuhp_node);
+	if (rc)
+		return rc;
+
+	dsu_pmu->pmu = (struct pmu) {
+		.task_ctx_nr	= perf_invalid_context,
+		.module		= THIS_MODULE,
+		.pmu_enable	= dsu_pmu_enable,
+		.pmu_disable	= dsu_pmu_disable,
+		.event_init	= dsu_pmu_event_init,
+		.add		= dsu_pmu_add,
+		.del		= dsu_pmu_del,
+		.start		= dsu_pmu_start,
+		.stop		= dsu_pmu_stop,
+		.read		= dsu_pmu_read,
+
+		.attr_groups	= dsu_pmu_attr_groups,
+	};
+
+	rc = perf_pmu_register(&dsu_pmu->pmu, name, -1);
+	if (rc) {
+		cpuhp_state_remove_instance(dsu_pmu_cpuhp_state,
+						 &dsu_pmu->cpuhp_node);
+		irq_set_affinity_hint(dsu_pmu->irq, NULL);
+	}
+
+	return rc;
+}
+
+static int dsu_pmu_device_remove(struct platform_device *pdev)
+{
+	struct dsu_pmu *dsu_pmu = platform_get_drvdata(pdev);
+
+	perf_pmu_unregister(&dsu_pmu->pmu);
+	cpuhp_state_remove_instance(dsu_pmu_cpuhp_state, &dsu_pmu->cpuhp_node);
+	irq_set_affinity_hint(dsu_pmu->irq, NULL);
+
+	return 0;
+}
+
+static const struct of_device_id dsu_pmu_of_match[] = {
+	{ .compatible = "arm,dsu-pmu", },
+	{},
+};
+
+static struct platform_driver dsu_pmu_driver = {
+	.driver = {
+		.name	= DRVNAME,
+		.of_match_table = of_match_ptr(dsu_pmu_of_match),
+	},
+	.probe = dsu_pmu_device_probe,
+	.remove = dsu_pmu_device_remove,
+};
+
+static int dsu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
+{
+	struct dsu_pmu *dsu_pmu = hlist_entry_safe(node, struct dsu_pmu,
+						   cpuhp_node);
+
+	if (!cpumask_test_cpu(cpu, &dsu_pmu->associated_cpus))
+		return 0;
+
+	/* If the PMU is already managed, there is nothing to do */
+	if (!cpumask_empty(&dsu_pmu->active_cpu))
+		return 0;
+
+	dsu_pmu_init_pmu(dsu_pmu);
+	dsu_pmu_set_active_cpu(cpu, dsu_pmu);
+
+	return 0;
+}
+
+static int dsu_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node)
+{
+	int dst;
+	struct dsu_pmu *dsu_pmu = hlist_entry_safe(node, struct dsu_pmu,
+						   cpuhp_node);
+
+	if (!cpumask_test_and_clear_cpu(cpu, &dsu_pmu->active_cpu))
+		return 0;
+
+	dst = dsu_pmu_get_online_cpu_any_but(dsu_pmu, cpu);
+	/* If there are no active CPUs in the DSU, leave IRQ disabled */
+	if (dst >= nr_cpu_ids) {
+		irq_set_affinity_hint(dsu_pmu->irq, NULL);
+		return 0;
+	}
+
+	perf_pmu_migrate_context(&dsu_pmu->pmu, cpu, dst);
+	dsu_pmu_set_active_cpu(dst, dsu_pmu);
+
+	return 0;
+}
+
+static int __init dsu_pmu_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+					DRVNAME,
+					dsu_pmu_cpu_online,
+					dsu_pmu_cpu_teardown);
+	if (ret < 0)
+		return ret;
+	dsu_pmu_cpuhp_state = ret;
+	return platform_driver_register(&dsu_pmu_driver);
+}
+
+static void __exit dsu_pmu_exit(void)
+{
+	platform_driver_unregister(&dsu_pmu_driver);
+	cpuhp_remove_multi_state(dsu_pmu_cpuhp_state);
+}
+
+module_init(dsu_pmu_init);
+module_exit(dsu_pmu_exit);
+
+MODULE_DEVICE_TABLE(of, dsu_pmu_of_match);
+MODULE_DESCRIPTION("Perf driver for ARM DynamIQ Shared Unit");
+MODULE_AUTHOR("Suzuki K Poulose <suzuki.poulose@arm.com>");
+MODULE_LICENSE("GPL v2");

From a8ffaaa060b8d4da6138e0958cb0f45b73e1cb78 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Wed, 27 Dec 2017 15:12:56 +0000
Subject: [PATCH 040/104] arm64: asid: Do not replace active_asids if already 0

Under some uncommon timing conditions, a generation check and
xchg(active_asids, A1) in check_and_switch_context() on P1 can race with
an ASID roll-over on P2. If P2 has not seen the update to
active_asids[P1], it can re-allocate A1 to a new task T2 on P2. P1 ends
up waiting on the spinlock since the xchg() returned 0 while P2 can go
through a second ASID roll-over with (T2,A1,G2) active on P2. This
roll-over copies active_asids[P1] == A1,G1 into reserved_asids[P1] and
active_asids[P2] == A1,G2 into reserved_asids[P2]. A subsequent
scheduling of T1 on P1 and T2 on P2 would match reserved_asids and get
their generation bumped to G3:

P1					P2
--                                      --
TTBR0.BADDR = T0
TTBR0.ASID = A0
asid_generation = G1
check_and_switch_context(T1,A1,G1)
  generation match
					check_and_switch_context(T2,A0,G0)
 				          new_context()
					    ASID roll-over
					    asid_generation = G2
					    flush_context()
					      active_asids[P1] = 0
					      asid_map[A1] = 0
					      reserved_asids[P1] = A0,G0
  xchg(active_asids, A1)
    active_asids[P1] = A1,G1
    xchg returns 0
  spin_lock_irqsave()
					    allocated ASID (T2,A1,G2)
					    asid_map[A1] = 1
					  active_asids[P2] = A1,G2
					...
					check_and_switch_context(T3,A0,G0)
					  new_context()
					    ASID roll-over
					    asid_generation = G3
					    flush_context()
					      active_asids[P1] = 0
					      asid_map[A1] = 1
					      reserved_asids[P1] = A1,G1
					      reserved_asids[P2] = A1,G2
					    allocated ASID (T3,A2,G3)
					    asid_map[A2] = 1
					  active_asids[P2] = A2,G3
  new_context()
    check_update_reserved_asid(A1,G1)
      matches reserved_asid[P1]
      reserved_asid[P1] = A1,G3
  updated T1 ASID to (T1,A1,G3)
					check_and_switch_context(T2,A1,G2)
					  new_context()
					    check_and_switch_context(A1,G2)
					      matches reserved_asids[P2]
					      reserved_asids[P2] = A1,G3
					  updated T2 ASID to (T2,A1,G3)

At this point, we have two tasks, T1 and T2 both using ASID A1 with the
latest generation G3. Any of them is allowed to be scheduled on the
other CPU leading to two different tasks with the same ASID on the same
CPU.

This patch changes the xchg to cmpxchg so that the active_asids is only
updated if non-zero to avoid a race with an ASID roll-over on a
different CPU.

The ASID allocation algorithm has been formally verified using the TLA+
model checker (see
https://git.kernel.org/pub/scm/linux/kernel/git/cmarinas/kernel-tla.git/tree/asidalloc.tla
for the spec).

Reviewed-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/context.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 1cb3bc92ae5c..1fe71b9fcf35 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -194,26 +194,29 @@ set_asid:
 void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
 {
 	unsigned long flags;
-	u64 asid;
+	u64 asid, old_active_asid;
 
 	asid = atomic64_read(&mm->context.id);
 
 	/*
 	 * The memory ordering here is subtle.
-	 * If our ASID matches the current generation, then we update
-	 * our active_asids entry with a relaxed xchg. Racing with a
-	 * concurrent rollover means that either:
+	 * If our active_asids is non-zero and the ASID matches the current
+	 * generation, then we update the active_asids entry with a relaxed
+	 * cmpxchg. Racing with a concurrent rollover means that either:
 	 *
-	 * - We get a zero back from the xchg and end up waiting on the
+	 * - We get a zero back from the cmpxchg and end up waiting on the
 	 *   lock. Taking the lock synchronises with the rollover and so
 	 *   we are forced to see the updated generation.
 	 *
-	 * - We get a valid ASID back from the xchg, which means the
+	 * - We get a valid ASID back from the cmpxchg, which means the
 	 *   relaxed xchg in flush_context will treat us as reserved
 	 *   because atomic RmWs are totally ordered for a given location.
 	 */
-	if (!((asid ^ atomic64_read(&asid_generation)) >> asid_bits)
-	    && atomic64_xchg_relaxed(&per_cpu(active_asids, cpu), asid))
+	old_active_asid = atomic64_read(&per_cpu(active_asids, cpu));
+	if (old_active_asid &&
+	    !((asid ^ atomic64_read(&asid_generation)) >> asid_bits) &&
+	    atomic64_cmpxchg_relaxed(&per_cpu(active_asids, cpu),
+				     old_active_asid, asid))
 		goto switch_mm_fastpath;
 
 	raw_spin_lock_irqsave(&cpu_asid_lock, flags);

From 3b3b681097fae73b7f5dcdd42db6cfdf32943d4c Mon Sep 17 00:00:00 2001
From: Dongjiu Geng <gengdongjiu@huawei.com>
Date: Wed, 13 Dec 2017 18:13:56 +0800
Subject: [PATCH 041/104] arm64: v8.4: Support for new floating point
 multiplication instructions

ARM v8.4 extensions add new neon instructions for performing a
multiplication of each FP16 element of one vector with the corresponding
FP16 element of a second vector, and to add or subtract this without an
intermediate rounding to the corresponding FP32 element in a third vector.

This patch detects this feature and let the userspace know about it via a
HWCAP bit and MRS emulation.

Cc: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Dongjiu Geng <gengdongjiu@huawei.com>
Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arm64/cpu-feature-registers.txt | 4 +++-
 Documentation/arm64/elf_hwcaps.txt            | 4 ++++
 arch/arm64/include/asm/sysreg.h               | 1 +
 arch/arm64/include/uapi/asm/hwcap.h           | 1 +
 arch/arm64/kernel/cpufeature.c                | 2 ++
 arch/arm64/kernel/cpuinfo.c                   | 1 +
 6 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/Documentation/arm64/cpu-feature-registers.txt b/Documentation/arm64/cpu-feature-registers.txt
index bd9b3faab2c4..a70090b28b07 100644
--- a/Documentation/arm64/cpu-feature-registers.txt
+++ b/Documentation/arm64/cpu-feature-registers.txt
@@ -110,7 +110,9 @@ infrastructure:
      x--------------------------------------------------x
      | Name                         |  bits   | visible |
      |--------------------------------------------------|
-     | RES0                         | [63-48] |    n    |
+     | RES0                         | [63-52] |    n    |
+     |--------------------------------------------------|
+     | FHM                          | [51-48] |    y    |
      |--------------------------------------------------|
      | DP                           | [47-44] |    y    |
      |--------------------------------------------------|
diff --git a/Documentation/arm64/elf_hwcaps.txt b/Documentation/arm64/elf_hwcaps.txt
index 89edba12a9e0..57324ee55ecc 100644
--- a/Documentation/arm64/elf_hwcaps.txt
+++ b/Documentation/arm64/elf_hwcaps.txt
@@ -158,3 +158,7 @@ HWCAP_SHA512
 HWCAP_SVE
 
     Functionality implied by ID_AA64PFR0_EL1.SVE == 0b0001.
+
+HWCAP_ASIMDFHM
+
+   Functionality implied by ID_AA64ISAR0_EL1.FHM == 0b0001.
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index ec144f480b39..ab637886a6f9 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -419,6 +419,7 @@
 #define SCTLR_EL1_CP15BEN	(1 << 5)
 
 /* id_aa64isar0 */
+#define ID_AA64ISAR0_FHM_SHIFT		48
 #define ID_AA64ISAR0_DP_SHIFT		44
 #define ID_AA64ISAR0_SM4_SHIFT		40
 #define ID_AA64ISAR0_SM3_SHIFT		36
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index cda76fa8b9b2..f018c3deea3b 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -43,5 +43,6 @@
 #define HWCAP_ASIMDDP		(1 << 20)
 #define HWCAP_SHA512		(1 << 21)
 #define HWCAP_SVE		(1 << 22)
+#define HWCAP_ASIMDFHM		(1 << 23)
 
 #endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 9f0545dfe497..1f024049eb30 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -123,6 +123,7 @@ cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused)
  * sync with the documentation of the CPU feature register ABI.
  */
 static const struct arm64_ftr_bits ftr_id_aa64isar0[] = {
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_FHM_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_DP_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SM4_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SM3_SHIFT, 4, 0),
@@ -1032,6 +1033,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SM3),
 	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM4_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SM4),
 	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_DP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDDP),
+	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_FHM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDFHM),
 	HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_FP),
 	HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_FPHP),
 	HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_ASIMD),
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 1e2554543506..7f94623df8a5 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -76,6 +76,7 @@ static const char *const hwcap_str[] = {
 	"asimddp",
 	"sha512",
 	"sve",
+	"asimdfhm",
 	NULL
 };
 

From be04a6d1126b02c6a28741155b899d648739fc5b Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 16:15:59 +0000
Subject: [PATCH 042/104] arm64: use RET instruction for exiting the trampoline

Speculation attacks against the entry trampoline can potentially resteer
the speculative instruction stream through the indirect branch and into
arbitrary gadgets within the kernel.

This patch defends against these attacks by forcing a misprediction
through the return stack: a dummy BL instruction loads an entry into
the stack, so that the predicted program flow of the subsequent RET
instruction is to a branch-to-self instruction which is finally resolved
as a branch to the kernel vectors with speculation suppressed.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/entry.S | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 031392ee5f47..6ceed4877daf 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -1029,6 +1029,14 @@ alternative_else_nop_endif
 	.if	\regsize == 64
 	msr	tpidrro_el0, x30	// Restored in kernel_ventry
 	.endif
+	/*
+	 * Defend against branch aliasing attacks by pushing a dummy
+	 * entry onto the return stack and using a RET instruction to
+	 * enter the full-fat kernel vectors.
+	 */
+	bl	2f
+	b	.
+2:
 	tramp_map_kernel	x30
 #ifdef CONFIG_RANDOMIZE_BASE
 	adr	x30, tramp_vectors + PAGE_SIZE
@@ -1041,7 +1049,7 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003
 	msr	vbar_el1, x30
 	add	x30, x30, #(1b - tramp_vectors)
 	isb
-	br	x30
+	ret
 	.endm
 
 	.macro tramp_exit, regsize = 64

From 0617052ddde355ee663b2f048e67dd381e5ebd6a Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 16:19:39 +0000
Subject: [PATCH 043/104] arm64: Kconfig: Reword UNMAP_KERNEL_AT_EL0 kconfig
 entry

Although CONFIG_UNMAP_KERNEL_AT_EL0 does make KASLR more robust, it's
actually more useful as a mitigation against speculation attacks that
can leak arbitrary kernel data to userspace through speculation.

Reword the Kconfig help message to reflect this, and make the option
depend on EXPERT so that it is on by default for the majority of users.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9b1330806f39..cb7a70e686cb 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -863,15 +863,14 @@ config FORCE_MAX_ZONEORDER
 	  4M allocations matching the default size used by generic code.
 
 config UNMAP_KERNEL_AT_EL0
-	bool "Unmap kernel when running in userspace (aka \"KAISER\")"
+	bool "Unmap kernel when running in userspace (aka \"KAISER\")" if EXPERT
 	default y
 	help
-	  Some attacks against KASLR make use of the timing difference between
-	  a permission fault which could arise from a page table entry that is
-	  present in the TLB, and a translation fault which always requires a
-	  page table walk. This option defends against these attacks by unmapping
-	  the kernel whilst running in userspace, therefore forcing translation
-	  faults for all of kernel space.
+	  Speculation attacks against some high-performance processors can
+	  be used to bypass MMU permission checks and leak kernel data to
+	  userspace. This can be defended against by unmapping the kernel
+	  when running in userspace, mapping it back in on exception entry
+	  via a trampoline page in the vector table.
 
 	  If unsure, say Y.
 

From 179a56f6f9fbda28f6ca07db1fc3dfad6bc7343c Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Mon, 27 Nov 2017 18:29:30 +0000
Subject: [PATCH 044/104] arm64: Take into account ID_AA64PFR0_EL1.CSV3

For non-KASLR kernels where the KPTI behaviour has not been overridden
on the command line we can use ID_AA64PFR0_EL1.CSV3 to determine whether
or not we should unmap the kernel whilst running at EL0.

Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/sysreg.h | 1 +
 arch/arm64/kernel/cpufeature.c  | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index ab637886a6f9..262ae18f0e05 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -438,6 +438,7 @@
 #define ID_AA64ISAR1_DPB_SHIFT		0
 
 /* id_aa64pfr0 */
+#define ID_AA64PFR0_CSV3_SHIFT		60
 #define ID_AA64PFR0_SVE_SHIFT		32
 #define ID_AA64PFR0_GIC_SHIFT		24
 #define ID_AA64PFR0_ASIMD_SHIFT		20
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 1f024049eb30..0f3c96172f0f 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -146,6 +146,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = {
 };
 
 static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0),
 	S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI),
@@ -852,6 +853,8 @@ static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */
 static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry,
 				int __unused)
 {
+	u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+
 	/* Forced on command line? */
 	if (__kpti_forced) {
 		pr_info_once("kernel page table isolation forced %s by command line option\n",
@@ -863,7 +866,9 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry,
 	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
 		return true;
 
-	return false;
+	/* Defer to CPU feature registers */
+	return !cpuid_feature_extract_unsigned_field(pfr0,
+						     ID_AA64PFR0_CSV3_SHIFT);
 }
 
 static int __init parse_kpti(char *str)
@@ -968,6 +973,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 	},
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 	{
+		.desc = "Kernel page table isolation (KPTI)",
 		.capability = ARM64_UNMAP_KERNEL_AT_EL0,
 		.def_scope = SCOPE_SYSTEM,
 		.matches = unmap_kernel_at_el0,

From 0a0d111d40fd1dc588cc590fab6b55d86ddc71d3 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 2 Jan 2018 21:37:25 +0000
Subject: [PATCH 045/104] arm64: cpufeature: Pass capability structure to
 ->enable callback

In order to invoke the CPU capability ->matches callback from the ->enable
callback for applying local-CPU workarounds, we need a handle on the
capability structure.

This patch passes a pointer to the capability structure to the ->enable
callback.

Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/cpufeature.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 0f3c96172f0f..991922b45d3d 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1154,7 +1154,7 @@ void __init enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps)
 			 * uses an IPI, giving us a PSTATE that disappears when
 			 * we return.
 			 */
-			stop_machine(caps->enable, NULL, cpu_online_mask);
+			stop_machine(caps->enable, (void *)caps, cpu_online_mask);
 		}
 	}
 }
@@ -1197,7 +1197,7 @@ verify_local_cpu_features(const struct arm64_cpu_capabilities *caps)
 			cpu_die_early();
 		}
 		if (caps->enable)
-			caps->enable(NULL);
+			caps->enable((void *)caps);
 	}
 }
 

From d68e3ba5303f7e1099f51fdcd155f5263da8569b Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 2 Jan 2018 21:45:41 +0000
Subject: [PATCH 046/104] drivers/firmware: Expose psci_get_version through
 psci_ops structure

Entry into recent versions of ARM Trusted Firmware will invalidate the CPU
branch predictor state in order to protect against aliasing attacks.

This patch exposes the PSCI "VERSION" function via psci_ops, so that it
can be invoked outside of the PSCI driver where necessary.

Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/firmware/psci.c | 2 ++
 include/linux/psci.h    | 1 +
 2 files changed, 3 insertions(+)

diff --git a/drivers/firmware/psci.c b/drivers/firmware/psci.c
index d687ca3d5049..8b25d31e8401 100644
--- a/drivers/firmware/psci.c
+++ b/drivers/firmware/psci.c
@@ -496,6 +496,8 @@ static void __init psci_init_migrate(void)
 static void __init psci_0_2_set_functions(void)
 {
 	pr_info("Using standard PSCI v0.2 function IDs\n");
+	psci_ops.get_version = psci_get_version;
+
 	psci_function_id[PSCI_FN_CPU_SUSPEND] =
 					PSCI_FN_NATIVE(0_2, CPU_SUSPEND);
 	psci_ops.cpu_suspend = psci_cpu_suspend;
diff --git a/include/linux/psci.h b/include/linux/psci.h
index bdea1cb5e1db..6306ab10af18 100644
--- a/include/linux/psci.h
+++ b/include/linux/psci.h
@@ -26,6 +26,7 @@ int psci_cpu_init_idle(unsigned int cpu);
 int psci_cpu_suspend_enter(unsigned long index);
 
 struct psci_operations {
+	u32 (*get_version)(void);
 	int (*cpu_suspend)(u32 state, unsigned long entry_point);
 	int (*cpu_off)(u32 state);
 	int (*cpu_on)(unsigned long cpuid, unsigned long entry_point);

From 95e3de3590e3f2358bb13f013911bc1bfa5d3f53 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 2 Jan 2018 18:19:39 +0000
Subject: [PATCH 047/104] arm64: Move post_ttbr_update_workaround to C code

We will soon need to invoke a CPU-specific function pointer after changing
page tables, so move post_ttbr_update_workaround out into C code to make
this possible.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/assembler.h | 13 -------------
 arch/arm64/kernel/entry.S          |  2 +-
 arch/arm64/mm/context.c            |  9 +++++++++
 arch/arm64/mm/proc.S               |  3 +--
 4 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 215a49213507..a6f90b648655 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -492,19 +492,6 @@ alternative_endif
 	mrs	\rd, sp_el0
 	.endm
 
-/*
- * Errata workaround post TTBRx_EL1 update.
- */
-	.macro	post_ttbr_update_workaround
-#ifdef CONFIG_CAVIUM_ERRATUM_27456
-alternative_if ARM64_WORKAROUND_CAVIUM_27456
-	ic	iallu
-	dsb	nsh
-	isb
-alternative_else_nop_endif
-#endif
-	.endm
-
 /*
  * Arrange a physical address in a TTBR register, taking care of 52-bit
  * addresses.
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 6ceed4877daf..80b539845da6 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -277,7 +277,7 @@ alternative_else_nop_endif
 	 * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache
 	 * corruption).
 	 */
-	post_ttbr_update_workaround
+	bl	post_ttbr_update_workaround
 	.endif
 1:
 	.if	\el != 0
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 1fe71b9fcf35..511bd1e79b69 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -242,6 +242,15 @@ switch_mm_fastpath:
 		cpu_switch_mm(mm->pgd, mm);
 }
 
+/* Errata workaround post TTBRx_EL1 update. */
+asmlinkage void post_ttbr_update_workaround(void)
+{
+	asm(ALTERNATIVE("nop; nop; nop",
+			"ic iallu; dsb nsh; isb",
+			ARM64_WORKAROUND_CAVIUM_27456,
+			CONFIG_CAVIUM_ERRATUM_27456));
+}
+
 static int asids_init(void)
 {
 	asid_bits = get_cpu_asid_bits();
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index bc334588f234..bc86f7ef8620 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -146,8 +146,7 @@ ENTRY(cpu_do_switch_mm)
 	phys_to_ttbr x0, x2
 	msr	ttbr0_el1, x2			// now update TTBR0
 	isb
-	post_ttbr_update_workaround
-	ret
+	b	post_ttbr_update_workaround	// Back to C code...
 ENDPROC(cpu_do_switch_mm)
 
 	.pushsection ".idmap.text", "ax"

From 0f15adbb2861ce6f75ccfc5a92b19eae0ef327d0 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 3 Jan 2018 11:17:58 +0000
Subject: [PATCH 048/104] arm64: Add skeleton to harden the branch predictor
 against aliasing attacks

Aliasing attacks against CPU branch predictors can allow an attacker to
redirect speculative control flow on some CPUs and potentially divulge
information from one context to another.

This patch adds initial skeleton code behind a new Kconfig option to
enable implementation-specific mitigations against these attacks for
CPUs that are affected.

Co-developed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig               | 17 ++++++++
 arch/arm64/include/asm/cpucaps.h |  3 +-
 arch/arm64/include/asm/mmu.h     | 37 ++++++++++++++++
 arch/arm64/include/asm/sysreg.h  |  1 +
 arch/arm64/kernel/Makefile       |  4 ++
 arch/arm64/kernel/bpi.S          | 55 ++++++++++++++++++++++++
 arch/arm64/kernel/cpu_errata.c   | 74 ++++++++++++++++++++++++++++++++
 arch/arm64/kernel/cpufeature.c   |  1 +
 arch/arm64/kernel/entry.S        |  7 ++-
 arch/arm64/mm/context.c          |  2 +
 arch/arm64/mm/fault.c            | 17 ++++++++
 11 files changed, 215 insertions(+), 3 deletions(-)
 create mode 100644 arch/arm64/kernel/bpi.S

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index cb7a70e686cb..664fadc2aa2e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -874,6 +874,23 @@ config UNMAP_KERNEL_AT_EL0
 
 	  If unsure, say Y.
 
+config HARDEN_BRANCH_PREDICTOR
+	bool "Harden the branch predictor against aliasing attacks" if EXPERT
+	default y
+	help
+	  Speculation attacks against some high-performance processors rely on
+	  being able to manipulate the branch predictor for a victim context by
+	  executing aliasing branches in the attacker context.  Such attacks
+	  can be partially mitigated against by clearing internal branch
+	  predictor state and limiting the prediction logic in some situations.
+
+	  This config option will take CPU-specific actions to harden the
+	  branch predictor against aliasing attacks and may rely on specific
+	  instruction sequences or control bits being set by the system
+	  firmware.
+
+	  If unsure, say Y.
+
 menuconfig ARMV8_DEPRECATED
 	bool "Emulate deprecated/obsolete ARMv8 instructions"
 	depends on COMPAT
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index b4537ffd1018..51616e77fe6b 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -42,7 +42,8 @@
 #define ARM64_HAS_DCPOP				21
 #define ARM64_SVE				22
 #define ARM64_UNMAP_KERNEL_AT_EL0		23
+#define ARM64_HARDEN_BRANCH_PREDICTOR		24
 
-#define ARM64_NCAPS				24
+#define ARM64_NCAPS				25
 
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 6f7bdb89817f..6dd83d75b82a 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -41,6 +41,43 @@ static inline bool arm64_kernel_unmapped_at_el0(void)
 	       cpus_have_const_cap(ARM64_UNMAP_KERNEL_AT_EL0);
 }
 
+typedef void (*bp_hardening_cb_t)(void);
+
+struct bp_hardening_data {
+	int			hyp_vectors_slot;
+	bp_hardening_cb_t	fn;
+};
+
+#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
+extern char __bp_harden_hyp_vecs_start[], __bp_harden_hyp_vecs_end[];
+
+DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
+
+static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void)
+{
+	return this_cpu_ptr(&bp_hardening_data);
+}
+
+static inline void arm64_apply_bp_hardening(void)
+{
+	struct bp_hardening_data *d;
+
+	if (!cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR))
+		return;
+
+	d = arm64_get_bp_hardening_data();
+	if (d->fn)
+		d->fn();
+}
+#else
+static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void)
+{
+	return NULL;
+}
+
+static inline void arm64_apply_bp_hardening(void)	{ }
+#endif	/* CONFIG_HARDEN_BRANCH_PREDICTOR */
+
 extern void paging_init(void);
 extern void bootmem_init(void);
 extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt);
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 262ae18f0e05..54e99af043c6 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -439,6 +439,7 @@
 
 /* id_aa64pfr0 */
 #define ID_AA64PFR0_CSV3_SHIFT		60
+#define ID_AA64PFR0_CSV2_SHIFT		56
 #define ID_AA64PFR0_SVE_SHIFT		32
 #define ID_AA64PFR0_GIC_SHIFT		24
 #define ID_AA64PFR0_ASIMD_SHIFT		20
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 067baace74a0..0c760db04858 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -53,6 +53,10 @@ arm64-obj-$(CONFIG_ARM64_RELOC_TEST)	+= arm64-reloc-test.o
 arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
 arm64-obj-$(CONFIG_CRASH_DUMP)		+= crash_dump.o
 
+ifeq ($(CONFIG_KVM),y)
+arm64-obj-$(CONFIG_HARDEN_BRANCH_PREDICTOR)	+= bpi.o
+endif
+
 obj-y					+= $(arm64-obj-y) vdso/ probes/
 obj-m					+= $(arm64-obj-m)
 head-y					:= head.o
diff --git a/arch/arm64/kernel/bpi.S b/arch/arm64/kernel/bpi.S
new file mode 100644
index 000000000000..06a931eb2673
--- /dev/null
+++ b/arch/arm64/kernel/bpi.S
@@ -0,0 +1,55 @@
+/*
+ * Contains CPU specific branch predictor invalidation sequences
+ *
+ * Copyright (C) 2018 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/linkage.h>
+
+.macro ventry target
+	.rept 31
+	nop
+	.endr
+	b	\target
+.endm
+
+.macro vectors target
+	ventry \target + 0x000
+	ventry \target + 0x080
+	ventry \target + 0x100
+	ventry \target + 0x180
+
+	ventry \target + 0x200
+	ventry \target + 0x280
+	ventry \target + 0x300
+	ventry \target + 0x380
+
+	ventry \target + 0x400
+	ventry \target + 0x480
+	ventry \target + 0x500
+	ventry \target + 0x580
+
+	ventry \target + 0x600
+	ventry \target + 0x680
+	ventry \target + 0x700
+	ventry \target + 0x780
+.endm
+
+	.align	11
+ENTRY(__bp_harden_hyp_vecs_start)
+	.rept 4
+	vectors __kvm_hyp_vector
+	.endr
+ENTRY(__bp_harden_hyp_vecs_end)
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 0e27f86ee709..16ea5c6f314e 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -46,6 +46,80 @@ static int cpu_enable_trap_ctr_access(void *__unused)
 	return 0;
 }
 
+#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
+#include <asm/mmu_context.h>
+#include <asm/cacheflush.h>
+
+DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
+
+#ifdef CONFIG_KVM
+static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
+				const char *hyp_vecs_end)
+{
+	void *dst = lm_alias(__bp_harden_hyp_vecs_start + slot * SZ_2K);
+	int i;
+
+	for (i = 0; i < SZ_2K; i += 0x80)
+		memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start);
+
+	flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
+}
+
+static void __install_bp_hardening_cb(bp_hardening_cb_t fn,
+				      const char *hyp_vecs_start,
+				      const char *hyp_vecs_end)
+{
+	static int last_slot = -1;
+	static DEFINE_SPINLOCK(bp_lock);
+	int cpu, slot = -1;
+
+	spin_lock(&bp_lock);
+	for_each_possible_cpu(cpu) {
+		if (per_cpu(bp_hardening_data.fn, cpu) == fn) {
+			slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu);
+			break;
+		}
+	}
+
+	if (slot == -1) {
+		last_slot++;
+		BUG_ON(((__bp_harden_hyp_vecs_end - __bp_harden_hyp_vecs_start)
+			/ SZ_2K) <= last_slot);
+		slot = last_slot;
+		__copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end);
+	}
+
+	__this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot);
+	__this_cpu_write(bp_hardening_data.fn, fn);
+	spin_unlock(&bp_lock);
+}
+#else
+static void __install_bp_hardening_cb(bp_hardening_cb_t fn,
+				      const char *hyp_vecs_start,
+				      const char *hyp_vecs_end)
+{
+	__this_cpu_write(bp_hardening_data.fn, fn);
+}
+#endif	/* CONFIG_KVM */
+
+static void  install_bp_hardening_cb(const struct arm64_cpu_capabilities *entry,
+				     bp_hardening_cb_t fn,
+				     const char *hyp_vecs_start,
+				     const char *hyp_vecs_end)
+{
+	u64 pfr0;
+
+	if (!entry->matches(entry, SCOPE_LOCAL_CPU))
+		return;
+
+	pfr0 = read_cpuid(ID_AA64PFR0_EL1);
+	if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_CSV2_SHIFT))
+		return;
+
+	__install_bp_hardening_cb(fn, hyp_vecs_start, hyp_vecs_end);
+}
+#endif	/* CONFIG_HARDEN_BRANCH_PREDICTOR */
+
 #define MIDR_RANGE(model, min, max) \
 	.def_scope = SCOPE_LOCAL_CPU, \
 	.matches = is_affected_midr_range, \
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 991922b45d3d..da6722db50b0 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -147,6 +147,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = {
 
 static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0),
 	S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI),
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 80b539845da6..07a7d4db8ec4 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -721,12 +721,15 @@ el0_ia:
 	 * Instruction abort handling
 	 */
 	mrs	x26, far_el1
-	enable_daif
+	enable_da_f
+#ifdef CONFIG_TRACE_IRQFLAGS
+	bl	trace_hardirqs_off
+#endif
 	ct_user_exit
 	mov	x0, x26
 	mov	x1, x25
 	mov	x2, sp
-	bl	do_mem_abort
+	bl	do_el0_ia_bp_hardening
 	b	ret_to_user
 el0_fpsimd_acc:
 	/*
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 511bd1e79b69..ff99a880a730 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -249,6 +249,8 @@ asmlinkage void post_ttbr_update_workaround(void)
 			"ic iallu; dsb nsh; isb",
 			ARM64_WORKAROUND_CAVIUM_27456,
 			CONFIG_CAVIUM_ERRATUM_27456));
+
+	arm64_apply_bp_hardening();
 }
 
 static int asids_init(void)
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 22168cd0dde7..0e671ddf4855 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -708,6 +708,23 @@ asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
 	arm64_notify_die("", regs, &info, esr);
 }
 
+asmlinkage void __exception do_el0_ia_bp_hardening(unsigned long addr,
+						   unsigned int esr,
+						   struct pt_regs *regs)
+{
+	/*
+	 * We've taken an instruction abort from userspace and not yet
+	 * re-enabled IRQs. If the address is a kernel address, apply
+	 * BP hardening prior to enabling IRQs and pre-emption.
+	 */
+	if (addr > TASK_SIZE)
+		arm64_apply_bp_hardening();
+
+	local_irq_enable();
+	do_mem_abort(addr, esr, regs);
+}
+
+
 asmlinkage void __exception do_sp_pc_abort(unsigned long addr,
 					   unsigned int esr,
 					   struct pt_regs *regs)

From 6840bdd73d07216ab4bc46f5a8768c37ea519038 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Wed, 3 Jan 2018 16:38:35 +0000
Subject: [PATCH 049/104] arm64: KVM: Use per-CPU vector when BP hardening is
 enabled

Now that we have per-CPU vectors, let's plug then in the KVM/arm64 code.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm/include/asm/kvm_mmu.h   | 10 +++++++++
 arch/arm64/include/asm/kvm_mmu.h | 38 ++++++++++++++++++++++++++++++++
 arch/arm64/kvm/hyp/switch.c      |  2 +-
 virt/kvm/arm/arm.c               |  8 ++++++-
 4 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 8c5643e2eea4..a2d176a308bd 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -226,6 +226,16 @@ static inline unsigned int kvm_get_vmid_bits(void)
 	return 8;
 }
 
+static inline void *kvm_get_hyp_vector(void)
+{
+	return kvm_ksym_ref(__kvm_hyp_vector);
+}
+
+static inline int kvm_map_vectors(void)
+{
+	return 0;
+}
+
 #define kvm_phys_to_vttbr(addr)		(addr)
 
 #endif	/* !__ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index b33bdb5eeb3d..72e279dbae5f 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -322,6 +322,44 @@ static inline unsigned int kvm_get_vmid_bits(void)
 	return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8;
 }
 
+#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
+#include <asm/mmu.h>
+
+static inline void *kvm_get_hyp_vector(void)
+{
+	struct bp_hardening_data *data = arm64_get_bp_hardening_data();
+	void *vect = kvm_ksym_ref(__kvm_hyp_vector);
+
+	if (data->fn) {
+		vect = __bp_harden_hyp_vecs_start +
+		       data->hyp_vectors_slot * SZ_2K;
+
+		if (!has_vhe())
+			vect = lm_alias(vect);
+	}
+
+	return vect;
+}
+
+static inline int kvm_map_vectors(void)
+{
+	return create_hyp_mappings(kvm_ksym_ref(__bp_harden_hyp_vecs_start),
+				   kvm_ksym_ref(__bp_harden_hyp_vecs_end),
+				   PAGE_HYP_EXEC);
+}
+
+#else
+static inline void *kvm_get_hyp_vector(void)
+{
+	return kvm_ksym_ref(__kvm_hyp_vector);
+}
+
+static inline int kvm_map_vectors(void)
+{
+	return 0;
+}
+#endif
+
 #define kvm_phys_to_vttbr(addr)		phys_to_ttbr(addr)
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index f7c651f3a8c0..8d4f3c9d6dc4 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -52,7 +52,7 @@ static void __hyp_text __activate_traps_vhe(void)
 	val &= ~(CPACR_EL1_FPEN | CPACR_EL1_ZEN);
 	write_sysreg(val, cpacr_el1);
 
-	write_sysreg(__kvm_hyp_vector, vbar_el1);
+	write_sysreg(kvm_get_hyp_vector(), vbar_el1);
 }
 
 static void __hyp_text __activate_traps_nvhe(void)
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index c8d49879307f..2df6a5c42f77 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -1158,7 +1158,7 @@ static void cpu_init_hyp_mode(void *dummy)
 	pgd_ptr = kvm_mmu_get_httbr();
 	stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
 	hyp_stack_ptr = stack_page + PAGE_SIZE;
-	vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector);
+	vector_ptr = (unsigned long)kvm_get_hyp_vector();
 
 	__cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
 	__cpu_init_stage2();
@@ -1403,6 +1403,12 @@ static int init_hyp_mode(void)
 		goto out_err;
 	}
 
+	err = kvm_map_vectors();
+	if (err) {
+		kvm_err("Cannot map vectors\n");
+		goto out_err;
+	}
+
 	/*
 	 * Map the Hyp stack pages
 	 */

From 90348689d500410ca7a55624c667f956771dce7f Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Wed, 3 Jan 2018 16:38:37 +0000
Subject: [PATCH 050/104] arm64: KVM: Make PSCI_VERSION a fast path

For those CPUs that require PSCI to perform a BP invalidation,
going all the way to the PSCI code for not much is a waste of
precious cycles. Let's terminate that call as early as possible.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kvm/hyp/switch.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 8d4f3c9d6dc4..4d273f6d0e69 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -17,6 +17,7 @@
 
 #include <linux/types.h>
 #include <linux/jump_label.h>
+#include <uapi/linux/psci.h>
 
 #include <asm/kvm_asm.h>
 #include <asm/kvm_emulate.h>
@@ -341,6 +342,18 @@ again:
 	if (exit_code == ARM_EXCEPTION_TRAP && !__populate_fault_info(vcpu))
 		goto again;
 
+	if (exit_code == ARM_EXCEPTION_TRAP &&
+	    (kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_HVC64 ||
+	     kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_HVC32) &&
+	    vcpu_get_reg(vcpu, 0) == PSCI_0_2_FN_PSCI_VERSION) {
+		u64 val = PSCI_RET_NOT_SUPPORTED;
+		if (test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features))
+			val = 2;
+
+		vcpu_set_reg(vcpu, 0, val);
+		goto again;
+	}
+
 	if (static_branch_unlikely(&vgic_v2_cpuif_trap) &&
 	    exit_code == ARM_EXCEPTION_TRAP) {
 		bool valid;

From a65d219fe5dc7887fd5ca04c2ac3e9a34feb8dfc Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 3 Jan 2018 11:19:34 +0000
Subject: [PATCH 051/104] arm64: cputype: Add missing MIDR values for
 Cortex-A72 and Cortex-A75

Hook up MIDR values for the Cortex-A72 and Cortex-A75 CPUs, since they
will soon need MIDR matches for hardening the branch predictor.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cputype.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 235e77d98261..84385b94e70b 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -79,8 +79,10 @@
 #define ARM_CPU_PART_AEM_V8		0xD0F
 #define ARM_CPU_PART_FOUNDATION		0xD00
 #define ARM_CPU_PART_CORTEX_A57		0xD07
+#define ARM_CPU_PART_CORTEX_A72		0xD08
 #define ARM_CPU_PART_CORTEX_A53		0xD03
 #define ARM_CPU_PART_CORTEX_A73		0xD09
+#define ARM_CPU_PART_CORTEX_A75		0xD0A
 
 #define APM_CPU_PART_POTENZA		0x000
 
@@ -94,7 +96,9 @@
 
 #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
 #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
+#define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72)
 #define MIDR_CORTEX_A73 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A73)
+#define MIDR_CORTEX_A75 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A75)
 #define MIDR_THUNDERX	MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
 #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
 #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX)

From aa6acde65e03186b5add8151e1ffe36c3c62639b Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 3 Jan 2018 12:46:21 +0000
Subject: [PATCH 052/104] arm64: Implement branch predictor hardening for
 affected Cortex-A CPUs

Cortex-A57, A72, A73 and A75 are susceptible to branch predictor aliasing
and can theoretically be attacked by malicious code.

This patch implements a PSCI-based mitigation for these CPUs when available.
The call into firmware will invalidate the branch predictor state, preventing
any malicious entries from affecting other victim contexts.

Co-developed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/bpi.S        | 24 +++++++++++++++++++
 arch/arm64/kernel/cpu_errata.c | 42 ++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git a/arch/arm64/kernel/bpi.S b/arch/arm64/kernel/bpi.S
index 06a931eb2673..dec95bd82e31 100644
--- a/arch/arm64/kernel/bpi.S
+++ b/arch/arm64/kernel/bpi.S
@@ -53,3 +53,27 @@ ENTRY(__bp_harden_hyp_vecs_start)
 	vectors __kvm_hyp_vector
 	.endr
 ENTRY(__bp_harden_hyp_vecs_end)
+ENTRY(__psci_hyp_bp_inval_start)
+	sub	sp, sp, #(8 * 18)
+	stp	x16, x17, [sp, #(16 * 0)]
+	stp	x14, x15, [sp, #(16 * 1)]
+	stp	x12, x13, [sp, #(16 * 2)]
+	stp	x10, x11, [sp, #(16 * 3)]
+	stp	x8, x9, [sp, #(16 * 4)]
+	stp	x6, x7, [sp, #(16 * 5)]
+	stp	x4, x5, [sp, #(16 * 6)]
+	stp	x2, x3, [sp, #(16 * 7)]
+	stp	x0, x1, [sp, #(16 * 8)]
+	mov	x0, #0x84000000
+	smc	#0
+	ldp	x16, x17, [sp, #(16 * 0)]
+	ldp	x14, x15, [sp, #(16 * 1)]
+	ldp	x12, x13, [sp, #(16 * 2)]
+	ldp	x10, x11, [sp, #(16 * 3)]
+	ldp	x8, x9, [sp, #(16 * 4)]
+	ldp	x6, x7, [sp, #(16 * 5)]
+	ldp	x4, x5, [sp, #(16 * 6)]
+	ldp	x2, x3, [sp, #(16 * 7)]
+	ldp	x0, x1, [sp, #(16 * 8)]
+	add	sp, sp, #(8 * 18)
+ENTRY(__psci_hyp_bp_inval_end)
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 16ea5c6f314e..cb0fb3796bb8 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -53,6 +53,8 @@ static int cpu_enable_trap_ctr_access(void *__unused)
 DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
 
 #ifdef CONFIG_KVM
+extern char __psci_hyp_bp_inval_start[], __psci_hyp_bp_inval_end[];
+
 static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
 				const char *hyp_vecs_end)
 {
@@ -94,6 +96,9 @@ static void __install_bp_hardening_cb(bp_hardening_cb_t fn,
 	spin_unlock(&bp_lock);
 }
 #else
+#define __psci_hyp_bp_inval_start	NULL
+#define __psci_hyp_bp_inval_end		NULL
+
 static void __install_bp_hardening_cb(bp_hardening_cb_t fn,
 				      const char *hyp_vecs_start,
 				      const char *hyp_vecs_end)
@@ -118,6 +123,21 @@ static void  install_bp_hardening_cb(const struct arm64_cpu_capabilities *entry,
 
 	__install_bp_hardening_cb(fn, hyp_vecs_start, hyp_vecs_end);
 }
+
+#include <linux/psci.h>
+
+static int enable_psci_bp_hardening(void *data)
+{
+	const struct arm64_cpu_capabilities *entry = data;
+
+	if (psci_ops.get_version)
+		install_bp_hardening_cb(entry,
+				       (bp_hardening_cb_t)psci_ops.get_version,
+				       __psci_hyp_bp_inval_start,
+				       __psci_hyp_bp_inval_end);
+
+	return 0;
+}
 #endif	/* CONFIG_HARDEN_BRANCH_PREDICTOR */
 
 #define MIDR_RANGE(model, min, max) \
@@ -260,6 +280,28 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 		.capability = ARM64_WORKAROUND_858921,
 		MIDR_ALL_VERSIONS(MIDR_CORTEX_A73),
 	},
+#endif
+#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
+	{
+		.capability = ARM64_HARDEN_BRANCH_PREDICTOR,
+		MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
+		.enable = enable_psci_bp_hardening,
+	},
+	{
+		.capability = ARM64_HARDEN_BRANCH_PREDICTOR,
+		MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
+		.enable = enable_psci_bp_hardening,
+	},
+	{
+		.capability = ARM64_HARDEN_BRANCH_PREDICTOR,
+		MIDR_ALL_VERSIONS(MIDR_CORTEX_A73),
+		.enable = enable_psci_bp_hardening,
+	},
+	{
+		.capability = ARM64_HARDEN_BRANCH_PREDICTOR,
+		MIDR_ALL_VERSIONS(MIDR_CORTEX_A75),
+		.enable = enable_psci_bp_hardening,
+	},
 #endif
 	{
 	}

From ec82b567a74fbdffdf418d4bb381d55f6a9096af Mon Sep 17 00:00:00 2001
From: Shanker Donthineni <shankerd@codeaurora.org>
Date: Fri, 5 Jan 2018 14:28:59 -0600
Subject: [PATCH 053/104] arm64: Implement branch predictor hardening for
 Falkor

Falkor is susceptible to branch predictor aliasing and can
theoretically be attacked by malicious code. This patch
implements a mitigation for these attacks, preventing any
malicious entries from affecting other victim contexts.

Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org>
[will: fix label name when !CONFIG_KVM and remove references to MIDR_FALKOR]
Signed-off-by: Will Deacon <will.deacon@arm.com>

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpucaps.h |  3 ++-
 arch/arm64/include/asm/kvm_asm.h |  2 ++
 arch/arm64/kernel/bpi.S          |  8 +++++++
 arch/arm64/kernel/cpu_errata.c   | 40 ++++++++++++++++++++++++++++++--
 arch/arm64/kvm/hyp/entry.S       | 12 ++++++++++
 arch/arm64/kvm/hyp/switch.c      |  8 +++++++
 6 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 51616e77fe6b..7049b4802587 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -43,7 +43,8 @@
 #define ARM64_SVE				22
 #define ARM64_UNMAP_KERNEL_AT_EL0		23
 #define ARM64_HARDEN_BRANCH_PREDICTOR		24
+#define ARM64_HARDEN_BP_POST_GUEST_EXIT		25
 
-#define ARM64_NCAPS				25
+#define ARM64_NCAPS				26
 
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index ab4d0a926043..24961b732e65 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -68,6 +68,8 @@ extern u32 __kvm_get_mdcr_el2(void);
 
 extern u32 __init_stage2_translation(void);
 
+extern void __qcom_hyp_sanitize_btac_predictors(void);
+
 #endif
 
 #endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm64/kernel/bpi.S b/arch/arm64/kernel/bpi.S
index dec95bd82e31..76225c2611ea 100644
--- a/arch/arm64/kernel/bpi.S
+++ b/arch/arm64/kernel/bpi.S
@@ -77,3 +77,11 @@ ENTRY(__psci_hyp_bp_inval_start)
 	ldp	x0, x1, [sp, #(16 * 8)]
 	add	sp, sp, #(8 * 18)
 ENTRY(__psci_hyp_bp_inval_end)
+
+ENTRY(__qcom_hyp_sanitize_link_stack_start)
+	stp     x29, x30, [sp, #-16]!
+	.rept	16
+	bl	. + 4
+	.endr
+	ldp	x29, x30, [sp], #16
+ENTRY(__qcom_hyp_sanitize_link_stack_end)
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index cb0fb3796bb8..70e5f1826fd9 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -54,6 +54,8 @@ DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
 
 #ifdef CONFIG_KVM
 extern char __psci_hyp_bp_inval_start[], __psci_hyp_bp_inval_end[];
+extern char __qcom_hyp_sanitize_link_stack_start[];
+extern char __qcom_hyp_sanitize_link_stack_end[];
 
 static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
 				const char *hyp_vecs_end)
@@ -96,8 +98,10 @@ static void __install_bp_hardening_cb(bp_hardening_cb_t fn,
 	spin_unlock(&bp_lock);
 }
 #else
-#define __psci_hyp_bp_inval_start	NULL
-#define __psci_hyp_bp_inval_end		NULL
+#define __psci_hyp_bp_inval_start		NULL
+#define __psci_hyp_bp_inval_end			NULL
+#define __qcom_hyp_sanitize_link_stack_start	NULL
+#define __qcom_hyp_sanitize_link_stack_end	NULL
 
 static void __install_bp_hardening_cb(bp_hardening_cb_t fn,
 				      const char *hyp_vecs_start,
@@ -138,6 +142,29 @@ static int enable_psci_bp_hardening(void *data)
 
 	return 0;
 }
+
+static void qcom_link_stack_sanitization(void)
+{
+	u64 tmp;
+
+	asm volatile("mov	%0, x30		\n"
+		     ".rept	16		\n"
+		     "bl	. + 4		\n"
+		     ".endr			\n"
+		     "mov	x30, %0		\n"
+		     : "=&r" (tmp));
+}
+
+static int qcom_enable_link_stack_sanitization(void *data)
+{
+	const struct arm64_cpu_capabilities *entry = data;
+
+	install_bp_hardening_cb(entry, qcom_link_stack_sanitization,
+				__qcom_hyp_sanitize_link_stack_start,
+				__qcom_hyp_sanitize_link_stack_end);
+
+	return 0;
+}
 #endif	/* CONFIG_HARDEN_BRANCH_PREDICTOR */
 
 #define MIDR_RANGE(model, min, max) \
@@ -302,6 +329,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 		MIDR_ALL_VERSIONS(MIDR_CORTEX_A75),
 		.enable = enable_psci_bp_hardening,
 	},
+	{
+		.capability = ARM64_HARDEN_BRANCH_PREDICTOR,
+		MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR_V1),
+		.enable = qcom_enable_link_stack_sanitization,
+	},
+	{
+		.capability = ARM64_HARDEN_BP_POST_GUEST_EXIT,
+		MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR_V1),
+	},
 #endif
 	{
 	}
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 12ee62d6d410..9c45c6af1f58 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -196,3 +196,15 @@ alternative_endif
 
 	eret
 ENDPROC(__fpsimd_guest_restore)
+
+ENTRY(__qcom_hyp_sanitize_btac_predictors)
+	/**
+	 * Call SMC64 with Silicon provider serviceID 23<<8 (0xc2001700)
+	 * 0xC2000000-0xC200FFFF: assigned to SiP Service Calls
+	 * b15-b0: contains SiP functionID
+	 */
+	movz    x0, #0x1700
+	movk    x0, #0xc200, lsl #16
+	smc     #0
+	ret
+ENDPROC(__qcom_hyp_sanitize_btac_predictors)
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 4d273f6d0e69..170e1917f83c 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -406,6 +406,14 @@ again:
 		/* 0 falls through to be handled out of EL2 */
 	}
 
+	if (cpus_have_const_cap(ARM64_HARDEN_BP_POST_GUEST_EXIT)) {
+		u32 midr = read_cpuid_id();
+
+		/* Apply BTAC predictors mitigation to all Falkor chips */
+		if ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1)
+			__qcom_hyp_sanitize_btac_predictors();
+	}
+
 	fp_enabled = __fpsimd_enabled();
 
 	__sysreg_save_guest_state(guest_ctxt);

From 0d90718871fe80f019b7295ec9d2b23121e396fb Mon Sep 17 00:00:00 2001
From: Jayachandran C <jnair@caviumnetworks.com>
Date: Sun, 7 Jan 2018 22:53:35 -0800
Subject: [PATCH 054/104] arm64: cputype: Add MIDR values for Cavium ThunderX2
 CPUs

Add the older Broadcom ID as well as the new Cavium ID for ThunderX2
CPUs.

Signed-off-by: Jayachandran C <jnair@caviumnetworks.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cputype.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 84385b94e70b..cce5735a677c 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -89,6 +89,7 @@
 #define CAVIUM_CPU_PART_THUNDERX	0x0A1
 #define CAVIUM_CPU_PART_THUNDERX_81XX	0x0A2
 #define CAVIUM_CPU_PART_THUNDERX_83XX	0x0A3
+#define CAVIUM_CPU_PART_THUNDERX2	0x0AF
 
 #define BRCM_CPU_PART_VULCAN		0x516
 
@@ -102,6 +103,8 @@
 #define MIDR_THUNDERX	MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
 #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
 #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX)
+#define MIDR_CAVIUM_THUNDERX2 MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX2)
+#define MIDR_BRCM_VULCAN MIDR_CPU_MODEL(ARM_CPU_IMP_BRCM, BRCM_CPU_PART_VULCAN)
 #define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1)
 
 #ifndef __ASSEMBLY__

From 32b03d1059667a39e089c45ee38ec9c16332430f Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 8 Jan 2018 15:38:03 +0000
Subject: [PATCH 055/104] KVM: arm64: Store vcpu on the stack during
 __guest_enter()

KVM uses tpidr_el2 as its private vcpu register, which makes sense for
non-vhe world switch as only KVM can access this register. This means
vhe Linux has to use tpidr_el1, which KVM has to save/restore as part
of the host context.

If the SDEI handler code runs behind KVMs back, it mustn't access any
per-cpu variables. To allow this on systems with vhe we need to make
the host use tpidr_el2, saving KVM from save/restoring it.

__guest_enter() stores the host_ctxt on the stack, do the same with
the vcpu.

Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kvm/hyp/entry.S     | 10 +++++++---
 arch/arm64/kvm/hyp/hyp-entry.S |  6 +++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 9c45c6af1f58..fe4678f20a85 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -62,8 +62,8 @@ ENTRY(__guest_enter)
 	// Store the host regs
 	save_callee_saved_regs x1
 
-	// Store the host_ctxt for use at exit time
-	str	x1, [sp, #-16]!
+	// Store host_ctxt and vcpu for use at exit time
+	stp	x1, x0, [sp, #-16]!
 
 	add	x18, x0, #VCPU_CONTEXT
 
@@ -159,6 +159,10 @@ abort_guest_exit_end:
 ENDPROC(__guest_exit)
 
 ENTRY(__fpsimd_guest_restore)
+	// x0: esr
+	// x1: vcpu
+	// x2-x29,lr: vcpu regs
+	// vcpu x0-x1 on the stack
 	stp	x2, x3, [sp, #-16]!
 	stp	x4, lr, [sp, #-16]!
 
@@ -173,7 +177,7 @@ alternative_else
 alternative_endif
 	isb
 
-	mrs	x3, tpidr_el2
+	mov	x3, x1
 
 	ldr	x0, [x3, #VCPU_HOST_CONTEXT]
 	kern_hyp_va x0
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 5170ce1021da..fce7cc507e0a 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -104,6 +104,7 @@ el1_trap:
 	/*
 	 * x0: ESR_EC
 	 */
+	ldr	x1, [sp, #16 + 8]	// vcpu stored by __guest_enter
 
 	/*
 	 * We trap the first access to the FP/SIMD to save the host context
@@ -116,19 +117,18 @@ alternative_if_not ARM64_HAS_NO_FPSIMD
 	b.eq	__fpsimd_guest_restore
 alternative_else_nop_endif
 
-	mrs	x1, tpidr_el2
 	mov	x0, #ARM_EXCEPTION_TRAP
 	b	__guest_exit
 
 el1_irq:
 	stp     x0, x1, [sp, #-16]!
-	mrs	x1, tpidr_el2
+	ldr	x1, [sp, #16 + 8]
 	mov	x0, #ARM_EXCEPTION_IRQ
 	b	__guest_exit
 
 el1_error:
 	stp     x0, x1, [sp, #-16]!
-	mrs	x1, tpidr_el2
+	ldr	x1, [sp, #16 + 8]
 	mov	x0, #ARM_EXCEPTION_EL1_SERROR
 	b	__guest_exit
 

From 36989e7fd386a9a5822c48691473863f8fbb404d Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 8 Jan 2018 15:38:04 +0000
Subject: [PATCH 056/104] KVM: arm/arm64: Convert kvm_host_cpu_state to a
 static per-cpu allocation

kvm_host_cpu_state is a per-cpu allocation made from kvm_arch_init()
used to store the host EL1 registers when KVM switches to a guest.

Make it easier for ASM to generate pointers into this per-cpu memory
by making it a static allocation.

Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 virt/kvm/arm/arm.c | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 2df6a5c42f77..2fc6009a766c 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -53,8 +53,8 @@
 __asm__(".arch_extension	virt");
 #endif
 
+DEFINE_PER_CPU(kvm_cpu_context_t, kvm_host_cpu_state);
 static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
-static kvm_cpu_context_t __percpu *kvm_host_cpu_state;
 
 /* Per-CPU variable containing the currently running vcpu. */
 static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu);
@@ -354,7 +354,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	vcpu->cpu = cpu;
-	vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
+	vcpu->arch.host_cpu_context = this_cpu_ptr(&kvm_host_cpu_state);
 
 	kvm_arm_set_running_vcpu(vcpu);
 	kvm_vgic_load(vcpu);
@@ -1272,19 +1272,8 @@ static inline void hyp_cpu_pm_exit(void)
 }
 #endif
 
-static void teardown_common_resources(void)
-{
-	free_percpu(kvm_host_cpu_state);
-}
-
 static int init_common_resources(void)
 {
-	kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t);
-	if (!kvm_host_cpu_state) {
-		kvm_err("Cannot allocate host CPU state\n");
-		return -ENOMEM;
-	}
-
 	/* set size of VMID supported by CPU */
 	kvm_vmid_bits = kvm_get_vmid_bits();
 	kvm_info("%d-bit VMID\n", kvm_vmid_bits);
@@ -1426,7 +1415,7 @@ static int init_hyp_mode(void)
 	for_each_possible_cpu(cpu) {
 		kvm_cpu_context_t *cpu_ctxt;
 
-		cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu);
+		cpu_ctxt = per_cpu_ptr(&kvm_host_cpu_state, cpu);
 		err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP);
 
 		if (err) {
@@ -1550,7 +1539,6 @@ out_hyp:
 	if (!in_hyp_mode)
 		teardown_hyp_mode();
 out_err:
-	teardown_common_resources();
 	return err;
 }
 

From c97e166e54b662717d20ec2e36761758d2b6a7c2 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 8 Jan 2018 15:38:05 +0000
Subject: [PATCH 057/104] KVM: arm64: Change hyp_panic()s dependency on
 tpidr_el2

Make tpidr_el2 a cpu-offset for per-cpu variables in the same way the
host uses tpidr_el1. This lets tpidr_el{1,2} have the same value, and
on VHE they can be the same register.

KVM calls hyp_panic() when anything unexpected happens. This may occur
while a guest owns the EL1 registers. KVM stashes the vcpu pointer in
tpidr_el2, which it uses to find the host context in order to restore
the host EL1 registers before parachuting into the host's panic().

The host context is a struct kvm_cpu_context allocated in the per-cpu
area, and mapped to hyp. Given the per-cpu offset for this CPU, this is
easy to find. Change hyp_panic() to take a pointer to the
struct kvm_cpu_context. Wrap these calls with an asm function that
retrieves the struct kvm_cpu_context from the host's per-cpu area.

Copy the per-cpu offset from the hosts tpidr_el1 into tpidr_el2 during
kvm init. (Later patches will make this unnecessary for VHE hosts)

We print out the vcpu pointer as part of the panic message. Add a back
reference to the 'running vcpu' in the host cpu context to preserve this.

Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/kvm_host.h |  2 ++
 arch/arm64/kvm/hyp/hyp-entry.S    | 12 ++++++++++++
 arch/arm64/kvm/hyp/s2-setup.c     |  3 +++
 arch/arm64/kvm/hyp/switch.c       | 25 +++++++++++++------------
 4 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index ea6cb5b24258..7ee72b402907 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -192,6 +192,8 @@ struct kvm_cpu_context {
 		u64 sys_regs[NR_SYS_REGS];
 		u32 copro[NR_COPRO_REGS];
 	};
+
+	struct kvm_vcpu *__hyp_running_vcpu;
 };
 
 typedef struct kvm_cpu_context kvm_cpu_context_t;
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index fce7cc507e0a..e4f37b9dd47c 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -163,6 +163,18 @@ ENTRY(__hyp_do_panic)
 	eret
 ENDPROC(__hyp_do_panic)
 
+ENTRY(__hyp_panic)
+	/*
+	 * '=kvm_host_cpu_state' is a host VA from the constant pool, it may
+	 * not be accessible by this address from EL2, hyp_panic() converts
+	 * it with kern_hyp_va() before use.
+	 */
+	ldr	x0, =kvm_host_cpu_state
+	mrs	x1, tpidr_el2
+	add	x0, x0, x1
+	b	hyp_panic
+ENDPROC(__hyp_panic)
+
 .macro invalid_vector	label, target = __hyp_panic
 	.align	2
 \label:
diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c
index 603e1ee83e89..74306c33a6de 100644
--- a/arch/arm64/kvm/hyp/s2-setup.c
+++ b/arch/arm64/kvm/hyp/s2-setup.c
@@ -86,5 +86,8 @@ u32 __hyp_text __init_stage2_translation(void)
 
 	write_sysreg(val, vtcr_el2);
 
+	/* copy tpidr_el1 into tpidr_el2 for use by HYP */
+	write_sysreg(read_sysreg(tpidr_el1), tpidr_el2);
+
 	return parange;
 }
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 170e1917f83c..324f4202cdd5 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -306,9 +306,9 @@ int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 	u64 exit_code;
 
 	vcpu = kern_hyp_va(vcpu);
-	write_sysreg(vcpu, tpidr_el2);
 
 	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
+	host_ctxt->__hyp_running_vcpu = vcpu;
 	guest_ctxt = &vcpu->arch.ctxt;
 
 	__sysreg_save_host_state(host_ctxt);
@@ -443,7 +443,8 @@ again:
 
 static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
 
-static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par)
+static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par,
+					     struct kvm_vcpu *vcpu)
 {
 	unsigned long str_va;
 
@@ -457,35 +458,35 @@ static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par)
 	__hyp_do_panic(str_va,
 		       spsr,  elr,
 		       read_sysreg(esr_el2),   read_sysreg_el2(far),
-		       read_sysreg(hpfar_el2), par,
-		       (void *)read_sysreg(tpidr_el2));
+		       read_sysreg(hpfar_el2), par, vcpu);
 }
 
-static void __hyp_text __hyp_call_panic_vhe(u64 spsr, u64 elr, u64 par)
+static void __hyp_text __hyp_call_panic_vhe(u64 spsr, u64 elr, u64 par,
+					    struct kvm_vcpu *vcpu)
 {
 	panic(__hyp_panic_string,
 	      spsr,  elr,
 	      read_sysreg_el2(esr),   read_sysreg_el2(far),
-	      read_sysreg(hpfar_el2), par,
-	      (void *)read_sysreg(tpidr_el2));
+	      read_sysreg(hpfar_el2), par, vcpu);
 }
 
 static hyp_alternate_select(__hyp_call_panic,
 			    __hyp_call_panic_nvhe, __hyp_call_panic_vhe,
 			    ARM64_HAS_VIRT_HOST_EXTN);
 
-void __hyp_text __noreturn __hyp_panic(void)
+void __hyp_text __noreturn hyp_panic(struct kvm_cpu_context *__host_ctxt)
 {
+	struct kvm_vcpu *vcpu = NULL;
+
 	u64 spsr = read_sysreg_el2(spsr);
 	u64 elr = read_sysreg_el2(elr);
 	u64 par = read_sysreg(par_el1);
 
 	if (read_sysreg(vttbr_el2)) {
-		struct kvm_vcpu *vcpu;
 		struct kvm_cpu_context *host_ctxt;
 
-		vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2);
-		host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
+		host_ctxt = kern_hyp_va(__host_ctxt);
+		vcpu = host_ctxt->__hyp_running_vcpu;
 		__timer_disable_traps(vcpu);
 		__deactivate_traps(vcpu);
 		__deactivate_vm(vcpu);
@@ -493,7 +494,7 @@ void __hyp_text __noreturn __hyp_panic(void)
 	}
 
 	/* Call panic for real */
-	__hyp_call_panic()(spsr, elr, par);
+	__hyp_call_panic()(spsr, elr, par, vcpu);
 
 	unreachable();
 }

From 6d99b68933fbcf51f84fcbba49246ce1209ec193 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 8 Jan 2018 15:38:06 +0000
Subject: [PATCH 058/104] arm64: alternatives: use tpidr_el2 on VHE hosts

Now that KVM uses tpidr_el2 in the same way as Linux's cpu_offset in
tpidr_el1, merge the two. This saves KVM from save/restoring tpidr_el1
on VHE hosts, and allows future code to blindly access per-cpu variables
without triggering world-switch.

Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Christoffer Dall <cdall@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/alternative.h |  2 ++
 arch/arm64/include/asm/assembler.h   |  8 ++++++++
 arch/arm64/include/asm/percpu.h      | 11 +++++++++--
 arch/arm64/kernel/alternative.c      |  9 +++++----
 arch/arm64/kernel/cpufeature.c       | 17 +++++++++++++++++
 arch/arm64/mm/proc.S                 |  8 ++++++++
 6 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index 4a85c6952a22..669028172fd6 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -12,6 +12,8 @@
 #include <linux/stddef.h>
 #include <linux/stringify.h>
 
+extern int alternatives_applied;
+
 struct alt_instr {
 	s32 orig_offset;	/* offset to original instruction */
 	s32 alt_offset;		/* offset to replacement instruction */
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index a6f90b648655..5dc4856f3bb9 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -254,7 +254,11 @@ lr	.req	x30		// link register
 #else
 	adr_l	\dst, \sym
 #endif
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
 	mrs	\tmp, tpidr_el1
+alternative_else
+	mrs	\tmp, tpidr_el2
+alternative_endif
 	add	\dst, \dst, \tmp
 	.endm
 
@@ -265,7 +269,11 @@ lr	.req	x30		// link register
 	 */
 	.macro ldr_this_cpu dst, sym, tmp
 	adr_l	\dst, \sym
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
 	mrs	\tmp, tpidr_el1
+alternative_else
+	mrs	\tmp, tpidr_el2
+alternative_endif
 	ldr	\dst, [\dst, \tmp]
 	.endm
 
diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h
index 3bd498e4de4c..43393208229e 100644
--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -16,11 +16,15 @@
 #ifndef __ASM_PERCPU_H
 #define __ASM_PERCPU_H
 
+#include <asm/alternative.h>
 #include <asm/stack_pointer.h>
 
 static inline void set_my_cpu_offset(unsigned long off)
 {
-	asm volatile("msr tpidr_el1, %0" :: "r" (off) : "memory");
+	asm volatile(ALTERNATIVE("msr tpidr_el1, %0",
+				 "msr tpidr_el2, %0",
+				 ARM64_HAS_VIRT_HOST_EXTN)
+			:: "r" (off) : "memory");
 }
 
 static inline unsigned long __my_cpu_offset(void)
@@ -31,7 +35,10 @@ static inline unsigned long __my_cpu_offset(void)
 	 * We want to allow caching the value, so avoid using volatile and
 	 * instead use a fake stack read to hazard against barrier().
 	 */
-	asm("mrs %0, tpidr_el1" : "=r" (off) :
+	asm(ALTERNATIVE("mrs %0, tpidr_el1",
+			"mrs %0, tpidr_el2",
+			ARM64_HAS_VIRT_HOST_EXTN)
+		: "=r" (off) :
 		"Q" (*(const unsigned long *)current_stack_pointer));
 
 	return off;
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index 6dd0a3a3e5c9..414288a558c8 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -32,6 +32,8 @@
 #define ALT_ORIG_PTR(a)		__ALT_PTR(a, orig_offset)
 #define ALT_REPL_PTR(a)		__ALT_PTR(a, alt_offset)
 
+int alternatives_applied;
+
 struct alt_region {
 	struct alt_instr *begin;
 	struct alt_instr *end;
@@ -143,7 +145,6 @@ static void __apply_alternatives(void *alt_region, bool use_linear_alias)
  */
 static int __apply_alternatives_multi_stop(void *unused)
 {
-	static int patched = 0;
 	struct alt_region region = {
 		.begin	= (struct alt_instr *)__alt_instructions,
 		.end	= (struct alt_instr *)__alt_instructions_end,
@@ -151,14 +152,14 @@ static int __apply_alternatives_multi_stop(void *unused)
 
 	/* We always have a CPU 0 at this point (__init) */
 	if (smp_processor_id()) {
-		while (!READ_ONCE(patched))
+		while (!READ_ONCE(alternatives_applied))
 			cpu_relax();
 		isb();
 	} else {
-		BUG_ON(patched);
+		BUG_ON(alternatives_applied);
 		__apply_alternatives(&region, true);
 		/* Barriers provided by the cache flushing */
-		WRITE_ONCE(patched, 1);
+		WRITE_ONCE(alternatives_applied, 1);
 	}
 
 	return 0;
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index da6722db50b0..9ef84d0def9a 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -886,6 +886,22 @@ static int __init parse_kpti(char *str)
 __setup("kpti=", parse_kpti);
 #endif	/* CONFIG_UNMAP_KERNEL_AT_EL0 */
 
+static int cpu_copy_el2regs(void *__unused)
+{
+	/*
+	 * Copy register values that aren't redirected by hardware.
+	 *
+	 * Before code patching, we only set tpidr_el1, all CPUs need to copy
+	 * this value to tpidr_el2 before we patch the code. Once we've done
+	 * that, freshly-onlined CPUs will set tpidr_el2, so we don't need to
+	 * do anything here.
+	 */
+	if (!alternatives_applied)
+		write_sysreg(read_sysreg(tpidr_el1), tpidr_el2);
+
+	return 0;
+}
+
 static const struct arm64_cpu_capabilities arm64_features[] = {
 	{
 		.desc = "GIC system register CPU interface",
@@ -955,6 +971,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.capability = ARM64_HAS_VIRT_HOST_EXTN,
 		.def_scope = SCOPE_SYSTEM,
 		.matches = runs_at_el2,
+		.enable = cpu_copy_el2regs,
 	},
 	{
 		.desc = "32-bit EL0 Support",
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index bc86f7ef8620..5a59eea49395 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -70,7 +70,11 @@ ENTRY(cpu_do_suspend)
 	mrs	x8, mdscr_el1
 	mrs	x9, oslsr_el1
 	mrs	x10, sctlr_el1
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
 	mrs	x11, tpidr_el1
+alternative_else
+	mrs	x11, tpidr_el2
+alternative_endif
 	mrs	x12, sp_el0
 	stp	x2, x3, [x0]
 	stp	x4, xzr, [x0, #16]
@@ -116,7 +120,11 @@ ENTRY(cpu_do_resume)
 	msr	mdscr_el1, x10
 
 	msr	sctlr_el1, x12
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
 	msr	tpidr_el1, x13
+alternative_else
+	msr	tpidr_el2, x13
+alternative_endif
 	msr	sp_el0, x14
 	/*
 	 * Restore oslsr_el1 by writing oslar_el1

From 1f742679c33bc083722cb0b442a95d458c491b56 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 8 Jan 2018 15:38:07 +0000
Subject: [PATCH 059/104] KVM: arm64: Stop save/restoring host tpidr_el1 on VHE

Now that a VHE host uses tpidr_el2 for the cpu offset we no longer
need KVM to save/restore tpidr_el1. Move this from the 'common' code
into the non-vhe code. While we're at it, on VHE we don't need to
save the ELR or SPSR as kernel_entry in entry.S will have pushed these
onto the kernel stack, and will restore them from there. Move these
to the non-vhe code as we need them to get back to the host.

Finally remove the always-copy-tpidr we hid in the stage2 setup
code, cpufeature's enable callback will do this for VHE, we only
need KVM to do it for non-vhe. Add the copy into kvm-init instead.

Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kvm/hyp-init.S      |  4 ++++
 arch/arm64/kvm/hyp/s2-setup.c  |  3 ---
 arch/arm64/kvm/hyp/sysreg-sr.c | 16 ++++++++--------
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index 33c40b3eea01..8a00de187e56 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -120,6 +120,10 @@ CPU_BE(	orr	x4, x4, #SCTLR_ELx_EE)
 	kern_hyp_va	x2
 	msr	vbar_el2, x2
 
+	/* copy tpidr_el1 into tpidr_el2 for use by HYP */
+	mrs	x1, tpidr_el1
+	msr	tpidr_el2, x1
+
 	/* Hello, World! */
 	eret
 ENDPROC(__kvm_hyp_init)
diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c
index 74306c33a6de..603e1ee83e89 100644
--- a/arch/arm64/kvm/hyp/s2-setup.c
+++ b/arch/arm64/kvm/hyp/s2-setup.c
@@ -86,8 +86,5 @@ u32 __hyp_text __init_stage2_translation(void)
 
 	write_sysreg(val, vtcr_el2);
 
-	/* copy tpidr_el1 into tpidr_el2 for use by HYP */
-	write_sysreg(read_sysreg(tpidr_el1), tpidr_el2);
-
 	return parange;
 }
diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
index 934137647837..c54cc2afb92b 100644
--- a/arch/arm64/kvm/hyp/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -27,8 +27,8 @@ static void __hyp_text __sysreg_do_nothing(struct kvm_cpu_context *ctxt) { }
 /*
  * Non-VHE: Both host and guest must save everything.
  *
- * VHE: Host must save tpidr*_el[01], actlr_el1, mdscr_el1, sp0, pc,
- * pstate, and guest must save everything.
+ * VHE: Host must save tpidr*_el0, actlr_el1, mdscr_el1, sp_el0,
+ * and guest must save everything.
  */
 
 static void __hyp_text __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
@@ -36,11 +36,8 @@ static void __hyp_text __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
 	ctxt->sys_regs[ACTLR_EL1]	= read_sysreg(actlr_el1);
 	ctxt->sys_regs[TPIDR_EL0]	= read_sysreg(tpidr_el0);
 	ctxt->sys_regs[TPIDRRO_EL0]	= read_sysreg(tpidrro_el0);
-	ctxt->sys_regs[TPIDR_EL1]	= read_sysreg(tpidr_el1);
 	ctxt->sys_regs[MDSCR_EL1]	= read_sysreg(mdscr_el1);
 	ctxt->gp_regs.regs.sp		= read_sysreg(sp_el0);
-	ctxt->gp_regs.regs.pc		= read_sysreg_el2(elr);
-	ctxt->gp_regs.regs.pstate	= read_sysreg_el2(spsr);
 }
 
 static void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt)
@@ -62,10 +59,13 @@ static void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt)
 	ctxt->sys_regs[AMAIR_EL1]	= read_sysreg_el1(amair);
 	ctxt->sys_regs[CNTKCTL_EL1]	= read_sysreg_el1(cntkctl);
 	ctxt->sys_regs[PAR_EL1]		= read_sysreg(par_el1);
+	ctxt->sys_regs[TPIDR_EL1]	= read_sysreg(tpidr_el1);
 
 	ctxt->gp_regs.sp_el1		= read_sysreg(sp_el1);
 	ctxt->gp_regs.elr_el1		= read_sysreg_el1(elr);
 	ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg_el1(spsr);
+	ctxt->gp_regs.regs.pc		= read_sysreg_el2(elr);
+	ctxt->gp_regs.regs.pstate	= read_sysreg_el2(spsr);
 }
 
 static hyp_alternate_select(__sysreg_call_save_host_state,
@@ -89,11 +89,8 @@ static void __hyp_text __sysreg_restore_common_state(struct kvm_cpu_context *ctx
 	write_sysreg(ctxt->sys_regs[ACTLR_EL1],	  actlr_el1);
 	write_sysreg(ctxt->sys_regs[TPIDR_EL0],	  tpidr_el0);
 	write_sysreg(ctxt->sys_regs[TPIDRRO_EL0], tpidrro_el0);
-	write_sysreg(ctxt->sys_regs[TPIDR_EL1],	  tpidr_el1);
 	write_sysreg(ctxt->sys_regs[MDSCR_EL1],	  mdscr_el1);
 	write_sysreg(ctxt->gp_regs.regs.sp,	  sp_el0);
-	write_sysreg_el2(ctxt->gp_regs.regs.pc,	  elr);
-	write_sysreg_el2(ctxt->gp_regs.regs.pstate, spsr);
 }
 
 static void __hyp_text __sysreg_restore_state(struct kvm_cpu_context *ctxt)
@@ -115,10 +112,13 @@ static void __hyp_text __sysreg_restore_state(struct kvm_cpu_context *ctxt)
 	write_sysreg_el1(ctxt->sys_regs[AMAIR_EL1],	amair);
 	write_sysreg_el1(ctxt->sys_regs[CNTKCTL_EL1], 	cntkctl);
 	write_sysreg(ctxt->sys_regs[PAR_EL1],		par_el1);
+	write_sysreg(ctxt->sys_regs[TPIDR_EL1],		tpidr_el1);
 
 	write_sysreg(ctxt->gp_regs.sp_el1,		sp_el1);
 	write_sysreg_el1(ctxt->gp_regs.elr_el1,		elr);
 	write_sysreg_el1(ctxt->gp_regs.spsr[KVM_SPSR_EL1],spsr);
+	write_sysreg_el2(ctxt->gp_regs.regs.pc,		elr);
+	write_sysreg_el2(ctxt->gp_regs.regs.pstate,	spsr);
 }
 
 static hyp_alternate_select(__sysreg_call_restore_host_state,

From 86f04f640058143388f56c048f86e66ea5204ae2 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 8 Jan 2018 15:38:08 +0000
Subject: [PATCH 060/104] Docs: dt: add devicetree binding for describing arm64
 SDEI firmware

The Software Delegated Exception Interface (SDEI) is an ARM standard
for registering callbacks from the platform firmware into the OS.
This is typically used to implement RAS notifications, or from an
IRQ that has been promoted to a firmware-assisted NMI.

Add a new devicetree binding to describe the SDE firmware interface.

Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../devicetree/bindings/arm/firmware/sdei.txt | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/arm/firmware/sdei.txt

diff --git a/Documentation/devicetree/bindings/arm/firmware/sdei.txt b/Documentation/devicetree/bindings/arm/firmware/sdei.txt
new file mode 100644
index 000000000000..ee3f0ff49889
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/firmware/sdei.txt
@@ -0,0 +1,42 @@
+* Software Delegated Exception Interface (SDEI)
+
+Firmware implementing the SDEI functions described in ARM document number
+ARM DEN 0054A ("Software Delegated Exception Interface") can be used by
+Linux to receive notification of events such as those generated by
+firmware-first error handling, or from an IRQ that has been promoted to
+a firmware-assisted NMI.
+
+The interface provides a number of API functions for registering callbacks
+and enabling/disabling events. Functions are invoked by trapping to the
+privilege level of the SDEI firmware (specified as part of the binding
+below) and passing arguments in a manner specified by the "SMC Calling
+Convention (ARM DEN 0028B):
+
+	 r0		=> 32-bit Function ID / return value
+	{r1 - r3}	=> Parameters
+
+Note that the immediate field of the trapping instruction must be set
+to #0.
+
+The SDEI_EVENT_REGISTER function registers a callback in the kernel
+text to handle the specified event number.
+
+The sdei node should be a child node of '/firmware' and have required
+properties:
+
+ - compatible    : should contain:
+	* "arm,sdei-1.0" : For implementations complying to SDEI version 1.x.
+
+ - method        : The method of calling the SDEI firmware. Permitted
+                   values are:
+	* "smc" : SMC #0, with the register assignments specified in this
+	          binding.
+	* "hvc" : HVC #0, with the register assignments specified in this
+	          binding.
+Example:
+	firmware {
+		sdei {
+			compatible	= "arm,sdei-1.0";
+			method		= "smc";
+		};
+	};

From ad6eb31ef90355993eb55ff77e0e855ae7d91e4c Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 8 Jan 2018 15:38:09 +0000
Subject: [PATCH 061/104] firmware: arm_sdei: Add driver for Software Delegated
 Exceptions

The Software Delegated Exception Interface (SDEI) is an ARM standard
for registering callbacks from the platform firmware into the OS.
This is typically used to implement firmware notifications (such as
firmware-first RAS) or promote an IRQ that has been promoted to a
firmware-assisted NMI.

Add the code for detecting the SDEI version and the framework for
registering and unregistering events. Subsequent patches will add the
arch-specific backend code and the necessary power management hooks.

Only shared events are supported, power management, private events and
discovery for ACPI systems will be added by later patches.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 MAINTAINERS                   |   9 +
 arch/arm64/include/asm/sdei.h |   8 +
 drivers/firmware/Kconfig      |   8 +
 drivers/firmware/Makefile     |   1 +
 drivers/firmware/arm_sdei.c   | 619 ++++++++++++++++++++++++++++++++++
 include/linux/arm_sdei.h      |  79 +++++
 include/uapi/linux/arm_sdei.h |  73 ++++
 7 files changed, 797 insertions(+)
 create mode 100644 arch/arm64/include/asm/sdei.h
 create mode 100644 drivers/firmware/arm_sdei.c
 create mode 100644 include/linux/arm_sdei.h
 create mode 100644 include/uapi/linux/arm_sdei.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 82ad0eabce4f..c06885c4c0f1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12618,6 +12618,15 @@ L:	linux-media@vger.kernel.org
 S:	Supported
 F:	drivers/media/pci/solo6x10/
 
+SOFTWARE DELEGATED EXCEPTION INTERFACE (SDEI)
+M:	James Morse <james.morse@arm.com>
+L:	linux-arm-kernel@lists.infradead.org
+S:	Maintained
+F:	Documentation/devicetree/bindings/arm/firmware/sdei.txt
+F:	drivers/firmware/arm_sdei.c
+F:	include/linux/sdei.h
+F:	include/uapi/linux/sdei.h
+
 SOFTWARE RAID (Multiple Disks) SUPPORT
 M:	Shaohua Li <shli@kernel.org>
 L:	linux-raid@vger.kernel.org
diff --git a/arch/arm64/include/asm/sdei.h b/arch/arm64/include/asm/sdei.h
new file mode 100644
index 000000000000..59f26b6e673d
--- /dev/null
+++ b/arch/arm64/include/asm/sdei.h
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2017 Arm Ltd.
+#ifndef __ASM_SDEI_H
+#define __ASM_SDEI_H
+
+/* Later patches add the arch specific bits */
+
+#endif /* __ASM_SDEI_H */
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index fa87a055905e..e77f77caa0f3 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -48,6 +48,14 @@ config ARM_SCPI_POWER_DOMAIN
 	  This enables support for the SCPI power domains which can be
 	  enabled or disabled via the SCP firmware
 
+config ARM_SDE_INTERFACE
+	bool "ARM Software Delegated Exception Interface (SDEI)"
+	depends on ARM64
+	help
+	  The Software Delegated Exception Interface (SDEI) is an ARM
+	  standard for registering callbacks from the platform firmware
+	  into the OS. This is typically used to implement RAS notifications.
+
 config EDD
 	tristate "BIOS Enhanced Disk Drive calls determine boot disk"
 	depends on X86
diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile
index feaa890197f3..b248238ddc6a 100644
--- a/drivers/firmware/Makefile
+++ b/drivers/firmware/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_ARM_PSCI_FW)	+= psci.o
 obj-$(CONFIG_ARM_PSCI_CHECKER)	+= psci_checker.o
 obj-$(CONFIG_ARM_SCPI_PROTOCOL)	+= arm_scpi.o
 obj-$(CONFIG_ARM_SCPI_POWER_DOMAIN) += scpi_pm_domain.o
+obj-$(CONFIG_ARM_SDE_INTERFACE)	+= arm_sdei.o
 obj-$(CONFIG_DMI)		+= dmi_scan.o
 obj-$(CONFIG_DMI_SYSFS)		+= dmi-sysfs.o
 obj-$(CONFIG_EDD)		+= edd.o
diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c
new file mode 100644
index 000000000000..8da173cc7e43
--- /dev/null
+++ b/drivers/firmware/arm_sdei.c
@@ -0,0 +1,619 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2017 Arm Ltd.
+#define pr_fmt(fmt) "sdei: " fmt
+
+#include <linux/acpi.h>
+#include <linux/arm_sdei.h>
+#include <linux/arm-smccc.h>
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/hardirq.h>
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/kvm_host.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/percpu.h>
+#include <linux/platform_device.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+
+/*
+ * The call to use to reach the firmware.
+ */
+static asmlinkage void (*sdei_firmware_call)(unsigned long function_id,
+		      unsigned long arg0, unsigned long arg1,
+		      unsigned long arg2, unsigned long arg3,
+		      unsigned long arg4, struct arm_smccc_res *res);
+
+/* entry point from firmware to arch asm code */
+static unsigned long sdei_entry_point;
+
+struct sdei_event {
+	struct list_head	list;
+	u32			event_num;
+	u8			type;
+	u8			priority;
+
+	/* This pointer is handed to firmware as the event argument. */
+	struct sdei_registered_event *registered;
+};
+
+/* Take the mutex for any API call or modification. Take the mutex first. */
+static DEFINE_MUTEX(sdei_events_lock);
+
+/* and then hold this when modifying the list */
+static DEFINE_SPINLOCK(sdei_list_lock);
+static LIST_HEAD(sdei_list);
+
+static int sdei_to_linux_errno(unsigned long sdei_err)
+{
+	switch (sdei_err) {
+	case SDEI_NOT_SUPPORTED:
+		return -EOPNOTSUPP;
+	case SDEI_INVALID_PARAMETERS:
+		return -EINVAL;
+	case SDEI_DENIED:
+		return -EPERM;
+	case SDEI_PENDING:
+		return -EINPROGRESS;
+	case SDEI_OUT_OF_RESOURCE:
+		return -ENOMEM;
+	}
+
+	/* Not an error value ... */
+	return sdei_err;
+}
+
+/*
+ * If x0 is any of these values, then the call failed, use sdei_to_linux_errno()
+ * to translate.
+ */
+static int sdei_is_err(struct arm_smccc_res *res)
+{
+	switch (res->a0) {
+	case SDEI_NOT_SUPPORTED:
+	case SDEI_INVALID_PARAMETERS:
+	case SDEI_DENIED:
+	case SDEI_PENDING:
+	case SDEI_OUT_OF_RESOURCE:
+		return true;
+	}
+
+	return false;
+}
+
+static int invoke_sdei_fn(unsigned long function_id, unsigned long arg0,
+			  unsigned long arg1, unsigned long arg2,
+			  unsigned long arg3, unsigned long arg4,
+			  u64 *result)
+{
+	int err = 0;
+	struct arm_smccc_res res;
+
+	if (sdei_firmware_call) {
+		sdei_firmware_call(function_id, arg0, arg1, arg2, arg3, arg4,
+				   &res);
+		if (sdei_is_err(&res))
+			err = sdei_to_linux_errno(res.a0);
+	} else {
+		/*
+		 * !sdei_firmware_call means we failed to probe or called
+		 * sdei_mark_interface_broken(). -EIO is not an error returned
+		 * by sdei_to_linux_errno() and is used to suppress messages
+		 * from this driver.
+		 */
+		err = -EIO;
+		res.a0 = SDEI_NOT_SUPPORTED;
+	}
+
+	if (result)
+		*result = res.a0;
+
+	return err;
+}
+
+static struct sdei_event *sdei_event_find(u32 event_num)
+{
+	struct sdei_event *e, *found = NULL;
+
+	lockdep_assert_held(&sdei_events_lock);
+
+	spin_lock(&sdei_list_lock);
+	list_for_each_entry(e, &sdei_list, list) {
+		if (e->event_num == event_num) {
+			found = e;
+			break;
+		}
+	}
+	spin_unlock(&sdei_list_lock);
+
+	return found;
+}
+
+int sdei_api_event_context(u32 query, u64 *result)
+{
+	return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_CONTEXT, query, 0, 0, 0, 0,
+			      result);
+}
+NOKPROBE_SYMBOL(sdei_api_event_context);
+
+static int sdei_api_event_get_info(u32 event, u32 info, u64 *result)
+{
+	return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_GET_INFO, event, info, 0,
+			      0, 0, result);
+}
+
+static struct sdei_event *sdei_event_create(u32 event_num,
+					    sdei_event_callback *cb,
+					    void *cb_arg)
+{
+	int err;
+	u64 result;
+	struct sdei_event *event;
+	struct sdei_registered_event *reg;
+
+	lockdep_assert_held(&sdei_events_lock);
+
+	event = kzalloc(sizeof(*event), GFP_KERNEL);
+	if (!event)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&event->list);
+	event->event_num = event_num;
+
+	err = sdei_api_event_get_info(event_num, SDEI_EVENT_INFO_EV_PRIORITY,
+				      &result);
+	if (err) {
+		kfree(event);
+		return ERR_PTR(err);
+	}
+	event->priority = result;
+
+	err = sdei_api_event_get_info(event_num, SDEI_EVENT_INFO_EV_TYPE,
+				      &result);
+	if (err) {
+		kfree(event);
+		return ERR_PTR(err);
+	}
+	event->type = result;
+
+	if (event->type == SDEI_EVENT_TYPE_SHARED) {
+		reg = kzalloc(sizeof(*reg), GFP_KERNEL);
+		if (!reg) {
+			kfree(event);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		reg->event_num = event_num;
+		reg->priority = event->priority;
+
+		reg->callback = cb;
+		reg->callback_arg = cb_arg;
+		event->registered = reg;
+	}
+
+	if (sdei_event_find(event_num)) {
+		kfree(event->registered);
+		kfree(event);
+		event = ERR_PTR(-EBUSY);
+	} else {
+		spin_lock(&sdei_list_lock);
+		list_add(&event->list, &sdei_list);
+		spin_unlock(&sdei_list_lock);
+	}
+
+	return event;
+}
+
+static void sdei_event_destroy(struct sdei_event *event)
+{
+	lockdep_assert_held(&sdei_events_lock);
+
+	spin_lock(&sdei_list_lock);
+	list_del(&event->list);
+	spin_unlock(&sdei_list_lock);
+
+	if (event->type == SDEI_EVENT_TYPE_SHARED)
+		kfree(event->registered);
+
+	kfree(event);
+}
+
+static int sdei_api_get_version(u64 *version)
+{
+	return invoke_sdei_fn(SDEI_1_0_FN_SDEI_VERSION, 0, 0, 0, 0, 0, version);
+}
+
+int sdei_mask_local_cpu(void)
+{
+	int err;
+
+	WARN_ON_ONCE(preemptible());
+
+	err = invoke_sdei_fn(SDEI_1_0_FN_SDEI_PE_MASK, 0, 0, 0, 0, 0, NULL);
+	if (err && err != -EIO) {
+		pr_warn_once("failed to mask CPU[%u]: %d\n",
+			      smp_processor_id(), err);
+		return err;
+	}
+
+	return 0;
+}
+
+static void _ipi_mask_cpu(void *ignored)
+{
+	sdei_mask_local_cpu();
+}
+
+int sdei_unmask_local_cpu(void)
+{
+	int err;
+
+	WARN_ON_ONCE(preemptible());
+
+	err = invoke_sdei_fn(SDEI_1_0_FN_SDEI_PE_UNMASK, 0, 0, 0, 0, 0, NULL);
+	if (err && err != -EIO) {
+		pr_warn_once("failed to unmask CPU[%u]: %d\n",
+			     smp_processor_id(), err);
+		return err;
+	}
+
+	return 0;
+}
+
+static void _ipi_unmask_cpu(void *ignored)
+{
+	sdei_unmask_local_cpu();
+}
+
+static void _ipi_private_reset(void *ignored)
+{
+	int err;
+
+	err = invoke_sdei_fn(SDEI_1_0_FN_SDEI_PRIVATE_RESET, 0, 0, 0, 0, 0,
+			     NULL);
+	if (err && err != -EIO)
+		pr_warn_once("failed to reset CPU[%u]: %d\n",
+			     smp_processor_id(), err);
+}
+
+static int sdei_api_shared_reset(void)
+{
+	return invoke_sdei_fn(SDEI_1_0_FN_SDEI_SHARED_RESET, 0, 0, 0, 0, 0,
+			      NULL);
+}
+
+static void sdei_mark_interface_broken(void)
+{
+	pr_err("disabling SDEI firmware interface\n");
+	on_each_cpu(&_ipi_mask_cpu, NULL, true);
+	sdei_firmware_call = NULL;
+}
+
+static int sdei_platform_reset(void)
+{
+	int err;
+
+	on_each_cpu(&_ipi_private_reset, NULL, true);
+	err = sdei_api_shared_reset();
+	if (err) {
+		pr_err("Failed to reset platform: %d\n", err);
+		sdei_mark_interface_broken();
+	}
+
+	return err;
+}
+
+static int sdei_api_event_enable(u32 event_num)
+{
+	return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_ENABLE, event_num, 0, 0, 0,
+			      0, NULL);
+}
+
+int sdei_event_enable(u32 event_num)
+{
+	int err = -EINVAL;
+	struct sdei_event *event;
+
+	mutex_lock(&sdei_events_lock);
+	event = sdei_event_find(event_num);
+	if (!event) {
+		mutex_unlock(&sdei_events_lock);
+		return -ENOENT;
+	}
+
+	if (event->type == SDEI_EVENT_TYPE_SHARED)
+		err = sdei_api_event_enable(event->event_num);
+	mutex_unlock(&sdei_events_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(sdei_event_enable);
+
+static int sdei_api_event_disable(u32 event_num)
+{
+	return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_DISABLE, event_num, 0, 0,
+			      0, 0, NULL);
+}
+
+int sdei_event_disable(u32 event_num)
+{
+	int err = -EINVAL;
+	struct sdei_event *event;
+
+	mutex_lock(&sdei_events_lock);
+	event = sdei_event_find(event_num);
+	if (!event) {
+		mutex_unlock(&sdei_events_lock);
+		return -ENOENT;
+	}
+
+	if (event->type == SDEI_EVENT_TYPE_SHARED)
+		err = sdei_api_event_disable(event->event_num);
+	mutex_unlock(&sdei_events_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(sdei_event_disable);
+
+static int sdei_api_event_unregister(u32 event_num)
+{
+	return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_UNREGISTER, event_num, 0,
+			      0, 0, 0, NULL);
+}
+
+static int _sdei_event_unregister(struct sdei_event *event)
+{
+	lockdep_assert_held(&sdei_events_lock);
+
+	if (event->type == SDEI_EVENT_TYPE_SHARED)
+		return sdei_api_event_unregister(event->event_num);
+
+	return -EINVAL;
+}
+
+int sdei_event_unregister(u32 event_num)
+{
+	int err;
+	struct sdei_event *event;
+
+	WARN_ON(in_nmi());
+
+	mutex_lock(&sdei_events_lock);
+	event = sdei_event_find(event_num);
+	do {
+		if (!event) {
+			pr_warn("Event %u not registered\n", event_num);
+			err = -ENOENT;
+			break;
+		}
+
+		err = _sdei_event_unregister(event);
+		if (err)
+			break;
+
+		sdei_event_destroy(event);
+	} while (0);
+	mutex_unlock(&sdei_events_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(sdei_event_unregister);
+
+static int sdei_api_event_register(u32 event_num, unsigned long entry_point,
+				   void *arg, u64 flags, u64 affinity)
+{
+	return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_REGISTER, event_num,
+			      (unsigned long)entry_point, (unsigned long)arg,
+			      flags, affinity, NULL);
+}
+
+static int _sdei_event_register(struct sdei_event *event)
+{
+	lockdep_assert_held(&sdei_events_lock);
+
+	if (event->type == SDEI_EVENT_TYPE_SHARED)
+		return sdei_api_event_register(event->event_num,
+					       sdei_entry_point,
+					       event->registered,
+					       SDEI_EVENT_REGISTER_RM_ANY, 0);
+
+	return -EINVAL;
+}
+
+int sdei_event_register(u32 event_num, sdei_event_callback *cb, void *arg)
+{
+	int err;
+	struct sdei_event *event;
+
+	WARN_ON(in_nmi());
+
+	mutex_lock(&sdei_events_lock);
+	do {
+		if (sdei_event_find(event_num)) {
+			pr_warn("Event %u already registered\n", event_num);
+			err = -EBUSY;
+			break;
+		}
+
+		event = sdei_event_create(event_num, cb, arg);
+		if (IS_ERR(event)) {
+			err = PTR_ERR(event);
+			pr_warn("Failed to create event %u: %d\n", event_num,
+				err);
+			break;
+		}
+
+		err = _sdei_event_register(event);
+		if (err) {
+			sdei_event_destroy(event);
+			pr_warn("Failed to register event %u: %d\n", event_num,
+				err);
+		}
+	} while (0);
+	mutex_unlock(&sdei_events_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(sdei_event_register);
+
+static void sdei_smccc_smc(unsigned long function_id,
+			   unsigned long arg0, unsigned long arg1,
+			   unsigned long arg2, unsigned long arg3,
+			   unsigned long arg4, struct arm_smccc_res *res)
+{
+	arm_smccc_smc(function_id, arg0, arg1, arg2, arg3, arg4, 0, 0, res);
+}
+
+static void sdei_smccc_hvc(unsigned long function_id,
+			   unsigned long arg0, unsigned long arg1,
+			   unsigned long arg2, unsigned long arg3,
+			   unsigned long arg4, struct arm_smccc_res *res)
+{
+	arm_smccc_hvc(function_id, arg0, arg1, arg2, arg3, arg4, 0, 0, res);
+}
+
+static int sdei_get_conduit(struct platform_device *pdev)
+{
+	const char *method;
+	struct device_node *np = pdev->dev.of_node;
+
+	sdei_firmware_call = NULL;
+	if (np) {
+		if (of_property_read_string(np, "method", &method)) {
+			pr_warn("missing \"method\" property\n");
+			return CONDUIT_INVALID;
+		}
+
+		if (!strcmp("hvc", method)) {
+			sdei_firmware_call = &sdei_smccc_hvc;
+			return CONDUIT_HVC;
+		} else if (!strcmp("smc", method)) {
+			sdei_firmware_call = &sdei_smccc_smc;
+			return CONDUIT_SMC;
+		}
+
+		pr_warn("invalid \"method\" property: %s\n", method);
+	}
+
+	return CONDUIT_INVALID;
+}
+
+static int sdei_probe(struct platform_device *pdev)
+{
+	int err;
+	u64 ver = 0;
+	int conduit;
+
+	conduit = sdei_get_conduit(pdev);
+	if (!sdei_firmware_call)
+		return 0;
+
+	err = sdei_api_get_version(&ver);
+	if (err == -EOPNOTSUPP)
+		pr_err("advertised but not implemented in platform firmware\n");
+	if (err) {
+		pr_err("Failed to get SDEI version: %d\n", err);
+		sdei_mark_interface_broken();
+		return err;
+	}
+
+	pr_info("SDEIv%d.%d (0x%x) detected in firmware.\n",
+		(int)SDEI_VERSION_MAJOR(ver), (int)SDEI_VERSION_MINOR(ver),
+		(int)SDEI_VERSION_VENDOR(ver));
+
+	if (SDEI_VERSION_MAJOR(ver) != 1) {
+		pr_warn("Conflicting SDEI version detected.\n");
+		sdei_mark_interface_broken();
+		return -EINVAL;
+	}
+
+	err = sdei_platform_reset();
+	if (err)
+		return err;
+
+	sdei_entry_point = sdei_arch_get_entry_point(conduit);
+	if (!sdei_entry_point) {
+		/* Not supported due to hardware or boot configuration */
+		sdei_mark_interface_broken();
+		return 0;
+	}
+
+	on_each_cpu(&_ipi_unmask_cpu, NULL, false);
+
+	return 0;
+}
+
+static const struct of_device_id sdei_of_match[] = {
+	{ .compatible = "arm,sdei-1.0" },
+	{}
+};
+
+static struct platform_driver sdei_driver = {
+	.driver		= {
+		.name			= "sdei",
+		.of_match_table		= sdei_of_match,
+	},
+	.probe		= sdei_probe,
+};
+
+static bool __init sdei_present_dt(void)
+{
+	struct platform_device *pdev;
+	struct device_node *np, *fw_np;
+
+	fw_np = of_find_node_by_name(NULL, "firmware");
+	if (!fw_np)
+		return false;
+
+	np = of_find_matching_node(fw_np, sdei_of_match);
+	of_node_put(fw_np);
+	if (!np)
+		return false;
+
+	pdev = of_platform_device_create(np, sdei_driver.driver.name, NULL);
+	of_node_put(np);
+	if (IS_ERR(pdev))
+		return false;
+
+	return true;
+}
+
+static int __init sdei_init(void)
+{
+	if (sdei_present_dt())
+		platform_driver_register(&sdei_driver);
+
+	return 0;
+}
+
+subsys_initcall_sync(sdei_init);
+
+int sdei_event_handler(struct pt_regs *regs,
+		       struct sdei_registered_event *arg)
+{
+	int err;
+	mm_segment_t orig_addr_limit;
+	u32 event_num = arg->event_num;
+
+	orig_addr_limit = get_fs();
+	set_fs(USER_DS);
+
+	err = arg->callback(event_num, regs, arg->callback_arg);
+	if (err)
+		pr_err_ratelimited("event %u on CPU %u failed with error: %d\n",
+				   event_num, smp_processor_id(), err);
+
+	set_fs(orig_addr_limit);
+
+	return err;
+}
+NOKPROBE_SYMBOL(sdei_event_handler);
diff --git a/include/linux/arm_sdei.h b/include/linux/arm_sdei.h
new file mode 100644
index 000000000000..942afbd544b7
--- /dev/null
+++ b/include/linux/arm_sdei.h
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2017 Arm Ltd.
+#ifndef __LINUX_ARM_SDEI_H
+#define __LINUX_ARM_SDEI_H
+
+#include <uapi/linux/arm_sdei.h>
+
+enum sdei_conduit_types {
+	CONDUIT_INVALID = 0,
+	CONDUIT_SMC,
+	CONDUIT_HVC,
+};
+
+#include <asm/sdei.h>
+
+/* Arch code should override this to set the entry point from firmware... */
+#ifndef sdei_arch_get_entry_point
+#define sdei_arch_get_entry_point(conduit)	(0)
+#endif
+
+/*
+ * When an event occurs sdei_event_handler() will call a user-provided callback
+ * like this in NMI context on the CPU that received the event.
+ */
+typedef int (sdei_event_callback)(u32 event, struct pt_regs *regs, void *arg);
+
+/*
+ * Register your callback to claim an event. The event must be described
+ * by firmware.
+ */
+int sdei_event_register(u32 event_num, sdei_event_callback *cb, void *arg);
+
+/*
+ * Calls to sdei_event_unregister() may return EINPROGRESS. Keep calling
+ * it until it succeeds.
+ */
+int sdei_event_unregister(u32 event_num);
+
+int sdei_event_enable(u32 event_num);
+int sdei_event_disable(u32 event_num);
+
+#ifdef CONFIG_ARM_SDE_INTERFACE
+/* For use by arch code when CPU hotplug notifiers are not appropriate. */
+int sdei_mask_local_cpu(void);
+int sdei_unmask_local_cpu(void);
+#else
+static inline int sdei_mask_local_cpu(void) { return 0; }
+static inline int sdei_unmask_local_cpu(void) { return 0; }
+#endif /* CONFIG_ARM_SDE_INTERFACE */
+
+
+/*
+ * This struct represents an event that has been registered. The driver
+ * maintains a list of all events, and which ones are registered. (Private
+ * events have one entry in the list, but are registered on each CPU).
+ * A pointer to this struct is passed to firmware, and back to the event
+ * handler. The event handler can then use this to invoke the registered
+ * callback, without having to walk the list.
+ *
+ * For CPU private events, this structure is per-cpu.
+ */
+struct sdei_registered_event {
+	/* For use by arch code: */
+	struct pt_regs          interrupted_regs;
+
+	sdei_event_callback	*callback;
+	void			*callback_arg;
+	u32			 event_num;
+	u8			 priority;
+};
+
+/* The arch code entry point should then call this when an event arrives. */
+int notrace sdei_event_handler(struct pt_regs *regs,
+			       struct sdei_registered_event *arg);
+
+/* arch code may use this to retrieve the extra registers. */
+int sdei_api_event_context(u32 query, u64 *result);
+
+#endif /* __LINUX_ARM_SDEI_H */
diff --git a/include/uapi/linux/arm_sdei.h b/include/uapi/linux/arm_sdei.h
new file mode 100644
index 000000000000..af0630ba5437
--- /dev/null
+++ b/include/uapi/linux/arm_sdei.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (C) 2017 Arm Ltd. */
+#ifndef _UAPI_LINUX_ARM_SDEI_H
+#define _UAPI_LINUX_ARM_SDEI_H
+
+#define SDEI_1_0_FN_BASE			0xC4000020
+#define SDEI_1_0_MASK				0xFFFFFFE0
+#define SDEI_1_0_FN(n)				(SDEI_1_0_FN_BASE + (n))
+
+#define SDEI_1_0_FN_SDEI_VERSION			SDEI_1_0_FN(0x00)
+#define SDEI_1_0_FN_SDEI_EVENT_REGISTER			SDEI_1_0_FN(0x01)
+#define SDEI_1_0_FN_SDEI_EVENT_ENABLE			SDEI_1_0_FN(0x02)
+#define SDEI_1_0_FN_SDEI_EVENT_DISABLE			SDEI_1_0_FN(0x03)
+#define SDEI_1_0_FN_SDEI_EVENT_CONTEXT			SDEI_1_0_FN(0x04)
+#define SDEI_1_0_FN_SDEI_EVENT_COMPLETE			SDEI_1_0_FN(0x05)
+#define SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME	SDEI_1_0_FN(0x06)
+#define SDEI_1_0_FN_SDEI_EVENT_UNREGISTER		SDEI_1_0_FN(0x07)
+#define SDEI_1_0_FN_SDEI_EVENT_STATUS			SDEI_1_0_FN(0x08)
+#define SDEI_1_0_FN_SDEI_EVENT_GET_INFO			SDEI_1_0_FN(0x09)
+#define SDEI_1_0_FN_SDEI_EVENT_ROUTING_SET		SDEI_1_0_FN(0x0A)
+#define SDEI_1_0_FN_SDEI_PE_MASK			SDEI_1_0_FN(0x0B)
+#define SDEI_1_0_FN_SDEI_PE_UNMASK			SDEI_1_0_FN(0x0C)
+#define SDEI_1_0_FN_SDEI_INTERRUPT_BIND			SDEI_1_0_FN(0x0D)
+#define SDEI_1_0_FN_SDEI_INTERRUPT_RELEASE		SDEI_1_0_FN(0x0E)
+#define SDEI_1_0_FN_SDEI_PRIVATE_RESET			SDEI_1_0_FN(0x11)
+#define SDEI_1_0_FN_SDEI_SHARED_RESET			SDEI_1_0_FN(0x12)
+
+#define SDEI_VERSION_MAJOR_SHIFT			48
+#define SDEI_VERSION_MAJOR_MASK				0x7fff
+#define SDEI_VERSION_MINOR_SHIFT			32
+#define SDEI_VERSION_MINOR_MASK				0xffff
+#define SDEI_VERSION_VENDOR_SHIFT			0
+#define SDEI_VERSION_VENDOR_MASK			0xffffffff
+
+#define SDEI_VERSION_MAJOR(x)	(x>>SDEI_VERSION_MAJOR_SHIFT & SDEI_VERSION_MAJOR_MASK)
+#define SDEI_VERSION_MINOR(x)	(x>>SDEI_VERSION_MINOR_SHIFT & SDEI_VERSION_MINOR_MASK)
+#define SDEI_VERSION_VENDOR(x)	(x>>SDEI_VERSION_VENDOR_SHIFT & SDEI_VERSION_VENDOR_MASK)
+
+/* SDEI return values */
+#define SDEI_SUCCESS		0
+#define SDEI_NOT_SUPPORTED	-1
+#define SDEI_INVALID_PARAMETERS	-2
+#define SDEI_DENIED		-3
+#define SDEI_PENDING		-5
+#define SDEI_OUT_OF_RESOURCE	-10
+
+/* EVENT_REGISTER flags */
+#define SDEI_EVENT_REGISTER_RM_ANY	0
+#define SDEI_EVENT_REGISTER_RM_PE	1
+
+/* EVENT_STATUS return value bits */
+#define SDEI_EVENT_STATUS_RUNNING	2
+#define SDEI_EVENT_STATUS_ENABLED	1
+#define SDEI_EVENT_STATUS_REGISTERED	0
+
+/* EVENT_COMPLETE status values */
+#define SDEI_EV_HANDLED	0
+#define SDEI_EV_FAILED	1
+
+/* GET_INFO values */
+#define SDEI_EVENT_INFO_EV_TYPE			0
+#define SDEI_EVENT_INFO_EV_SIGNALED		1
+#define SDEI_EVENT_INFO_EV_PRIORITY		2
+#define SDEI_EVENT_INFO_EV_ROUTING_MODE		3
+#define SDEI_EVENT_INFO_EV_ROUTING_AFF		4
+
+/* and their results */
+#define SDEI_EVENT_TYPE_PRIVATE			0
+#define SDEI_EVENT_TYPE_SHARED			1
+#define SDEI_EVENT_PRIORITY_NORMAL		0
+#define SDEI_EVENT_PRIORITY_CRITICAL		1
+
+#endif /* _UAPI_LINUX_ARM_SDEI_H */

From ed8b20d457d72e9e2a30533b436fdb4ea1c70b38 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 8 Jan 2018 15:38:10 +0000
Subject: [PATCH 062/104] arm64: Add vmap_stack header file

Today the arm64 arch code allocates an extra IRQ stack per-cpu. If we
also have SDEI and VMAP stacks we need two extra per-cpu VMAP stacks.

Move the VMAP stack allocation out to a helper in a new header file.
This avoids missing THREADINFO_GFP, or getting the all-important alignment
wrong.

Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/vmap_stack.h | 28 ++++++++++++++++++++++++++++
 arch/arm64/kernel/irq.c             | 13 ++-----------
 2 files changed, 30 insertions(+), 11 deletions(-)
 create mode 100644 arch/arm64/include/asm/vmap_stack.h

diff --git a/arch/arm64/include/asm/vmap_stack.h b/arch/arm64/include/asm/vmap_stack.h
new file mode 100644
index 000000000000..0b5ec6e08c10
--- /dev/null
+++ b/arch/arm64/include/asm/vmap_stack.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2017 Arm Ltd.
+#ifndef __ASM_VMAP_STACK_H
+#define __ASM_VMAP_STACK_H
+
+#include <linux/bug.h>
+#include <linux/gfp.h>
+#include <linux/kconfig.h>
+#include <linux/vmalloc.h>
+#include <asm/memory.h>
+#include <asm/pgtable.h>
+#include <asm/thread_info.h>
+
+/*
+ * To ensure that VMAP'd stack overflow detection works correctly, all VMAP'd
+ * stacks need to have the same alignment.
+ */
+static inline unsigned long *arch_alloc_vmap_stack(size_t stack_size, int node)
+{
+	BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK));
+
+	return __vmalloc_node_range(stack_size, THREAD_ALIGN,
+				    VMALLOC_START, VMALLOC_END,
+				    THREADINFO_GFP, PAGE_KERNEL, 0, node,
+				    __builtin_return_address(0));
+}
+
+#endif /* __ASM_VMAP_STACK_H */
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
index 713561e5bcab..60e5fc661f74 100644
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -29,6 +29,7 @@
 #include <linux/irqchip.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
+#include <asm/vmap_stack.h>
 
 unsigned long irq_err_count;
 
@@ -58,17 +59,7 @@ static void init_irq_stacks(void)
 	unsigned long *p;
 
 	for_each_possible_cpu(cpu) {
-		/*
-		* To ensure that VMAP'd stack overflow detection works
-		* correctly, the IRQ stacks need to have the same
-		* alignment as other stacks.
-		*/
-		p = __vmalloc_node_range(IRQ_STACK_SIZE, THREAD_ALIGN,
-					 VMALLOC_START, VMALLOC_END,
-					 THREADINFO_GFP, PAGE_KERNEL,
-					 0, cpu_to_node(cpu),
-					 __builtin_return_address(0));
-
+		p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, cpu_to_node(cpu));
 		per_cpu(irq_stack_ptr, cpu) = p;
 	}
 }

From e1281f56f114f3a945bf9ec30698bd3caa59d322 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 8 Jan 2018 15:38:11 +0000
Subject: [PATCH 063/104] arm64: uaccess: Add PAN helper

Add __uaccess_{en,dis}able_hw_pan() helpers to set/clear the PSTATE.PAN
bit.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/uaccess.h | 12 ++++++++++++
 arch/arm64/kernel/suspend.c      |  4 ++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 6eadf55ebaf0..3821fab01d7d 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -168,6 +168,18 @@ static inline bool uaccess_ttbr0_enable(void)
 }
 #endif
 
+static inline void __uaccess_disable_hw_pan(void)
+{
+	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN,
+			CONFIG_ARM64_PAN));
+}
+
+static inline void __uaccess_enable_hw_pan(void)
+{
+	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,
+			CONFIG_ARM64_PAN));
+}
+
 #define __uaccess_disable(alt)						\
 do {									\
 	if (!uaccess_ttbr0_disable())					\
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index 3fe5ad884418..a307b9e13392 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -2,6 +2,7 @@
 #include <linux/ftrace.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
 #include <asm/alternative.h>
 #include <asm/cacheflush.h>
 #include <asm/cpufeature.h>
@@ -51,8 +52,7 @@ void notrace __cpu_suspend_exit(void)
 	 * PSTATE was not saved over suspend/resume, re-enable any detected
 	 * features that might not have been set correctly.
 	 */
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,
-			CONFIG_ARM64_PAN));
+	__uaccess_enable_hw_pan();
 	uao_thread_switch(current);
 
 	/*

From f5df26961853d6809d704cedcaf082c57f635a64 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 8 Jan 2018 15:38:12 +0000
Subject: [PATCH 064/104] arm64: kernel: Add arch-specific SDEI entry code and
 CPU masking

The Software Delegated Exception Interface (SDEI) is an ARM standard
for registering callbacks from the platform firmware into the OS.
This is typically used to implement RAS notifications.

Such notifications enter the kernel at the registered entry-point
with the register values of the interrupted CPU context. Because this
is not a CPU exception, it cannot reuse the existing entry code.
(crucially we don't implicitly know which exception level we interrupted),

Add the entry point to entry.S to set us up for calling into C code. If
the event interrupted code that had interrupts masked, we always return
to that location. Otherwise we pretend this was an IRQ, and use SDEI's
complete_and_resume call to return to vbar_el1 + offset.

This allows the kernel to deliver signals to user space processes. For
KVM this triggers the world switch, a quick spin round vcpu_run, then
back into the guest, unless there are pending signals.

Add sdei_mask_local_cpu() calls to the smp_send_stop() code, this covers
the panic() code-path, which doesn't invoke cpuhotplug notifiers.

Because we can interrupt entry-from/exit-to another EL, we can't trust the
value in sp_el0 or x29, even if we interrupted the kernel, in this case
the code in entry.S will save/restore sp_el0 and use the value in
__entry_task.

When we have VMAP stacks we can interrupt the stack-overflow test, which
stirs x0 into sp, meaning we have to have our own VMAP stacks. For now
these are allocated when we probe the interface. Future patches will add
refcounting hooks to allow the arch code to allocate them lazily.

Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/sdei.h       |  47 +++++-
 arch/arm64/include/asm/stacktrace.h |   3 +
 arch/arm64/kernel/Makefile          |   1 +
 arch/arm64/kernel/asm-offsets.c     |   5 +
 arch/arm64/kernel/entry.S           | 101 +++++++++++++
 arch/arm64/kernel/sdei.c            | 219 ++++++++++++++++++++++++++++
 arch/arm64/kernel/smp.c             |  11 +-
 7 files changed, 384 insertions(+), 3 deletions(-)
 create mode 100644 arch/arm64/kernel/sdei.c

diff --git a/arch/arm64/include/asm/sdei.h b/arch/arm64/include/asm/sdei.h
index 59f26b6e673d..d58a31ab525a 100644
--- a/arch/arm64/include/asm/sdei.h
+++ b/arch/arm64/include/asm/sdei.h
@@ -3,6 +3,49 @@
 #ifndef __ASM_SDEI_H
 #define __ASM_SDEI_H
 
-/* Later patches add the arch specific bits */
+/* Values for sdei_exit_mode */
+#define SDEI_EXIT_HVC  0
+#define SDEI_EXIT_SMC  1
 
-#endif /* __ASM_SDEI_H */
+#define SDEI_STACK_SIZE		IRQ_STACK_SIZE
+
+#ifndef __ASSEMBLY__
+
+#include <linux/linkage.h>
+#include <linux/preempt.h>
+#include <linux/types.h>
+
+#include <asm/virt.h>
+
+extern unsigned long sdei_exit_mode;
+
+/* Software Delegated Exception entry point from firmware*/
+asmlinkage void __sdei_asm_handler(unsigned long event_num, unsigned long arg,
+				   unsigned long pc, unsigned long pstate);
+
+/*
+ * The above entry point does the minimum to call C code. This function does
+ * anything else, before calling the driver.
+ */
+struct sdei_registered_event;
+asmlinkage unsigned long __sdei_handler(struct pt_regs *regs,
+					struct sdei_registered_event *arg);
+
+unsigned long sdei_arch_get_entry_point(int conduit);
+#define sdei_arch_get_entry_point(x)	sdei_arch_get_entry_point(x)
+
+bool _on_sdei_stack(unsigned long sp);
+static inline bool on_sdei_stack(unsigned long sp)
+{
+	if (!IS_ENABLED(CONFIG_VMAP_STACK))
+		return false;
+	if (!IS_ENABLED(CONFIG_ARM_SDE_INTERFACE))
+		return false;
+	if (in_nmi())
+		return _on_sdei_stack(sp);
+
+	return false;
+}
+
+#endif /* __ASSEMBLY__ */
+#endif	/* __ASM_SDEI_H */
diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h
index 6ad30776e984..472ef944e932 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -22,6 +22,7 @@
 
 #include <asm/memory.h>
 #include <asm/ptrace.h>
+#include <asm/sdei.h>
 
 struct stackframe {
 	unsigned long fp;
@@ -85,6 +86,8 @@ static inline bool on_accessible_stack(struct task_struct *tsk, unsigned long sp
 		return true;
 	if (on_overflow_stack(sp))
 		return true;
+	if (on_sdei_stack(sp))
+		return true;
 
 	return false;
 }
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 0c760db04858..b87541360f43 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -52,6 +52,7 @@ arm64-obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o	\
 arm64-obj-$(CONFIG_ARM64_RELOC_TEST)	+= arm64-reloc-test.o
 arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
 arm64-obj-$(CONFIG_CRASH_DUMP)		+= crash_dump.o
+arm64-obj-$(CONFIG_ARM_SDE_INTERFACE)	+= sdei.o
 
 ifeq ($(CONFIG_KVM),y)
 arm64-obj-$(CONFIG_HARDEN_BRANCH_PREDICTOR)	+= bpi.o
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index af247d10252f..1dcc493f5765 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -18,6 +18,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/arm_sdei.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
@@ -157,6 +158,10 @@ int main(void)
   BLANK();
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
   DEFINE(TRAMP_VALIAS,		TRAMP_VALIAS);
+#endif
+#ifdef CONFIG_ARM_SDE_INTERFACE
+  DEFINE(SDEI_EVENT_INTREGS,	offsetof(struct sdei_registered_event, interrupted_regs));
+  DEFINE(SDEI_EVENT_PRIORITY,	offsetof(struct sdei_registered_event, priority));
 #endif
   return 0;
 }
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 07a7d4db8ec4..40bf5083d182 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -1153,3 +1153,104 @@ ENTRY(ret_from_fork)
 	b	ret_to_user
 ENDPROC(ret_from_fork)
 NOKPROBE(ret_from_fork)
+
+#ifdef CONFIG_ARM_SDE_INTERFACE
+
+#include <asm/sdei.h>
+#include <uapi/linux/arm_sdei.h>
+
+/*
+ * Software Delegated Exception entry point.
+ *
+ * x0: Event number
+ * x1: struct sdei_registered_event argument from registration time.
+ * x2: interrupted PC
+ * x3: interrupted PSTATE
+ *
+ * Firmware has preserved x0->x17 for us, we must save/restore the rest to
+ * follow SMC-CC. We save (or retrieve) all the registers as the handler may
+ * want them.
+ */
+ENTRY(__sdei_asm_handler)
+	stp     x2, x3, [x1, #SDEI_EVENT_INTREGS + S_PC]
+	stp     x4, x5, [x1, #SDEI_EVENT_INTREGS + 16 * 2]
+	stp     x6, x7, [x1, #SDEI_EVENT_INTREGS + 16 * 3]
+	stp     x8, x9, [x1, #SDEI_EVENT_INTREGS + 16 * 4]
+	stp     x10, x11, [x1, #SDEI_EVENT_INTREGS + 16 * 5]
+	stp     x12, x13, [x1, #SDEI_EVENT_INTREGS + 16 * 6]
+	stp     x14, x15, [x1, #SDEI_EVENT_INTREGS + 16 * 7]
+	stp     x16, x17, [x1, #SDEI_EVENT_INTREGS + 16 * 8]
+	stp     x18, x19, [x1, #SDEI_EVENT_INTREGS + 16 * 9]
+	stp     x20, x21, [x1, #SDEI_EVENT_INTREGS + 16 * 10]
+	stp     x22, x23, [x1, #SDEI_EVENT_INTREGS + 16 * 11]
+	stp     x24, x25, [x1, #SDEI_EVENT_INTREGS + 16 * 12]
+	stp     x26, x27, [x1, #SDEI_EVENT_INTREGS + 16 * 13]
+	stp     x28, x29, [x1, #SDEI_EVENT_INTREGS + 16 * 14]
+	mov	x4, sp
+	stp     lr, x4, [x1, #SDEI_EVENT_INTREGS + S_LR]
+
+	mov	x19, x1
+
+#ifdef CONFIG_VMAP_STACK
+	/*
+	 * entry.S may have been using sp as a scratch register, find whether
+	 * this is a normal or critical event and switch to the appropriate
+	 * stack for this CPU.
+	 */
+	ldrb	w4, [x19, #SDEI_EVENT_PRIORITY]
+	cbnz	w4, 1f
+	ldr_this_cpu dst=x5, sym=sdei_stack_normal_ptr, tmp=x6
+	b	2f
+1:	ldr_this_cpu dst=x5, sym=sdei_stack_critical_ptr, tmp=x6
+2:	mov	x6, #SDEI_STACK_SIZE
+	add	x5, x5, x6
+	mov	sp, x5
+#endif
+
+	/*
+	 * We may have interrupted userspace, or a guest, or exit-from or
+	 * return-to either of these. We can't trust sp_el0, restore it.
+	 */
+	mrs	x28, sp_el0
+	ldr_this_cpu	dst=x0, sym=__entry_task, tmp=x1
+	msr	sp_el0, x0
+
+	/* If we interrupted the kernel point to the previous stack/frame. */
+	and     x0, x3, #0xc
+	mrs     x1, CurrentEL
+	cmp     x0, x1
+	csel	x29, x29, xzr, eq	// fp, or zero
+	csel	x4, x2, xzr, eq		// elr, or zero
+
+	stp	x29, x4, [sp, #-16]!
+	mov	x29, sp
+
+	add	x0, x19, #SDEI_EVENT_INTREGS
+	mov	x1, x19
+	bl	__sdei_handler
+
+	msr	sp_el0, x28
+	/* restore regs >x17 that we clobbered */
+	ldp     x28, x29, [x19, #SDEI_EVENT_INTREGS + 16 * 14]
+	ldp     lr, x4, [x19, #SDEI_EVENT_INTREGS + S_LR]
+	mov	sp, x4
+	ldp     x18, x19, [x19, #SDEI_EVENT_INTREGS + 16 * 9]
+
+	mov	x1, x0			// address to complete_and_resume
+	/* x0 = (x0 <= 1) ? EVENT_COMPLETE:EVENT_COMPLETE_AND_RESUME */
+	cmp	x0, #1
+	mov_q	x2, SDEI_1_0_FN_SDEI_EVENT_COMPLETE
+	mov_q	x3, SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME
+	csel	x0, x2, x3, ls
+
+	/* On success, this call never returns... */
+	ldr_l	x2, sdei_exit_mode
+	cmp	x2, #SDEI_EXIT_SMC
+	b.ne	1f
+	smc	#0
+	b	.
+1:	hvc	#0
+	b	.
+ENDPROC(__sdei_asm_handler)
+NOKPROBE(__sdei_asm_handler)
+#endif /* CONFIG_ARM_SDE_INTERFACE */
diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c
new file mode 100644
index 000000000000..f9dffacaa5d6
--- /dev/null
+++ b/arch/arm64/kernel/sdei.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2017 Arm Ltd.
+#define pr_fmt(fmt) "sdei: " fmt
+
+#include <linux/arm_sdei.h>
+#include <linux/hardirq.h>
+#include <linux/irqflags.h>
+#include <linux/sched/task_stack.h>
+#include <linux/uaccess.h>
+
+#include <asm/alternative.h>
+#include <asm/kprobes.h>
+#include <asm/ptrace.h>
+#include <asm/sysreg.h>
+#include <asm/vmap_stack.h>
+
+unsigned long sdei_exit_mode;
+
+/*
+ * VMAP'd stacks checking for stack overflow on exception using sp as a scratch
+ * register, meaning SDEI has to switch to its own stack. We need two stacks as
+ * a critical event may interrupt a normal event that has just taken a
+ * synchronous exception, and is using sp as scratch register. For a critical
+ * event interrupting a normal event, we can't reliably tell if we were on the
+ * sdei stack.
+ * For now, we allocate stacks when the driver is probed.
+ */
+DECLARE_PER_CPU(unsigned long *, sdei_stack_normal_ptr);
+DECLARE_PER_CPU(unsigned long *, sdei_stack_critical_ptr);
+
+#ifdef CONFIG_VMAP_STACK
+DEFINE_PER_CPU(unsigned long *, sdei_stack_normal_ptr);
+DEFINE_PER_CPU(unsigned long *, sdei_stack_critical_ptr);
+#endif
+
+static void _free_sdei_stack(unsigned long * __percpu *ptr, int cpu)
+{
+	unsigned long *p;
+
+	p = per_cpu(*ptr, cpu);
+	if (p) {
+		per_cpu(*ptr, cpu) = NULL;
+		vfree(p);
+	}
+}
+
+static void free_sdei_stacks(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		_free_sdei_stack(&sdei_stack_normal_ptr, cpu);
+		_free_sdei_stack(&sdei_stack_critical_ptr, cpu);
+	}
+}
+
+static int _init_sdei_stack(unsigned long * __percpu *ptr, int cpu)
+{
+	unsigned long *p;
+
+	p = arch_alloc_vmap_stack(SDEI_STACK_SIZE, cpu_to_node(cpu));
+	if (!p)
+		return -ENOMEM;
+	per_cpu(*ptr, cpu) = p;
+
+	return 0;
+}
+
+static int init_sdei_stacks(void)
+{
+	int cpu;
+	int err = 0;
+
+	for_each_possible_cpu(cpu) {
+		err = _init_sdei_stack(&sdei_stack_normal_ptr, cpu);
+		if (err)
+			break;
+		err = _init_sdei_stack(&sdei_stack_critical_ptr, cpu);
+		if (err)
+			break;
+	}
+
+	if (err)
+		free_sdei_stacks();
+
+	return err;
+}
+
+bool _on_sdei_stack(unsigned long sp)
+{
+	unsigned long low, high;
+
+	if (!IS_ENABLED(CONFIG_VMAP_STACK))
+		return false;
+
+	low = (unsigned long)raw_cpu_read(sdei_stack_critical_ptr);
+	high = low + SDEI_STACK_SIZE;
+
+	if (low <= sp && sp < high)
+		return true;
+
+	low = (unsigned long)raw_cpu_read(sdei_stack_normal_ptr);
+	high = low + SDEI_STACK_SIZE;
+
+	return (low <= sp && sp < high);
+}
+
+unsigned long sdei_arch_get_entry_point(int conduit)
+{
+	/*
+	 * SDEI works between adjacent exception levels. If we booted at EL1 we
+	 * assume a hypervisor is marshalling events. If we booted at EL2 and
+	 * dropped to EL1 because we don't support VHE, then we can't support
+	 * SDEI.
+	 */
+	if (is_hyp_mode_available() && !is_kernel_in_hyp_mode()) {
+		pr_err("Not supported on this hardware/boot configuration\n");
+		return 0;
+	}
+
+	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+		if (init_sdei_stacks())
+			return 0;
+	}
+
+	sdei_exit_mode = (conduit == CONDUIT_HVC) ? SDEI_EXIT_HVC : SDEI_EXIT_SMC;
+	return (unsigned long)__sdei_asm_handler;
+}
+
+/*
+ * __sdei_handler() returns one of:
+ *  SDEI_EV_HANDLED -  success, return to the interrupted context.
+ *  SDEI_EV_FAILED  -  failure, return this error code to firmare.
+ *  virtual-address -  success, return to this address.
+ */
+static __kprobes unsigned long _sdei_handler(struct pt_regs *regs,
+					     struct sdei_registered_event *arg)
+{
+	u32 mode;
+	int i, err = 0;
+	const int clobbered_registers = 4;
+	u64 elr = read_sysreg(elr_el1);
+	u32 kernel_mode = read_sysreg(CurrentEL) | 1;	/* +SPSel */
+	unsigned long vbar = read_sysreg(vbar_el1);
+
+	/* Retrieve the missing registers values */
+	for (i = 0; i < clobbered_registers; i++) {
+		/* from within the handler, this call always succeeds */
+		sdei_api_event_context(i, &regs->regs[i]);
+	}
+
+	/*
+	 * We didn't take an exception to get here, set PAN. UAO will be cleared
+	 * by sdei_event_handler()s set_fs(USER_DS) call.
+	 */
+	__uaccess_enable_hw_pan();
+
+	err = sdei_event_handler(regs, arg);
+	if (err)
+		return SDEI_EV_FAILED;
+
+	if (elr != read_sysreg(elr_el1)) {
+		/*
+		 * We took a synchronous exception from the SDEI handler.
+		 * This could deadlock, and if you interrupt KVM it will
+		 * hyp-panic instead.
+		 */
+		pr_warn("unsafe: exception during handler\n");
+	}
+
+	mode = regs->pstate & (PSR_MODE32_BIT | PSR_MODE_MASK);
+
+	/*
+	 * If we interrupted the kernel with interrupts masked, we always go
+	 * back to wherever we came from.
+	 */
+	if (mode == kernel_mode && !interrupts_enabled(regs))
+		return SDEI_EV_HANDLED;
+
+	/*
+	 * Otherwise, we pretend this was an IRQ. This lets user space tasks
+	 * receive signals before we return to them, and KVM to invoke it's
+	 * world switch to do the same.
+	 *
+	 * See DDI0487B.a Table D1-7 'Vector offsets from vector table base
+	 * address'.
+	 */
+	if (mode == kernel_mode)
+		return vbar + 0x280;
+	else if (mode & PSR_MODE32_BIT)
+		return vbar + 0x680;
+
+	return vbar + 0x480;
+}
+
+
+asmlinkage __kprobes notrace unsigned long
+__sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg)
+{
+	unsigned long ret;
+	bool do_nmi_exit = false;
+
+	/*
+	 * nmi_enter() deals with printk() re-entrance and use of RCU when
+	 * RCU believed this CPU was idle. Because critical events can
+	 * interrupt normal events, we may already be in_nmi().
+	 */
+	if (!in_nmi()) {
+		nmi_enter();
+		do_nmi_exit = true;
+	}
+
+	ret = _sdei_handler(regs, arg);
+
+	if (do_nmi_exit)
+		nmi_exit();
+
+	return ret;
+}
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 551eb07c53b6..3b8ad7be9c33 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -18,6 +18,7 @@
  */
 
 #include <linux/acpi.h>
+#include <linux/arm_sdei.h>
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
@@ -836,6 +837,7 @@ static void ipi_cpu_stop(unsigned int cpu)
 	set_cpu_online(cpu, false);
 
 	local_daif_mask();
+	sdei_mask_local_cpu();
 
 	while (1)
 		cpu_relax();
@@ -853,6 +855,7 @@ static void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
 	atomic_dec(&waiting_for_crash_ipi);
 
 	local_irq_disable();
+	sdei_mask_local_cpu();
 
 #ifdef CONFIG_HOTPLUG_CPU
 	if (cpu_ops[cpu]->cpu_die)
@@ -972,6 +975,8 @@ void smp_send_stop(void)
 	if (num_online_cpus() > 1)
 		pr_warning("SMP: failed to stop secondary CPUs %*pbl\n",
 			   cpumask_pr_args(cpu_online_mask));
+
+	sdei_mask_local_cpu();
 }
 
 #ifdef CONFIG_KEXEC_CORE
@@ -990,8 +995,10 @@ void crash_smp_send_stop(void)
 
 	cpus_stopped = 1;
 
-	if (num_online_cpus() == 1)
+	if (num_online_cpus() == 1) {
+		sdei_mask_local_cpu();
 		return;
+	}
 
 	cpumask_copy(&mask, cpu_online_mask);
 	cpumask_clear_cpu(smp_processor_id(), &mask);
@@ -1009,6 +1016,8 @@ void crash_smp_send_stop(void)
 	if (atomic_read(&waiting_for_crash_ipi) > 0)
 		pr_warning("SMP: failed to stop secondary CPUs %*pbl\n",
 			   cpumask_pr_args(&mask));
+
+	sdei_mask_local_cpu();
 }
 
 bool smp_crash_stop_failed(void)

From da351827240e1705cca64bb8ae526f0ce1068048 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 8 Jan 2018 15:38:13 +0000
Subject: [PATCH 065/104] firmware: arm_sdei: Add support for CPU and system
 power states

When a CPU enters an idle lower-power state or is powering off, we
need to mask SDE events so that no events can be delivered while we
are messing with the MMU as the registered entry points won't be valid.

If the system reboots, we want to unregister all events and mask the CPUs.
For kexec this allows us to hand a clean slate to the next kernel
instead of relying on it to call sdei_{private,system}_data_reset().

For hibernate we unregister all events and re-register them on restore,
in case we restored with the SDE code loaded at a different address.
(e.g. KASLR).

Add all the notifiers necessary to do this. We only support shared events
so all events are left registered and enabled over CPU hotplug.

Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
[catalin.marinas@arm.com: added CPU_PM_ENTER_FAILED case]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/firmware/arm_sdei.c | 256 +++++++++++++++++++++++++++++++++++-
 include/linux/cpuhotplug.h  |   1 +
 2 files changed, 256 insertions(+), 1 deletion(-)

diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c
index 8da173cc7e43..c503bc5222ed 100644
--- a/drivers/firmware/arm_sdei.c
+++ b/drivers/firmware/arm_sdei.c
@@ -7,6 +7,8 @@
 #include <linux/arm-smccc.h>
 #include <linux/bitops.h>
 #include <linux/compiler.h>
+#include <linux/cpuhotplug.h>
+#include <linux/cpu_pm.h>
 #include <linux/errno.h>
 #include <linux/hardirq.h>
 #include <linux/kernel.h>
@@ -14,12 +16,15 @@
 #include <linux/kvm_host.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
+#include <linux/notifier.h>
 #include <linux/of.h>
 #include <linux/of_platform.h>
 #include <linux/percpu.h>
 #include <linux/platform_device.h>
+#include <linux/pm.h>
 #include <linux/ptrace.h>
 #include <linux/preempt.h>
+#include <linux/reboot.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/spinlock.h>
@@ -37,7 +42,11 @@ static asmlinkage void (*sdei_firmware_call)(unsigned long function_id,
 static unsigned long sdei_entry_point;
 
 struct sdei_event {
+	/* These three are protected by the sdei_list_lock */
 	struct list_head	list;
+	bool			reregister;
+	bool			reenable;
+
 	u32			event_num;
 	u8			type;
 	u8			priority;
@@ -253,6 +262,11 @@ static void _ipi_mask_cpu(void *ignored)
 	sdei_mask_local_cpu();
 }
 
+static int sdei_cpuhp_down(unsigned int ignored)
+{
+	return sdei_mask_local_cpu();
+}
+
 int sdei_unmask_local_cpu(void)
 {
 	int err;
@@ -274,6 +288,11 @@ static void _ipi_unmask_cpu(void *ignored)
 	sdei_unmask_local_cpu();
 }
 
+static int sdei_cpuhp_up(unsigned int ignored)
+{
+	return sdei_unmask_local_cpu();
+}
+
 static void _ipi_private_reset(void *ignored)
 {
 	int err;
@@ -330,6 +349,10 @@ int sdei_event_enable(u32 event_num)
 		return -ENOENT;
 	}
 
+	spin_lock(&sdei_list_lock);
+	event->reenable = true;
+	spin_unlock(&sdei_list_lock);
+
 	if (event->type == SDEI_EVENT_TYPE_SHARED)
 		err = sdei_api_event_enable(event->event_num);
 	mutex_unlock(&sdei_events_lock);
@@ -356,6 +379,10 @@ int sdei_event_disable(u32 event_num)
 		return -ENOENT;
 	}
 
+	spin_lock(&sdei_list_lock);
+	event->reenable = false;
+	spin_unlock(&sdei_list_lock);
+
 	if (event->type == SDEI_EVENT_TYPE_SHARED)
 		err = sdei_api_event_disable(event->event_num);
 	mutex_unlock(&sdei_events_lock);
@@ -374,6 +401,11 @@ static int _sdei_event_unregister(struct sdei_event *event)
 {
 	lockdep_assert_held(&sdei_events_lock);
 
+	spin_lock(&sdei_list_lock);
+	event->reregister = false;
+	event->reenable = false;
+	spin_unlock(&sdei_list_lock);
+
 	if (event->type == SDEI_EVENT_TYPE_SHARED)
 		return sdei_api_event_unregister(event->event_num);
 
@@ -408,6 +440,31 @@ int sdei_event_unregister(u32 event_num)
 }
 EXPORT_SYMBOL(sdei_event_unregister);
 
+/*
+ * unregister events, but don't destroy them as they are re-registered by
+ * sdei_reregister_shared().
+ */
+static int sdei_unregister_shared(void)
+{
+	int err = 0;
+	struct sdei_event *event;
+
+	mutex_lock(&sdei_events_lock);
+	spin_lock(&sdei_list_lock);
+	list_for_each_entry(event, &sdei_list, list) {
+		if (event->type != SDEI_EVENT_TYPE_SHARED)
+			continue;
+
+		err = _sdei_event_unregister(event);
+		if (err)
+			break;
+	}
+	spin_unlock(&sdei_list_lock);
+	mutex_unlock(&sdei_events_lock);
+
+	return err;
+}
+
 static int sdei_api_event_register(u32 event_num, unsigned long entry_point,
 				   void *arg, u64 flags, u64 affinity)
 {
@@ -465,6 +522,175 @@ int sdei_event_register(u32 event_num, sdei_event_callback *cb, void *arg)
 }
 EXPORT_SYMBOL(sdei_event_register);
 
+static int sdei_reregister_event(struct sdei_event *event)
+{
+	int err;
+
+	lockdep_assert_held(&sdei_events_lock);
+
+	err = _sdei_event_register(event);
+	if (err) {
+		pr_err("Failed to re-register event %u\n", event->event_num);
+		sdei_event_destroy(event);
+		return err;
+	}
+
+	if (event->reenable) {
+		if (event->type == SDEI_EVENT_TYPE_SHARED)
+			err = sdei_api_event_enable(event->event_num);
+	}
+
+	if (err)
+		pr_err("Failed to re-enable event %u\n", event->event_num);
+
+	return err;
+}
+
+static int sdei_reregister_shared(void)
+{
+	int err = 0;
+	struct sdei_event *event;
+
+	mutex_lock(&sdei_events_lock);
+	spin_lock(&sdei_list_lock);
+	list_for_each_entry(event, &sdei_list, list) {
+		if (event->type != SDEI_EVENT_TYPE_SHARED)
+			continue;
+
+		if (event->reregister) {
+			err = sdei_reregister_event(event);
+			if (err)
+				break;
+		}
+	}
+	spin_unlock(&sdei_list_lock);
+	mutex_unlock(&sdei_events_lock);
+
+	return err;
+}
+
+/* When entering idle, mask/unmask events for this cpu */
+static int sdei_pm_notifier(struct notifier_block *nb, unsigned long action,
+			    void *data)
+{
+	int rv;
+
+	switch (action) {
+	case CPU_PM_ENTER:
+		rv = sdei_mask_local_cpu();
+		break;
+	case CPU_PM_EXIT:
+	case CPU_PM_ENTER_FAILED:
+		rv = sdei_unmask_local_cpu();
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	if (rv)
+		return notifier_from_errno(rv);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block sdei_pm_nb = {
+	.notifier_call = sdei_pm_notifier,
+};
+
+static int sdei_device_suspend(struct device *dev)
+{
+	on_each_cpu(_ipi_mask_cpu, NULL, true);
+
+	return 0;
+}
+
+static int sdei_device_resume(struct device *dev)
+{
+	on_each_cpu(_ipi_unmask_cpu, NULL, true);
+
+	return 0;
+}
+
+/*
+ * We need all events to be reregistered when we resume from hibernate.
+ *
+ * The sequence is freeze->thaw. Reboot. freeze->restore. We unregister
+ * events during freeze, then re-register and re-enable them during thaw
+ * and restore.
+ */
+static int sdei_device_freeze(struct device *dev)
+{
+	int err;
+
+	cpuhp_remove_state(CPUHP_AP_ARM_SDEI_STARTING);
+
+	err = sdei_unregister_shared();
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int sdei_device_thaw(struct device *dev)
+{
+	int err;
+
+	/* re-register shared events */
+	err = sdei_reregister_shared();
+	if (err) {
+		pr_warn("Failed to re-register shared events...\n");
+		sdei_mark_interface_broken();
+		return err;
+	}
+
+	err = cpuhp_setup_state(CPUHP_AP_ARM_SDEI_STARTING, "SDEI",
+				&sdei_cpuhp_up, &sdei_cpuhp_down);
+	if (err)
+		pr_warn("Failed to re-register CPU hotplug notifier...\n");
+
+	return err;
+}
+
+static int sdei_device_restore(struct device *dev)
+{
+	int err;
+
+	err = sdei_platform_reset();
+	if (err)
+		return err;
+
+	return sdei_device_thaw(dev);
+}
+
+static const struct dev_pm_ops sdei_pm_ops = {
+	.suspend = sdei_device_suspend,
+	.resume = sdei_device_resume,
+	.freeze = sdei_device_freeze,
+	.thaw = sdei_device_thaw,
+	.restore = sdei_device_restore,
+};
+
+/*
+ * Mask all CPUs and unregister all events on panic, reboot or kexec.
+ */
+static int sdei_reboot_notifier(struct notifier_block *nb, unsigned long action,
+				void *data)
+{
+	/*
+	 * We are going to reset the interface, after this there is no point
+	 * doing work when we take CPUs offline.
+	 */
+	cpuhp_remove_state(CPUHP_AP_ARM_SDEI_STARTING);
+
+	sdei_platform_reset();
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block sdei_reboot_nb = {
+	.notifier_call = sdei_reboot_notifier,
+};
+
 static void sdei_smccc_smc(unsigned long function_id,
 			   unsigned long arg0, unsigned long arg1,
 			   unsigned long arg2, unsigned long arg3,
@@ -547,9 +773,36 @@ static int sdei_probe(struct platform_device *pdev)
 		return 0;
 	}
 
-	on_each_cpu(&_ipi_unmask_cpu, NULL, false);
+	err = cpu_pm_register_notifier(&sdei_pm_nb);
+	if (err) {
+		pr_warn("Failed to register CPU PM notifier...\n");
+		goto error;
+	}
+
+	err = register_reboot_notifier(&sdei_reboot_nb);
+	if (err) {
+		pr_warn("Failed to register reboot notifier...\n");
+		goto remove_cpupm;
+	}
+
+	err = cpuhp_setup_state(CPUHP_AP_ARM_SDEI_STARTING, "SDEI",
+				&sdei_cpuhp_up, &sdei_cpuhp_down);
+	if (err) {
+		pr_warn("Failed to register CPU hotplug notifier...\n");
+		goto remove_reboot;
+	}
 
 	return 0;
+
+remove_reboot:
+	unregister_reboot_notifier(&sdei_reboot_nb);
+
+remove_cpupm:
+	cpu_pm_unregister_notifier(&sdei_pm_nb);
+
+error:
+	sdei_mark_interface_broken();
+	return err;
 }
 
 static const struct of_device_id sdei_of_match[] = {
@@ -560,6 +813,7 @@ static const struct of_device_id sdei_of_match[] = {
 static struct platform_driver sdei_driver = {
 	.driver		= {
 		.name			= "sdei",
+		.pm			= &sdei_pm_ops,
 		.of_match_table		= sdei_of_match,
 	},
 	.probe		= sdei_probe,
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 201ab7267986..87b505a48a94 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -109,6 +109,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_XTENSA_STARTING,
 	CPUHP_AP_PERF_METAG_STARTING,
 	CPUHP_AP_MIPS_OP_LOONGSON3_STARTING,
+	CPUHP_AP_ARM_SDEI_STARTING,
 	CPUHP_AP_ARM_VFP_STARTING,
 	CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING,
 	CPUHP_AP_PERF_ARM_HW_BREAKPOINT_STARTING,

From f92b5462a2f22d13a75dc663f7b2fac16a3e61cb Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 8 Jan 2018 15:38:14 +0000
Subject: [PATCH 066/104] firmware: arm_sdei: add support for CPU private
 events

Private SDE events are per-cpu, and need to be registered and enabled
on each CPU.

Hide this detail from the caller by adapting our {,un}register and
{en,dis}able calls to send an IPI to each CPU if the event is private.

CPU private events are unregistered when the CPU is powered-off, and
re-registered when the CPU is brought back online. This saves bringing
secondary cores back online to call private_reset() on shutdown, kexec
and resume from hibernate.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/firmware/arm_sdei.c | 206 +++++++++++++++++++++++++++++++++---
 1 file changed, 193 insertions(+), 13 deletions(-)

diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c
index c503bc5222ed..0c735363f54d 100644
--- a/drivers/firmware/arm_sdei.c
+++ b/drivers/firmware/arm_sdei.c
@@ -5,9 +5,11 @@
 #include <linux/acpi.h>
 #include <linux/arm_sdei.h>
 #include <linux/arm-smccc.h>
+#include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/compiler.h>
 #include <linux/cpuhotplug.h>
+#include <linux/cpu.h>
 #include <linux/cpu_pm.h>
 #include <linux/errno.h>
 #include <linux/hardirq.h>
@@ -52,7 +54,13 @@ struct sdei_event {
 	u8			priority;
 
 	/* This pointer is handed to firmware as the event argument. */
-	struct sdei_registered_event *registered;
+	union {
+		/* Shared events */
+		struct sdei_registered_event *registered;
+
+		/* CPU private events */
+		struct sdei_registered_event __percpu *private_registered;
+	};
 };
 
 /* Take the mutex for any API call or modification. Take the mutex first. */
@@ -62,6 +70,34 @@ static DEFINE_MUTEX(sdei_events_lock);
 static DEFINE_SPINLOCK(sdei_list_lock);
 static LIST_HEAD(sdei_list);
 
+/* Private events are registered/enabled via IPI passing one of these */
+struct sdei_crosscall_args {
+	struct sdei_event *event;
+	atomic_t errors;
+	int first_error;
+};
+
+#define CROSSCALL_INIT(arg, event)	(arg.event = event, \
+					 arg.first_error = 0, \
+					 atomic_set(&arg.errors, 0))
+
+static inline int sdei_do_cross_call(void *fn, struct sdei_event * event)
+{
+	struct sdei_crosscall_args arg;
+
+	CROSSCALL_INIT(arg, event);
+	on_each_cpu(fn, &arg, true);
+
+	return arg.first_error;
+}
+
+static inline void
+sdei_cross_call_return(struct sdei_crosscall_args *arg, int err)
+{
+	if (err && (atomic_inc_return(&arg->errors) == 1))
+		arg->first_error = err;
+}
+
 static int sdei_to_linux_errno(unsigned long sdei_err)
 {
 	switch (sdei_err) {
@@ -207,6 +243,26 @@ static struct sdei_event *sdei_event_create(u32 event_num,
 		reg->callback = cb;
 		reg->callback_arg = cb_arg;
 		event->registered = reg;
+	} else {
+		int cpu;
+		struct sdei_registered_event __percpu *regs;
+
+		regs = alloc_percpu(struct sdei_registered_event);
+		if (!regs) {
+			kfree(event);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		for_each_possible_cpu(cpu) {
+			reg = per_cpu_ptr(regs, cpu);
+
+			reg->event_num = event->event_num;
+			reg->priority = event->priority;
+			reg->callback = cb;
+			reg->callback_arg = cb_arg;
+		}
+
+		event->private_registered = regs;
 	}
 
 	if (sdei_event_find(event_num)) {
@@ -232,6 +288,8 @@ static void sdei_event_destroy(struct sdei_event *event)
 
 	if (event->type == SDEI_EVENT_TYPE_SHARED)
 		kfree(event->registered);
+	else
+		free_percpu(event->private_registered);
 
 	kfree(event);
 }
@@ -262,11 +320,6 @@ static void _ipi_mask_cpu(void *ignored)
 	sdei_mask_local_cpu();
 }
 
-static int sdei_cpuhp_down(unsigned int ignored)
-{
-	return sdei_mask_local_cpu();
-}
-
 int sdei_unmask_local_cpu(void)
 {
 	int err;
@@ -288,11 +341,6 @@ static void _ipi_unmask_cpu(void *ignored)
 	sdei_unmask_local_cpu();
 }
 
-static int sdei_cpuhp_up(unsigned int ignored)
-{
-	return sdei_unmask_local_cpu();
-}
-
 static void _ipi_private_reset(void *ignored)
 {
 	int err;
@@ -337,6 +385,19 @@ static int sdei_api_event_enable(u32 event_num)
 			      0, NULL);
 }
 
+/* Called directly by the hotplug callbacks */
+static void _local_event_enable(void *data)
+{
+	int err;
+	struct sdei_crosscall_args *arg = data;
+
+	WARN_ON_ONCE(preemptible());
+
+	err = sdei_api_event_enable(arg->event->event_num);
+
+	sdei_cross_call_return(arg, err);
+}
+
 int sdei_event_enable(u32 event_num)
 {
 	int err = -EINVAL;
@@ -355,6 +416,8 @@ int sdei_event_enable(u32 event_num)
 
 	if (event->type == SDEI_EVENT_TYPE_SHARED)
 		err = sdei_api_event_enable(event->event_num);
+	else
+		err = sdei_do_cross_call(_local_event_enable, event);
 	mutex_unlock(&sdei_events_lock);
 
 	return err;
@@ -367,6 +430,16 @@ static int sdei_api_event_disable(u32 event_num)
 			      0, 0, NULL);
 }
 
+static void _ipi_event_disable(void *data)
+{
+	int err;
+	struct sdei_crosscall_args *arg = data;
+
+	err = sdei_api_event_disable(arg->event->event_num);
+
+	sdei_cross_call_return(arg, err);
+}
+
 int sdei_event_disable(u32 event_num)
 {
 	int err = -EINVAL;
@@ -385,6 +458,8 @@ int sdei_event_disable(u32 event_num)
 
 	if (event->type == SDEI_EVENT_TYPE_SHARED)
 		err = sdei_api_event_disable(event->event_num);
+	else
+		err = sdei_do_cross_call(_ipi_event_disable, event);
 	mutex_unlock(&sdei_events_lock);
 
 	return err;
@@ -397,6 +472,19 @@ static int sdei_api_event_unregister(u32 event_num)
 			      0, 0, 0, NULL);
 }
 
+/* Called directly by the hotplug callbacks */
+static void _local_event_unregister(void *data)
+{
+	int err;
+	struct sdei_crosscall_args *arg = data;
+
+	WARN_ON_ONCE(preemptible());
+
+	err = sdei_api_event_unregister(arg->event->event_num);
+
+	sdei_cross_call_return(arg, err);
+}
+
 static int _sdei_event_unregister(struct sdei_event *event)
 {
 	lockdep_assert_held(&sdei_events_lock);
@@ -409,7 +497,7 @@ static int _sdei_event_unregister(struct sdei_event *event)
 	if (event->type == SDEI_EVENT_TYPE_SHARED)
 		return sdei_api_event_unregister(event->event_num);
 
-	return -EINVAL;
+	return sdei_do_cross_call(_local_event_unregister, event);
 }
 
 int sdei_event_unregister(u32 event_num)
@@ -473,17 +561,50 @@ static int sdei_api_event_register(u32 event_num, unsigned long entry_point,
 			      flags, affinity, NULL);
 }
 
+/* Called directly by the hotplug callbacks */
+static void _local_event_register(void *data)
+{
+	int err;
+	struct sdei_registered_event *reg;
+	struct sdei_crosscall_args *arg = data;
+
+	WARN_ON(preemptible());
+
+	reg = per_cpu_ptr(arg->event->private_registered, smp_processor_id());
+	err = sdei_api_event_register(arg->event->event_num, sdei_entry_point,
+				      reg, 0, 0);
+
+	sdei_cross_call_return(arg, err);
+}
+
 static int _sdei_event_register(struct sdei_event *event)
 {
+	int err;
+
 	lockdep_assert_held(&sdei_events_lock);
 
+	spin_lock(&sdei_list_lock);
+	event->reregister = true;
+	spin_unlock(&sdei_list_lock);
+
 	if (event->type == SDEI_EVENT_TYPE_SHARED)
 		return sdei_api_event_register(event->event_num,
 					       sdei_entry_point,
 					       event->registered,
 					       SDEI_EVENT_REGISTER_RM_ANY, 0);
 
-	return -EINVAL;
+
+	err = sdei_do_cross_call(_local_event_register, event);
+	if (err) {
+		spin_lock(&sdei_list_lock);
+		event->reregister = false;
+		event->reenable = false;
+		spin_unlock(&sdei_list_lock);
+
+		sdei_do_cross_call(_local_event_unregister, event);
+	}
+
+	return err;
 }
 
 int sdei_event_register(u32 event_num, sdei_event_callback *cb, void *arg)
@@ -538,6 +659,8 @@ static int sdei_reregister_event(struct sdei_event *event)
 	if (event->reenable) {
 		if (event->type == SDEI_EVENT_TYPE_SHARED)
 			err = sdei_api_event_enable(event->event_num);
+		else
+			err = sdei_do_cross_call(_local_event_enable, event);
 	}
 
 	if (err)
@@ -569,6 +692,62 @@ static int sdei_reregister_shared(void)
 	return err;
 }
 
+static int sdei_cpuhp_down(unsigned int cpu)
+{
+	struct sdei_event *event;
+	struct sdei_crosscall_args arg;
+
+	/* un-register private events */
+	spin_lock(&sdei_list_lock);
+	list_for_each_entry(event, &sdei_list, list) {
+		if (event->type == SDEI_EVENT_TYPE_SHARED)
+			continue;
+
+		CROSSCALL_INIT(arg, event);
+		/* call the cross-call function locally... */
+		_local_event_unregister(&arg);
+		if (arg.first_error)
+			pr_err("Failed to unregister event %u: %d\n",
+			       event->event_num, arg.first_error);
+	}
+	spin_unlock(&sdei_list_lock);
+
+	return sdei_mask_local_cpu();
+}
+
+static int sdei_cpuhp_up(unsigned int cpu)
+{
+	struct sdei_event *event;
+	struct sdei_crosscall_args arg;
+
+	/* re-register/enable private events */
+	spin_lock(&sdei_list_lock);
+	list_for_each_entry(event, &sdei_list, list) {
+		if (event->type == SDEI_EVENT_TYPE_SHARED)
+			continue;
+
+		if (event->reregister) {
+			CROSSCALL_INIT(arg, event);
+			/* call the cross-call function locally... */
+			_local_event_register(&arg);
+			if (arg.first_error)
+				pr_err("Failed to re-register event %u: %d\n",
+				       event->event_num, arg.first_error);
+		}
+
+		if (event->reenable) {
+			CROSSCALL_INIT(arg, event);
+			_local_event_enable(&arg);
+			if (arg.first_error)
+				pr_err("Failed to re-enable event %u: %d\n",
+				       event->event_num, arg.first_error);
+		}
+	}
+	spin_unlock(&sdei_list_lock);
+
+	return sdei_unmask_local_cpu();
+}
+
 /* When entering idle, mask/unmask events for this cpu */
 static int sdei_pm_notifier(struct notifier_block *nb, unsigned long action,
 			    void *data)
@@ -622,6 +801,7 @@ static int sdei_device_freeze(struct device *dev)
 {
 	int err;
 
+	/* unregister private events */
 	cpuhp_remove_state(CPUHP_AP_ARM_SDEI_STARTING);
 
 	err = sdei_unregister_shared();

From fa31ab77ced9fbab87fbac4fca3682009b7f9be2 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 8 Jan 2018 15:38:15 +0000
Subject: [PATCH 067/104] arm64: acpi: Remove __init from acpi_psci_use_hvc()
 for use by SDEI

SDEI inherits the 'use hvc' bit that is also used by PSCI. PSCI does all
its initialisation early, SDEI does its late.

Remove the __init annotation from acpi_psci_use_hvc().

Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/acpi.c | 2 +-
 include/linux/psci.h     | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index b3162715ed78..252396a96c78 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -117,7 +117,7 @@ bool __init acpi_psci_present(void)
 }
 
 /* Whether HVC must be used instead of SMC as the PSCI conduit */
-bool __init acpi_psci_use_hvc(void)
+bool acpi_psci_use_hvc(void)
 {
 	return acpi_gbl_FADT.arm_boot_flags & ACPI_FADT_PSCI_USE_HVC;
 }
diff --git a/include/linux/psci.h b/include/linux/psci.h
index 6306ab10af18..f724fd8c78e8 100644
--- a/include/linux/psci.h
+++ b/include/linux/psci.h
@@ -47,10 +47,11 @@ static inline int psci_dt_init(void) { return 0; }
 #if defined(CONFIG_ARM_PSCI_FW) && defined(CONFIG_ACPI)
 int __init psci_acpi_init(void);
 bool __init acpi_psci_present(void);
-bool __init acpi_psci_use_hvc(void);
+bool acpi_psci_use_hvc(void);
 #else
 static inline int psci_acpi_init(void) { return 0; }
 static inline bool acpi_psci_present(void) { return false; }
+static inline bool acpi_psci_use_hvc(void) {return false; }
 #endif
 
 #endif /* __LINUX_PSCI_H */

From 677a60bd2003ff5517a0b502365112531446a3e3 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 8 Jan 2018 15:38:16 +0000
Subject: [PATCH 068/104] firmware: arm_sdei: Discover SDEI support via ACPI

SDEI defines a new ACPI table to indicate the presence of the interface.
The conduit is discovered in the same way as PSCI.

For ACPI we need to create the platform device ourselves as SDEI doesn't
have an entry in the DSDT.

The SDEI platform device should be created after ACPI has been initialised
so that we can parse the table, but before GHES devices are created, which
may register SDE events if they use SDEI as their notification type.

Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/firmware/arm_sdei.c | 41 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c
index 0c735363f54d..8f6563c595e5 100644
--- a/drivers/firmware/arm_sdei.c
+++ b/drivers/firmware/arm_sdei.c
@@ -908,6 +908,14 @@ static int sdei_get_conduit(struct platform_device *pdev)
 		}
 
 		pr_warn("invalid \"method\" property: %s\n", method);
+	} else if (IS_ENABLED(CONFIG_ACPI) && !acpi_disabled) {
+		if (acpi_psci_use_hvc()) {
+			sdei_firmware_call = &sdei_smccc_hvc;
+			return CONDUIT_HVC;
+		} else {
+			sdei_firmware_call = &sdei_smccc_smc;
+			return CONDUIT_SMC;
+		}
 	}
 
 	return CONDUIT_INVALID;
@@ -1021,14 +1029,45 @@ static bool __init sdei_present_dt(void)
 	return true;
 }
 
+static bool __init sdei_present_acpi(void)
+{
+	acpi_status status;
+	struct platform_device *pdev;
+	struct acpi_table_header *sdei_table_header;
+
+	if (acpi_disabled)
+		return false;
+
+	status = acpi_get_table(ACPI_SIG_SDEI, 0, &sdei_table_header);
+	if (ACPI_FAILURE(status) && status != AE_NOT_FOUND) {
+		const char *msg = acpi_format_exception(status);
+
+		pr_info("Failed to get ACPI:SDEI table, %s\n", msg);
+	}
+	if (ACPI_FAILURE(status))
+		return false;
+
+	pdev = platform_device_register_simple(sdei_driver.driver.name, 0, NULL,
+					       0);
+	if (IS_ERR(pdev))
+		return false;
+
+	return true;
+}
+
 static int __init sdei_init(void)
 {
-	if (sdei_present_dt())
+	if (sdei_present_dt() || sdei_present_acpi())
 		platform_driver_register(&sdei_driver);
 
 	return 0;
 }
 
+/*
+ * On an ACPI system SDEI needs to be ready before HEST:GHES tries to register
+ * its events. ACPI is initialised from a subsys_initcall(), GHES is initialised
+ * by device_initcall(). We want to be called in the middle.
+ */
 subsys_initcall_sync(sdei_init);
 
 int sdei_event_handler(struct pt_regs *regs,

From 83f8ee3a73f551aebb5b0bb029feaf6936615730 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 8 Jan 2018 15:38:17 +0000
Subject: [PATCH 069/104] arm64: mmu: add the entry trampolines start/end
 section markers into sections.h

SDEI needs to calculate an offset in the trampoline page too. Move
the extern char[] to sections.h.

This patch just moves code around.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/sections.h | 1 +
 arch/arm64/mm/mmu.c               | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h
index 941267caa39c..caab039d6305 100644
--- a/arch/arm64/include/asm/sections.h
+++ b/arch/arm64/include/asm/sections.h
@@ -28,5 +28,6 @@ extern char __initdata_begin[], __initdata_end[];
 extern char __inittext_begin[], __inittext_end[];
 extern char __irqentry_text_start[], __irqentry_text_end[];
 extern char __mmuoff_data_start[], __mmuoff_data_end[];
+extern char __entry_tramp_text_start[], __entry_tramp_text_end[];
 
 #endif /* __ASM_SECTIONS_H */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 4071602031ed..62d7abb2f710 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -529,8 +529,6 @@ early_param("rodata", parse_rodata);
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 static int __init map_entry_trampoline(void)
 {
-	extern char __entry_tramp_text_start[];
-
 	pgprot_t prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
 	phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start);
 

From 79e9aa59dc29a995921fb01e64cd36b73cf5abe0 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 8 Jan 2018 15:38:18 +0000
Subject: [PATCH 070/104] arm64: sdei: Add trampoline code for remapping the
 kernel

When CONFIG_UNMAP_KERNEL_AT_EL0 is set the SDEI entry point and the rest
of the kernel may be unmapped when we take an event. If this may be the
case, use an entry trampoline that can switch to the kernel page tables.

We can't use the provided PSTATE to determine whether to switch page
tables as we may have interrupted the kernel's entry trampoline, (or a
normal-priority event that interrupted the kernel's entry trampoline).
Instead test for a user ASID in ttbr1_el1.

Save a value in regs->addr_limit to indicate whether we need to restore
the original ASID when returning from this event. This value is only used
by do_page_fault(), which we don't call with the SDEI regs.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/mmu.h  |  3 +-
 arch/arm64/include/asm/sdei.h |  6 +++
 arch/arm64/kernel/entry.S     | 98 +++++++++++++++++++++++++++++++----
 arch/arm64/kernel/sdei.c      | 20 ++++++-
 4 files changed, 113 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 6dd83d75b82a..a050d4f3615d 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -17,7 +17,8 @@
 #define __ASM_MMU_H
 
 #define MMCF_AARCH32	0x1	/* mm context flag for AArch32 executables */
-#define USER_ASID_FLAG	(UL(1) << 48)
+#define USER_ASID_BIT	48
+#define USER_ASID_FLAG	(UL(1) << USER_ASID_BIT)
 #define TTBR_ASID_MASK	(UL(0xffff) << 48)
 
 #ifndef __ASSEMBLY__
diff --git a/arch/arm64/include/asm/sdei.h b/arch/arm64/include/asm/sdei.h
index d58a31ab525a..e073e6886685 100644
--- a/arch/arm64/include/asm/sdei.h
+++ b/arch/arm64/include/asm/sdei.h
@@ -23,6 +23,12 @@ extern unsigned long sdei_exit_mode;
 asmlinkage void __sdei_asm_handler(unsigned long event_num, unsigned long arg,
 				   unsigned long pc, unsigned long pstate);
 
+/* and its CONFIG_UNMAP_KERNEL_AT_EL0 trampoline */
+asmlinkage void __sdei_asm_entry_trampoline(unsigned long event_num,
+						   unsigned long arg,
+						   unsigned long pc,
+						   unsigned long pstate);
+
 /*
  * The above entry point does the minimum to call C code. This function does
  * anything else, before calling the driver.
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 40bf5083d182..812416a09a9c 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -1159,6 +1159,78 @@ NOKPROBE(ret_from_fork)
 #include <asm/sdei.h>
 #include <uapi/linux/arm_sdei.h>
 
+.macro sdei_handler_exit exit_mode
+	/* On success, this call never returns... */
+	cmp	\exit_mode, #SDEI_EXIT_SMC
+	b.ne	99f
+	smc	#0
+	b	.
+99:	hvc	#0
+	b	.
+.endm
+
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+/*
+ * The regular SDEI entry point may have been unmapped along with the rest of
+ * the kernel. This trampoline restores the kernel mapping to make the x1 memory
+ * argument accessible.
+ *
+ * This clobbers x4, __sdei_handler() will restore this from firmware's
+ * copy.
+ */
+.ltorg
+.pushsection ".entry.tramp.text", "ax"
+ENTRY(__sdei_asm_entry_trampoline)
+	mrs	x4, ttbr1_el1
+	tbz	x4, #USER_ASID_BIT, 1f
+
+	tramp_map_kernel tmp=x4
+	isb
+	mov	x4, xzr
+
+	/*
+	 * Use reg->interrupted_regs.addr_limit to remember whether to unmap
+	 * the kernel on exit.
+	 */
+1:	str	x4, [x1, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)]
+
+#ifdef CONFIG_RANDOMIZE_BASE
+	adr	x4, tramp_vectors + PAGE_SIZE
+	add	x4, x4, #:lo12:__sdei_asm_trampoline_next_handler
+	ldr	x4, [x4]
+#else
+	ldr	x4, =__sdei_asm_handler
+#endif
+	br	x4
+ENDPROC(__sdei_asm_entry_trampoline)
+NOKPROBE(__sdei_asm_entry_trampoline)
+
+/*
+ * Make the exit call and restore the original ttbr1_el1
+ *
+ * x0 & x1: setup for the exit API call
+ * x2: exit_mode
+ * x4: struct sdei_registered_event argument from registration time.
+ */
+ENTRY(__sdei_asm_exit_trampoline)
+	ldr	x4, [x4, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)]
+	cbnz	x4, 1f
+
+	tramp_unmap_kernel	tmp=x4
+
+1:	sdei_handler_exit exit_mode=x2
+ENDPROC(__sdei_asm_exit_trampoline)
+NOKPROBE(__sdei_asm_exit_trampoline)
+	.ltorg
+.popsection		// .entry.tramp.text
+#ifdef CONFIG_RANDOMIZE_BASE
+.pushsection ".rodata", "a"
+__sdei_asm_trampoline_next_handler:
+	.quad	__sdei_asm_handler
+.popsection		// .rodata
+#endif /* CONFIG_RANDOMIZE_BASE */
+#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
+
 /*
  * Software Delegated Exception entry point.
  *
@@ -1166,6 +1238,7 @@ NOKPROBE(ret_from_fork)
  * x1: struct sdei_registered_event argument from registration time.
  * x2: interrupted PC
  * x3: interrupted PSTATE
+ * x4: maybe clobbered by the trampoline
  *
  * Firmware has preserved x0->x17 for us, we must save/restore the rest to
  * follow SMC-CC. We save (or retrieve) all the registers as the handler may
@@ -1231,10 +1304,11 @@ ENTRY(__sdei_asm_handler)
 
 	msr	sp_el0, x28
 	/* restore regs >x17 that we clobbered */
-	ldp     x28, x29, [x19, #SDEI_EVENT_INTREGS + 16 * 14]
-	ldp     lr, x4, [x19, #SDEI_EVENT_INTREGS + S_LR]
-	mov	sp, x4
-	ldp     x18, x19, [x19, #SDEI_EVENT_INTREGS + 16 * 9]
+	mov	x4, x19         // keep x4 for __sdei_asm_exit_trampoline
+	ldp	x28, x29, [x4, #SDEI_EVENT_INTREGS + 16 * 14]
+	ldp	x18, x19, [x4, #SDEI_EVENT_INTREGS + 16 * 9]
+	ldp	lr, x1, [x4, #SDEI_EVENT_INTREGS + S_LR]
+	mov	sp, x1
 
 	mov	x1, x0			// address to complete_and_resume
 	/* x0 = (x0 <= 1) ? EVENT_COMPLETE:EVENT_COMPLETE_AND_RESUME */
@@ -1243,14 +1317,16 @@ ENTRY(__sdei_asm_handler)
 	mov_q	x3, SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME
 	csel	x0, x2, x3, ls
 
-	/* On success, this call never returns... */
 	ldr_l	x2, sdei_exit_mode
-	cmp	x2, #SDEI_EXIT_SMC
-	b.ne	1f
-	smc	#0
-	b	.
-1:	hvc	#0
-	b	.
+
+alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0
+	sdei_handler_exit exit_mode=x2
+alternative_else_nop_endif
+
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+	tramp_alias	dst=x5, sym=__sdei_asm_exit_trampoline
+	br	x5
+#endif
 ENDPROC(__sdei_asm_handler)
 NOKPROBE(__sdei_asm_handler)
 #endif /* CONFIG_ARM_SDE_INTERFACE */
diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c
index f9dffacaa5d6..6b8d90d5ceae 100644
--- a/arch/arm64/kernel/sdei.c
+++ b/arch/arm64/kernel/sdei.c
@@ -10,7 +10,9 @@
 
 #include <asm/alternative.h>
 #include <asm/kprobes.h>
+#include <asm/mmu.h>
 #include <asm/ptrace.h>
+#include <asm/sections.h>
 #include <asm/sysreg.h>
 #include <asm/vmap_stack.h>
 
@@ -124,7 +126,18 @@ unsigned long sdei_arch_get_entry_point(int conduit)
 	}
 
 	sdei_exit_mode = (conduit == CONDUIT_HVC) ? SDEI_EXIT_HVC : SDEI_EXIT_SMC;
-	return (unsigned long)__sdei_asm_handler;
+
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+	if (arm64_kernel_unmapped_at_el0()) {
+		unsigned long offset;
+
+		offset = (unsigned long)__sdei_asm_entry_trampoline -
+			 (unsigned long)__entry_tramp_text_start;
+		return TRAMP_VALIAS + offset;
+	} else
+#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
+		return (unsigned long)__sdei_asm_handler;
+
 }
 
 /*
@@ -138,11 +151,14 @@ static __kprobes unsigned long _sdei_handler(struct pt_regs *regs,
 {
 	u32 mode;
 	int i, err = 0;
-	const int clobbered_registers = 4;
+	int clobbered_registers = 4;
 	u64 elr = read_sysreg(elr_el1);
 	u32 kernel_mode = read_sysreg(CurrentEL) | 1;	/* +SPSel */
 	unsigned long vbar = read_sysreg(vbar_el1);
 
+	if (arm64_kernel_unmapped_at_el0())
+		clobbered_registers++;
+
 	/* Retrieve the missing registers values */
 	for (i = 0; i < clobbered_registers; i++) {
 		/* from within the handler, this call always succeeds */

From 9dfe4828aa32c49856dffd6cd31297f3466caa0d Mon Sep 17 00:00:00 2001
From: Steve Capper <steve.capper@arm.com>
Date: Thu, 11 Jan 2018 10:11:57 +0000
Subject: [PATCH 071/104] arm64: Re-order reserved_ttbr0 in linker script

Currently one resolves the location of the reserved_ttbr0 for PAN by
taking a positive offset from swapper_pg_dir. In a future patch we wish
to extend the swapper s.t. its size is determined at link time rather
than comile time, rendering SWAPPER_DIR_SIZE unsuitable for such a low
level calculation.

In this patch we re-arrange the order of the linker script s.t. instead
one computes reserved_ttbr0 by subtracting RESERVED_TTBR0_SIZE from
swapper_pg_dir.

Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Steve Capper <steve.capper@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/asm-uaccess.h | 8 ++++----
 arch/arm64/include/asm/uaccess.h     | 4 ++--
 arch/arm64/kernel/vmlinux.lds.S      | 5 ++---
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index f4f234b6155e..8719ce122a38 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -13,11 +13,11 @@
  */
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 	.macro	__uaccess_ttbr0_disable, tmp1
-	mrs	\tmp1, ttbr1_el1		// swapper_pg_dir
-	add	\tmp1, \tmp1, #SWAPPER_DIR_SIZE	// reserved_ttbr0 at the end of swapper_pg_dir
-	msr	ttbr0_el1, \tmp1		// set reserved TTBR0_EL1
+	mrs	\tmp1, ttbr1_el1			// swapper_pg_dir
+	sub	\tmp1, \tmp1, #RESERVED_TTBR0_SIZE	// reserved_ttbr0 just before swapper_pg_dir
+	msr	ttbr0_el1, \tmp1			// set reserved TTBR0_EL1
 	isb
-	sub	\tmp1, \tmp1, #SWAPPER_DIR_SIZE
+	add	\tmp1, \tmp1, #RESERVED_TTBR0_SIZE
 	bic	\tmp1, \tmp1, #TTBR_ASID_MASK
 	msr	ttbr1_el1, \tmp1		// set reserved ASID
 	isb
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 3821fab01d7d..01ade20d5456 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -108,8 +108,8 @@ static inline void __uaccess_ttbr0_disable(void)
 	unsigned long ttbr;
 
 	ttbr = read_sysreg(ttbr1_el1);
-	/* reserved_ttbr0 placed at the end of swapper_pg_dir */
-	write_sysreg(ttbr + SWAPPER_DIR_SIZE, ttbr0_el1);
+	/* reserved_ttbr0 placed before swapper_pg_dir */
+	write_sysreg(ttbr - RESERVED_TTBR0_SIZE, ttbr0_el1);
 	isb();
 	/* Set reserved ASID */
 	ttbr &= ~TTBR_ASID_MASK;
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index ddfd3c0942f7..8e567de8f369 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -218,13 +218,12 @@ SECTIONS
 	. = ALIGN(PAGE_SIZE);
 	idmap_pg_dir = .;
 	. += IDMAP_DIR_SIZE;
-	swapper_pg_dir = .;
-	. += SWAPPER_DIR_SIZE;
-
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 	reserved_ttbr0 = .;
 	. += RESERVED_TTBR0_SIZE;
 #endif
+	swapper_pg_dir = .;
+	. += SWAPPER_DIR_SIZE;
 
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 	tramp_pg_dir = .;

From 1e1b8c04fa3451e2b7190930adae43c95f0fae31 Mon Sep 17 00:00:00 2001
From: Steve Capper <steve.capper@arm.com>
Date: Thu, 11 Jan 2018 10:11:58 +0000
Subject: [PATCH 072/104] arm64: entry: Move the trampoline to be before PAN

The trampoline page tables are positioned after the early page tables in
the kernel linker script.

As we are about to change the early page table logic to resolve the
swapper size at link time as opposed to compile time, the
SWAPPER_DIR_SIZE variable (currently used to locate the trampline)
will be rendered unsuitable for low level assembler.

This patch solves this issue by moving the trampoline before the PAN
page tables. The offset to the trampoline from ttbr1 can then be
expressed by: PAGE_SIZE + RESERVED_TTBR0_SIZE, which is available to the
entry assembler.

Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Steve Capper <steve.capper@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/entry.S       |  4 ++--
 arch/arm64/kernel/vmlinux.lds.S | 11 ++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 812416a09a9c..1e1d1842888d 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -990,7 +990,7 @@ __ni_sys_trace:
 
 	.macro tramp_map_kernel, tmp
 	mrs	\tmp, ttbr1_el1
-	sub	\tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE)
+	add	\tmp, \tmp, #(PAGE_SIZE + RESERVED_TTBR0_SIZE)
 	bic	\tmp, \tmp, #USER_ASID_FLAG
 	msr	ttbr1_el1, \tmp
 #ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003
@@ -1009,7 +1009,7 @@ alternative_else_nop_endif
 
 	.macro tramp_unmap_kernel, tmp
 	mrs	\tmp, ttbr1_el1
-	add	\tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE)
+	sub	\tmp, \tmp, #(PAGE_SIZE + RESERVED_TTBR0_SIZE)
 	orr	\tmp, \tmp, #USER_ASID_FLAG
 	msr	ttbr1_el1, \tmp
 	/*
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 8e567de8f369..4c7112a47469 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -218,6 +218,12 @@ SECTIONS
 	. = ALIGN(PAGE_SIZE);
 	idmap_pg_dir = .;
 	. += IDMAP_DIR_SIZE;
+
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+	tramp_pg_dir = .;
+	. += PAGE_SIZE;
+#endif
+
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 	reserved_ttbr0 = .;
 	. += RESERVED_TTBR0_SIZE;
@@ -225,11 +231,6 @@ SECTIONS
 	swapper_pg_dir = .;
 	. += SWAPPER_DIR_SIZE;
 
-#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
-	tramp_pg_dir = .;
-	. += PAGE_SIZE;
-#endif
-
 	__pecoff_data_size = ABSOLUTE(. - __initdata_begin);
 	_end = .;
 

From 0370b31e48454d8cf11120664aedd1c51b3004cb Mon Sep 17 00:00:00 2001
From: Steve Capper <steve.capper@arm.com>
Date: Thu, 11 Jan 2018 10:11:59 +0000
Subject: [PATCH 073/104] arm64: Extend early page table code to allow for
 larger kernels

Currently the early assembler page table code assumes that precisely
1xpgd, 1xpud, 1xpmd are sufficient to represent the early kernel text
mappings.

Unfortunately this is rarely the case when running with a 16KB granule,
and we also run into limits with 4KB granule when building much larger
kernels.

This patch re-writes the early page table logic to compute indices of
mappings for each level of page table, and if multiple indices are
required, the next-level page table is scaled up accordingly.

Also the required size of the swapper_pg_dir is computed at link time
to cover the mapping [KIMAGE_ADDR + VOFFSET, _end]. When KASLR is
enabled, an extra page is set aside for each level that may require extra
entries at runtime.

Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Steve Capper <steve.capper@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/kernel-pgtable.h |  47 +++++++-
 arch/arm64/include/asm/pgtable.h        |   1 +
 arch/arm64/kernel/head.S                | 144 +++++++++++++++++-------
 arch/arm64/kernel/vmlinux.lds.S         |   1 +
 arch/arm64/mm/mmu.c                     |   3 +-
 5 files changed, 156 insertions(+), 40 deletions(-)

diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 77a27af01371..82386e860dd2 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -52,7 +52,52 @@
 #define IDMAP_PGTABLE_LEVELS	(ARM64_HW_PGTABLE_LEVELS(PHYS_MASK_SHIFT))
 #endif
 
-#define SWAPPER_DIR_SIZE	(SWAPPER_PGTABLE_LEVELS * PAGE_SIZE)
+
+/*
+ * If KASLR is enabled, then an offset K is added to the kernel address
+ * space. The bottom 21 bits of this offset are zero to guarantee 2MB
+ * alignment for PA and VA.
+ *
+ * For each pagetable level of the swapper, we know that the shift will
+ * be larger than 21 (for the 4KB granule case we use section maps thus
+ * the smallest shift is actually 30) thus there is the possibility that
+ * KASLR can increase the number of pagetable entries by 1, so we make
+ * room for this extra entry.
+ *
+ * Note KASLR cannot increase the number of required entries for a level
+ * by more than one because it increments both the virtual start and end
+ * addresses equally (the extra entry comes from the case where the end
+ * address is just pushed over a boundary and the start address isn't).
+ */
+
+#ifdef CONFIG_RANDOMIZE_BASE
+#define EARLY_KASLR	(1)
+#else
+#define EARLY_KASLR	(0)
+#endif
+
+#define EARLY_ENTRIES(vstart, vend, shift) (((vend) >> (shift)) \
+					- ((vstart) >> (shift)) + 1 + EARLY_KASLR)
+
+#define EARLY_PGDS(vstart, vend) (EARLY_ENTRIES(vstart, vend, PGDIR_SHIFT))
+
+#if SWAPPER_PGTABLE_LEVELS > 3
+#define EARLY_PUDS(vstart, vend) (EARLY_ENTRIES(vstart, vend, PUD_SHIFT))
+#else
+#define EARLY_PUDS(vstart, vend) (0)
+#endif
+
+#if SWAPPER_PGTABLE_LEVELS > 2
+#define EARLY_PMDS(vstart, vend) (EARLY_ENTRIES(vstart, vend, SWAPPER_TABLE_SHIFT))
+#else
+#define EARLY_PMDS(vstart, vend) (0)
+#endif
+
+#define EARLY_PAGES(vstart, vend) ( 1 			/* PGDIR page */				\
+			+ EARLY_PGDS((vstart), (vend)) 	/* each PGDIR needs a next level page table */	\
+			+ EARLY_PUDS((vstart), (vend))	/* each PUD needs a next level page table */	\
+			+ EARLY_PMDS((vstart), (vend)))	/* each PMD needs a next level page table */
+#define SWAPPER_DIR_SIZE (PAGE_SIZE * EARLY_PAGES(KIMAGE_VADDR + TEXT_OFFSET, _end))
 #define IDMAP_DIR_SIZE		(IDMAP_PGTABLE_LEVELS * PAGE_SIZE)
 
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index bfa237e892f1..54b0a8398055 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -706,6 +706,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 #endif
 
 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
+extern pgd_t swapper_pg_end[];
 extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
 extern pgd_t tramp_pg_dir[PTRS_PER_PGD];
 
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 66f01869e97c..95748a00eb89 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -191,44 +191,109 @@ ENDPROC(preserve_boot_args)
 	.endm
 
 /*
- * Macro to populate the PGD (and possibily PUD) for the corresponding
- * block entry in the next level (tbl) for the given virtual address.
+ * Macro to populate page table entries, these entries can be pointers to the next level
+ * or last level entries pointing to physical memory.
  *
- * Preserves:	tbl, next, virt
- * Corrupts:	ptrs_per_pgd, tmp1, tmp2
+ *	tbl:	page table address
+ *	rtbl:	pointer to page table or physical memory
+ *	index:	start index to write
+ *	eindex:	end index to write - [index, eindex] written to
+ *	flags:	flags for pagetable entry to or in
+ *	inc:	increment to rtbl between each entry
+ *	tmp1:	temporary variable
+ *
+ * Preserves:	tbl, eindex, flags, inc
+ * Corrupts:	index, tmp1
+ * Returns:	rtbl
  */
-	.macro	create_pgd_entry, tbl, virt, ptrs_per_pgd, tmp1, tmp2
-	create_table_entry \tbl, \virt, PGDIR_SHIFT, \ptrs_per_pgd, \tmp1, \tmp2
-#if SWAPPER_PGTABLE_LEVELS > 3
-	mov	\ptrs_per_pgd, PTRS_PER_PUD
-	create_table_entry \tbl, \virt, PUD_SHIFT, \ptrs_per_pgd, \tmp1, \tmp2
-#endif
-#if SWAPPER_PGTABLE_LEVELS > 2
-	mov	\ptrs_per_pgd, PTRS_PER_PTE
-	create_table_entry \tbl, \virt, SWAPPER_TABLE_SHIFT, \ptrs_per_pgd, \tmp1, \tmp2
-#endif
+	.macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1
+.Lpe\@:	phys_to_pte \rtbl, \tmp1
+	orr	\tmp1, \tmp1, \flags	// tmp1 = table entry
+	str	\tmp1, [\tbl, \index, lsl #3]
+	add	\rtbl, \rtbl, \inc	// rtbl = pa next level
+	add	\index, \index, #1
+	cmp	\index, \eindex
+	b.ls	.Lpe\@
 	.endm
 
 /*
- * Macro to populate block entries in the page table for the start..end
- * virtual range (inclusive).
+ * Compute indices of table entries from virtual address range. If multiple entries
+ * were needed in the previous page table level then the next page table level is assumed
+ * to be composed of multiple pages. (This effectively scales the end index).
  *
- * Preserves:	tbl, flags
- * Corrupts:	phys, start, end, tmp, pstate
+ *	vstart:	virtual address of start of range
+ *	vend:	virtual address of end of range
+ *	shift:	shift used to transform virtual address into index
+ *	ptrs:	number of entries in page table
+ *	istart:	index in table corresponding to vstart
+ *	iend:	index in table corresponding to vend
+ *	count:	On entry: how many extra entries were required in previous level, scales
+ *			  our end index.
+ *		On exit: returns how many extra entries required for next page table level
+ *
+ * Preserves:	vstart, vend, shift, ptrs
+ * Returns:	istart, iend, count
  */
-	.macro	create_block_map, tbl, flags, phys, start, end, tmp
-	lsr	\start, \start, #SWAPPER_BLOCK_SHIFT
-	and	\start, \start, #PTRS_PER_PTE - 1	// table index
-	bic	\phys, \phys, #SWAPPER_BLOCK_SIZE - 1
-	lsr	\end, \end, #SWAPPER_BLOCK_SHIFT
-	and	\end, \end, #PTRS_PER_PTE - 1		// table end index
-9999:	phys_to_pte \phys, \tmp
-	orr	\tmp, \tmp, \flags			// table entry
-	str	\tmp, [\tbl, \start, lsl #3]		// store the entry
-	add	\start, \start, #1			// next entry
-	add	\phys, \phys, #SWAPPER_BLOCK_SIZE		// next block
-	cmp	\start, \end
-	b.ls	9999b
+	.macro compute_indices, vstart, vend, shift, ptrs, istart, iend, count
+	lsr	\iend, \vend, \shift
+	mov	\istart, \ptrs
+	sub	\istart, \istart, #1
+	and	\iend, \iend, \istart	// iend = (vend >> shift) & (ptrs - 1)
+	mov	\istart, \ptrs
+	mul	\istart, \istart, \count
+	add	\iend, \iend, \istart	// iend += (count - 1) * ptrs
+					// our entries span multiple tables
+
+	lsr	\istart, \vstart, \shift
+	mov	\count, \ptrs
+	sub	\count, \count, #1
+	and	\istart, \istart, \count
+
+	sub	\count, \iend, \istart
+	.endm
+
+/*
+ * Map memory for specified virtual address range. Each level of page table needed supports
+ * multiple entries. If a level requires n entries the next page table level is assumed to be
+ * formed from n pages.
+ *
+ *	tbl:	location of page table
+ *	rtbl:	address to be used for first level page table entry (typically tbl + PAGE_SIZE)
+ *	vstart:	start address to map
+ *	vend:	end address to map - we map [vstart, vend]
+ *	flags:	flags to use to map last level entries
+ *	phys:	physical address corresponding to vstart - physical memory is contiguous
+ *	pgds:	the number of pgd entries
+ *
+ * Temporaries:	istart, iend, tmp, count, sv - these need to be different registers
+ * Preserves:	vstart, vend, flags
+ * Corrupts:	tbl, rtbl, istart, iend, tmp, count, sv
+ */
+	.macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv
+	add \rtbl, \tbl, #PAGE_SIZE
+	mov \sv, \rtbl
+	mov \count, #0
+	compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count
+	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
+	mov \tbl, \sv
+	mov \sv, \rtbl
+
+#if SWAPPER_PGTABLE_LEVELS > 3
+	compute_indices \vstart, \vend, #PUD_SHIFT, #PTRS_PER_PUD, \istart, \iend, \count
+	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
+	mov \tbl, \sv
+	mov \sv, \rtbl
+#endif
+
+#if SWAPPER_PGTABLE_LEVELS > 2
+	compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \count
+	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
+	mov \tbl, \sv
+#endif
+
+	compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \count
+	bic \count, \phys, #SWAPPER_BLOCK_SIZE - 1
+	populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp
 	.endm
 
 /*
@@ -246,14 +311,16 @@ __create_page_tables:
 	 * dirty cache lines being evicted.
 	 */
 	adrp	x0, idmap_pg_dir
-	ldr	x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE)
+	adrp	x1, swapper_pg_end
+	sub	x1, x1, x0
 	bl	__inval_dcache_area
 
 	/*
 	 * Clear the idmap and swapper page tables.
 	 */
 	adrp	x0, idmap_pg_dir
-	ldr	x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE)
+	adrp	x1, swapper_pg_end
+	sub	x1, x1, x0
 1:	stp	xzr, xzr, [x0], #16
 	stp	xzr, xzr, [x0], #16
 	stp	xzr, xzr, [x0], #16
@@ -318,10 +385,10 @@ __create_page_tables:
 #endif
 1:
 	ldr_l	x4, idmap_ptrs_per_pgd
-	create_pgd_entry x0, x3, x4, x5, x6
 	mov	x5, x3				// __pa(__idmap_text_start)
 	adr_l	x6, __idmap_text_end		// __pa(__idmap_text_end)
-	create_block_map x0, x7, x3, x5, x6, x4
+
+	map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14
 
 	/*
 	 * Map the kernel image (starting with PHYS_OFFSET).
@@ -330,12 +397,12 @@ __create_page_tables:
 	mov_q	x5, KIMAGE_VADDR + TEXT_OFFSET	// compile time __va(_text)
 	add	x5, x5, x23			// add KASLR displacement
 	mov	x4, PTRS_PER_PGD
-	create_pgd_entry x0, x5, x4, x3, x6
 	adrp	x6, _end			// runtime __pa(_end)
 	adrp	x3, _text			// runtime __pa(_text)
 	sub	x6, x6, x3			// _end - _text
 	add	x6, x6, x5			// runtime __va(_end)
-	create_block_map x0, x7, x3, x5, x6, x4
+
+	map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14
 
 	/*
 	 * Since the page tables have been populated with non-cacheable
@@ -343,7 +410,8 @@ __create_page_tables:
 	 * tables again to remove any speculatively loaded cache lines.
 	 */
 	adrp	x0, idmap_pg_dir
-	ldr	x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE)
+	adrp	x1, swapper_pg_end
+	sub	x1, x1, x0
 	dmb	sy
 	bl	__inval_dcache_area
 
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 4c7112a47469..0221aca6493d 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -230,6 +230,7 @@ SECTIONS
 #endif
 	swapper_pg_dir = .;
 	. += SWAPPER_DIR_SIZE;
+	swapper_pg_end = .;
 
 	__pecoff_data_size = ABSOLUTE(. - __initdata_begin);
 	_end = .;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 62d7abb2f710..b44992ec9643 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -642,7 +642,8 @@ void __init paging_init(void)
 	 * allocated with it.
 	 */
 	memblock_free(__pa_symbol(swapper_pg_dir) + PAGE_SIZE,
-		      SWAPPER_DIR_SIZE - PAGE_SIZE);
+		      __pa_symbol(swapper_pg_end) - __pa_symbol(swapper_pg_dir)
+		      - PAGE_SIZE);
 }
 
 /*

From bb48711800e6d7aedbf4dfd3367e0ab1270a6bea Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Wed, 13 Dec 2017 14:19:37 -0800
Subject: [PATCH 074/104] arm64: cpu_errata: Add Kryo to Falkor 1003 errata

The Kryo CPUs are also affected by the Falkor 1003 errata, so
we need to do the same workaround on Kryo CPUs. The MIDR is
slightly more complicated here, where the PART number is not
always the same when looking at all the bits from 15 to 4. Drop
the lower 8 bits and just look at the top 4 to see if it's '2'
and then consider those as Kryo CPUs. This covers all the
combinations without having to list them all out.

Fixes: 38fd94b0275c ("arm64: Work around Falkor erratum 1003")
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arm64/silicon-errata.txt |  2 +-
 arch/arm64/include/asm/cputype.h       |  2 ++
 arch/arm64/kernel/cpu_errata.c         | 21 +++++++++++++++++++++
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt
index 304bf22bb83c..b9d93e981a05 100644
--- a/Documentation/arm64/silicon-errata.txt
+++ b/Documentation/arm64/silicon-errata.txt
@@ -72,6 +72,6 @@ stable kernels.
 | Hisilicon      | Hip0{6,7}       | #161010701      | N/A                         |
 | Hisilicon      | Hip07           | #161600802      | HISILICON_ERRATUM_161600802 |
 |                |                 |                 |                             |
-| Qualcomm Tech. | Falkor v1       | E1003           | QCOM_FALKOR_ERRATUM_1003    |
+| Qualcomm Tech. | Kryo/Falkor v1  | E1003           | QCOM_FALKOR_ERRATUM_1003    |
 | Qualcomm Tech. | Falkor v1       | E1009           | QCOM_FALKOR_ERRATUM_1009    |
 | Qualcomm Tech. | QDF2400 ITS     | E0065           | QCOM_QDF2400_ERRATUM_0065   |
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index cce5735a677c..2f8d39ed9c2e 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -94,6 +94,7 @@
 #define BRCM_CPU_PART_VULCAN		0x516
 
 #define QCOM_CPU_PART_FALKOR_V1		0x800
+#define QCOM_CPU_PART_KRYO		0x200
 
 #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
 #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
@@ -106,6 +107,7 @@
 #define MIDR_CAVIUM_THUNDERX2 MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX2)
 #define MIDR_BRCM_VULCAN MIDR_CPU_MODEL(ARM_CPU_IMP_BRCM, BRCM_CPU_PART_VULCAN)
 #define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1)
+#define MIDR_QCOM_KRYO MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 70e5f1826fd9..90a9e465339c 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -30,6 +30,20 @@ is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope)
 				       entry->midr_range_max);
 }
 
+static bool __maybe_unused
+is_kryo_midr(const struct arm64_cpu_capabilities *entry, int scope)
+{
+	u32 model;
+
+	WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
+
+	model = read_cpuid_id();
+	model &= MIDR_IMPLEMENTOR_MASK | (0xf00 << MIDR_PARTNUM_SHIFT) |
+		 MIDR_ARCHITECTURE_MASK;
+
+	return model == entry->midr_model;
+}
+
 static bool
 has_mismatched_cache_line_size(const struct arm64_cpu_capabilities *entry,
 				int scope)
@@ -290,6 +304,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 			   MIDR_CPU_VAR_REV(0, 0),
 			   MIDR_CPU_VAR_REV(0, 0)),
 	},
+	{
+		.desc = "Qualcomm Technologies Kryo erratum 1003",
+		.capability = ARM64_WORKAROUND_QCOM_FALKOR_E1003,
+		.def_scope = SCOPE_LOCAL_CPU,
+		.midr_model = MIDR_QCOM_KRYO,
+		.matches = is_kryo_midr,
+	},
 #endif
 #ifdef CONFIG_QCOM_FALKOR_ERRATUM_1009
 	{

From 67948af41f2e6818edeeba5182811c704d484949 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Tue, 9 Jan 2018 16:12:18 +0000
Subject: [PATCH 075/104] arm64: capabilities: Handle duplicate entries for a
 capability

Sometimes a single capability could be listed multiple times with
differing matches(), e.g, CPU errata for different MIDR versions.
This breaks verify_local_cpu_feature() and this_cpu_has_cap() as
we stop checking for a capability on a CPU with the first
entry in the given table, which is not sufficient. Make sure we
run the checks for all entries of the same capability. We do
this by fixing __this_cpu_has_cap() to run through all the
entries in the given table for a match and reuse it for
verify_local_cpu_feature().

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/cpufeature.c | 44 ++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 9ef84d0def9a..9f4491d16cfb 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1137,6 +1137,26 @@ static void __init setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps)
 			cap_set_elf_hwcap(hwcaps);
 }
 
+/*
+ * Check if the current CPU has a given feature capability.
+ * Should be called from non-preemptible context.
+ */
+static bool __this_cpu_has_cap(const struct arm64_cpu_capabilities *cap_array,
+			       unsigned int cap)
+{
+	const struct arm64_cpu_capabilities *caps;
+
+	if (WARN_ON(preemptible()))
+		return false;
+
+	for (caps = cap_array; caps->desc; caps++)
+		if (caps->capability == cap &&
+		    caps->matches &&
+		    caps->matches(caps, SCOPE_LOCAL_CPU))
+			return true;
+	return false;
+}
+
 void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
 			    const char *info)
 {
@@ -1200,8 +1220,9 @@ verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps)
 }
 
 static void
-verify_local_cpu_features(const struct arm64_cpu_capabilities *caps)
+verify_local_cpu_features(const struct arm64_cpu_capabilities *caps_list)
 {
+	const struct arm64_cpu_capabilities *caps = caps_list;
 	for (; caps->matches; caps++) {
 		if (!cpus_have_cap(caps->capability))
 			continue;
@@ -1209,7 +1230,7 @@ verify_local_cpu_features(const struct arm64_cpu_capabilities *caps)
 		 * If the new CPU misses an advertised feature, we cannot proceed
 		 * further, park the cpu.
 		 */
-		if (!caps->matches(caps, SCOPE_LOCAL_CPU)) {
+		if (!__this_cpu_has_cap(caps_list, caps->capability)) {
 			pr_crit("CPU%d: missing feature: %s\n",
 					smp_processor_id(), caps->desc);
 			cpu_die_early();
@@ -1291,25 +1312,6 @@ static void __init mark_const_caps_ready(void)
 	static_branch_enable(&arm64_const_caps_ready);
 }
 
-/*
- * Check if the current CPU has a given feature capability.
- * Should be called from non-preemptible context.
- */
-static bool __this_cpu_has_cap(const struct arm64_cpu_capabilities *cap_array,
-			       unsigned int cap)
-{
-	const struct arm64_cpu_capabilities *caps;
-
-	if (WARN_ON(preemptible()))
-		return false;
-
-	for (caps = cap_array; caps->desc; caps++)
-		if (caps->capability == cap && caps->matches)
-			return caps->matches(caps, SCOPE_LOCAL_CPU);
-
-	return false;
-}
-
 extern const struct arm64_cpu_capabilities arm64_errata[];
 
 bool this_cpu_has_cap(unsigned int cap)

From a22fde8e979494f326e88ef0728367e009260d75 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <Suzuki.Poulose@arm.com>
Date: Mon, 15 Jan 2018 10:51:16 +0000
Subject: [PATCH 076/104] perf: dsu: Use signed field for dsu_pmu->num_counters

We set dsu_pmu->num_counters to -1, when the DSU is allocated
but not initialised when none of the CPUs are active in the DSU.
However, we use an unsigned field for num_counters. Switch this
to a signed field.

Fixes: 7520fa99246d ("perf: ARM DynamIQ Shared Unit PMU support")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/perf/arm_dsu_pmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c
index 37c0526c93d5..93c50e377507 100644
--- a/drivers/perf/arm_dsu_pmu.c
+++ b/drivers/perf/arm_dsu_pmu.c
@@ -120,7 +120,7 @@ struct dsu_pmu {
 	cpumask_t			associated_cpus;
 	cpumask_t			active_cpu;
 	struct hlist_node		cpuhp_node;
-	u8				num_counters;
+	s8				num_counters;
 	int				irq;
 	DECLARE_BITMAP(cpmceid_bitmap, DSU_PMU_MAX_COMMON_EVENTS);
 };

From 2f7aacf795e0192648b51674aef90e755e02408c Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Mon, 15 Jan 2018 10:41:53 +0000
Subject: [PATCH 077/104] firmware: arm_sdei: Fix return value check in
 sdei_present_dt()

In case of error, the function of_platform_device_create() returns
NULL pointer not ERR_PTR(). The IS_ERR() test in the return value
check should be replaced with NULL test.

Fixes: 677a60bd2003 ("firmware: arm_sdei: Discover SDEI support via ACPI")
Acked-by: James Morse <james.morse@arm.com>
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/firmware/arm_sdei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c
index 8f6563c595e5..1ea71640fdc2 100644
--- a/drivers/firmware/arm_sdei.c
+++ b/drivers/firmware/arm_sdei.c
@@ -1023,7 +1023,7 @@ static bool __init sdei_present_dt(void)
 
 	pdev = of_platform_device_create(np, sdei_driver.driver.name, NULL);
 	of_node_put(np);
-	if (IS_ERR(pdev))
+	if (!pdev)
 		return false;
 
 	return true;

From 071929dbdd865f779a89ba3f1e06ba8d17dd3743 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Tue, 19 Dec 2017 11:28:10 -0800
Subject: [PATCH 078/104] arm64: Stop printing the virtual memory layout

Printing kernel addresses should be done in limited circumstances, mostly
for debugging purposes. Printing out the virtual memory layout at every
kernel bootup doesn't really fall into this category so delete the prints.
There are other ways to get the same information.

Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/init.c | 43 -------------------------------------------
 1 file changed, 43 deletions(-)

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 5960bef0170d..672094ed7e07 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -599,49 +599,6 @@ void __init mem_init(void)
 
 	mem_init_print_info(NULL);
 
-#define MLK(b, t) b, t, ((t) - (b)) >> 10
-#define MLM(b, t) b, t, ((t) - (b)) >> 20
-#define MLG(b, t) b, t, ((t) - (b)) >> 30
-#define MLK_ROUNDUP(b, t) b, t, DIV_ROUND_UP(((t) - (b)), SZ_1K)
-
-	pr_notice("Virtual kernel memory layout:\n");
-#ifdef CONFIG_KASAN
-	pr_notice("    kasan   : 0x%16lx - 0x%16lx   (%6ld GB)\n",
-		MLG(KASAN_SHADOW_START, KASAN_SHADOW_END));
-#endif
-	pr_notice("    modules : 0x%16lx - 0x%16lx   (%6ld MB)\n",
-		MLM(MODULES_VADDR, MODULES_END));
-	pr_notice("    vmalloc : 0x%16lx - 0x%16lx   (%6ld GB)\n",
-		MLG(VMALLOC_START, VMALLOC_END));
-	pr_notice("      .text : 0x%p" " - 0x%p" "   (%6ld KB)\n",
-		MLK_ROUNDUP(_text, _etext));
-	pr_notice("    .rodata : 0x%p" " - 0x%p" "   (%6ld KB)\n",
-		MLK_ROUNDUP(__start_rodata, __init_begin));
-	pr_notice("      .init : 0x%p" " - 0x%p" "   (%6ld KB)\n",
-		MLK_ROUNDUP(__init_begin, __init_end));
-	pr_notice("      .data : 0x%p" " - 0x%p" "   (%6ld KB)\n",
-		MLK_ROUNDUP(_sdata, _edata));
-	pr_notice("       .bss : 0x%p" " - 0x%p" "   (%6ld KB)\n",
-		MLK_ROUNDUP(__bss_start, __bss_stop));
-	pr_notice("    fixed   : 0x%16lx - 0x%16lx   (%6ld KB)\n",
-		MLK(FIXADDR_START, FIXADDR_TOP));
-	pr_notice("    PCI I/O : 0x%16lx - 0x%16lx   (%6ld MB)\n",
-		MLM(PCI_IO_START, PCI_IO_END));
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-	pr_notice("    vmemmap : 0x%16lx - 0x%16lx   (%6ld GB maximum)\n",
-		MLG(VMEMMAP_START, VMEMMAP_START + VMEMMAP_SIZE));
-	pr_notice("              0x%16lx - 0x%16lx   (%6ld MB actual)\n",
-		MLM((unsigned long)phys_to_page(memblock_start_of_DRAM()),
-		    (unsigned long)virt_to_page(high_memory)));
-#endif
-	pr_notice("    memory  : 0x%16lx - 0x%16lx   (%6ld MB)\n",
-		MLM(__phys_to_virt(memblock_start_of_DRAM()),
-		    (unsigned long)high_memory));
-
-#undef MLK
-#undef MLM
-#undef MLK_ROUNDUP
-
 	/*
 	 * Check boundaries twice: Some fundamental inconsistencies can be
 	 * detected at build time already.

From 6a205420758af5625429a2e77a2654abbf59ac09 Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Mon, 15 Jan 2018 15:23:48 +0000
Subject: [PATCH 079/104] arm64: fix ID map extension to 52 bits

Commit fa2a8445b1d3 added support for extending the ID map to 52 bits,
but accidentally dropped a required change to __cpu_uses_extended_idmap.
As a result, the kernel fails to boot when VA_BITS = 48 and the ID map
text is in 52-bit physical memory, because we reduce TCR.T0SZ to cover
the ID map, but then never set it back to VA_BITS.

Add back the change, and also clean up some double parentheses.

Fixes: fa2a8445b1d3 ("arm64: allow ID map to be extended to 52 bits")
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/mmu_context.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 8f80fcb49252..5aee19905556 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -72,8 +72,7 @@ extern u64 idmap_ptrs_per_pgd;
 
 static inline bool __cpu_uses_extended_idmap(void)
 {
-	return (!IS_ENABLED(CONFIG_ARM64_VA_BITS_48) &&
-		unlikely(idmap_t0sz != TCR_T0SZ(VA_BITS)));
+	return unlikely(idmap_t0sz != TCR_T0SZ(VA_BITS));
 }
 
 /*
@@ -82,7 +81,7 @@ static inline bool __cpu_uses_extended_idmap(void)
  */
 static inline bool __cpu_uses_extended_idmap_level(void)
 {
-	return ARM64_HW_PGTABLE_LEVELS((64 - idmap_t0sz)) > CONFIG_PGTABLE_LEVELS;
+	return ARM64_HW_PGTABLE_LEVELS(64 - idmap_t0sz) > CONFIG_PGTABLE_LEVELS;
 }
 
 /*

From 98732d1b189b626a7593afd02dc3d2dc3f6c545a Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Mon, 15 Jan 2018 15:23:49 +0000
Subject: [PATCH 080/104] KVM: arm/arm64: fix HYP ID map extension to 52 bits

Commit fa2a8445b1d3 incorrectly masks the index of the HYP ID map pgd
entry, causing a non-VHE kernel to hang during boot. This happens when
VA_BITS=48 and the ID map text is in 52-bit physical memory. In this
case we don't need an extra table level but need more entries in the
top-level table, so we need to map into hyp_pgd and need to use
__kvm_idmap_ptrs_per_pgd to mask in the extra bits. However,
__create_hyp_mappings currently masks by PTRS_PER_PGD instead.

Fix it so that we always use __kvm_idmap_ptrs_per_pgd for the HYP ID
map. This ensures that we use the larger mask for the top-level ID map
table when it has more entries. In all other cases, PTRS_PER_PGD is used
as normal.

Fixes: fa2a8445b1d3 ("arm64: allow ID map to be extended to 52 bits")
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 virt/kvm/arm/mmu.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 761787befd3b..876caf531d32 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -623,21 +623,15 @@ static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start,
 	return 0;
 }
 
-static int __create_hyp_mappings(pgd_t *pgdp,
+static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
 				 unsigned long start, unsigned long end,
 				 unsigned long pfn, pgprot_t prot)
 {
 	pgd_t *pgd;
 	pud_t *pud;
-	unsigned long addr, next, ptrs_per_pgd = PTRS_PER_PGD;
+	unsigned long addr, next;
 	int err = 0;
 
-	/*
-	 * If it's not the hyp_pgd, fall back to the kvm idmap layout.
-	 */
-	if (pgdp != hyp_pgd)
-		ptrs_per_pgd = __kvm_idmap_ptrs_per_pgd();
-
 	mutex_lock(&kvm_hyp_pgd_mutex);
 	addr = start & PAGE_MASK;
 	end = PAGE_ALIGN(end);
@@ -705,8 +699,8 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot)
 		int err;
 
 		phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
-		err = __create_hyp_mappings(hyp_pgd, virt_addr,
-					    virt_addr + PAGE_SIZE,
+		err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
+					    virt_addr, virt_addr + PAGE_SIZE,
 					    __phys_to_pfn(phys_addr),
 					    prot);
 		if (err)
@@ -737,7 +731,7 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
 	if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1))
 		return -EINVAL;
 
-	return __create_hyp_mappings(hyp_pgd, start, end,
+	return __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, start, end,
 				     __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
 }
 
@@ -1743,7 +1737,7 @@ static int kvm_map_idmap_text(pgd_t *pgd)
 	int err;
 
 	/* Create the idmap in the boot page tables */
-	err = 	__create_hyp_mappings(pgd,
+	err = 	__create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
 				      hyp_idmap_start, hyp_idmap_end,
 				      __phys_to_pfn(hyp_idmap_start),
 				      PAGE_HYP_EXEC);

From 39610a68d9a9a9c6e71be731d071c405e4f2434b Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Mon, 15 Jan 2018 15:23:50 +0000
Subject: [PATCH 081/104] arm64: fix comment above tcr_compute_pa_size

The 'pos' argument is used to select where in TCR to write the value:
the IPS or PS bitfield.

Fixes: 787fd1d019b2 ("arm64: limit PA size to supported range")
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/assembler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 5dc4856f3bb9..1645e0d3a2c1 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -360,7 +360,7 @@ alternative_endif
  * ID_AA64MMFR0_EL1.PARange value
  *
  *	tcr:		register with the TCR_ELx value to be updated
- *	pos:		PARange bitfield position
+ *	pos:		IPS or PS bitfield position
  *	tmp{0,1}:	temporary registers
  */
 	.macro	tcr_compute_pa_size, tcr, pos, tmp0, tmp1

From 894cfd149289c8c9c99211b58a2e3ae7027fff7e Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Wed, 29 Nov 2017 15:39:49 -0800
Subject: [PATCH 082/104] arm64: Inform user if software PAN is in use

It isn't entirely obvious if we're using software PAN because we
don't say anything about it in the boot log. But if we're using
hardware PAN we'll print a nice CPU feature message indicating
it. Add a print for software PAN too so we know if it's being
used or not.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/cpufeature.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 9f4491d16cfb..a11311397430 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1276,6 +1276,9 @@ static void verify_local_cpu_capabilities(void)
 
 	if (system_supports_sve())
 		verify_sve_features();
+
+	if (system_uses_ttbr0_pan())
+		pr_info("Emulating Privileged Access Never (PAN) using TTBR0_EL1 switching\n");
 }
 
 void check_local_cpu_capabilities(void)

From 29d9bef13085e8477f4c9e21aca342edd50847d0 Mon Sep 17 00:00:00 2001
From: Punit Agrawal <punit.agrawal@arm.com>
Date: Wed, 10 Jan 2018 19:07:26 +0000
Subject: [PATCH 083/104] arm64: Correct type for PUD macros

The PUD macros (PUD_TABLE_BIT, PUD_TYPE_MASK, PUD_TYPE_SECT) use the
pgdval_t even when pudval_t is available. Even though the underlying
type for both (u64) is the same it is confusing and may lead to issues
in the future.

Fix this by using pudval_t to define the PUD_* macros.

Fixes: 084bd29810a56 ("ARM64: mm: HugeTLB support.")
Fixes: 206a2a73a62d3 ("arm64: mm: Create gigabyte kernel logical mappings where possible")
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/pgtable-hwdef.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index e1f6679d763e..f42836da8723 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -118,9 +118,9 @@
  * Level 1 descriptor (PUD).
  */
 #define PUD_TYPE_TABLE		(_AT(pudval_t, 3) << 0)
-#define PUD_TABLE_BIT		(_AT(pgdval_t, 1) << 1)
-#define PUD_TYPE_MASK		(_AT(pgdval_t, 3) << 0)
-#define PUD_TYPE_SECT		(_AT(pgdval_t, 1) << 0)
+#define PUD_TABLE_BIT		(_AT(pudval_t, 1) << 1)
+#define PUD_TYPE_MASK		(_AT(pudval_t, 3) << 0)
+#define PUD_TYPE_SECT		(_AT(pudval_t, 1) << 0)
 
 /*
  * Level 2 descriptor (PMD).

From 0abdeff598a66e2bf9bfcb016eb159b11fc2887a Mon Sep 17 00:00:00 2001
From: Dave Martin <Dave.Martin@arm.com>
Date: Fri, 15 Dec 2017 18:34:38 +0000
Subject: [PATCH 084/104] arm64: fpsimd: Fix state leakage when migrating after
 sigreturn

When refactoring the sigreturn code to handle SVE, I changed the
sigreturn implementation to store the new FPSIMD state from the
user sigframe into task_struct before reloading the state into the
CPU regs.  This makes it easier to convert the data for SVE when
needed.

However, it turns out that the fpsimd_state structure passed into
fpsimd_update_current_state is not fully initialised, so assigning
the structure as a whole corrupts current->thread.fpsimd_state.cpu
with uninitialised data.

This means that if the garbage data written to .cpu happens to be a
valid cpu number, and the task is subsequently migrated to the cpu
identified by the that number, and then tries to enter userspace,
the CPU FPSIMD regs will be assumed to be correct for the task and
not reloaded as they should be.  This can result in returning to
userspace with the FPSIMD registers containing data that is stale or
that belongs to another task or to the kernel.

Knowingly handing around a kernel structure that is incompletely
initialised with user data is a potential source of mistakes,
especially across source file boundaries.  To help avoid a repeat
of this issue, this patch adapts the relevant internal API to hand
around the user-accessible subset only: struct user_fpsimd_state.

To avoid future surprises, this patch also converts all uses of
struct fpsimd_state that really only access the user subset, to use
struct user_fpsimd_state.  A few missing consts are added to
function prototypes for good measure.

Thanks to Will for spotting the cause of the bug here.

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/fpsimd.h | 2 +-
 arch/arm64/kernel/fpsimd.c      | 4 ++--
 arch/arm64/kernel/signal.c      | 7 ++++---
 arch/arm64/kernel/signal32.c    | 5 +++--
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 74f34392a531..8857a0f0d0f7 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -71,7 +71,7 @@ extern void fpsimd_flush_thread(void);
 extern void fpsimd_signal_preserve_current_state(void);
 extern void fpsimd_preserve_current_state(void);
 extern void fpsimd_restore_current_state(void);
-extern void fpsimd_update_current_state(struct fpsimd_state *state);
+extern void fpsimd_update_current_state(struct user_fpsimd_state const *state);
 
 extern void fpsimd_flush_task_state(struct task_struct *target);
 extern void sve_flush_cpu_state(void);
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 540a1e010eb5..55fb544072f6 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -1036,14 +1036,14 @@ void fpsimd_restore_current_state(void)
  * flag that indicates that the FPSIMD register contents are the most recent
  * FPSIMD state of 'current'
  */
-void fpsimd_update_current_state(struct fpsimd_state *state)
+void fpsimd_update_current_state(struct user_fpsimd_state const *state)
 {
 	if (!system_supports_fpsimd())
 		return;
 
 	local_bh_disable();
 
-	current->thread.fpsimd_state = *state;
+	current->thread.fpsimd_state.user_fpsimd = *state;
 	if (system_supports_sve() && test_thread_flag(TIF_SVE))
 		fpsimd_to_sve(current);
 
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index b120111a46be..f60c052e8d1c 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -178,7 +178,8 @@ static void __user *apply_user_offset(
 
 static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
 {
-	struct fpsimd_state *fpsimd = &current->thread.fpsimd_state;
+	struct user_fpsimd_state const *fpsimd =
+		&current->thread.fpsimd_state.user_fpsimd;
 	int err;
 
 	/* copy the FP and status/control registers */
@@ -195,7 +196,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
 
 static int restore_fpsimd_context(struct fpsimd_context __user *ctx)
 {
-	struct fpsimd_state fpsimd;
+	struct user_fpsimd_state fpsimd;
 	__u32 magic, size;
 	int err = 0;
 
@@ -266,7 +267,7 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user)
 {
 	int err;
 	unsigned int vq;
-	struct fpsimd_state fpsimd;
+	struct user_fpsimd_state fpsimd;
 	struct sve_context sve;
 
 	if (__copy_from_user(&sve, user->sve, sizeof(sve)))
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index 22711ee8e36c..a124140c0926 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -228,7 +228,8 @@ union __fpsimd_vreg {
 
 static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
 {
-	struct fpsimd_state *fpsimd = &current->thread.fpsimd_state;
+	struct user_fpsimd_state const *fpsimd =
+		&current->thread.fpsimd_state.user_fpsimd;
 	compat_ulong_t magic = VFP_MAGIC;
 	compat_ulong_t size = VFP_STORAGE_SIZE;
 	compat_ulong_t fpscr, fpexc;
@@ -277,7 +278,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
 
 static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame)
 {
-	struct fpsimd_state fpsimd;
+	struct user_fpsimd_state fpsimd;
 	compat_ulong_t magic = VFP_MAGIC;
 	compat_ulong_t size = VFP_STORAGE_SIZE;
 	compat_ulong_t fpscr;

From edf298cfce47ab7279d03b5203ae2ef3a58e49db Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 15 Jan 2018 19:38:54 +0000
Subject: [PATCH 085/104] arm64: cpufeature: __this_cpu_has_cap() shouldn't
 stop early

this_cpu_has_cap() tests caps->desc not caps->matches, so it stops
walking the list when it finds a 'silent' feature, instead of
walking to the end of the list.

Prior to v4.6's 644c2ae198412 ("arm64: cpufeature: Test 'matches' pointer
to find the end of the list") we always tested desc to find the end of
a capability list. This was changed for dubious things like PAN_NOT_UAO.
v4.7's e3661b128e53e ("arm64: Allow a capability to be checked on
single CPU") added this_cpu_has_cap() using the old desc style test.

CC: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/cpufeature.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index a11311397430..630a40ec1332 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1149,9 +1149,8 @@ static bool __this_cpu_has_cap(const struct arm64_cpu_capabilities *cap_array,
 	if (WARN_ON(preemptible()))
 		return false;
 
-	for (caps = cap_array; caps->desc; caps++)
+	for (caps = cap_array; caps->matches; caps++)
 		if (caps->capability == cap &&
-		    caps->matches &&
 		    caps->matches(caps, SCOPE_LOCAL_CPU))
 			return true;
 	return false;

From 7a00d68ebe5f07cb1db17e7fedfd031f0d87e8bb Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 15 Jan 2018 19:38:55 +0000
Subject: [PATCH 086/104] arm64: sysreg: Move to use definitions for all the
 SCTLR bits

__cpu_setup() configures SCTLR_EL1 using some hard coded hex masks,
and el2_setup() duplicates some this when setting RES1 bits.

Lets make this the same as KVM's hyp_init, which uses named bits.

First, we add definitions for all the SCTLR_EL{1,2} bits, the RES{1,0}
bits, and those we want to set or clear.

Add a build_bug checks to ensures all bits are either set or clear.
This means we don't need to preserve endian-ness configuration
generated elsewhere.

Finally, move the head.S and proc.S users of these hard-coded masks
over to the macro versions.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/sysreg.h | 69 +++++++++++++++++++++++++++++++--
 arch/arm64/kernel/head.S        | 13 ++-----
 arch/arm64/mm/proc.S            | 24 +-----------
 3 files changed, 69 insertions(+), 37 deletions(-)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 54e99af043c6..1a8108f84932 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -20,6 +20,7 @@
 #ifndef __ASM_SYSREG_H
 #define __ASM_SYSREG_H
 
+#include <asm/compiler.h>
 #include <linux/stringify.h>
 
 /*
@@ -398,25 +399,81 @@
 
 /* Common SCTLR_ELx flags. */
 #define SCTLR_ELx_EE    (1 << 25)
+#define SCTLR_ELx_WXN	(1 << 19)
 #define SCTLR_ELx_I	(1 << 12)
 #define SCTLR_ELx_SA	(1 << 3)
 #define SCTLR_ELx_C	(1 << 2)
 #define SCTLR_ELx_A	(1 << 1)
 #define SCTLR_ELx_M	1
 
-#define SCTLR_EL2_RES1	((1 << 4)  | (1 << 5)  | (1 << 11) | (1 << 16) | \
-			 (1 << 18) | (1 << 22) | (1 << 23) | (1 << 28) | \
-			 (1 << 29))
-
 #define SCTLR_ELx_FLAGS	(SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \
 			 SCTLR_ELx_SA | SCTLR_ELx_I)
 
+/* SCTLR_EL2 specific flags. */
+#define SCTLR_EL2_RES1	((1 << 4)  | (1 << 5)  | (1 << 11) | (1 << 16) | \
+			 (1 << 18) | (1 << 22) | (1 << 23) | (1 << 28) | \
+			 (1 << 29))
+#define SCTLR_EL2_RES0	((1 << 6)  | (1 << 7)  | (1 << 8)  | (1 << 9)  | \
+			 (1 << 10) | (1 << 13) | (1 << 14) | (1 << 15) | \
+			 (1 << 17) | (1 << 20) | (1 << 21) | (1 << 24) | \
+			 (1 << 26) | (1 << 27) | (1 << 30) | (1 << 31))
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+#define ENDIAN_SET_EL2		SCTLR_ELx_EE
+#define ENDIAN_CLEAR_EL2	0
+#else
+#define ENDIAN_SET_EL2		0
+#define ENDIAN_CLEAR_EL2	SCTLR_ELx_EE
+#endif
+
+/* SCTLR_EL2 value used for the hyp-stub */
+#define SCTLR_EL2_SET	(ENDIAN_SET_EL2   | SCTLR_EL2_RES1)
+#define SCTLR_EL2_CLEAR	(SCTLR_ELx_M      | SCTLR_ELx_A    | SCTLR_ELx_C   | \
+			 SCTLR_ELx_SA     | SCTLR_ELx_I    | SCTLR_ELx_WXN | \
+			 ENDIAN_CLEAR_EL2 | SCTLR_EL2_RES0)
+
+/* Check all the bits are accounted for */
+#define SCTLR_EL2_BUILD_BUG_ON_MISSING_BITS	BUILD_BUG_ON((SCTLR_EL2_SET ^ SCTLR_EL2_CLEAR) != ~0)
+
+
 /* SCTLR_EL1 specific flags. */
 #define SCTLR_EL1_UCI		(1 << 26)
+#define SCTLR_EL1_E0E		(1 << 24)
 #define SCTLR_EL1_SPAN		(1 << 23)
+#define SCTLR_EL1_NTWE		(1 << 18)
+#define SCTLR_EL1_NTWI		(1 << 16)
 #define SCTLR_EL1_UCT		(1 << 15)
+#define SCTLR_EL1_DZE		(1 << 14)
+#define SCTLR_EL1_UMA		(1 << 9)
 #define SCTLR_EL1_SED		(1 << 8)
+#define SCTLR_EL1_ITD		(1 << 7)
 #define SCTLR_EL1_CP15BEN	(1 << 5)
+#define SCTLR_EL1_SA0		(1 << 4)
+
+#define SCTLR_EL1_RES1	((1 << 11) | (1 << 20) | (1 << 22) | (1 << 28) | \
+			 (1 << 29))
+#define SCTLR_EL1_RES0  ((1 << 6)  | (1 << 10) | (1 << 13) | (1 << 17) | \
+			 (1 << 21) | (1 << 27) | (1 << 30) | (1 << 31))
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+#define ENDIAN_SET_EL1		(SCTLR_EL1_E0E | SCTLR_ELx_EE)
+#define ENDIAN_CLEAR_EL1	0
+#else
+#define ENDIAN_SET_EL1		0
+#define ENDIAN_CLEAR_EL1	(SCTLR_EL1_E0E | SCTLR_ELx_EE)
+#endif
+
+#define SCTLR_EL1_SET	(SCTLR_ELx_M    | SCTLR_ELx_C    | SCTLR_ELx_SA   |\
+			 SCTLR_EL1_SA0  | SCTLR_EL1_SED  | SCTLR_ELx_I    |\
+			 SCTLR_EL1_DZE  | SCTLR_EL1_UCT  | SCTLR_EL1_NTWI |\
+			 SCTLR_EL1_NTWE | SCTLR_EL1_SPAN | ENDIAN_SET_EL1 |\
+			 SCTLR_EL1_UCI  | SCTLR_EL1_RES1)
+#define SCTLR_EL1_CLEAR	(SCTLR_ELx_A   | SCTLR_EL1_CP15BEN | SCTLR_EL1_ITD    |\
+			 SCTLR_EL1_UMA | SCTLR_ELx_WXN     | ENDIAN_CLEAR_EL1 |\
+			 SCTLR_EL1_RES0)
+
+/* Check all the bits are accounted for */
+#define SCTLR_EL1_BUILD_BUG_ON_MISSING_BITS	BUILD_BUG_ON((SCTLR_EL1_SET ^ SCTLR_EL1_CLEAR) != ~0)
 
 /* id_aa64isar0 */
 #define ID_AA64ISAR0_FHM_SHIFT		48
@@ -593,6 +650,7 @@
 
 #else
 
+#include <linux/build_bug.h>
 #include <linux/types.h>
 
 asm(
@@ -649,6 +707,9 @@ static inline void config_sctlr_el1(u32 clear, u32 set)
 {
 	u32 val;
 
+	SCTLR_EL2_BUILD_BUG_ON_MISSING_BITS;
+	SCTLR_EL1_BUILD_BUG_ON_MISSING_BITS;
+
 	val = read_sysreg(sctlr_el1);
 	val &= ~clear;
 	val |= set;
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 95748a00eb89..c3b241b8b659 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -492,17 +492,13 @@ ENTRY(el2_setup)
 	mrs	x0, CurrentEL
 	cmp	x0, #CurrentEL_EL2
 	b.eq	1f
-	mrs	x0, sctlr_el1
-CPU_BE(	orr	x0, x0, #(3 << 24)	)	// Set the EE and E0E bits for EL1
-CPU_LE(	bic	x0, x0, #(3 << 24)	)	// Clear the EE and E0E bits for EL1
+	mov_q	x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1)
 	msr	sctlr_el1, x0
 	mov	w0, #BOOT_CPU_MODE_EL1		// This cpu booted in EL1
 	isb
 	ret
 
-1:	mrs	x0, sctlr_el2
-CPU_BE(	orr	x0, x0, #(1 << 25)	)	// Set the EE bit for EL2
-CPU_LE(	bic	x0, x0, #(1 << 25)	)	// Clear the EE bit for EL2
+1:	mov_q	x0, (SCTLR_EL2_RES1 | ENDIAN_SET_EL2)
 	msr	sctlr_el2, x0
 
 #ifdef CONFIG_ARM64_VHE
@@ -618,10 +614,7 @@ install_el2_stub:
 	 * requires no configuration, and all non-hyp-specific EL2 setup
 	 * will be done via the _EL1 system register aliases in __cpu_setup.
 	 */
-	/* sctlr_el1 */
-	mov	x0, #0x0800			// Set/clear RES{1,0} bits
-CPU_BE(	movk	x0, #0x33d0, lsl #16	)	// Set EE and E0E on BE systems
-CPU_LE(	movk	x0, #0x30d0, lsl #16	)	// Clear EE and E0E on LE systems
+	mov_q	x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1)
 	msr	sctlr_el1, x0
 
 	/* Coprocessor traps. */
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 5a59eea49395..ec4c3d82b4d6 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -226,11 +226,7 @@ ENTRY(__cpu_setup)
 	/*
 	 * Prepare SCTLR
 	 */
-	adr	x5, crval
-	ldp	w5, w6, [x5]
-	mrs	x0, sctlr_el1
-	bic	x0, x0, x5			// clear bits
-	orr	x0, x0, x6			// set bits
+	mov_q	x0, SCTLR_EL1_SET
 	/*
 	 * Set/prepare TCR and TTBR. We use 512GB (39-bit) address range for
 	 * both user and kernel.
@@ -259,21 +255,3 @@ ENTRY(__cpu_setup)
 	msr	tcr_el1, x10
 	ret					// return to head.S
 ENDPROC(__cpu_setup)
-
-	/*
-	 * We set the desired value explicitly, including those of the
-	 * reserved bits. The values of bits EE & E0E were set early in
-	 * el2_setup, which are left untouched below.
-	 *
-	 *                 n n            T
-	 *       U E      WT T UD     US IHBS
-	 *       CE0      XWHW CZ     ME TEEA S
-	 * .... .IEE .... NEAI TE.I ..AD DEN0 ACAM
-	 * 0011 0... 1101 ..0. ..0. 10.. .0.. .... < hardware reserved
-	 * .... .1.. .... 01.1 11.1 ..01 0.01 1101 < software settings
-	 */
-	.type	crval, #object
-crval:
-	.word	0xfcffffff			// clear
-	.word	0x34d5d91d			// set
-	.popsection

From 64c02720ea3598bf5143b672274d923a941b8053 Mon Sep 17 00:00:00 2001
From: Xie XiuQi <xiexiuqi@huawei.com>
Date: Mon, 15 Jan 2018 19:38:56 +0000
Subject: [PATCH 087/104] arm64: cpufeature: Detect CPU RAS Extentions

ARM's v8.2 Extentions add support for Reliability, Availability and
Serviceability (RAS). On CPUs with these extensions system software
can use additional barriers to isolate errors and determine if faults
are pending. Add cpufeature detection.

Platform level RAS support may require additional firmware support.

Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
[Rebased added config option, reworded commit message]
Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig               | 16 ++++++++++++++++
 arch/arm64/include/asm/cpucaps.h |  3 ++-
 arch/arm64/include/asm/sysreg.h  |  2 ++
 arch/arm64/kernel/cpufeature.c   | 13 +++++++++++++
 4 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 664fadc2aa2e..1d51c8edf34b 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1062,6 +1062,22 @@ config ARM64_PMEM
 	  operations if DC CVAP is not supported (following the behaviour of
 	  DC CVAP itself if the system does not define a point of persistence).
 
+config ARM64_RAS_EXTN
+	bool "Enable support for RAS CPU Extensions"
+	default y
+	help
+	  CPUs that support the Reliability, Availability and Serviceability
+	  (RAS) Extensions, part of ARMv8.2 are able to track faults and
+	  errors, classify them and report them to software.
+
+	  On CPUs with these extensions system software can use additional
+	  barriers to determine if faults are pending and read the
+	  classification from a new set of registers.
+
+	  Selecting this feature will allow the kernel to use these barriers
+	  and access the new registers if the system supports the extension.
+	  Platform RAS features may additionally depend on firmware support.
+
 endmenu
 
 config ARM64_SVE
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 7049b4802587..bb263820de13 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -44,7 +44,8 @@
 #define ARM64_UNMAP_KERNEL_AT_EL0		23
 #define ARM64_HARDEN_BRANCH_PREDICTOR		24
 #define ARM64_HARDEN_BP_POST_GUEST_EXIT		25
+#define ARM64_HAS_RAS_EXTN			26
 
-#define ARM64_NCAPS				26
+#define ARM64_NCAPS				27
 
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 1a8108f84932..321622e9f9c3 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -498,6 +498,7 @@
 #define ID_AA64PFR0_CSV3_SHIFT		60
 #define ID_AA64PFR0_CSV2_SHIFT		56
 #define ID_AA64PFR0_SVE_SHIFT		32
+#define ID_AA64PFR0_RAS_SHIFT		28
 #define ID_AA64PFR0_GIC_SHIFT		24
 #define ID_AA64PFR0_ASIMD_SHIFT		20
 #define ID_AA64PFR0_FP_SHIFT		16
@@ -507,6 +508,7 @@
 #define ID_AA64PFR0_EL0_SHIFT		0
 
 #define ID_AA64PFR0_SVE			0x1
+#define ID_AA64PFR0_RAS_V1		0x1
 #define ID_AA64PFR0_FP_NI		0xf
 #define ID_AA64PFR0_FP_SUPPORTED	0x0
 #define ID_AA64PFR0_ASIMD_NI		0xf
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 630a40ec1332..116a5c9443a7 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -149,6 +149,7 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_RAS_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0),
 	S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI),
 	S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI),
@@ -1028,6 +1029,18 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.enable = sve_kernel_enable,
 	},
 #endif /* CONFIG_ARM64_SVE */
+#ifdef CONFIG_ARM64_RAS_EXTN
+	{
+		.desc = "RAS Extension Support",
+		.capability = ARM64_HAS_RAS_EXTN,
+		.def_scope = SCOPE_SYSTEM,
+		.matches = has_cpuid_feature,
+		.sys_reg = SYS_ID_AA64PFR0_EL1,
+		.sign = FTR_UNSIGNED,
+		.field_pos = ID_AA64PFR0_RAS_SHIFT,
+		.min_field_value = ID_AA64PFR0_RAS_V1,
+	},
+#endif /* CONFIG_ARM64_RAS_EXTN */
 	{},
 };
 

From 6bf0dcfd713563bd2e13ceb53217305c28a8aa5f Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 15 Jan 2018 19:38:57 +0000
Subject: [PATCH 088/104] arm64: kernel: Survive corrected RAS errors notified
 by SError

Prior to v8.2, SError is an uncontainable fatal exception. The v8.2 RAS
extensions use SError to notify software about RAS errors, these can be
contained by the Error Syncronization Barrier.

An ACPI system with firmware-first may use SError as its 'SEI'
notification. Future patches may add code to 'claim' this SError as a
notification.

Other systems can distinguish these RAS errors from the SError ESR and
use the AET bits and additional data from RAS-Error registers to handle
the error. Future patches may add this kernel-first handling.

Without support for either of these we will panic(), even if we received
a corrected error. Add code to decode the severity of RAS errors. We can
safely ignore contained errors where the CPU can continue to make
progress. For all other errors we continue to panic().

Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/esr.h   | 13 ++++++++
 arch/arm64/include/asm/traps.h | 54 ++++++++++++++++++++++++++++++++++
 arch/arm64/kernel/traps.c      | 51 ++++++++++++++++++++++++++++----
 3 files changed, 113 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 014d7d8edcf9..c367838700fa 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -86,6 +86,18 @@
 #define ESR_ELx_WNR_SHIFT	(6)
 #define ESR_ELx_WNR		(UL(1) << ESR_ELx_WNR_SHIFT)
 
+/* Asynchronous Error Type */
+#define ESR_ELx_IDS_SHIFT	(24)
+#define ESR_ELx_IDS		(UL(1) << ESR_ELx_IDS_SHIFT)
+#define ESR_ELx_AET_SHIFT	(10)
+#define ESR_ELx_AET		(UL(0x7) << ESR_ELx_AET_SHIFT)
+
+#define ESR_ELx_AET_UC		(UL(0) << ESR_ELx_AET_SHIFT)
+#define ESR_ELx_AET_UEU		(UL(1) << ESR_ELx_AET_SHIFT)
+#define ESR_ELx_AET_UEO		(UL(2) << ESR_ELx_AET_SHIFT)
+#define ESR_ELx_AET_UER		(UL(3) << ESR_ELx_AET_SHIFT)
+#define ESR_ELx_AET_CE		(UL(6) << ESR_ELx_AET_SHIFT)
+
 /* Shared ISS field definitions for Data/Instruction aborts */
 #define ESR_ELx_SET_SHIFT	(11)
 #define ESR_ELx_SET_MASK	(UL(3) << ESR_ELx_SET_SHIFT)
@@ -100,6 +112,7 @@
 #define ESR_ELx_FSC		(0x3F)
 #define ESR_ELx_FSC_TYPE	(0x3C)
 #define ESR_ELx_FSC_EXTABT	(0x10)
+#define ESR_ELx_FSC_SERROR	(0x11)
 #define ESR_ELx_FSC_ACCESS	(0x08)
 #define ESR_ELx_FSC_FAULT	(0x04)
 #define ESR_ELx_FSC_PERM	(0x0C)
diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h
index 1696f9de9359..178e338d2889 100644
--- a/arch/arm64/include/asm/traps.h
+++ b/arch/arm64/include/asm/traps.h
@@ -19,6 +19,7 @@
 #define __ASM_TRAP_H
 
 #include <linux/list.h>
+#include <asm/esr.h>
 #include <asm/sections.h>
 
 struct pt_regs;
@@ -66,4 +67,57 @@ static inline int in_entry_text(unsigned long ptr)
 	return ptr >= (unsigned long)&__entry_text_start &&
 	       ptr < (unsigned long)&__entry_text_end;
 }
+
+/*
+ * CPUs with the RAS extensions have an Implementation-Defined-Syndrome bit
+ * to indicate whether this ESR has a RAS encoding. CPUs without this feature
+ * have a ISS-Valid bit in the same position.
+ * If this bit is set, we know its not a RAS SError.
+ * If its clear, we need to know if the CPU supports RAS. Uncategorized RAS
+ * errors share the same encoding as an all-zeros encoding from a CPU that
+ * doesn't support RAS.
+ */
+static inline bool arm64_is_ras_serror(u32 esr)
+{
+	WARN_ON(preemptible());
+
+	if (esr & ESR_ELx_IDS)
+		return false;
+
+	if (this_cpu_has_cap(ARM64_HAS_RAS_EXTN))
+		return true;
+	else
+		return false;
+}
+
+/*
+ * Return the AET bits from a RAS SError's ESR.
+ *
+ * It is implementation defined whether Uncategorized errors are containable.
+ * We treat them as Uncontainable.
+ * Non-RAS SError's are reported as Uncontained/Uncategorized.
+ */
+static inline u32 arm64_ras_serror_get_severity(u32 esr)
+{
+	u32 aet = esr & ESR_ELx_AET;
+
+	if (!arm64_is_ras_serror(esr)) {
+		/* Not a RAS error, we can't interpret the ESR. */
+		return ESR_ELx_AET_UC;
+	}
+
+	/*
+	 * AET is RES0 if 'the value returned in the DFSC field is not
+	 * [ESR_ELx_FSC_SERROR]'
+	 */
+	if ((esr & ESR_ELx_FSC) != ESR_ELx_FSC_SERROR) {
+		/* No severity information : Uncategorized */
+		return ESR_ELx_AET_UC;
+	}
+
+	return aet;
+}
+
+bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr);
+void __noreturn arm64_serror_panic(struct pt_regs *regs, u32 esr);
 #endif
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 3d3588fcd1c7..bbb0fde2780e 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -662,17 +662,58 @@ asmlinkage void handle_bad_stack(struct pt_regs *regs)
 }
 #endif
 
-asmlinkage void do_serror(struct pt_regs *regs, unsigned int esr)
+void __noreturn arm64_serror_panic(struct pt_regs *regs, u32 esr)
 {
-	nmi_enter();
-
 	console_verbose();
 
 	pr_crit("SError Interrupt on CPU%d, code 0x%08x -- %s\n",
 		smp_processor_id(), esr, esr_get_class_string(esr));
-	__show_regs(regs);
+	if (regs)
+		__show_regs(regs);
 
-	panic("Asynchronous SError Interrupt");
+	nmi_panic(regs, "Asynchronous SError Interrupt");
+
+	cpu_park_loop();
+	unreachable();
+}
+
+bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr)
+{
+	u32 aet = arm64_ras_serror_get_severity(esr);
+
+	switch (aet) {
+	case ESR_ELx_AET_CE:	/* corrected error */
+	case ESR_ELx_AET_UEO:	/* restartable, not yet consumed */
+		/*
+		 * The CPU can make progress. We may take UEO again as
+		 * a more severe error.
+		 */
+		return false;
+
+	case ESR_ELx_AET_UEU:	/* Uncorrected Unrecoverable */
+	case ESR_ELx_AET_UER:	/* Uncorrected Recoverable */
+		/*
+		 * The CPU can't make progress. The exception may have
+		 * been imprecise.
+		 */
+		return true;
+
+	case ESR_ELx_AET_UC:	/* Uncontainable or Uncategorized error */
+	default:
+		/* Error has been silently propagated */
+		arm64_serror_panic(regs, esr);
+	}
+}
+
+asmlinkage void do_serror(struct pt_regs *regs, unsigned int esr)
+{
+	nmi_enter();
+
+	/* non-RAS errors are not containable */
+	if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr))
+		arm64_serror_panic(regs, esr);
+
+	nmi_exit();
 }
 
 void __pte_error(const char *file, int line, unsigned long val)

From f751daa4f9d3da07e2777ea0c1ba2d58ff2c860f Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 15 Jan 2018 19:38:58 +0000
Subject: [PATCH 089/104] arm64: Unconditionally enable IESB on exception
 entry/return for firmware-first

ARM v8.2 has a feature to add implicit error synchronization barriers
whenever the CPU enters or returns from an exception level. Add this to the
features we always enable. CPUs that don't support this feature will treat
the bit as RES0.

This feature causes RAS errors that are not yet visible to software to
become pending SErrors. We expect to have firmware-first RAS support
so synchronised RAS errors will be take immediately to EL3.
Any system without firmware-first handling of errors will take the SError
either immediatly after exception return, or when we unmask SError after
entry.S's work.

Adding IESB to the ELx flags causes it to be enabled by KVM and kexec
too.

Platform level RAS support may require additional firmware support.

Cc: Christoffer Dall <christoffer.dall@linaro.org>
Suggested-by: Will Deacon <will.deacon@arm.com>
Link: https://www.spinics.net/lists/kvm-arm/msg28192.html
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/sysreg.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 321622e9f9c3..1281bc8263c2 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -399,6 +399,7 @@
 
 /* Common SCTLR_ELx flags. */
 #define SCTLR_ELx_EE    (1 << 25)
+#define SCTLR_ELx_IESB	(1 << 21)
 #define SCTLR_ELx_WXN	(1 << 19)
 #define SCTLR_ELx_I	(1 << 12)
 #define SCTLR_ELx_SA	(1 << 3)
@@ -406,8 +407,8 @@
 #define SCTLR_ELx_A	(1 << 1)
 #define SCTLR_ELx_M	1
 
-#define SCTLR_ELx_FLAGS	(SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \
-			 SCTLR_ELx_SA | SCTLR_ELx_I)
+#define SCTLR_ELx_FLAGS	(SCTLR_ELx_M  | SCTLR_ELx_A | SCTLR_ELx_C | \
+			 SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB)
 
 /* SCTLR_EL2 specific flags. */
 #define SCTLR_EL2_RES1	((1 << 4)  | (1 << 5)  | (1 << 11) | (1 << 16) | \
@@ -415,8 +416,8 @@
 			 (1 << 29))
 #define SCTLR_EL2_RES0	((1 << 6)  | (1 << 7)  | (1 << 8)  | (1 << 9)  | \
 			 (1 << 10) | (1 << 13) | (1 << 14) | (1 << 15) | \
-			 (1 << 17) | (1 << 20) | (1 << 21) | (1 << 24) | \
-			 (1 << 26) | (1 << 27) | (1 << 30) | (1 << 31))
+			 (1 << 17) | (1 << 20) | (1 << 24) | (1 << 26) | \
+			 (1 << 27) | (1 << 30) | (1 << 31))
 
 #ifdef CONFIG_CPU_BIG_ENDIAN
 #define ENDIAN_SET_EL2		SCTLR_ELx_EE
@@ -427,7 +428,7 @@
 #endif
 
 /* SCTLR_EL2 value used for the hyp-stub */
-#define SCTLR_EL2_SET	(ENDIAN_SET_EL2   | SCTLR_EL2_RES1)
+#define SCTLR_EL2_SET	(SCTLR_ELx_IESB   | ENDIAN_SET_EL2   | SCTLR_EL2_RES1)
 #define SCTLR_EL2_CLEAR	(SCTLR_ELx_M      | SCTLR_ELx_A    | SCTLR_ELx_C   | \
 			 SCTLR_ELx_SA     | SCTLR_ELx_I    | SCTLR_ELx_WXN | \
 			 ENDIAN_CLEAR_EL2 | SCTLR_EL2_RES0)
@@ -453,7 +454,7 @@
 #define SCTLR_EL1_RES1	((1 << 11) | (1 << 20) | (1 << 22) | (1 << 28) | \
 			 (1 << 29))
 #define SCTLR_EL1_RES0  ((1 << 6)  | (1 << 10) | (1 << 13) | (1 << 17) | \
-			 (1 << 21) | (1 << 27) | (1 << 30) | (1 << 31))
+			 (1 << 27) | (1 << 30) | (1 << 31))
 
 #ifdef CONFIG_CPU_BIG_ENDIAN
 #define ENDIAN_SET_EL1		(SCTLR_EL1_E0E | SCTLR_ELx_EE)
@@ -466,8 +467,8 @@
 #define SCTLR_EL1_SET	(SCTLR_ELx_M    | SCTLR_ELx_C    | SCTLR_ELx_SA   |\
 			 SCTLR_EL1_SA0  | SCTLR_EL1_SED  | SCTLR_ELx_I    |\
 			 SCTLR_EL1_DZE  | SCTLR_EL1_UCT  | SCTLR_EL1_NTWI |\
-			 SCTLR_EL1_NTWE | SCTLR_EL1_SPAN | ENDIAN_SET_EL1 |\
-			 SCTLR_EL1_UCI  | SCTLR_EL1_RES1)
+			 SCTLR_EL1_NTWE | SCTLR_ELx_IESB | SCTLR_EL1_SPAN |\
+			 ENDIAN_SET_EL1 | SCTLR_EL1_UCI  | SCTLR_EL1_RES1)
 #define SCTLR_EL1_CLEAR	(SCTLR_ELx_A   | SCTLR_EL1_CP15BEN | SCTLR_EL1_ITD    |\
 			 SCTLR_EL1_UMA | SCTLR_ELx_WXN     | ENDIAN_CLEAR_EL1 |\
 			 SCTLR_EL1_RES0)

From 68ddbf09ec5a888ec850edd7e7438d2daf069c56 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 15 Jan 2018 19:38:59 +0000
Subject: [PATCH 090/104] arm64: kernel: Prepare for a DISR user

KVM would like to consume any pending SError (or RAS error) after guest
exit. Today it has to unmask SError and use dsb+isb to synchronise the
CPU. With the RAS extensions we can use ESB to synchronise any pending
SError.

Add the necessary macros to allow DISR to be read and converted to an
ESR.

We clear the DISR register when we enable the RAS cpufeature, and the
kernel has not executed any ESB instructions. Any value we find in DISR
must have belonged to firmware. Executing an ESB instruction is the
only way to update DISR, so we can expect firmware to have handled
any deferred SError. By the same logic we clear DISR in the idle path.

Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/assembler.h |  7 +++++++
 arch/arm64/include/asm/esr.h       |  7 +++++++
 arch/arm64/include/asm/exception.h | 14 ++++++++++++++
 arch/arm64/include/asm/processor.h |  1 +
 arch/arm64/include/asm/sysreg.h    |  1 +
 arch/arm64/kernel/cpufeature.c     |  9 +++++++++
 arch/arm64/mm/proc.S               |  5 +++++
 7 files changed, 44 insertions(+)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 1645e0d3a2c1..794fe8122602 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -108,6 +108,13 @@
 	dmb	\opt
 	.endm
 
+/*
+ * RAS Error Synchronization barrier
+ */
+	.macro  esb
+	hint    #16
+	.endm
+
 /*
  * NOP sequence
  */
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index c367838700fa..803443d74926 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -140,6 +140,13 @@
 #define ESR_ELx_WFx_ISS_WFE	(UL(1) << 0)
 #define ESR_ELx_xVC_IMM_MASK	((1UL << 16) - 1)
 
+#define DISR_EL1_IDS		(UL(1) << 24)
+/*
+ * DISR_EL1 and ESR_ELx share the bottom 13 bits, but the RES0 bits may mean
+ * different things in the future...
+ */
+#define DISR_EL1_ESR_MASK	(ESR_ELx_AET | ESR_ELx_EA | ESR_ELx_FSC)
+
 /* ESR value templates for specific events */
 
 /* BRK instruction trap from AArch64 state */
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 0c2eec490abf..bc30429d8e91 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -18,6 +18,8 @@
 #ifndef __ASM_EXCEPTION_H
 #define __ASM_EXCEPTION_H
 
+#include <asm/esr.h>
+
 #include <linux/interrupt.h>
 
 #define __exception	__attribute__((section(".exception.text")))
@@ -27,4 +29,16 @@
 #define __exception_irq_entry	__exception
 #endif
 
+static inline u32 disr_to_esr(u64 disr)
+{
+	unsigned int esr = ESR_ELx_EC_SERROR << ESR_ELx_EC_SHIFT;
+
+	if ((disr & DISR_EL1_IDS) == 0)
+		esr |= (disr & DISR_EL1_ESR_MASK);
+	else
+		esr |= (disr & ESR_ELx_ISS_MASK);
+
+	return esr;
+}
+
 #endif	/* __ASM_EXCEPTION_H */
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 023cacb946c3..cee4ae25a5d1 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -216,6 +216,7 @@ static inline void spin_lock_prefetch(const void *ptr)
 
 int cpu_enable_pan(void *__unused);
 int cpu_enable_cache_maint_trap(void *__unused);
+int cpu_clear_disr(void *__unused);
 
 /* Userspace interface for PR_SVE_{SET,GET}_VL prctl()s: */
 #define SVE_SET_VL(arg)	sve_set_current_vl(arg)
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 1281bc8263c2..b5d543fc677d 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -279,6 +279,7 @@
 #define SYS_AMAIR_EL1			sys_reg(3, 0, 10, 3, 0)
 
 #define SYS_VBAR_EL1			sys_reg(3, 0, 12, 0, 0)
+#define SYS_DISR_EL1			sys_reg(3, 0, 12, 1, 1)
 
 #define SYS_ICC_IAR0_EL1		sys_reg(3, 0, 12, 8, 0)
 #define SYS_ICC_EOIR0_EL1		sys_reg(3, 0, 12, 8, 1)
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 116a5c9443a7..5612d6f46331 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1039,6 +1039,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.sign = FTR_UNSIGNED,
 		.field_pos = ID_AA64PFR0_RAS_SHIFT,
 		.min_field_value = ID_AA64PFR0_RAS_V1,
+		.enable = cpu_clear_disr,
 	},
 #endif /* CONFIG_ARM64_RAS_EXTN */
 	{},
@@ -1470,3 +1471,11 @@ static int __init enable_mrs_emulation(void)
 }
 
 core_initcall(enable_mrs_emulation);
+
+int cpu_clear_disr(void *__unused)
+{
+	/* Firmware may have left a deferred SError in this register. */
+	write_sysreg_s(0, SYS_DISR_EL1);
+
+	return 0;
+}
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index ec4c3d82b4d6..05e4ae934b23 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -132,6 +132,11 @@ alternative_endif
 	ubfx	x11, x11, #1, #1
 	msr	oslar_el1, x11
 	reset_pmuserenr_el0 x0			// Disable PMU access from EL0
+
+alternative_if ARM64_HAS_RAS_EXTN
+	msr_s	SYS_DISR_EL1, xzr
+alternative_else_nop_endif
+
 	isb
 	ret
 ENDPROC(cpu_do_resume)

From 4f5abad9e826bd579b0661efa32682d9c9bc3fa8 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 15 Jan 2018 19:39:00 +0000
Subject: [PATCH 091/104] KVM: arm/arm64: mask/unmask daif around VHE guests

Non-VHE systems take an exception to EL2 in order to world-switch into the
guest. When returning from the guest KVM implicitly restores the DAIF
flags when it returns to the kernel at EL1.

With VHE none of this exception-level jumping happens, so KVMs
world-switch code is exposed to the host kernel's DAIF values, and KVM
spills the guest-exit DAIF values back into the host kernel.
On entry to a guest we have Debug and SError exceptions unmasked, KVM
has switched VBAR but isn't prepared to handle these. On guest exit
Debug exceptions are left disabled once we return to the host and will
stay this way until we enter user space.

Add a helper to mask/unmask DAIF around VHE guests. The unmask can only
happen after the hosts VBAR value has been synchronised by the isb in
__vhe_hyp_call (via kvm_call_hyp()). Masking could be as late as
setting KVMs VBAR value, but is kept here for symmetry.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm/include/asm/kvm_host.h   |  2 ++
 arch/arm64/include/asm/kvm_host.h | 10 ++++++++++
 virt/kvm/arm/arm.c                |  4 ++++
 3 files changed, 16 insertions(+)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index a9f7d3f47134..b86fc4162539 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -301,4 +301,6 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
 /* All host FP/SIMD state is restored on guest exit, so nothing to save: */
 static inline void kvm_fpsimd_flush_cpu_state(void) {}
 
+static inline void kvm_arm_vhe_guest_enter(void) {}
+static inline void kvm_arm_vhe_guest_exit(void) {}
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 7ee72b402907..dcdd08edf5a5 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -25,6 +25,7 @@
 #include <linux/types.h>
 #include <linux/kvm_types.h>
 #include <asm/cpufeature.h>
+#include <asm/daifflags.h>
 #include <asm/fpsimd.h>
 #include <asm/kvm.h>
 #include <asm/kvm_asm.h>
@@ -398,4 +399,13 @@ static inline void kvm_fpsimd_flush_cpu_state(void)
 		sve_flush_cpu_state();
 }
 
+static inline void kvm_arm_vhe_guest_enter(void)
+{
+	local_daif_mask();
+}
+
+static inline void kvm_arm_vhe_guest_exit(void)
+{
+	local_daif_restore(DAIF_PROCCTX_NOIRQ);
+}
 #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 2fc6009a766c..38e81631fc91 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -704,9 +704,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		 */
 		trace_kvm_entry(*vcpu_pc(vcpu));
 		guest_enter_irqoff();
+		if (has_vhe())
+			kvm_arm_vhe_guest_enter();
 
 		ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
 
+		if (has_vhe())
+			kvm_arm_vhe_guest_exit();
 		vcpu->mode = OUTSIDE_GUEST_MODE;
 		vcpu->stat.exits++;
 		/*

From 4715c14bc136687bb79d12e24aafdc0f38786eb7 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 15 Jan 2018 19:39:01 +0000
Subject: [PATCH 092/104] KVM: arm64: Set an impdef ESR for Virtual-SError
 using VSESR_EL2.

Prior to v8.2's RAS Extensions, the HCR_EL2.VSE 'virtual SError' feature
generated an SError with an implementation defined ESR_EL1.ISS, because we
had no mechanism to specify the ESR value.

On Juno this generates an all-zero ESR, the most significant bit 'ISV'
is clear indicating the remainder of the ISS field is invalid.

With the RAS Extensions we have a mechanism to specify this value, and the
most significant bit has a new meaning: 'IDS - Implementation Defined
Syndrome'. An all-zero SError ESR now means: 'RAS error: Uncategorized'
instead of 'no valid ISS'.

Add KVM support for the VSESR_EL2 register to specify an ESR value when
HCR_EL2.VSE generates a virtual SError. Change kvm_inject_vabt() to
specify an implementation-defined value.

We only need to restore the VSESR_EL2 value when HCR_EL2.VSE is set, KVM
save/restores this bit during __{,de}activate_traps() and hardware clears the
bit once the guest has consumed the virtual-SError.

Future patches may add an API (or KVM CAP) to pend a virtual SError with
a specified ESR.

Cc: Dongjiu Geng <gengdongjiu@huawei.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/kvm_emulate.h |  5 +++++
 arch/arm64/include/asm/kvm_host.h    |  3 +++
 arch/arm64/include/asm/sysreg.h      |  1 +
 arch/arm64/kvm/hyp/switch.c          |  3 +++
 arch/arm64/kvm/inject_fault.c        | 13 ++++++++++++-
 5 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 5f28dfa14cee..6d3614795197 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -64,6 +64,11 @@ static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr)
 	vcpu->arch.hcr_el2 = hcr;
 }
 
+static inline void vcpu_set_vsesr(struct kvm_vcpu *vcpu, u64 vsesr)
+{
+	vcpu->arch.vsesr_el2 = vsesr;
+}
+
 static inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu)
 {
 	return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc;
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index dcdd08edf5a5..3014b39b8fe2 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -280,6 +280,9 @@ struct kvm_vcpu_arch {
 
 	/* Detect first run of a vcpu */
 	bool has_run_once;
+
+	/* Virtual SError ESR to restore when HCR_EL2.VSE is set */
+	u64 vsesr_el2;
 };
 
 #define vcpu_gp_regs(v)		(&(v)->arch.ctxt.gp_regs)
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index b5d543fc677d..52cfdc216bcf 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -355,6 +355,7 @@
 
 #define SYS_DACR32_EL2			sys_reg(3, 4, 3, 0, 0)
 #define SYS_IFSR32_EL2			sys_reg(3, 4, 5, 0, 1)
+#define SYS_VSESR_EL2			sys_reg(3, 4, 5, 2, 3)
 #define SYS_FPEXC32_EL2			sys_reg(3, 4, 5, 3, 0)
 
 #define __SYS__AP0Rx_EL2(x)		sys_reg(3, 4, 12, 8, x)
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 324f4202cdd5..b425b8aab45b 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -94,6 +94,9 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
 
 	write_sysreg(val, hcr_el2);
 
+	if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN) && (val & HCR_VSE))
+		write_sysreg_s(vcpu->arch.vsesr_el2, SYS_VSESR_EL2);
+
 	/* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */
 	write_sysreg(1 << 15, hstr_el2);
 	/*
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index 8ecbcb40e317..60666a056944 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -164,14 +164,25 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu)
 		inject_undef64(vcpu);
 }
 
+static void pend_guest_serror(struct kvm_vcpu *vcpu, u64 esr)
+{
+	vcpu_set_vsesr(vcpu, esr);
+	vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) | HCR_VSE);
+}
+
 /**
  * kvm_inject_vabt - inject an async abort / SError into the guest
  * @vcpu: The VCPU to receive the exception
  *
  * It is assumed that this code is called from the VCPU thread and that the
  * VCPU therefore is not currently executing guest code.
+ *
+ * Systems with the RAS Extensions specify an imp-def ESR (ISV/IDS = 1) with
+ * the remaining ISS all-zeros so that this error is not interpreted as an
+ * uncategorized RAS error. Without the RAS Extensions we can't specify an ESR
+ * value, so the CPU generates an imp-def value.
  */
 void kvm_inject_vabt(struct kvm_vcpu *vcpu)
 {
-	vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) | HCR_VSE);
+	pend_guest_serror(vcpu, ESR_ELx_ISV);
 }

From c773ae2b34760a1ae409614aa31cdded81a645a5 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 15 Jan 2018 19:39:02 +0000
Subject: [PATCH 093/104] KVM: arm64: Save/Restore guest DISR_EL1

If we deliver a virtual SError to the guest, the guest may defer it
with an ESB instruction. The guest reads the deferred value via DISR_EL1,
but the guests view of DISR_EL1 is re-mapped to VDISR_EL2 when HCR_EL2.AMO
is set.

Add the KVM code to save/restore VDISR_EL2, and make it accessible to
userspace as DISR_EL1.

Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/kvm_host.h | 1 +
 arch/arm64/include/asm/sysreg.h   | 1 +
 arch/arm64/kvm/hyp/sysreg-sr.c    | 6 ++++++
 arch/arm64/kvm/sys_regs.c         | 1 +
 4 files changed, 9 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 3014b39b8fe2..84fcb2a896a1 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -121,6 +121,7 @@ enum vcpu_sysreg {
 	PAR_EL1,	/* Physical Address Register */
 	MDSCR_EL1,	/* Monitor Debug System Control Register */
 	MDCCINT_EL1,	/* Monitor Debug Comms Channel Interrupt Enable Reg */
+	DISR_EL1,	/* Deferred Interrupt Status Register */
 
 	/* Performance Monitors Registers */
 	PMCR_EL0,	/* Control Register */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 52cfdc216bcf..53d99c0944b1 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -358,6 +358,7 @@
 #define SYS_VSESR_EL2			sys_reg(3, 4, 5, 2, 3)
 #define SYS_FPEXC32_EL2			sys_reg(3, 4, 5, 3, 0)
 
+#define SYS_VDISR_EL2			sys_reg(3, 4, 12, 1,  1)
 #define __SYS__AP0Rx_EL2(x)		sys_reg(3, 4, 12, 8, x)
 #define SYS_ICH_AP0R0_EL2		__SYS__AP0Rx_EL2(0)
 #define SYS_ICH_AP0R1_EL2		__SYS__AP0Rx_EL2(1)
diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
index c54cc2afb92b..2c17afd2be96 100644
--- a/arch/arm64/kvm/hyp/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -66,6 +66,9 @@ static void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt)
 	ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg_el1(spsr);
 	ctxt->gp_regs.regs.pc		= read_sysreg_el2(elr);
 	ctxt->gp_regs.regs.pstate	= read_sysreg_el2(spsr);
+
+	if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN))
+		ctxt->sys_regs[DISR_EL1] = read_sysreg_s(SYS_VDISR_EL2);
 }
 
 static hyp_alternate_select(__sysreg_call_save_host_state,
@@ -119,6 +122,9 @@ static void __hyp_text __sysreg_restore_state(struct kvm_cpu_context *ctxt)
 	write_sysreg_el1(ctxt->gp_regs.spsr[KVM_SPSR_EL1],spsr);
 	write_sysreg_el2(ctxt->gp_regs.regs.pc,		elr);
 	write_sysreg_el2(ctxt->gp_regs.regs.pstate,	spsr);
+
+	if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN))
+		write_sysreg_s(ctxt->sys_regs[DISR_EL1], SYS_VDISR_EL2);
 }
 
 static hyp_alternate_select(__sysreg_call_restore_host_state,
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 1830ebc227d1..9edf4ac8a320 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1169,6 +1169,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	{ SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 },
 
 	{ SYS_DESC(SYS_VBAR_EL1), NULL, reset_val, VBAR_EL1, 0 },
+	{ SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 },
 
 	{ SYS_DESC(SYS_ICC_IAR0_EL1), write_to_read_only },
 	{ SYS_DESC(SYS_ICC_EOIR0_EL1), read_from_write_only },

From c60590b552bdf682043579b9b965e6224fbf65d9 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 15 Jan 2018 19:39:03 +0000
Subject: [PATCH 094/104] KVM: arm64: Save ESR_EL2 on guest SError

When we exit a guest due to an SError the vcpu fault info isn't updated
with the ESR. Today this is only done for traps.

The v8.2 RAS Extensions define ISS values for SError. Update the vcpu's
fault_info with the ESR on SError so that handle_exit() can determine
if this was a RAS SError and decode its severity.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kvm/hyp/switch.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index b425b8aab45b..036e1f3d77a6 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -239,11 +239,12 @@ static bool __hyp_text __translate_far_to_hpfar(u64 far, u64 *hpfar)
 
 static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
 {
-	u64 esr = read_sysreg_el2(esr);
-	u8 ec = ESR_ELx_EC(esr);
+	u8 ec;
+	u64 esr;
 	u64 hpfar, far;
 
-	vcpu->arch.fault.esr_el2 = esr;
+	esr = vcpu->arch.fault.esr_el2;
+	ec = ESR_ELx_EC(esr);
 
 	if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW)
 		return true;
@@ -336,6 +337,8 @@ again:
 	exit_code = __guest_enter(vcpu, host_ctxt);
 	/* And we're baaack! */
 
+	if (ARM_EXCEPTION_CODE(exit_code) != ARM_EXCEPTION_IRQ)
+		vcpu->arch.fault.esr_el2 = read_sysreg_el2(esr);
 	/*
 	 * We're using the raw exception code in order to only process
 	 * the trap if no SError is pending. We will come back to the

From 3368bd809764d3ef0810e16c1e1531fec32e8d8e Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 15 Jan 2018 19:39:04 +0000
Subject: [PATCH 095/104] KVM: arm64: Handle RAS SErrors from EL1 on guest exit

We expect to have firmware-first handling of RAS SErrors, with errors
notified via an APEI method. For systems without firmware-first, add
some minimal handling to KVM.

There are two ways KVM can take an SError due to a guest, either may be a
RAS error: we exit the guest due to an SError routed to EL2 by HCR_EL2.AMO,
or we take an SError from EL2 when we unmask PSTATE.A from __guest_exit.

For SError that interrupt a guest and are routed to EL2 the existing
behaviour is to inject an impdef SError into the guest.

Add code to handle RAS SError based on the ESR. For uncontained and
uncategorized errors arm64_is_fatal_ras_serror() will panic(), these
errors compromise the host too. All other error types are contained:
For the fatal errors the vCPU can't make progress, so we inject a virtual
SError. We ignore contained errors where we can make progress as if
we're lucky, we may not hit them again.

If only some of the CPUs support RAS the guest will see the cpufeature
sanitised version of the id registers, but we may still take RAS SError
on this CPU. Move the SError handling out of handle_exit() into a new
handler that runs before we can be preempted. This allows us to use
this_cpu_has_cap(), via arm64_is_ras_serror().

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm/include/asm/kvm_host.h   |  3 +++
 arch/arm64/include/asm/kvm_host.h |  2 ++
 arch/arm64/kvm/handle_exit.c      | 18 +++++++++++++++++-
 virt/kvm/arm/arm.c                |  3 +++
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index b86fc4162539..acbf9ec7b396 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -238,6 +238,9 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *);
 int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 		int exception_index);
 
+static inline void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run,
+				     int exception_index) {}
+
 static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
 				       unsigned long hyp_stack_ptr,
 				       unsigned long vector_ptr)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 84fcb2a896a1..abcfd164e690 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -347,6 +347,8 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
 
 int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 		int exception_index);
+void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run,
+		       int exception_index);
 
 int kvm_perf_init(void);
 int kvm_perf_teardown(void);
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 304203fa9e33..6a5a5db4292f 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -29,12 +29,19 @@
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_psci.h>
 #include <asm/debug-monitors.h>
+#include <asm/traps.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
 typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *);
 
+static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u32 esr)
+{
+	if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(NULL, esr))
+		kvm_inject_vabt(vcpu);
+}
+
 static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
 	int ret;
@@ -252,7 +259,6 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 	case ARM_EXCEPTION_IRQ:
 		return 1;
 	case ARM_EXCEPTION_EL1_SERROR:
-		kvm_inject_vabt(vcpu);
 		/* We may still need to return for single-step */
 		if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS)
 			&& kvm_arm_handle_step_debug(vcpu, run))
@@ -275,3 +281,13 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 		return 0;
 	}
 }
+
+/* For exit types that need handling before we can be preempted */
+void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run,
+		       int exception_index)
+{
+	exception_index = ARM_EXCEPTION_CODE(exception_index);
+
+	if (exception_index == ARM_EXCEPTION_EL1_SERROR)
+		kvm_handle_guest_serror(vcpu, kvm_vcpu_get_hsr(vcpu));
+}
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 38e81631fc91..15bf026eb182 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -763,6 +763,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		guest_exit();
 		trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
 
+		/* Exit types that need handling before we can be preempted */
+		handle_exit_early(vcpu, run, ret);
+
 		preempt_enable();
 
 		ret = handle_exit(vcpu, run, ret);

From 0067df413bd9d7e9ee3a78ece1e1a93535378862 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 15 Jan 2018 19:39:05 +0000
Subject: [PATCH 096/104] KVM: arm64: Handle RAS SErrors from EL2 on guest exit

We expect to have firmware-first handling of RAS SErrors, with errors
notified via an APEI method. For systems without firmware-first, add
some minimal handling to KVM.

There are two ways KVM can take an SError due to a guest, either may be a
RAS error: we exit the guest due to an SError routed to EL2 by HCR_EL2.AMO,
or we take an SError from EL2 when we unmask PSTATE.A from __guest_exit.

The current SError from EL2 code unmasks SError and tries to fence any
pending SError into a single instruction window. It then leaves SError
unmasked.

With the v8.2 RAS Extensions we may take an SError for a 'corrected'
error, but KVM is only able to handle SError from EL2 if they occur
during this single instruction window...

The RAS Extensions give us a new instruction to synchronise and
consume SErrors. The RAS Extensions document (ARM DDI0587),
'2.4.1 ESB and Unrecoverable errors' describes ESB as synchronising
SError interrupts generated by 'instructions, translation table walks,
hardware updates to the translation tables, and instruction fetches on
the same PE'. This makes ESB equivalent to KVMs existing
'dsb, mrs-daifclr, isb' sequence.

Use the alternatives to synchronise and consume any SError using ESB
instead of unmasking and taking the SError. Set ARM_EXIT_WITH_SERROR_BIT
in the exit_code so that we can restart the vcpu if it turns out this
SError has no impact on the vcpu.

Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/kvm_emulate.h |  5 +++++
 arch/arm64/include/asm/kvm_host.h    |  1 +
 arch/arm64/kernel/asm-offsets.c      |  1 +
 arch/arm64/kvm/handle_exit.c         | 14 +++++++++++++-
 arch/arm64/kvm/hyp/entry.S           | 13 +++++++++++++
 5 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 6d3614795197..e002ab7f919a 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -176,6 +176,11 @@ static inline phys_addr_t kvm_vcpu_get_fault_ipa(const struct kvm_vcpu *vcpu)
 	return ((phys_addr_t)vcpu->arch.fault.hpfar_el2 & HPFAR_MASK) << 8;
 }
 
+static inline u64 kvm_vcpu_get_disr(const struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.fault.disr_el1;
+}
+
 static inline u32 kvm_vcpu_hvc_get_imm(const struct kvm_vcpu *vcpu)
 {
 	return kvm_vcpu_get_hsr(vcpu) & ESR_ELx_xVC_IMM_MASK;
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index abcfd164e690..4485ae8e98de 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -90,6 +90,7 @@ struct kvm_vcpu_fault_info {
 	u32 esr_el2;		/* Hyp Syndrom Register */
 	u64 far_el2;		/* Hyp Fault Address Register */
 	u64 hpfar_el2;		/* Hyp IPA Fault Address Register */
+	u64 disr_el1;		/* Deferred [SError] Status Register */
 };
 
 /*
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 1dcc493f5765..1303e04110cd 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -132,6 +132,7 @@ int main(void)
   BLANK();
 #ifdef CONFIG_KVM_ARM_HOST
   DEFINE(VCPU_CONTEXT,		offsetof(struct kvm_vcpu, arch.ctxt));
+  DEFINE(VCPU_FAULT_DISR,	offsetof(struct kvm_vcpu, arch.fault.disr_el1));
   DEFINE(CPU_GP_REGS,		offsetof(struct kvm_cpu_context, gp_regs));
   DEFINE(CPU_USER_PT_REGS,	offsetof(struct kvm_regs, regs));
   DEFINE(CPU_FP_REGS,		offsetof(struct kvm_regs, fp_regs));
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 6a5a5db4292f..c09fc5a576c7 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -23,6 +23,7 @@
 #include <linux/kvm_host.h>
 
 #include <asm/esr.h>
+#include <asm/exception.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_coproc.h>
 #include <asm/kvm_emulate.h>
@@ -249,7 +250,6 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 			*vcpu_pc(vcpu) -= adj;
 		}
 
-		kvm_inject_vabt(vcpu);
 		return 1;
 	}
 
@@ -286,6 +286,18 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run,
 		       int exception_index)
 {
+	if (ARM_SERROR_PENDING(exception_index)) {
+		if (this_cpu_has_cap(ARM64_HAS_RAS_EXTN)) {
+			u64 disr = kvm_vcpu_get_disr(vcpu);
+
+			kvm_handle_guest_serror(vcpu, disr_to_esr(disr));
+		} else {
+			kvm_inject_vabt(vcpu);
+		}
+
+		return;
+	}
+
 	exception_index = ARM_EXCEPTION_CODE(exception_index);
 
 	if (exception_index == ARM_EXCEPTION_EL1_SERROR)
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index fe4678f20a85..fdd1068ee3a5 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -124,6 +124,17 @@ ENTRY(__guest_exit)
 	// Now restore the host regs
 	restore_callee_saved_regs x2
 
+alternative_if ARM64_HAS_RAS_EXTN
+	// If we have the RAS extensions we can consume a pending error
+	// without an unmask-SError and isb.
+	esb
+	mrs_s	x2, SYS_DISR_EL1
+	str	x2, [x1, #(VCPU_FAULT_DISR - VCPU_CONTEXT)]
+	cbz	x2, 1f
+	msr_s	SYS_DISR_EL1, xzr
+	orr	x0, x0, #(1<<ARM_EXIT_WITH_SERROR_BIT)
+1:	ret
+alternative_else
 	// If we have a pending asynchronous abort, now is the
 	// time to find out. From your VAXorcist book, page 666:
 	// "Threaten me not, oh Evil one!  For I speak with
@@ -134,7 +145,9 @@ ENTRY(__guest_exit)
 	mov	x5, x0
 
 	dsb	sy		// Synchronize against in-flight ld/st
+	nop
 	msr	daifclr, #4	// Unmask aborts
+alternative_endif
 
 	// This is our single instruction exception window. A pending
 	// SError is guaranteed to occur at the earliest when we unmask

From 558daf693e0c7ea118dbfb9539aa5a72f34bad2e Mon Sep 17 00:00:00 2001
From: Dongjiu Geng <gengdongjiu@huawei.com>
Date: Mon, 15 Jan 2018 19:39:06 +0000
Subject: [PATCH 097/104] KVM: arm64: Emulate RAS error registers and set
 HCR_EL2's TERR & TEA

ARMv8.2 adds a new bit HCR_EL2.TEA which routes synchronous external
aborts to EL2, and adds a trap control bit HCR_EL2.TERR which traps
all Non-secure EL1&0 error record accesses to EL2.

This patch enables the two bits for the guest OS, guaranteeing that
KVM takes external aborts and traps attempts to access the physical
error registers.

ERRIDR_EL1 advertises the number of error records, we return
zero meaning we can treat all the other registers as RAZ/WI too.

Signed-off-by: Dongjiu Geng <gengdongjiu@huawei.com>
[removed specific emulation, use trap_raz_wi() directly for everything,
 rephrased parts of the commit message]
Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/kvm_arm.h     |  2 ++
 arch/arm64/include/asm/kvm_emulate.h |  7 +++++++
 arch/arm64/include/asm/sysreg.h      | 10 ++++++++++
 arch/arm64/kvm/sys_regs.c            | 10 ++++++++++
 4 files changed, 29 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 715d395ef45b..b0c84171e6a3 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -23,6 +23,8 @@
 #include <asm/types.h>
 
 /* Hyp Configuration Register (HCR) bits */
+#define HCR_TEA		(UL(1) << 37)
+#define HCR_TERR	(UL(1) << 36)
 #define HCR_E2H		(UL(1) << 34)
 #define HCR_ID		(UL(1) << 33)
 #define HCR_CD		(UL(1) << 32)
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index e002ab7f919a..413dc82b1e89 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -50,6 +50,13 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
 	vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
 	if (is_kernel_in_hyp_mode())
 		vcpu->arch.hcr_el2 |= HCR_E2H;
+	if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) {
+		/* route synchronous external abort exceptions to EL2 */
+		vcpu->arch.hcr_el2 |= HCR_TEA;
+		/* trap error record accesses */
+		vcpu->arch.hcr_el2 |= HCR_TERR;
+	}
+
 	if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features))
 		vcpu->arch.hcr_el2 &= ~HCR_RW;
 }
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 53d99c0944b1..0e1960c59197 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -176,6 +176,16 @@
 #define SYS_AFSR0_EL1			sys_reg(3, 0, 5, 1, 0)
 #define SYS_AFSR1_EL1			sys_reg(3, 0, 5, 1, 1)
 #define SYS_ESR_EL1			sys_reg(3, 0, 5, 2, 0)
+
+#define SYS_ERRIDR_EL1			sys_reg(3, 0, 5, 3, 0)
+#define SYS_ERRSELR_EL1			sys_reg(3, 0, 5, 3, 1)
+#define SYS_ERXFR_EL1			sys_reg(3, 0, 5, 4, 0)
+#define SYS_ERXCTLR_EL1			sys_reg(3, 0, 5, 4, 1)
+#define SYS_ERXSTATUS_EL1		sys_reg(3, 0, 5, 4, 2)
+#define SYS_ERXADDR_EL1			sys_reg(3, 0, 5, 4, 3)
+#define SYS_ERXMISC0_EL1		sys_reg(3, 0, 5, 5, 0)
+#define SYS_ERXMISC1_EL1		sys_reg(3, 0, 5, 5, 1)
+
 #define SYS_FAR_EL1			sys_reg(3, 0, 6, 0, 0)
 #define SYS_PAR_EL1			sys_reg(3, 0, 7, 4, 0)
 
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 9edf4ac8a320..50a43c7b97ca 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1159,6 +1159,16 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	{ SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 },
 	{ SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 },
 	{ SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 },
+
+	{ SYS_DESC(SYS_ERRIDR_EL1), trap_raz_wi },
+	{ SYS_DESC(SYS_ERRSELR_EL1), trap_raz_wi },
+	{ SYS_DESC(SYS_ERXFR_EL1), trap_raz_wi },
+	{ SYS_DESC(SYS_ERXCTLR_EL1), trap_raz_wi },
+	{ SYS_DESC(SYS_ERXSTATUS_EL1), trap_raz_wi },
+	{ SYS_DESC(SYS_ERXADDR_EL1), trap_raz_wi },
+	{ SYS_DESC(SYS_ERXMISC0_EL1), trap_raz_wi },
+	{ SYS_DESC(SYS_ERXMISC1_EL1), trap_raz_wi },
+
 	{ SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 },
 	{ SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 },
 

From 6b88a32c7af68895134872cdec3b6bfdb532d94e Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Wed, 10 Jan 2018 13:18:30 +0000
Subject: [PATCH 098/104] arm64: kpti: Fix the interaction between ASID
 switching and software PAN

With ARM64_SW_TTBR0_PAN enabled, the exception entry code checks the
active ASID to decide whether user access was enabled (non-zero ASID)
when the exception was taken. On return from exception, if user access
was previously disabled, it re-instates TTBR0_EL1 from the per-thread
saved value (updated in switch_mm() or efi_set_pgd()).

Commit 7655abb95386 ("arm64: mm: Move ASID from TTBR0 to TTBR1") makes a
TTBR0_EL1 + ASID switching non-atomic. Subsequently, commit 27a921e75711
("arm64: mm: Fix and re-enable ARM64_SW_TTBR0_PAN") changes the
__uaccess_ttbr0_disable() function and asm macro to first write the
reserved TTBR0_EL1 followed by the ASID=0 update in TTBR1_EL1. If an
exception occurs between these two, the exception return code will
re-instate a valid TTBR0_EL1. Similar scenario can happen in
cpu_switch_mm() between setting the reserved TTBR0_EL1 and the ASID
update in cpu_do_switch_mm().

This patch reverts the entry.S check for ASID == 0 to TTBR0_EL1 and
disables the interrupts around the TTBR0_EL1 and ASID switching code in
__uaccess_ttbr0_disable(). It also ensures that, when returning from the
EFI runtime services, efi_set_pgd() doesn't leave a non-zero ASID in
TTBR1_EL1 by using uaccess_ttbr0_{enable,disable}.

The accesses to current_thread_info()->ttbr0 are updated to use
READ_ONCE/WRITE_ONCE.

As a safety measure, __uaccess_ttbr0_enable() always masks out any
existing non-zero ASID TTBR1_EL1 before writing in the new ASID.

Fixes: 27a921e75711 ("arm64: mm: Fix and re-enable ARM64_SW_TTBR0_PAN")
Acked-by: Will Deacon <will.deacon@arm.com>
Reported-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: James Morse <james.morse@arm.com>
Tested-by: James Morse <james.morse@arm.com>
Co-developed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/asm-uaccess.h | 12 +++++++-----
 arch/arm64/include/asm/efi.h         | 12 +++++++-----
 arch/arm64/include/asm/mmu_context.h |  3 ++-
 arch/arm64/include/asm/uaccess.h     |  9 ++++++---
 arch/arm64/kernel/entry.S            |  2 +-
 arch/arm64/lib/clear_user.S          |  2 +-
 arch/arm64/lib/copy_from_user.S      |  2 +-
 arch/arm64/lib/copy_in_user.S        |  2 +-
 arch/arm64/lib/copy_to_user.S        |  2 +-
 arch/arm64/mm/cache.S                |  2 +-
 arch/arm64/mm/proc.S                 |  3 +++
 arch/arm64/xen/hypercall.S           |  2 +-
 12 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index 8719ce122a38..4128bec033f6 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -14,11 +14,11 @@
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 	.macro	__uaccess_ttbr0_disable, tmp1
 	mrs	\tmp1, ttbr1_el1			// swapper_pg_dir
+	bic	\tmp1, \tmp1, #TTBR_ASID_MASK
 	sub	\tmp1, \tmp1, #RESERVED_TTBR0_SIZE	// reserved_ttbr0 just before swapper_pg_dir
 	msr	ttbr0_el1, \tmp1			// set reserved TTBR0_EL1
 	isb
 	add	\tmp1, \tmp1, #RESERVED_TTBR0_SIZE
-	bic	\tmp1, \tmp1, #TTBR_ASID_MASK
 	msr	ttbr1_el1, \tmp1		// set reserved ASID
 	isb
 	.endm
@@ -35,9 +35,11 @@
 	isb
 	.endm
 
-	.macro	uaccess_ttbr0_disable, tmp1
+	.macro	uaccess_ttbr0_disable, tmp1, tmp2
 alternative_if_not ARM64_HAS_PAN
+	save_and_disable_irq \tmp2		// avoid preemption
 	__uaccess_ttbr0_disable \tmp1
+	restore_irq \tmp2
 alternative_else_nop_endif
 	.endm
 
@@ -49,7 +51,7 @@ alternative_if_not ARM64_HAS_PAN
 alternative_else_nop_endif
 	.endm
 #else
-	.macro	uaccess_ttbr0_disable, tmp1
+	.macro	uaccess_ttbr0_disable, tmp1, tmp2
 	.endm
 
 	.macro	uaccess_ttbr0_enable, tmp1, tmp2, tmp3
@@ -59,8 +61,8 @@ alternative_else_nop_endif
 /*
  * These macros are no-ops when UAO is present.
  */
-	.macro	uaccess_disable_not_uao, tmp1
-	uaccess_ttbr0_disable \tmp1
+	.macro	uaccess_disable_not_uao, tmp1, tmp2
+	uaccess_ttbr0_disable \tmp1, \tmp2
 alternative_if ARM64_ALT_PAN_NOT_UAO
 	SET_PSTATE_PAN(1)
 alternative_else_nop_endif
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index c4cd5081d78b..8389050328bb 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -121,19 +121,21 @@ static inline void efi_set_pgd(struct mm_struct *mm)
 		if (mm != current->active_mm) {
 			/*
 			 * Update the current thread's saved ttbr0 since it is
-			 * restored as part of a return from exception. Set
-			 * the hardware TTBR0_EL1 using cpu_switch_mm()
-			 * directly to enable potential errata workarounds.
+			 * restored as part of a return from exception. Enable
+			 * access to the valid TTBR0_EL1 and invoke the errata
+			 * workaround directly since there is no return from
+			 * exception when invoking the EFI run-time services.
 			 */
 			update_saved_ttbr0(current, mm);
-			cpu_switch_mm(mm->pgd, mm);
+			uaccess_ttbr0_enable();
+			post_ttbr_update_workaround();
 		} else {
 			/*
 			 * Defer the switch to the current thread's TTBR0_EL1
 			 * until uaccess_enable(). Restore the current
 			 * thread's saved ttbr0 corresponding to its active_mm
 			 */
-			cpu_set_reserved_ttbr0();
+			uaccess_ttbr0_disable();
 			update_saved_ttbr0(current, current->active_mm);
 		}
 	}
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 5aee19905556..8d3331985d2e 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -184,7 +184,7 @@ static inline void update_saved_ttbr0(struct task_struct *tsk,
 	else
 		ttbr = virt_to_phys(mm->pgd) | ASID(mm) << 48;
 
-	task_thread_info(tsk)->ttbr0 = ttbr;
+	WRITE_ONCE(task_thread_info(tsk)->ttbr0, ttbr);
 }
 #else
 static inline void update_saved_ttbr0(struct task_struct *tsk,
@@ -239,6 +239,7 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 #define activate_mm(prev,next)	switch_mm(prev, next, current)
 
 void verify_cpu_asid_bits(void);
+void post_ttbr_update_workaround(void);
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 01ade20d5456..59fda5292936 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -105,16 +105,18 @@ static inline void set_fs(mm_segment_t fs)
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 static inline void __uaccess_ttbr0_disable(void)
 {
-	unsigned long ttbr;
+	unsigned long flags, ttbr;
 
+	local_irq_save(flags);
 	ttbr = read_sysreg(ttbr1_el1);
+	ttbr &= ~TTBR_ASID_MASK;
 	/* reserved_ttbr0 placed before swapper_pg_dir */
 	write_sysreg(ttbr - RESERVED_TTBR0_SIZE, ttbr0_el1);
 	isb();
 	/* Set reserved ASID */
-	ttbr &= ~TTBR_ASID_MASK;
 	write_sysreg(ttbr, ttbr1_el1);
 	isb();
+	local_irq_restore(flags);
 }
 
 static inline void __uaccess_ttbr0_enable(void)
@@ -127,10 +129,11 @@ static inline void __uaccess_ttbr0_enable(void)
 	 * roll-over and an update of 'ttbr0'.
 	 */
 	local_irq_save(flags);
-	ttbr0 = current_thread_info()->ttbr0;
+	ttbr0 = READ_ONCE(current_thread_info()->ttbr0);
 
 	/* Restore active ASID */
 	ttbr1 = read_sysreg(ttbr1_el1);
+	ttbr1 &= ~TTBR_ASID_MASK;		/* safety measure */
 	ttbr1 |= ttbr0 & TTBR_ASID_MASK;
 	write_sysreg(ttbr1, ttbr1_el1);
 	isb();
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 1e1d1842888d..b34e717d7597 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -204,7 +204,7 @@ alternative_if ARM64_HAS_PAN
 alternative_else_nop_endif
 
 	.if	\el != 0
-	mrs	x21, ttbr1_el1
+	mrs	x21, ttbr0_el1
 	tst	x21, #TTBR_ASID_MASK		// Check for the reserved ASID
 	orr	x23, x23, #PSR_PAN_BIT		// Set the emulated PAN in the saved SPSR
 	b.eq	1f				// TTBR0 access already disabled
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index 8f9c4641e706..3d69a8d41fa5 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -50,7 +50,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2
 	b.mi	5f
 uao_user_alternative 9f, strb, sttrb, wzr, x0, 0
 5:	mov	x0, #0
-	uaccess_disable_not_uao x2
+	uaccess_disable_not_uao x2, x3
 	ret
 ENDPROC(__clear_user)
 
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 69d86a80f3e2..20305d485046 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -67,7 +67,7 @@ ENTRY(__arch_copy_from_user)
 	uaccess_enable_not_uao x3, x4, x5
 	add	end, x0, x2
 #include "copy_template.S"
-	uaccess_disable_not_uao x3
+	uaccess_disable_not_uao x3, x4
 	mov	x0, #0				// Nothing to copy
 	ret
 ENDPROC(__arch_copy_from_user)
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index e442b531252a..fbb090f431a5 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -68,7 +68,7 @@ ENTRY(raw_copy_in_user)
 	uaccess_enable_not_uao x3, x4, x5
 	add	end, x0, x2
 #include "copy_template.S"
-	uaccess_disable_not_uao x3
+	uaccess_disable_not_uao x3, x4
 	mov	x0, #0
 	ret
 ENDPROC(raw_copy_in_user)
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 318f15d5c336..fda6172d6b88 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -66,7 +66,7 @@ ENTRY(__arch_copy_to_user)
 	uaccess_enable_not_uao x3, x4, x5
 	add	end, x0, x2
 #include "copy_template.S"
-	uaccess_disable_not_uao x3
+	uaccess_disable_not_uao x3, x4
 	mov	x0, #0
 	ret
 ENDPROC(__arch_copy_to_user)
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 6cd20a8c0952..91464e7f77cc 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -72,7 +72,7 @@ USER(9f, ic	ivau, x4	)		// invalidate I line PoU
 	isb
 	mov	x0, #0
 1:
-	uaccess_ttbr0_disable x1
+	uaccess_ttbr0_disable x1, x2
 	ret
 9:
 	mov	x0, #-EFAULT
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 05e4ae934b23..c6a12073ef46 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -153,6 +153,9 @@ ENDPROC(cpu_do_resume)
 ENTRY(cpu_do_switch_mm)
 	mrs	x2, ttbr1_el1
 	mmid	x1, x1				// get mm->context.id
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	bfi	x0, x1, #48, #16		// set the ASID field in TTBR0
+#endif
 	bfi	x2, x1, #48, #16		// set the ASID
 	msr	ttbr1_el1, x2			// in TTBR1 (since TCR.A1 is set)
 	isb
diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S
index acdbd2c9e899..c5f05c4a4d00 100644
--- a/arch/arm64/xen/hypercall.S
+++ b/arch/arm64/xen/hypercall.S
@@ -107,6 +107,6 @@ ENTRY(privcmd_call)
 	/*
 	 * Disable userspace access from kernel once the hyp call completed.
 	 */
-	uaccess_ttbr0_disable x6
+	uaccess_ttbr0_disable x6, x7
 	ret
 ENDPROC(privcmd_call);

From e9eaa8052fe71b95f4fea6072fa3e0b2cf0b620f Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Thu, 18 Jan 2018 19:13:11 +0000
Subject: [PATCH 099/104] arm64: mm: ignore memory above supported physical
 address size

When booting a kernel without 52-bit PA support (e.g. a kernel with 4k
pages) on a system with 52-bit memory, the kernel will currently try to
use the 52-bit memory and crash. Fix this by ignoring any memory higher
than what the kernel supports.

Fixes: f77d281713d4 ("arm64: enable 52-bit physical address support")
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/init.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 672094ed7e07..285745b2ca38 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -366,6 +366,9 @@ void __init arm64_memblock_init(void)
 	/* Handle linux,usable-memory-range property */
 	fdt_enforce_memory_region();
 
+	/* Remove memory above our supported physical address size */
+	memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX);
+
 	/*
 	 * Ensure that the linear region takes up exactly half of the kernel
 	 * virtual address space. This way, we can distinguish a linear address

From a8e4c0a919ae310944ed2c9ace11cf3ccd8a609b Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 19 Jan 2018 15:42:09 +0000
Subject: [PATCH 100/104] arm64: Move BP hardening to check_and_switch_context

We call arm64_apply_bp_hardening() from post_ttbr_update_workaround,
which has the unexpected consequence of being triggered on every
exception return to userspace when ARM64_SW_TTBR0_PAN is selected,
even if no context switch actually occured.

This is a bit suboptimal, and it would be more logical to only
invalidate the branch predictor when we actually switch to
a different mm.

In order to solve this, move the call to arm64_apply_bp_hardening()
into check_and_switch_context(), where we're guaranteed to pick
a different mm context.

Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/context.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index ff99a880a730..301417ae2ba8 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -234,6 +234,9 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
 	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
 
 switch_mm_fastpath:
+
+	arm64_apply_bp_hardening();
+
 	/*
 	 * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when
 	 * emulating PAN.
@@ -249,8 +252,6 @@ asmlinkage void post_ttbr_update_workaround(void)
 			"ic iallu; dsb nsh; isb",
 			ARM64_WORKAROUND_CAVIUM_27456,
 			CONFIG_CAVIUM_ERRATUM_27456));
-
-	arm64_apply_bp_hardening();
 }
 
 static int asids_init(void)

From 55b35d070c2534dfb714b883f3c3ae05d02032da Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Wed, 17 Jan 2018 17:42:20 +0000
Subject: [PATCH 101/104] arm64: Run enable method for errata work arounds on
 late CPUs

When a CPU is brought up after we have finalised the system
wide capabilities (i.e, features and errata), we make sure the
new CPU doesn't need a new errata work around which has not been
detected already. However we don't run enable() method on the new
CPU for the errata work arounds already detected. This could
cause the new CPU running without potential work arounds.
It is upto the "enable()" method to decide if this CPU should
do something about the errata.

Fixes: commit 6a6efbb45b7d95c84 ("arm64: Verify CPU errata work arounds on hotplugged CPU")
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andre Przywara <andre.przywara@arm.com>
Cc: Dave Martin <dave.martin@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/cpu_errata.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 90a9e465339c..54e41dfe41f6 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -373,15 +373,18 @@ void verify_local_cpu_errata_workarounds(void)
 {
 	const struct arm64_cpu_capabilities *caps = arm64_errata;
 
-	for (; caps->matches; caps++)
-		if (!cpus_have_cap(caps->capability) &&
-			caps->matches(caps, SCOPE_LOCAL_CPU)) {
+	for (; caps->matches; caps++) {
+		if (cpus_have_cap(caps->capability)) {
+			if (caps->enable)
+				caps->enable((void *)caps);
+		} else if (caps->matches(caps, SCOPE_LOCAL_CPU)) {
 			pr_crit("CPU%d: Requires work around for %s, not detected"
 					" at boot time\n",
 				smp_processor_id(),
 				caps->desc ? : "an erratum");
 			cpu_die_early();
 		}
+	}
 }
 
 void update_cpu_errata_workarounds(void)

From f3d795d9b360523beca6d13ba64c2c532f601149 Mon Sep 17 00:00:00 2001
From: Jayachandran C <jnair@caviumnetworks.com>
Date: Fri, 19 Jan 2018 04:22:47 -0800
Subject: [PATCH 102/104] arm64: Branch predictor hardening for Cavium
 ThunderX2

Use PSCI based mitigation for speculative execution attacks targeting
the branch predictor. We use the same mechanism as the one used for
Cortex-A CPUs, we expect the PSCI version call to have a side effect
of clearing the BTBs.

Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Jayachandran C <jnair@caviumnetworks.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/cpu_errata.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 54e41dfe41f6..ed6881882231 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -359,6 +359,16 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 		.capability = ARM64_HARDEN_BP_POST_GUEST_EXIT,
 		MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR_V1),
 	},
+	{
+		.capability = ARM64_HARDEN_BRANCH_PREDICTOR,
+		MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN),
+		.enable = enable_psci_bp_hardening,
+	},
+	{
+		.capability = ARM64_HARDEN_BRANCH_PREDICTOR,
+		MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2),
+		.enable = enable_psci_bp_hardening,
+	},
 #endif
 	{
 	}

From 0ba2e29c7fc1d58a90fab614d41bf487e28e3840 Mon Sep 17 00:00:00 2001
From: Jayachandran C <jnair@caviumnetworks.com>
Date: Fri, 19 Jan 2018 04:22:48 -0800
Subject: [PATCH 103/104] arm64: Turn on KPTI only on CPUs that need it

Whitelist Broadcom Vulcan/Cavium ThunderX2 processors in
unmap_kernel_at_el0(). These CPUs are not vulnerable to
CVE-2017-5754 and do not need KPTI when KASLR is off.

Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Jayachandran C <jnair@caviumnetworks.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/cpufeature.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 5612d6f46331..4b6f9051cf0c 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -868,6 +868,13 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry,
 	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
 		return true;
 
+	/* Don't force KPTI for CPUs that are not vulnerable */
+	switch (read_cpuid_id() & MIDR_CPU_MODEL_MASK) {
+	case MIDR_CAVIUM_THUNDERX2:
+	case MIDR_BRCM_VULCAN:
+		return false;
+	}
+
 	/* Defer to CPU feature registers */
 	return !cpuid_feature_extract_unsigned_field(pfr0,
 						     ID_AA64PFR0_CSV3_SHIFT);

From ec89ab50a03a33a4a648869e868b1964354fb2d1 Mon Sep 17 00:00:00 2001
From: Steve Capper <steve.capper@arm.com>
Date: Wed, 24 Jan 2018 08:27:08 +0000
Subject: [PATCH 104/104] arm64: Fix TTBR + PAN + 52-bit PA logic in
 cpu_do_switch_mm

In cpu_do_switch_mm(.) with ARM64_SW_TTBR0_PAN=y we apply phys_to_ttbr
to a value that already has an ASID inserted into the upper bits. For
52-bit PA configurations this then can give us TTBR0_EL1 registers that
cause translation table walks to attempt to access non-zero PA[51:48]
spuriously. Ultimately leading to a Synchronous External Abort on level
1 translation.

This patch re-arranges the logic in cpu_do_switch_mm(.) such that
phys_to_ttbr is called before the ASID is inserted into the TTBR0 value.

Fixes: 6b88a32c7af6 ("arm64: kpti: Fix the interaction between ASID switching and software PAN")
Acked-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Tested-by: Kristina Martsenko <kristina.martsenko@arm.com>
Reviewed-by: Kristina Martsenko <kristina.martsenko@arm.com>
Signed-off-by: Steve Capper <steve.capper@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/proc.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index c6a12073ef46..9f177aac6390 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -153,14 +153,14 @@ ENDPROC(cpu_do_resume)
 ENTRY(cpu_do_switch_mm)
 	mrs	x2, ttbr1_el1
 	mmid	x1, x1				// get mm->context.id
+	phys_to_ttbr x0, x3
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
-	bfi	x0, x1, #48, #16		// set the ASID field in TTBR0
+	bfi	x3, x1, #48, #16		// set the ASID field in TTBR0
 #endif
 	bfi	x2, x1, #48, #16		// set the ASID
 	msr	ttbr1_el1, x2			// in TTBR1 (since TCR.A1 is set)
 	isb
-	phys_to_ttbr x0, x2
-	msr	ttbr0_el1, x2			// now update TTBR0
+	msr	ttbr0_el1, x3			// now update TTBR0
 	isb
 	b	post_ttbr_update_workaround	// Back to C code...
 ENDPROC(cpu_do_switch_mm)