powerpc updates for 5.4

- Initial support for running on a system with an Ultravisor, which is software that runs below the hypervisor and protects guests against some attacks by the hypervisor. - Support for building the kernel to run as a "Secure Virtual Machine", ie. as a guest capable of running on a system with an Ultravisor. - Some changes to our DMA code on bare metal, to allow devices with medium sized DMA masks (> 32 && < 59 bits) to use more than 2GB of DMA space. - Support for firmware assisted crash dumps on bare metal (powernv). - Two series fixing bugs in and refactoring our PCI EEH code. - A large series refactoring our exception entry code to use gas macros, both to make it more readable and also enable some future optimisations. As well as many cleanups and other minor features & fixups. Thanks to: Adam Zerella, Alexey Kardashevskiy, Alistair Popple, Andrew Donnellan, Aneesh Kumar K.V, Anju T Sudhakar, Anshuman Khandual, Balbir Singh, Benjamin Herrenschmidt, Cédric Le Goater, Christophe JAILLET, Christophe Leroy, Christopher M. Riedl, Christoph Hellwig, Claudio Carvalho, Daniel Axtens, David Gibson, David Hildenbrand, Desnes A. Nunes do Rosario, Ganesh Goudar, Gautham R. Shenoy, Greg Kurz, Guerney Hunt, Gustavo Romero, Halil Pasic, Hari Bathini, Joakim Tjernlund, Jonathan Neuschafer, Jordan Niethe, Leonardo Bras, Lianbo Jiang, Madhavan Srinivasan, Mahesh Salgaonkar, Mahesh Salgaonkar, Masahiro Yamada, Maxiwell S. Garcia, Michael Anderson, Nathan Chancellor, Nathan Lynch, Naveen N. Rao, Nicholas Piggin, Oliver O'Halloran, Qian Cai, Ram Pai, Ravi Bangoria, Reza Arbab, Ryan Grimm, Sam Bobroff, Santosh Sivaraj, Segher Boessenkool, Sukadev Bhattiprolu, Thiago Bauermann, Thiago Jung Bauermann, Thomas Gleixner, Tom Lendacky, Vasant Hegde. -----BEGIN PGP SIGNATURE----- iQJHBAABCAAxFiEEJFGtCPCthwEv2Y/bUevqPMjhpYAFAl2EtEcTHG1wZUBlbGxl cm1hbi5pZC5hdQAKCRBR6+o8yOGlgPfsD/9uXyBXn3anI/H08+mk74k5gCsmMQpn D442CD/ByogZcccp23yBTlhawtCE03hcHnCLygn0Xgd8a4YvHts/RGHUe3fPHqlG bEyZ7jsLVz5ebNZQP7r4eGs2pSzCajwJy2N9HJ/C1ojf15rrfRxoVJtnyhE2wXpm DL+6o2K+nUCB3gTQ1Inr3DnWzoGOOUfNTOea2u+J+yfHwGRqOBYpevwqiwy5eelK aRjUJCqMTvrzra49MeFwjo0Nt3/Y8UNcwA+JlGdeR8bRuWhFrYmyBRiZEKPaujNO 5EAfghBBlB0KQCqvF/tRM/c0OftHqK59AMobP9T7u9oOaBXeF/FpZX/iXjzNDPsN j9Oo2tKLTu/YVEXqBFuREGP+znANr1Wo4CFyOG8SbvYz0HFjR6XbtRJsS+0e8GWl kqX5/ZhYz3lBnKSNe9jgWOrh/J0KCSFigBTEWJT3xsn4YE8x8kK2l9KPqAIldWEP sKb2UjGS7v0NKq+NvShH88Q9AeQUEIjTcg/9aDDQDe6FaRQ7KiF8bUxSdwSPi+Fn j0lnF6i+1ATWZKuCr85veVi7C5qoe/+MqalnmP7MxULyzgXLLxUgN0SzEYO6QofK LQK/VaH2XVr5+M5YAb7K4/NX5gbM3s1bKrCiUy4EyHNvgG7gricYdbz6HgAjKpR7 oP0rHfgmVYvF1g== =WlW+ -----END PGP SIGNATURE----- Merge tag 'powerpc-5.4-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux Pull powerpc updates from Michael Ellerman: "This is a bit late, partly due to me travelling, and partly due to a power outage knocking out some of my test systems *while* I was travelling. - Initial support for running on a system with an Ultravisor, which is software that runs below the hypervisor and protects guests against some attacks by the hypervisor. - Support for building the kernel to run as a "Secure Virtual Machine", ie. as a guest capable of running on a system with an Ultravisor. - Some changes to our DMA code on bare metal, to allow devices with medium sized DMA masks (> 32 && < 59 bits) to use more than 2GB of DMA space. - Support for firmware assisted crash dumps on bare metal (powernv). - Two series fixing bugs in and refactoring our PCI EEH code. - A large series refactoring our exception entry code to use gas macros, both to make it more readable and also enable some future optimisations. As well as many cleanups and other minor features & fixups. Thanks to: Adam Zerella, Alexey Kardashevskiy, Alistair Popple, Andrew Donnellan, Aneesh Kumar K.V, Anju T Sudhakar, Anshuman Khandual, Balbir Singh, Benjamin Herrenschmidt, Cédric Le Goater, Christophe JAILLET, Christophe Leroy, Christopher M. Riedl, Christoph Hellwig, Claudio Carvalho, Daniel Axtens, David Gibson, David Hildenbrand, Desnes A. Nunes do Rosario, Ganesh Goudar, Gautham R. Shenoy, Greg Kurz, Guerney Hunt, Gustavo Romero, Halil Pasic, Hari Bathini, Joakim Tjernlund, Jonathan Neuschafer, Jordan Niethe, Leonardo Bras, Lianbo Jiang, Madhavan Srinivasan, Mahesh Salgaonkar, Mahesh Salgaonkar, Masahiro Yamada, Maxiwell S. Garcia, Michael Anderson, Nathan Chancellor, Nathan Lynch, Naveen N. Rao, Nicholas Piggin, Oliver O'Halloran, Qian Cai, Ram Pai, Ravi Bangoria, Reza Arbab, Ryan Grimm, Sam Bobroff, Santosh Sivaraj, Segher Boessenkool, Sukadev Bhattiprolu, Thiago Bauermann, Thiago Jung Bauermann, Thomas Gleixner, Tom Lendacky, Vasant Hegde" * tag 'powerpc-5.4-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (264 commits) powerpc/mm/mce: Keep irqs disabled during lockless page table walk powerpc: Use ftrace_graph_ret_addr() when unwinding powerpc/ftrace: Enable HAVE_FUNCTION_GRAPH_RET_ADDR_PTR ftrace: Look up the address of return_to_handler() using helpers powerpc: dump kernel log before carrying out fadump or kdump docs: powerpc: Add missing documentation reference powerpc/xmon: Fix output of XIVE IPI powerpc/xmon: Improve output of XIVE interrupts powerpc/mm/radix: remove useless kernel messages powerpc/fadump: support holes in kernel boot memory area powerpc/fadump: remove RMA_START and RMA_END macros powerpc/fadump: update documentation about option to release opalcore powerpc/fadump: consider f/w load area powerpc/opalcore: provide an option to invalidate /sys/firmware/opal/core file powerpc/opalcore: export /sys/firmware/opal/core for analysing opal crashes powerpc/fadump: update documentation about CONFIG_PRESERVE_FA_DUMP powerpc/fadump: add support to preserve crash data on FADUMP disabled kernel powerpc/fadump: improve how crashed kernel's memory is reserved powerpc/fadump: consider reserved ranges while releasing memory powerpc/fadump: make crash memory ranges array allocation generic ...
2019-09-20 11:48:06 -07:00 · 2019-09-20 11:48:06 -07:00 · 45824fc0da
parent 8c2b418c3f d9101bfa6a
commit 45824fc0da
247 changed files with 9763 additions and 5475 deletions
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@ -562,3 +562,13 @@ Description:	Umwait control
 			  or C0.2 state. The time is an unsigned 32-bit number.
 			  Note that a value of zero means there is no limit.
 			  Low order two bits must be zero.
+
+What:		/sys/devices/system/cpu/svm
+Date:		August 2019
+Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
+		Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
+Description:	Secure Virtual Machine
+
+		If 1, it means the system is using the Protected Execution
+		Facility in POWER9 and newer processors. i.e., it is a Secure
+		Virtual Machine.
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@ -860,6 +860,10 @@
 	disable_radix	[PPC]
 			Disable RADIX MMU mode on POWER9

+	disable_tlbie	[PPC]
+			Disable TLBIE instruction. Currently does not work
+			with KVM, with HASH MMU, or with coherent accelerators.
+
 	disable_cpu_apicid= [X86,APIC,SMP]
 			Format: <int>
 			The number of initial APIC ID for the
@ -4641,6 +4645,11 @@
 			/sys/power/pm_test). Only available when CONFIG_PM_DEBUG
 			is set. Default value is 5.

+	svm=		[PPC]
+			Format: { on | off | y | n | 1 | 0 }
+			This parameter controls use of the Protected
+			Execution Facility on pSeries.
+
 	swapaccount=[0|1]
 			[KNL] Enable accounting of swap in memory resource
 			controller if no parameter or 1 is given or disable
@ -5326,3 +5335,22 @@
 			A hex value specifying bitmask with supplemental xhci
 			host controller quirks. Meaning of each bit can be
 			consulted in header drivers/usb/host/xhci.h.
+
+	xmon		[PPC]
+			Format: { early | on | rw | ro | off }
+			Controls if xmon debugger is enabled. Default is off.
+			Passing only "xmon" is equivalent to "xmon=early".
+			early	Call xmon as early as possible on boot; xmon
+				debugger is called from setup_arch().
+			on	xmon debugger hooks will be installed so xmon
+				is only called on a kernel crash. Default mode,
+				i.e. either "ro" or "rw" mode, is controlled
+				with CONFIG_XMON_DEFAULT_RO_MODE.
+			rw	xmon debugger hooks will be installed so xmon
+				is called only on a kernel crash, mode is write,
+				meaning SPR registers, memory and, other data
+				can be written using xmon commands.
+			ro 	same as "rw" option above but SPR registers,
+				memory, and other data can't be written using
+				xmon commands.
+			off	xmon is disabled.
--- a/Documentation/powerpc/elfnote.rst
+++ b/Documentation/powerpc/elfnote.rst
@ -0,0 +1,41 @@
+==========================
+ELF Note PowerPC Namespace
+==========================
+
+The PowerPC namespace in an ELF Note of the kernel binary is used to store
+capabilities and information which can be used by a bootloader or userland.
+
+Types and Descriptors
+---------------------
+
+The types to be used with the "PowerPC" namesapce are defined in [#f1]_.
+
+	1) PPC_ELFNOTE_CAPABILITIES
+
+Define the capabilities supported/required by the kernel. This type uses a
+bitmap as "descriptor" field. Each bit is described below:
+
+- Ultravisor-capable bit (PowerNV only).
+
+.. code-block:: c
+
+	#define PPCCAP_ULTRAVISOR_BIT (1 << 0)
+
+Indicate that the powerpc kernel binary knows how to run in an
+ultravisor-enabled system.
+
+In an ultravisor-enabled system, some machine resources are now controlled
+by the ultravisor. If the kernel is not ultravisor-capable, but it ends up
+being run on a machine with ultravisor, the kernel will probably crash
+trying to access ultravisor resources. For instance, it may crash in early
+boot trying to set the partition table entry 0.
+
+In an ultravisor-enabled system, a bootloader could warn the user or prevent
+the kernel from being run if the PowerPC ultravisor capability doesn't exist
+or the Ultravisor-capable bit is not set.
+
+References
+----------
+
+.. [#f1] arch/powerpc/include/asm/elfnote.h
+
--- a/Documentation/powerpc/firmware-assisted-dump.rst
+++ b/Documentation/powerpc/firmware-assisted-dump.rst
@ -9,18 +9,18 @@ a crashed system, and to do so from a fully-reset system, and
 to minimize the total elapsed time until the system is back
 in production use.

- Firmware assisted dump (fadump) infrastructure is intended to replace
+- Firmware-Assisted Dump (FADump) infrastructure is intended to replace
  the existing phyp assisted dump.
 - Fadump uses the same firmware interfaces and memory reservation model
  as phyp assisted dump.
- Unlike phyp dump, fadump exports the memory dump through /proc/vmcore
+- Unlike phyp dump, FADump exports the memory dump through /proc/vmcore
  in the ELF format in the same way as kdump. This helps us reuse the
  kdump infrastructure for dump capture and filtering.
 - Unlike phyp dump, userspace tool does not need to refer any sysfs
  interface while reading /proc/vmcore.
- Unlike phyp dump, fadump allows user to release all the memory reserved
+- Unlike phyp dump, FADump allows user to release all the memory reserved
  for dump, with a single operation of echo 1 > /sys/kernel/fadump_release_mem.
- Once enabled through kernel boot parameter, fadump can be
+- Once enabled through kernel boot parameter, FADump can be
  started/stopped through /sys/kernel/fadump_registered interface (see
  sysfs files section below) and can be easily integrated with kdump
  service start/stop init scripts.
@ -34,7 +34,7 @@ dump offers several strong, practical advantages:
   in a clean, consistent state.
 -  Once the dump is copied out, the memory that held the dump
   is immediately available to the running kernel. And therefore,
-   unlike kdump, fadump doesn't need a 2nd reboot to get back
+   unlike kdump, FADump doesn't need a 2nd reboot to get back
   the system to the production configuration.

 The above can only be accomplished by coordination with,
@ -46,10 +46,9 @@ as follows:
   These registered sections of memory are reserved by the first
   kernel during early boot.

-  When a system crashes, the Power firmware will save
-   the low memory (boot memory of size larger of 5% of system RAM
-   or 256MB) of RAM to the previous registered region. It will
-   also save system registers, and hardware PTE's.
+-  When system crashes, the Power firmware will copy the registered
+   low memory regions (boot memory) from source to destination area.
+   It will also save hardware PTE's.

   NOTE:
         The term 'boot memory' means size of the low memory chunk
@ -61,9 +60,9 @@ as follows:
         the default calculated size. Use this option if default
         boot memory size is not sufficient for second kernel to
         boot successfully. For syntax of crashkernel= parameter,
-         refer to Documentation/admin-guide/kdump/kdump.rst. If any offset is
-         provided in crashkernel= parameter, it will be ignored
-         as fadump uses a predefined offset to reserve memory
+         refer to Documentation/admin-guide/kdump/kdump.rst. If any
+         offset is provided in crashkernel= parameter, it will be
+         ignored as FADump uses a predefined offset to reserve memory
         for boot memory dump preservation in case of a crash.

 -  After the low memory (boot memory) area has been saved, the
@ -71,13 +70,15 @@ as follows:
   *not* clear the RAM. It will then launch the bootloader, as
   normal.

-  The freshly booted kernel will notice that there is a new
-   node (ibm,dump-kernel) in the device tree, indicating that
+-  The freshly booted kernel will notice that there is a new node
+   (rtas/ibm,kernel-dump on pSeries or ibm,opal/dump/mpipl-boot
+   on OPAL platform) in the device tree, indicating that
   there is crash data available from a previous boot. During
   the early boot OS will reserve rest of the memory above
   boot memory size effectively booting with restricted memory
-   size. This will make sure that the second kernel will not
-   touch any of the dump memory area.
+   size. This will make sure that this kernel (also, referred
+   to as second kernel or capture kernel) will not touch any
+   of the dump memory area.

 -  User-space tools will read /proc/vmcore to obtain the contents
   of memory, which holds the previous crashed kernel dump in ELF
@ -94,8 +95,30 @@ as follows:
     # echo 1 > /sys/kernel/fadump_release_mem

 Please note that the firmware-assisted dump feature
-is only available on Power6 and above systems with recent
-firmware versions.
+is only available on POWER6 and above systems on pSeries
+(PowerVM) platform and POWER9 and above systems with OP940
+or later firmware versions on PowerNV (OPAL) platform.
+Note that, OPAL firmware exports ibm,opal/dump node when
+FADump is supported on PowerNV platform.
+
+On OPAL based machines, system first boots into an intermittent
+kernel (referred to as petitboot kernel) before booting into the
+capture kernel. This kernel would have minimal kernel and/or
+userspace support to process crash data. Such kernel needs to
+preserve previously crash'ed kernel's memory for the subsequent
+capture kernel boot to process this crash data. Kernel config
+option CONFIG_PRESERVE_FA_DUMP has to be enabled on such kernel
+to ensure that crash data is preserved to process later.
+
+-- On OPAL based machines (PowerNV), if the kernel is build with
+   CONFIG_OPAL_CORE=y, OPAL memory at the time of crash is also
+   exported as /sys/firmware/opal/core file. This procfs file is
+   helpful in debugging OPAL crashes with GDB. The kernel memory
+   used for exporting this procfs file can be released by echo'ing
+   '1' to /sys/kernel/fadump_release_opalcore node.
+
+   e.g.
+     # echo 1 > /sys/kernel/fadump_release_opalcore

 Implementation details:
 -----------------------
@ -110,72 +133,95 @@ that are run. If there is dump data, then the
 /sys/kernel/fadump_release_mem file is created, and the reserved
 memory is held.

-If there is no waiting dump data, then only the memory required
-to hold CPU state, HPTE region, boot memory dump and elfcore
-header, is usually reserved at an offset greater than boot memory
-size (see Fig. 1). This area is *not* released: this region will
-be kept permanently reserved, so that it can act as a receptacle
-for a copy of the boot memory content in addition to CPU state
-and HPTE region, in the case a crash does occur. Since this reserved
-memory area is used only after the system crash, there is no point in
-blocking this significant chunk of memory from production kernel.
-Hence, the implementation uses the Linux kernel's Contiguous Memory
-Allocator (CMA) for memory reservation if CMA is configured for kernel.
-With CMA reservation this memory will be available for applications to
-use it, while kernel is prevented from using it. With this fadump will
-still be able to capture all of the kernel memory and most of the user
-space memory except the user pages that were present in CMA region::
+If there is no waiting dump data, then only the memory required to
+hold CPU state, HPTE region, boot memory dump, FADump header and
+elfcore header, is usually reserved at an offset greater than boot
+memory size (see Fig. 1). This area is *not* released: this region
+will be kept permanently reserved, so that it can act as a receptacle
+for a copy of the boot memory content in addition to CPU state and
+HPTE region, in the case a crash does occur.
+
+Since this reserved memory area is used only after the system crash,
+there is no point in blocking this significant chunk of memory from
+production kernel. Hence, the implementation uses the Linux kernel's
+Contiguous Memory Allocator (CMA) for memory reservation if CMA is
+configured for kernel. With CMA reservation this memory will be
+available for applications to use it, while kernel is prevented from
+using it. With this FADump will still be able to capture all of the
+kernel memory and most of the user space memory except the user pages
+that were present in CMA region::

  o Memory Reservation during first kernel

-  Low memory                                         Top of memory
-  0      boot memory size                                       |
-  |           |                |<--Reserved dump area -->|      |
-  V           V                |   Permanent Reservation |      V
-  +-----------+----------/ /---+---+----+-----------+----+------+
-  |           |                |CPU|HPTE|  DUMP     |ELF |      |
-  +-----------+----------/ /---+---+----+-----------+----+------+
-        |                                           ^
-        |                                           |
-        \                                           /
-         -------------------------------------------
-          Boot memory content gets transferred to
-          reserved area by firmware at the time of
-          crash
+  Low memory                                                 Top of memory
+  0    boot memory size   |<--- Reserved dump area --->|       |
+  |           |           |    Permanent Reservation   |       |
+  V           V           |                            |       V
+  +-----------+-----/ /---+---+----+-------+-----+-----+----+--+
+  |           |           |///|////|  DUMP | HDR | ELF |////|  |
+  +-----------+-----/ /---+---+----+-------+-----+-----+----+--+
+        |                   ^    ^     ^      ^           ^
+        |                   |    |     |      |           |
+        \                  CPU  HPTE   /      |           |
+         ------------------------------       |           |
+      Boot memory content gets transferred    |           |
+      to reserved area by firmware at the     |           |
+      time of crash.                          |           |
+                                          FADump Header   |
+                                           (meta area)    |
+                                                          |
+                                                          |
+                      Metadata: This area holds a metadata struture whose
+                      address is registered with f/w and retrieved in the
+                      second kernel after crash, on platforms that support
+                      tags (OPAL). Having such structure with info needed
+                      to process the crashdump eases dump capture process.
+
                   Fig. 1

+
  o Memory Reservation during second kernel after crash

-  Low memory                                        Top of memory
-  0      boot memory size                                       |
-  |           |<------------- Reserved dump area ----------- -->|
-  V           V                                                 V
-  +-----------+----------/ /---+---+----+-----------+----+------+
-  |           |                |CPU|HPTE|  DUMP     |ELF |      |
-  +-----------+----------/ /---+---+----+-----------+----+------+
-        |                                              |
-        V                                              V
-   Used by second                                /proc/vmcore
+  Low memory                                              Top of memory
+  0      boot memory size                                      |
+  |           |<------------ Crash preserved area ------------>|
+  V           V           |<--- Reserved dump area --->|       |
+  +-----------+-----/ /---+---+----+-------+-----+-----+----+--+
+  |           |           |///|////|  DUMP | HDR | ELF |////|  |
+  +-----------+-----/ /---+---+----+-------+-----+-----+----+--+
+        |                                           |
+        V                                           V
+   Used by second                             /proc/vmcore
   kernel to boot
+
+        +---+
+        |///| -> Regions (CPU, HPTE & Metadata) marked like this in the above
+        +---+    figures are not always present. For example, OPAL platform
+                 does not have CPU & HPTE regions while Metadata region is
+                 not supported on pSeries currently.
+
                   Fig. 2

-Currently the dump will be copied from /proc/vmcore to a
-a new file upon user intervention. The dump data available through
-/proc/vmcore will be in ELF format. Hence the existing kdump
-infrastructure (kdump scripts) to save the dump works fine with
-minor modifications.
+
+Currently the dump will be copied from /proc/vmcore to a new file upon
+user intervention. The dump data available through /proc/vmcore will be
+in ELF format. Hence the existing kdump infrastructure (kdump scripts)
+to save the dump works fine with minor modifications. KDump scripts on
+major Distro releases have already been modified to work seemlessly (no
+user intervention in saving the dump) when FADump is used, instead of
+KDump, as dump mechanism.

 The tools to examine the dump will be same as the ones
 used for kdump.

-How to enable firmware-assisted dump (fadump):
+How to enable firmware-assisted dump (FADump):
 ----------------------------------------------

 1. Set config option CONFIG_FA_DUMP=y and build kernel.
 2. Boot into linux kernel with 'fadump=on' kernel cmdline option.
-   By default, fadump reserved memory will be initialized as CMA area.
+   By default, FADump reserved memory will be initialized as CMA area.
   Alternatively, user can boot linux kernel with 'fadump=nocma' to
-   prevent fadump to use CMA.
+   prevent FADump to use CMA.
 3. Optionally, user can also set 'crashkernel=' kernel cmdline
   to specify size of the memory to reserve for boot memory dump
   preservation.
@ -201,29 +247,29 @@ the control files and debugfs file to display memory reserved region.
 Here is the list of files under kernel sysfs:

 /sys/kernel/fadump_enabled
-    This is used to display the fadump status.
+    This is used to display the FADump status.

-    - 0 = fadump is disabled
-    - 1 = fadump is enabled
+    - 0 = FADump is disabled
+    - 1 = FADump is enabled

    This interface can be used by kdump init scripts to identify if
-    fadump is enabled in the kernel and act accordingly.
+    FADump is enabled in the kernel and act accordingly.

 /sys/kernel/fadump_registered
-    This is used to display the fadump registration status as well
-    as to control (start/stop) the fadump registration.
+    This is used to display the FADump registration status as well
+    as to control (start/stop) the FADump registration.

-    - 0 = fadump is not registered.
-    - 1 = fadump is registered and ready to handle system crash.
+    - 0 = FADump is not registered.
+    - 1 = FADump is registered and ready to handle system crash.

-    To register fadump echo 1 > /sys/kernel/fadump_registered and
+    To register FADump echo 1 > /sys/kernel/fadump_registered and
    echo 0 > /sys/kernel/fadump_registered for un-register and stop the
-    fadump. Once the fadump is un-registered, the system crash will not
+    FADump. Once the FADump is un-registered, the system crash will not
    be handled and vmcore will not be captured. This interface can be
    easily integrated with kdump service start/stop.

 /sys/kernel/fadump_release_mem
-    This file is available only when fadump is active during
+    This file is available only when FADump is active during
    second kernel. This is used to release the reserved memory
    region that are held for saving crash dump. To release the
    reserved memory echo 1 to it::
@ -237,25 +283,38 @@ Here is the list of files under kernel sysfs:
    enhanced to use this interface to release the memory reserved for
    dump and continue without 2nd reboot.

+ /sys/kernel/fadump_release_opalcore
+
+    This file is available only on OPAL based machines when FADump is
+    active during capture kernel. This is used to release the memory
+    used by the kernel to export /sys/firmware/opal/core file. To
+    release this memory, echo '1' to it:
+
+    echo 1  > /sys/kernel/fadump_release_opalcore
+
 Here is the list of files under powerpc debugfs:
 (Assuming debugfs is mounted on /sys/kernel/debug directory.)

 /sys/kernel/debug/powerpc/fadump_region
-    This file shows the reserved memory regions if fadump is
+    This file shows the reserved memory regions if FADump is
    enabled otherwise this file is empty. The output format
    is::

      <region>: [<start>-<end>] <reserved-size> bytes, Dumped: <dump-size>

+    and for kernel DUMP region is:
+
+    DUMP: Src: <src-addr>, Dest: <dest-addr>, Size: <size>, Dumped: # bytes
+
    e.g.
-    Contents when fadump is registered during first kernel::
+    Contents when FADump is registered during first kernel::

      # cat /sys/kernel/debug/powerpc/fadump_region
      CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x0
      HPTE: [0x0000006fff0020-0x0000006fff101f] 0x1000 bytes, Dumped: 0x0
      DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x0

-    Contents when fadump is active during second kernel::
+    Contents when FADump is active during second kernel::

      # cat /sys/kernel/debug/powerpc/fadump_region
      CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x40020
@ -263,6 +322,7 @@ Here is the list of files under powerpc debugfs:
      DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x10000000
          : [0x00000010000000-0x0000006ffaffff] 0x5ffb0000 bytes, Dumped: 0x5ffb0000

+
 NOTE:
      Please refer to Documentation/filesystems/debugfs.txt on
      how to mount the debugfs filesystem.
@ -273,7 +333,7 @@ TODO:
 - Need to come up with the better approach to find out more
   accurate boot memory size that is required for a kernel to
   boot successfully when booted with restricted memory.
- - The fadump implementation introduces a fadump crash info structure
+ - The FADump implementation introduces a FADump crash info structure
   in the scratch area before the ELF core header. The idea of introducing
   this structure is to pass some important crash info data to the second
   kernel which will help second kernel to populate ELF core header with
--- a/Documentation/powerpc/index.rst
+++ b/Documentation/powerpc/index.rst
@ -15,6 +15,7 @@ powerpc
    dawr-power9
    dscr
    eeh-pci-error-recovery
+    elfnote
    firmware-assisted-dump
    hvcs
    isa-versions
@ -25,6 +26,7 @@ powerpc
    qe_firmware
    syscall64-abi
    transactional_memory
+    ultravisor

 .. only::  subproject and html

--- a/Documentation/powerpc/ultravisor.rst
+++ b/Documentation/powerpc/ultravisor.rst
--- a/arch/Kconfig
+++ b/arch/Kconfig
@ -946,6 +946,9 @@ config RELR
 	  well as compatible NM and OBJCOPY utilities (llvm-nm and llvm-objcopy
 	  are compatible).

+config ARCH_HAS_MEM_ENCRYPT
+	bool
+
 source "kernel/gcov/Kconfig"

 source "scripts/gcc-plugins/Kconfig"
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@ -128,14 +128,15 @@ config PPC
 	select ARCH_HAS_HUGEPD			if HUGETLB_PAGE
 	select ARCH_HAS_MMIOWB			if PPC64
 	select ARCH_HAS_PHYS_TO_DMA
-	select ARCH_HAS_PMEM_API                if PPC64
+	select ARCH_HAS_PMEM_API
 	select ARCH_HAS_PTE_DEVMAP		if PPC_BOOK3S_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_MEMBARRIER_CALLBACKS
-	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE && PPC64
+	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64
 	select ARCH_HAS_STRICT_KERNEL_RWX	if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION)
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
-	select ARCH_HAS_UACCESS_FLUSHCACHE	if PPC64
+	select ARCH_HAS_UACCESS_FLUSHCACHE
+	select ARCH_HAS_UACCESS_MCSAFE		if PPC64
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_KEEP_MEMBLOCK
@ -183,6 +184,7 @@ config PPC
 	select HAVE_STACKPROTECTOR		if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13)
 	select HAVE_STACKPROTECTOR		if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2)
 	select HAVE_CONTEXT_TRACKING		if PPC64
+	select HAVE_COPY_THREAD_TLS
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_DYNAMIC_FTRACE
@ -568,7 +570,7 @@ config CRASH_DUMP

 config FA_DUMP
 	bool "Firmware-assisted dump"
-	depends on PPC64 && PPC_RTAS
+	depends on PPC64 && (PPC_RTAS || PPC_POWERNV)
 	select CRASH_CORE
 	select CRASH_DUMP
 	help
@ -579,7 +581,26 @@ config FA_DUMP
 	  is meant to be a kdump replacement offering robustness and
 	  speed not possible without system firmware assistance.

-	  If unsure, say "N"
+	  If unsure, say "y". Only special kernels like petitboot may
+	  need to say "N" here.
+
+config PRESERVE_FA_DUMP
+	bool "Preserve Firmware-assisted dump"
+	depends on PPC64 && PPC_POWERNV && !FA_DUMP
+	help
+	  On a kernel with FA_DUMP disabled, this option helps to preserve
+	  crash data from a previously crash'ed kernel. Useful when the next
+	  memory preserving kernel boot would process this crash data.
+	  Petitboot kernel is the typical usecase for this option.
+
+config OPAL_CORE
+	bool "Export OPAL memory as /sys/firmware/opal/core"
+	depends on PPC64 && PPC_POWERNV
+	help
+	  This option uses the MPIPL support in firmware to provide an
+	  ELF core of OPAL memory after a crash. The ELF core is exported
+	  as /sys/firmware/opal/core file which is helpful in debugging
+	  OPAL crashes using GDB.

 config IRQ_ALL_CPUS
 	bool "Distribute interrupts on all CPUs by default"
@ -1140,18 +1161,6 @@ config TASK_SIZE
 	default "0x80000000" if PPC_8xx
 	default "0xc0000000"

-config CONSISTENT_SIZE_BOOL
-	bool "Set custom consistent memory pool size"
-	depends on ADVANCED_OPTIONS && NOT_COHERENT_CACHE
-	help
-	  This option allows you to set the size of the
-	  consistent memory pool.  This pool of virtual memory
-	  is used to make consistent memory allocations.
-
-config CONSISTENT_SIZE
-	hex "Size of consistent memory pool" if CONSISTENT_SIZE_BOOL
-	default "0x00200000" if NOT_COHERENT_CACHE
-
 config PIN_TLB
 	bool "Pinned Kernel TLBs (860 ONLY)"
 	depends on ADVANCED_OPTIONS && PPC_8xx && \
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@ -110,7 +110,6 @@ ifeq ($(HAS_BIARCH),y)
 KBUILD_CFLAGS	+= -m$(BITS)
 KBUILD_AFLAGS	+= -m$(BITS) -Wl,-a$(BITS)
 KBUILD_LDFLAGS	+= -m elf$(BITS)$(LDEMULATION)
-KBUILD_ARFLAGS	+= --target=elf$(BITS)-$(GNUTARGET)
 endif

 cflags-$(CONFIG_STACKPROTECTOR)	+= -mstack-protector-guard=tls
--- a/arch/powerpc/boot/main.c
+++ b/arch/powerpc/boot/main.c
@ -146,6 +146,46 @@ static struct addr_range prep_initrd(struct addr_range vmlinux, void *chosen,
 	return (struct addr_range){(void *)initrd_addr, initrd_size};
 }

+#ifdef __powerpc64__
+static void prep_esm_blob(struct addr_range vmlinux, void *chosen)
+{
+	unsigned long esm_blob_addr, esm_blob_size;
+
+	/* Do we have an ESM (Enter Secure Mode) blob? */
+	if (_esm_blob_end <= _esm_blob_start)
+		return;
+
+	printf("Attached ESM blob at 0x%p-0x%p\n\r",
+	       _esm_blob_start, _esm_blob_end);
+	esm_blob_addr = (unsigned long)_esm_blob_start;
+	esm_blob_size = _esm_blob_end - _esm_blob_start;
+
+	/*
+	 * If the ESM blob is too low it will be clobbered when the
+	 * kernel relocates to its final location.  In this case,
+	 * allocate a safer place and move it.
+	 */
+	if (esm_blob_addr < vmlinux.size) {
+		void *old_addr = (void *)esm_blob_addr;
+
+		printf("Allocating 0x%lx bytes for esm_blob ...\n\r",
+		       esm_blob_size);
+		esm_blob_addr = (unsigned long)malloc(esm_blob_size);
+		if (!esm_blob_addr)
+			fatal("Can't allocate memory for ESM blob !\n\r");
+		printf("Relocating ESM blob 0x%lx <- 0x%p (0x%lx bytes)\n\r",
+		       esm_blob_addr, old_addr, esm_blob_size);
+		memmove((void *)esm_blob_addr, old_addr, esm_blob_size);
+	}
+
+	/* Tell the kernel ESM blob address via device tree. */
+	setprop_val(chosen, "linux,esm-blob-start", (u32)(esm_blob_addr));
+	setprop_val(chosen, "linux,esm-blob-end", (u32)(esm_blob_addr + esm_blob_size));
+}
+#else
+static inline void prep_esm_blob(struct addr_range vmlinux, void *chosen) { }
+#endif
+
 /* A buffer that may be edited by tools operating on a zImage binary so as to
 * edit the command line passed to vmlinux (by setting /chosen/bootargs).
 * The buffer is put in it's own section so that tools may locate it easier.
@ -214,6 +254,7 @@ void start(void)
 	vmlinux = prep_kernel();
 	initrd = prep_initrd(vmlinux, chosen,
 			     loader_info.initrd_addr, loader_info.initrd_size);
+	prep_esm_blob(vmlinux, chosen);
 	prep_cmdline(chosen);

 	printf("Finalizing device tree...");
--- a/arch/powerpc/boot/ops.h
+++ b/arch/powerpc/boot/ops.h
@ -251,6 +251,8 @@ extern char _initrd_start[];
 extern char _initrd_end[];
 extern char _dtb_start[];
 extern char _dtb_end[];
+extern char _esm_blob_start[];
+extern char _esm_blob_end[];

 static inline __attribute__((const))
 int __ilog2_u32(u32 n)
--- a/arch/powerpc/boot/wrapper
+++ b/arch/powerpc/boot/wrapper
@ -13,6 +13,7 @@
 # -i initrd	specify initrd file
 # -d devtree	specify device-tree blob
 # -s tree.dts	specify device-tree source file (needs dtc installed)
+# -e esm_blob   specify ESM blob for secure images
 # -c		cache $kernel.strip.gz (use if present & newer, else make)
 # -C prefix	specify command prefix for cross-building tools
 #		(strip, objcopy, ld)
@ -37,6 +38,7 @@ platform=of
 initrd=
 dtb=
 dts=
+esm_blob=
 cacheit=
 binary=
 compression=.gz
@ -60,9 +62,9 @@ tmpdir=.

 usage() {
    echo 'Usage: wrapper [-o output] [-p platform] [-i initrd]' >&2
-    echo '       [-d devtree] [-s tree.dts] [-c] [-C cross-prefix]' >&2
-    echo '       [-D datadir] [-W workingdir] [-Z (gz|xz|none)]' >&2
-    echo '       [--no-compression] [vmlinux]' >&2
+    echo '       [-d devtree] [-s tree.dts] [-e esm_blob]' >&2
+    echo '       [-c] [-C cross-prefix] [-D datadir] [-W workingdir]' >&2
+    echo '       [-Z (gz|xz|none)] [--no-compression] [vmlinux]' >&2
    exit 1
 }

@ -105,6 +107,11 @@ while [ "$#" -gt 0 ]; do
 	[ "$#" -gt 0 ] || usage
 	dtb="$1"
 	;;
+    -e)
+	shift
+	[ "$#" -gt 0 ] || usage
+	esm_blob="$1"
+	;;
    -s)
 	shift
 	[ "$#" -gt 0 ] || usage
@ -218,9 +225,16 @@ objflags=-S
 tmp=$tmpdir/zImage.$$.o
 ksection=.kernel:vmlinux.strip
 isection=.kernel:initrd
+esection=.kernel:esm_blob
 link_address='0x400000'
 make_space=y

+
+if [ -n "$esm_blob" -a "$platform" != "pseries" ]; then
+    echo "ESM blob not support on non-pseries platforms" >&2
+    exit 1
+fi
+
 case "$platform" in
 of)
    platformo="$object/of.o $object/epapr.o"
@ -477,6 +491,10 @@ if [ -n "$dtb" ]; then
    fi
 fi

+if [ -n "$esm_blob" ]; then
+    addsec $tmp "$esm_blob" $esection
+fi
+
 if [ "$platform" != "miboot" ]; then
    if [ -n "$link_address" ] ; then
        text_start="-Ttext $link_address"
--- a/arch/powerpc/boot/zImage.lds.S
+++ b/arch/powerpc/boot/zImage.lds.S
@ -68,6 +68,14 @@ SECTIONS
    _initrd_end =  .;
  }

+  . = ALIGN(4096);
+  .kernel:esm_blob :
+  {
+    _esm_blob_start =  .;
+    *(.kernel:esm_blob)
+    _esm_blob_end =  .;
+  }
+
 #ifdef CONFIG_PPC64_BOOT_WRAPPER
  . = ALIGN(256);
  .got :
--- a/arch/powerpc/configs/pmac32_defconfig
+++ b/arch/powerpc/configs/pmac32_defconfig
@ -20,7 +20,6 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_GOV_POWERSAVE=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_PMAC=y
-CONFIG_PPC601_SYNC_FIX=y
 CONFIG_GEN_RTC=y
 CONFIG_HIGHMEM=y
 CONFIG_BINFMT_MISC=m
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@ -38,7 +38,7 @@ CONFIG_MODULE_UNLOAD=y
 CONFIG_MODVERSIONS=y
 CONFIG_MODULE_SRCVERSION_ALL=y
 CONFIG_PARTITION_ADVANCED=y
-CONFIG_SCOM_DEBUGFS=y
+# CONFIG_SCOM_DEBUGFS is not set
 CONFIG_OPAL_PRD=y
 CONFIG_PPC_MEMTRACE=y
 # CONFIG_PPC_PSERIES is not set
--- a/arch/powerpc/configs/ppc40x_defconfig
+++ b/arch/powerpc/configs/ppc40x_defconfig
@ -84,4 +84,3 @@ CONFIG_CRYPTO_ECB=y
 CONFIG_CRYPTO_PCBC=y
 CONFIG_CRYPTO_MD5=y
 CONFIG_CRYPTO_DES=y
-CONFIG_PPC4xx_OCM=y
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@ -29,6 +29,7 @@ CONFIG_DTL=y
 CONFIG_SCANLOG=m
 CONFIG_PPC_SMLPAR=y
 CONFIG_IBMEBUS=y
+CONFIG_PPC_SVM=y
 CONFIG_PPC_MAPLE=y
 CONFIG_PPC_PASEMI=y
 CONFIG_PPC_PASEMI_IOMMU=y
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@ -42,6 +42,7 @@ CONFIG_DTL=y
 CONFIG_SCANLOG=m
 CONFIG_PPC_SMLPAR=y
 CONFIG_IBMEBUS=y
+CONFIG_PPC_SVM=y
 # CONFIG_PPC_PMAC is not set
 CONFIG_RTAS_FLASH=m
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
--- a/arch/powerpc/configs/skiroot_defconfig
+++ b/arch/powerpc/configs/skiroot_defconfig
@ -213,6 +213,7 @@ CONFIG_IPMI_WATCHDOG=y
 CONFIG_HW_RANDOM=y
 CONFIG_TCG_TPM=y
 CONFIG_TCG_TIS_I2C_NUVOTON=y
+# CONFIG_DEVPORT is not set
 CONFIG_I2C=y
 # CONFIG_I2C_COMPAT is not set
 CONFIG_I2C_CHARDEV=y
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@ -15,6 +15,7 @@
 #include <asm/epapr_hcalls.h>
 #include <asm/dcr.h>
 #include <asm/mmu_context.h>
+#include <asm/ultravisor-api.h>

 #include <uapi/asm/ucontext.h>

@ -34,6 +35,16 @@ extern struct static_key hcall_tracepoint_key;
 void __trace_hcall_entry(unsigned long opcode, unsigned long *args);
 void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf);

+/* Ultravisor */
+#if defined(CONFIG_PPC_POWERNV) || defined(CONFIG_PPC_SVM)
+long ucall_norets(unsigned long opcode, ...);
+#else
+static inline long ucall_norets(unsigned long opcode, ...)
+{
+	return U_NOT_AVAILABLE;
+}
+#endif
+
 /* OPAL */
 int64_t __opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
 		    int64_t a4, int64_t a5, int64_t a6, int64_t a7,
@ -123,7 +134,8 @@ extern int __ucmpdi2(u64, u64);

 /* tracing */
 void _mcount(void);
-unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip);
+unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip,
+						unsigned long sp);

 void pnv_power9_force_smt4_catch(void);
 void pnv_power9_force_smt4_release(void);
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@ -148,23 +148,21 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
 */
 #include <asm/fixmap.h>

-#ifdef CONFIG_HIGHMEM
-#define KVIRT_TOP	PKMAP_BASE
-#else
-#define KVIRT_TOP	FIXADDR_START
-#endif
-
 /*
 * ioremap_bot starts at that address. Early ioremaps move down from there,
 * until mem_init() at which point this becomes the top of the vmalloc
 * and ioremap space
 */
-#ifdef CONFIG_NOT_COHERENT_CACHE
-#define IOREMAP_TOP	((KVIRT_TOP - CONFIG_CONSISTENT_SIZE) & PAGE_MASK)
+#ifdef CONFIG_HIGHMEM
+#define IOREMAP_TOP	PKMAP_BASE
 #else
-#define IOREMAP_TOP	KVIRT_TOP
+#define IOREMAP_TOP	FIXADDR_START
 #endif

+/* PPC32 shares vmalloc area with ioremap */
+#define IOREMAP_START	VMALLOC_START
+#define IOREMAP_END	VMALLOC_END
+
 /*
 * Just any arbitrary offset to the start of the vmalloc VM area: the
 * current 16MB value just means that there will be a 64MB "hole" after the
@ -201,8 +199,6 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
 #include <linux/sched.h>
 #include <linux/threads.h>

-extern unsigned long ioremap_bot;
-
 /* Bits to mask out from a PGD to get to the PUD page */
 #define PGD_MASKED_BITS		0

--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@ -206,7 +206,6 @@ extern int mmu_io_psize;
 void mmu_early_init_devtree(void);
 void hash__early_init_devtree(void);
 void radix__early_init_devtree(void);
-extern void radix_init_native(void);
 extern void hash__early_init_mmu(void);
 extern void radix__early_init_mmu(void);
 static inline void early_init_mmu(void)
@ -238,9 +237,6 @@ static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 					   first_memblock_size);
 }

-extern int (*register_process_table)(unsigned long base, unsigned long page_size,
-				     unsigned long tbl_size);
-
 #ifdef CONFIG_PPC_PSERIES
 extern void radix_init_pseries(void);
 #else
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@ -289,7 +289,6 @@ extern unsigned long __kernel_io_end;
 #define KERN_IO_END __kernel_io_end

 extern struct page *vmemmap;
-extern unsigned long ioremap_bot;
 extern unsigned long pci_io_base;
 #endif /* __ASSEMBLY__ */

@ -317,6 +316,7 @@ extern unsigned long pci_io_base;
 #define  PHB_IO_BASE	(ISA_IO_END)
 #define  PHB_IO_END	(KERN_IO_START + FULL_IO_SIZE)
 #define IOREMAP_BASE	(PHB_IO_END)
+#define IOREMAP_START	(ioremap_bot)
 #define IOREMAP_END	(KERN_IO_END)

 /* Advertise special mapping type for AGP */
@ -608,8 +608,10 @@ static inline bool pte_access_permitted(pte_t pte, bool write)
 */
 static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
 {
-	return __pte((((pte_basic_t)(pfn) << PAGE_SHIFT) & PTE_RPN_MASK) |
-		     pgprot_val(pgprot));
+	VM_BUG_ON(pfn >> (64 - PAGE_SHIFT));
+	VM_BUG_ON((pfn << PAGE_SHIFT) & ~PTE_RPN_MASK);
+
+	return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot));
 }

 static inline unsigned long pte_pfn(pte_t pte)
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@ -266,9 +266,6 @@ extern void radix__vmemmap_remove_mapping(unsigned long start,
 extern int radix__map_kernel_page(unsigned long ea, unsigned long pa,
 				 pgprot_t flags, unsigned int psz);

-extern int radix__ioremap_range(unsigned long ea, phys_addr_t pa,
-				unsigned long size, pgprot_t prot, int nid);
-
 static inline unsigned long radix__get_tree_size(void)
 {
 	unsigned long rts_field;
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@ -17,8 +17,8 @@ extern void radix__flush_tlb_lpid_page(unsigned int lpid,
 					unsigned long addr,
 					unsigned long page_size);
 extern void radix__flush_pwc_lpid(unsigned int lpid);
-extern void radix__flush_tlb_lpid(unsigned int lpid);
-extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid);
+extern void radix__flush_all_lpid(unsigned int lpid);
+extern void radix__flush_all_lpid_guest(unsigned int lpid);
 #else
 static inline void radix__tlbiel_all(unsigned int action) { WARN_ON(1); };
 static inline void radix__flush_tlb_lpid_page(unsigned int lpid,
@ -31,11 +31,7 @@ static inline void radix__flush_pwc_lpid(unsigned int lpid)
 {
 	WARN_ON(1);
 }
-static inline void radix__flush_tlb_lpid(unsigned int lpid)
-{
-	WARN_ON(1);
-}
-static inline void radix__local_flush_tlb_lpid_guest(unsigned int lpid)
+static inline void radix__flush_all_lpid(unsigned int lpid)
 {
 	WARN_ON(1);
 }
@ -73,6 +69,4 @@ extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr);
 extern void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr);
 extern void radix__flush_tlb_all(void);

-extern void radix__local_flush_tlb_lpid(unsigned int lpid);
-
 #endif
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@ -162,4 +162,13 @@ static inline void flush_tlb_pgtable(struct mmu_gather *tlb, unsigned long addre

 	radix__flush_tlb_pwc(tlb, address);
 }
+
+extern bool tlbie_capable;
+extern bool tlbie_enabled;
+
+static inline bool cputlb_use_tlbie(void)
+{
+	return tlbie_enabled;
+}
+
 #endif /*  _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H */
--- a/arch/powerpc/include/asm/book3s/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/pgtable.h
@ -26,5 +26,16 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 				     unsigned long size, pgprot_t vma_prot);
 #define __HAVE_PHYS_MEM_ACCESS_PROT

+/*
+ * This gets called at the end of handling a page fault, when
+ * the kernel has put a new PTE into the page table for the process.
+ * We use it to ensure coherency between the i-cache and d-cache
+ * for the page which has just been mapped in.
+ * On machines which use an MMU hash table, we use this to put a
+ * corresponding HPTE into the hash table ahead of time, instead of
+ * waiting for the inevitable extra hash-table miss exception.
+ */
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep);
+
 #endif /* __ASSEMBLY__ */
 #endif
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@ -5,14 +5,6 @@

 #include <asm/asm-compat.h>

-/*
- * Define an illegal instr to trap on the bug.
- * We don't use 0 because that marks the end of a function
- * in the ELF ABI.  That's "Boo Boo" in case you wonder...
- */
-#define BUG_OPCODE .long 0x00b00b00  /* For asm */
-#define BUG_ILLEGAL_INSTR "0x00b00b00" /* For BUG macro */
-
 #ifdef CONFIG_BUG

 #ifdef __ASSEMBLY__
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@ -145,12 +145,10 @@ static inline void cpu_feature_keys_init(void) { }

 /* Definitions for features that only exist on 32-bit chips */
 #ifdef CONFIG_PPC32
-#define CPU_FTR_601			ASM_CONST(0x00001000)
 #define CPU_FTR_L2CR			ASM_CONST(0x00002000)
 #define CPU_FTR_SPEC7450		ASM_CONST(0x00004000)
 #define CPU_FTR_TAU			ASM_CONST(0x00008000)
 #define CPU_FTR_CAN_DOZE		ASM_CONST(0x00010000)
-#define CPU_FTR_USE_RTC			ASM_CONST(0x00020000)
 #define CPU_FTR_L3CR			ASM_CONST(0x00040000)
 #define CPU_FTR_L3_DISABLE_NAP		ASM_CONST(0x00080000)
 #define CPU_FTR_NAP_DISABLE_L2_PR	ASM_CONST(0x00100000)
@ -160,14 +158,12 @@ static inline void cpu_feature_keys_init(void) { }
 #define CPU_FTR_NEED_COHERENT		ASM_CONST(0x01000000)
 #define CPU_FTR_NO_BTIC			ASM_CONST(0x02000000)
 #define CPU_FTR_PPC_LE			ASM_CONST(0x04000000)
-#define CPU_FTR_UNIFIED_ID_CACHE	ASM_CONST(0x08000000)
 #define CPU_FTR_SPE			ASM_CONST(0x10000000)
 #define CPU_FTR_NEED_PAIRED_STWCX	ASM_CONST(0x20000000)
 #define CPU_FTR_INDEXED_DCR		ASM_CONST(0x40000000)

 #else	/* CONFIG_PPC32 */
 /* Define these to 0 for the sake of tests in common code */
-#define CPU_FTR_601			(0)
 #define CPU_FTR_PPC_LE			(0)
 #endif

@ -294,8 +290,8 @@ static inline void cpu_feature_keys_init(void) { }
 #define CPU_FTR_MAYBE_CAN_NAP	0
 #endif

-#define CPU_FTRS_PPC601	(CPU_FTR_COMMON | CPU_FTR_601 | \
-	CPU_FTR_COHERENT_ICACHE | CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_USE_RTC)
+#define CPU_FTRS_PPC601	(CPU_FTR_COMMON | \
+	CPU_FTR_COHERENT_ICACHE)
 #define CPU_FTRS_603	(CPU_FTR_COMMON | CPU_FTR_MAYBE_CAN_DOZE | \
 	    CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE | CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_604	(CPU_FTR_COMMON | CPU_FTR_PPC_LE)
@ -386,7 +382,7 @@ static inline void cpu_feature_keys_init(void) { }
 #define CPU_FTRS_47X	(CPU_FTRS_440x6)
 #define CPU_FTRS_E200	(CPU_FTR_SPE_COMP | \
 	    CPU_FTR_NODSISRALIGN | CPU_FTR_COHERENT_ICACHE | \
-	    CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_NOEXECUTE | \
+	    CPU_FTR_NOEXECUTE | \
 	    CPU_FTR_DEBUG_LVL_EXC)
 #define CPU_FTRS_E500	(CPU_FTR_MAYBE_CAN_DOZE | \
 	    CPU_FTR_SPE_COMP | CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_NODSISRALIGN | \
@ -498,7 +494,9 @@ static inline void cpu_feature_keys_init(void) { }
 #else
 enum {
 	CPU_FTRS_POSSIBLE =
-#ifdef CONFIG_PPC_BOOK3S_32
+#ifdef CONFIG_PPC_BOOK3S_601
+	    CPU_FTRS_PPC601 |
+#elif defined(CONFIG_PPC_BOOK3S_32)
 	    CPU_FTRS_PPC601 | CPU_FTRS_603 | CPU_FTRS_604 | CPU_FTRS_740_NOTAU |
 	    CPU_FTRS_740 | CPU_FTRS_750 | CPU_FTRS_750FX1 |
 	    CPU_FTRS_750FX2 | CPU_FTRS_750FX | CPU_FTRS_750GX |
@ -574,8 +572,10 @@ enum {
 #else
 enum {
 	CPU_FTRS_ALWAYS =
-#ifdef CONFIG_PPC_BOOK3S_32
-	    CPU_FTRS_PPC601 & CPU_FTRS_603 & CPU_FTRS_604 & CPU_FTRS_740_NOTAU &
+#ifdef CONFIG_PPC_BOOK3S_601
+	    CPU_FTRS_PPC601 &
+#elif defined(CONFIG_PPC_BOOK3S_32)
+	    CPU_FTRS_603 & CPU_FTRS_604 & CPU_FTRS_740_NOTAU &
 	    CPU_FTRS_740 & CPU_FTRS_750 & CPU_FTRS_750FX1 &
 	    CPU_FTRS_750FX2 & CPU_FTRS_750FX & CPU_FTRS_750GX &
 	    CPU_FTRS_7400_NOTAU & CPU_FTRS_7400 & CPU_FTRS_7450_20 &
--- a/arch/powerpc/include/asm/current.h
+++ b/arch/powerpc/include/asm/current.h
@ -16,7 +16,8 @@ static inline struct task_struct *get_current(void)
 {
 	struct task_struct *task;

-	__asm__ __volatile__("ld %0,%1(13)"
+	/* get_current can be cached by the compiler, so no volatile */
+	asm ("ld %0,%1(13)"
 	: "=r" (task)
 	: "i" (offsetof(struct paca_struct, __current)));

--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@ -88,6 +88,19 @@ struct eeh_pe {
 	struct list_head child_list;	/* List of PEs below this PE	*/
 	struct list_head child;		/* Memb. child_list/eeh_phb_pe	*/
 	struct list_head edevs;		/* List of eeh_dev in this PE	*/
+
+#ifdef CONFIG_STACKTRACE
+	/*
+	 * Saved stack trace. When we find a PE freeze in eeh_dev_check_failure
+	 * the stack trace is saved here so we can print it in the recovery
+	 * thread if it turns out to due to a real problem rather than
+	 * a hot-remove.
+	 *
+	 * A max of 64 entries might be overkill, but it also might not be.
+	 */
+	unsigned long stack_trace[64];
+	int trace_entries;
+#endif /* CONFIG_STACKTRACE */
 };

 #define eeh_pe_for_each_dev(pe, edev, tmp) \
@ -121,6 +134,8 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe)
 struct eeh_dev {
 	int mode;			/* EEH mode			*/
 	int class_code;			/* Class code of the device	*/
+	int bdfn;			/* bdfn of device (for cfg ops) */
+	struct pci_controller *controller;
 	int pe_config_addr;		/* PE config address		*/
 	u32 config_space[16];		/* Saved PCI config space	*/
 	int pcix_cap;			/* Saved PCIx capability	*/
@ -136,6 +151,17 @@ struct eeh_dev {
 	struct pci_dev *physfn;		/* Associated SRIOV PF		*/
 };

+/* "fmt" must be a simple literal string */
+#define EEH_EDEV_PRINT(level, edev, fmt, ...) \
+	pr_##level("PCI %04x:%02x:%02x.%x#%04x: EEH: " fmt, \
+	(edev)->controller->global_number, PCI_BUSNO((edev)->bdfn), \
+	PCI_SLOT((edev)->bdfn), PCI_FUNC((edev)->bdfn), \
+	((edev)->pe ? (edev)->pe_config_addr : 0xffff), ##__VA_ARGS__)
+#define eeh_edev_dbg(edev, fmt, ...) EEH_EDEV_PRINT(debug, (edev), fmt, ##__VA_ARGS__)
+#define eeh_edev_info(edev, fmt, ...) EEH_EDEV_PRINT(info, (edev), fmt, ##__VA_ARGS__)
+#define eeh_edev_warn(edev, fmt, ...) EEH_EDEV_PRINT(warn, (edev), fmt, ##__VA_ARGS__)
+#define eeh_edev_err(edev, fmt, ...) EEH_EDEV_PRINT(err, (edev), fmt, ##__VA_ARGS__)
+
 static inline struct pci_dn *eeh_dev_to_pdn(struct eeh_dev *edev)
 {
 	return edev ? edev->pdn : NULL;
@ -247,7 +273,7 @@ static inline bool eeh_state_active(int state)
 	== (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
 }

-typedef void *(*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag);
+typedef void (*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag);
 typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag);
 void eeh_set_pe_aux_size(int size);
 int eeh_phb_pe_create(struct pci_controller *phb);
@ -261,20 +287,20 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev);
 void eeh_pe_update_time_stamp(struct eeh_pe *pe);
 void *eeh_pe_traverse(struct eeh_pe *root,
 		      eeh_pe_traverse_func fn, void *flag);
-void *eeh_pe_dev_traverse(struct eeh_pe *root,
-			  eeh_edev_traverse_func fn, void *flag);
+void eeh_pe_dev_traverse(struct eeh_pe *root,
+			 eeh_edev_traverse_func fn, void *flag);
 void eeh_pe_restore_bars(struct eeh_pe *pe);
 const char *eeh_pe_loc_get(struct eeh_pe *pe);
 struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);

 struct eeh_dev *eeh_dev_init(struct pci_dn *pdn);
 void eeh_dev_phb_init_dynamic(struct pci_controller *phb);
-void eeh_probe_devices(void);
+void eeh_show_enabled(void);
 int __init eeh_ops_register(struct eeh_ops *ops);
 int __exit eeh_ops_unregister(const char *name);
 int eeh_check_failure(const volatile void __iomem *token);
 int eeh_dev_check_failure(struct eeh_dev *edev);
-void eeh_addr_cache_build(void);
+void eeh_addr_cache_init(void);
 void eeh_add_device_early(struct pci_dn *);
 void eeh_add_device_tree_early(struct pci_dn *);
 void eeh_add_device_late(struct pci_dev *);
@ -316,7 +342,7 @@ static inline bool eeh_enabled(void)
        return false;
 }

-static inline void eeh_probe_devices(void) { }
+static inline void eeh_show_enabled(void) { }

 static inline void *eeh_dev_init(struct pci_dn *pdn, void *data)
 {
@ -332,7 +358,7 @@ static inline int eeh_check_failure(const volatile void __iomem *token)

 #define eeh_dev_check_failure(x) (0)

-static inline void eeh_addr_cache_build(void) { }
+static inline void eeh_addr_cache_init(void) { }

 static inline void eeh_add_device_early(struct pci_dn *pdn) { }

--- a/arch/powerpc/include/asm/elfnote.h
+++ b/arch/powerpc/include/asm/elfnote.h
@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PowerPC ELF notes.
+ *
+ * Copyright 2019, IBM Corporation
+ */
+
+#ifndef __ASM_POWERPC_ELFNOTE_H__
+#define __ASM_POWERPC_ELFNOTE_H__
+
+/*
+ * These note types should live in a SHT_NOTE segment and have
+ * "PowerPC" in the name field.
+ */
+
+/*
+ * The capabilities supported/required by this kernel (bitmap).
+ *
+ * This type uses a bitmap as "desc" field. Each bit is described
+ * in arch/powerpc/kernel/note.S
+ */
+#define PPC_ELFNOTE_CAPABILITIES 1
+
+#endif /* __ASM_POWERPC_ELFNOTE_H__ */
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@ -0,0 +1,169 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Firmware-Assisted Dump internal code.
+ *
+ * Copyright 2011, Mahesh Salgaonkar, IBM Corporation.
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#ifndef _ASM_POWERPC_FADUMP_INTERNAL_H
+#define _ASM_POWERPC_FADUMP_INTERNAL_H
+
+/* Maximum number of memory regions kernel supports */
+#define FADUMP_MAX_MEM_REGS			128
+
+#ifndef CONFIG_PRESERVE_FA_DUMP
+
+/* The upper limit percentage for user specified boot memory size (25%) */
+#define MAX_BOOT_MEM_RATIO			4
+
+#define memblock_num_regions(memblock_type)	(memblock.memblock_type.cnt)
+
+/* Alignment per CMA requirement. */
+#define FADUMP_CMA_ALIGNMENT	(PAGE_SIZE <<				\
+				 max_t(unsigned long, MAX_ORDER - 1,	\
+				 pageblock_order))
+
+/* FAD commands */
+#define FADUMP_REGISTER			1
+#define FADUMP_UNREGISTER		2
+#define FADUMP_INVALIDATE		3
+
+/*
+ * Copy the ascii values for first 8 characters from a string into u64
+ * variable at their respective indexes.
+ * e.g.
+ *  The string "FADMPINF" will be converted into 0x4641444d50494e46
+ */
+static inline u64 fadump_str_to_u64(const char *str)
+{
+	u64 val = 0;
+	int i;
+
+	for (i = 0; i < sizeof(val); i++)
+		val = (*str) ? (val << 8) | *str++ : val << 8;
+	return val;
+}
+
+#define FADUMP_CPU_UNKNOWN		(~((u32)0))
+
+#define FADUMP_CRASH_INFO_MAGIC		fadump_str_to_u64("FADMPINF")
+
+/* fadump crash info structure */
+struct fadump_crash_info_header {
+	u64		magic_number;
+	u64		elfcorehdr_addr;
+	u32		crashing_cpu;
+	struct pt_regs	regs;
+	struct cpumask	online_mask;
+};
+
+struct fadump_memory_range {
+	u64	base;
+	u64	size;
+};
+
+/* fadump memory ranges info */
+struct fadump_mrange_info {
+	char				name[16];
+	struct fadump_memory_range	*mem_ranges;
+	u32				mem_ranges_sz;
+	u32				mem_range_cnt;
+	u32				max_mem_ranges;
+};
+
+/* Platform specific callback functions */
+struct fadump_ops;
+
+/* Firmware-assisted dump configuration details. */
+struct fw_dump {
+	unsigned long	reserve_dump_area_start;
+	unsigned long	reserve_dump_area_size;
+	/* cmd line option during boot */
+	unsigned long	reserve_bootvar;
+
+	unsigned long	cpu_state_data_size;
+	u64		cpu_state_dest_vaddr;
+	u32		cpu_state_data_version;
+	u32		cpu_state_entry_size;
+
+	unsigned long	hpte_region_size;
+
+	unsigned long	boot_memory_size;
+	u64		boot_mem_dest_addr;
+	u64		boot_mem_addr[FADUMP_MAX_MEM_REGS];
+	u64		boot_mem_sz[FADUMP_MAX_MEM_REGS];
+	u64		boot_mem_top;
+	u64		boot_mem_regs_cnt;
+
+	unsigned long	fadumphdr_addr;
+	unsigned long	cpu_notes_buf_vaddr;
+	unsigned long	cpu_notes_buf_size;
+
+	/*
+	 * Maximum size supported by firmware to copy from source to
+	 * destination address per entry.
+	 */
+	u64		max_copy_size;
+	u64		kernel_metadata;
+
+	int		ibm_configure_kernel_dump;
+
+	unsigned long	fadump_enabled:1;
+	unsigned long	fadump_supported:1;
+	unsigned long	dump_active:1;
+	unsigned long	dump_registered:1;
+	unsigned long	nocma:1;
+
+	struct fadump_ops	*ops;
+};
+
+struct fadump_ops {
+	u64	(*fadump_init_mem_struct)(struct fw_dump *fadump_conf);
+	u64	(*fadump_get_metadata_size)(void);
+	int	(*fadump_setup_metadata)(struct fw_dump *fadump_conf);
+	u64	(*fadump_get_bootmem_min)(void);
+	int	(*fadump_register)(struct fw_dump *fadump_conf);
+	int	(*fadump_unregister)(struct fw_dump *fadump_conf);
+	int	(*fadump_invalidate)(struct fw_dump *fadump_conf);
+	void	(*fadump_cleanup)(struct fw_dump *fadump_conf);
+	int	(*fadump_process)(struct fw_dump *fadump_conf);
+	void	(*fadump_region_show)(struct fw_dump *fadump_conf,
+				      struct seq_file *m);
+	void	(*fadump_trigger)(struct fadump_crash_info_header *fdh,
+				  const char *msg);
+};
+
+/* Helper functions */
+s32 fadump_setup_cpu_notes_buf(u32 num_cpus);
+void fadump_free_cpu_notes_buf(void);
+u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs);
+void fadump_update_elfcore_header(char *bufp);
+bool is_fadump_boot_mem_contiguous(void);
+bool is_fadump_reserved_mem_contiguous(void);
+
+#else /* !CONFIG_PRESERVE_FA_DUMP */
+
+/* Firmware-assisted dump configuration details. */
+struct fw_dump {
+	u64	boot_mem_top;
+	u64	dump_active;
+};
+
+#endif /* CONFIG_PRESERVE_FA_DUMP */
+
+#ifdef CONFIG_PPC_PSERIES
+extern void rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node);
+#else
+static inline void
+rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) { }
+#endif
+
+#ifdef CONFIG_PPC_POWERNV
+extern void opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node);
+#else
+static inline void
+opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) { }
+#endif
+
+#endif /* _ASM_POWERPC_FADUMP_INTERNAL_H */
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@ -6,196 +6,14 @@
 * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
 */

-#ifndef __PPC64_FA_DUMP_H__
-#define __PPC64_FA_DUMP_H__
+#ifndef _ASM_POWERPC_FADUMP_H
+#define _ASM_POWERPC_FADUMP_H

 #ifdef CONFIG_FA_DUMP

-/*
- * The RMA region will be saved for later dumping when kernel crashes.
- * RMA is Real Mode Area, the first block of logical memory address owned
- * by logical partition, containing the storage that may be accessed with
- * translate off.
- */
-#define RMA_START	0x0
-#define RMA_END		(ppc64_rma_size)
-
-/*
- * On some Power systems where RMO is 128MB, it still requires minimum of
- * 256MB for kernel to boot successfully. When kdump infrastructure is
- * configured to save vmcore over network, we run into OOM issue while
- * loading modules related to network setup. Hence we need aditional 64M
- * of memory to avoid OOM issue.
- */
-#define MIN_BOOT_MEM	(((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \
-			+ (0x1UL << 26))
-
-/* The upper limit percentage for user specified boot memory size (25%) */
-#define MAX_BOOT_MEM_RATIO			4
-
-#define memblock_num_regions(memblock_type)	(memblock.memblock_type.cnt)
-
-/* Alignement per CMA requirement. */
-#define FADUMP_CMA_ALIGNMENT	(PAGE_SIZE <<				\
-			max_t(unsigned long, MAX_ORDER - 1, pageblock_order))
-
-/* Firmware provided dump sections */
-#define FADUMP_CPU_STATE_DATA	0x0001
-#define FADUMP_HPTE_REGION	0x0002
-#define FADUMP_REAL_MODE_REGION	0x0011
-
-/* Dump request flag */
-#define FADUMP_REQUEST_FLAG	0x00000001
-
-/* FAD commands */
-#define FADUMP_REGISTER		1
-#define FADUMP_UNREGISTER	2
-#define FADUMP_INVALIDATE	3
-
-/* Dump status flag */
-#define FADUMP_ERROR_FLAG	0x2000
-
-#define FADUMP_CPU_ID_MASK	((1UL << 32) - 1)
-
-#define CPU_UNKNOWN		(~((u32)0))
-
-/* Utility macros */
-#define SKIP_TO_NEXT_CPU(reg_entry)					\
-({									\
-	while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND"))	\
-		reg_entry++;						\
-	reg_entry++;							\
-})
-
 extern int crashing_cpu;

-/* Kernel Dump section info */
-struct fadump_section {
-	__be32	request_flag;
-	__be16	source_data_type;
-	__be16	error_flags;
-	__be64	source_address;
-	__be64	source_len;
-	__be64	bytes_dumped;
-	__be64	destination_address;
-};
-
-/* ibm,configure-kernel-dump header. */
-struct fadump_section_header {
-	__be32	dump_format_version;
-	__be16	dump_num_sections;
-	__be16	dump_status_flag;
-	__be32	offset_first_dump_section;
-
-	/* Fields for disk dump option. */
-	__be32	dd_block_size;
-	__be64	dd_block_offset;
-	__be64	dd_num_blocks;
-	__be32	dd_offset_disk_path;
-
-	/* Maximum time allowed to prevent an automatic dump-reboot. */
-	__be32	max_time_auto;
-};
-
-/*
- * Firmware Assisted dump memory structure. This structure is required for
- * registering future kernel dump with power firmware through rtas call.
- *
- * No disk dump option. Hence disk dump path string section is not included.
- */
-struct fadump_mem_struct {
-	struct fadump_section_header	header;
-
-	/* Kernel dump sections */
-	struct fadump_section		cpu_state_data;
-	struct fadump_section		hpte_region;
-	struct fadump_section		rmr_region;
-};
-
-/* Firmware-assisted dump configuration details. */
-struct fw_dump {
-	unsigned long	cpu_state_data_size;
-	unsigned long	hpte_region_size;
-	unsigned long	boot_memory_size;
-	unsigned long	reserve_dump_area_start;
-	unsigned long	reserve_dump_area_size;
-	/* cmd line option during boot */
-	unsigned long	reserve_bootvar;
-
-	unsigned long	fadumphdr_addr;
-	unsigned long	cpu_notes_buf;
-	unsigned long	cpu_notes_buf_size;
-
-	int		ibm_configure_kernel_dump;
-
-	unsigned long	fadump_enabled:1;
-	unsigned long	fadump_supported:1;
-	unsigned long	dump_active:1;
-	unsigned long	dump_registered:1;
-	unsigned long	nocma:1;
-};
-
-/*
- * Copy the ascii values for first 8 characters from a string into u64
- * variable at their respective indexes.
- * e.g.
- *  The string "FADMPINF" will be converted into 0x4641444d50494e46
- */
-static inline u64 str_to_u64(const char *str)
-{
-	u64 val = 0;
-	int i;
-
-	for (i = 0; i < sizeof(val); i++)
-		val = (*str) ? (val << 8) | *str++ : val << 8;
-	return val;
-}
-#define STR_TO_HEX(x)	str_to_u64(x)
-#define REG_ID(x)	str_to_u64(x)
-
-#define FADUMP_CRASH_INFO_MAGIC		STR_TO_HEX("FADMPINF")
-#define REGSAVE_AREA_MAGIC		STR_TO_HEX("REGSAVE")
-
-/* The firmware-assisted dump format.
- *
- * The register save area is an area in the partition's memory used to preserve
- * the register contents (CPU state data) for the active CPUs during a firmware
- * assisted dump. The dump format contains register save area header followed
- * by register entries. Each list of registers for a CPU starts with
- * "CPUSTRT" and ends with "CPUEND".
- */
-
-/* Register save area header. */
-struct fadump_reg_save_area_header {
-	__be64		magic_number;
-	__be32		version;
-	__be32		num_cpu_offset;
-};
-
-/* Register entry. */
-struct fadump_reg_entry {
-	__be64		reg_id;
-	__be64		reg_value;
-};
-
-/* fadump crash info structure */
-struct fadump_crash_info_header {
-	u64		magic_number;
-	u64		elfcorehdr_addr;
-	u32		crashing_cpu;
-	struct pt_regs	regs;
-	struct cpumask	online_mask;
-};
-
-struct fad_crash_memory_ranges {
-	unsigned long long	base;
-	unsigned long long	size;
-};
-
 extern int is_fadump_memory_area(u64 addr, ulong size);
-extern int early_init_dt_scan_fw_dump(unsigned long node,
-		const char *uname, int depth, void *data);
-extern int fadump_reserve_mem(void);
 extern int setup_fadump(void);
 extern int is_fadump_active(void);
 extern int should_fadump_crash(void);
@ -207,5 +25,11 @@ static inline int is_fadump_active(void) { return 0; }
 static inline int should_fadump_crash(void) { return 0; }
 static inline void crash_fadump(struct pt_regs *regs, const char *str) { }
 static inline void fadump_cleanup(void) { }
+#endif /* !CONFIG_FA_DUMP */
+
+#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP)
+extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
+				      int depth, void *data);
+extern int fadump_reserve_mem(void);
 #endif
-#endif
+#endif /* _ASM_POWERPC_FADUMP_H */
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@ -50,6 +50,7 @@
 #define FW_FEATURE_DRC_INFO	ASM_CONST(0x0000000800000000)
 #define FW_FEATURE_BLOCK_REMOVE ASM_CONST(0x0000001000000000)
 #define FW_FEATURE_PAPR_SCM 	ASM_CONST(0x0000002000000000)
+#define FW_FEATURE_ULTRAVISOR	ASM_CONST(0x0000004000000000)

 #ifndef __ASSEMBLY__

@ -68,9 +69,9 @@ enum {
 		FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN |
 		FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 |
 		FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE |
-		FW_FEATURE_PAPR_SCM,
+		FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR,
 	FW_FEATURE_PSERIES_ALWAYS = 0,
-	FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL,
+	FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_ULTRAVISOR,
 	FW_FEATURE_POWERNV_ALWAYS = 0,
 	FW_FEATURE_PS3_POSSIBLE = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1,
 	FW_FEATURE_PS3_ALWAYS = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1,
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@ -8,6 +8,8 @@
 #define MCOUNT_ADDR		((unsigned long)(_mcount))
 #define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */

+#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+
 #ifdef __ASSEMBLY__

 /* Based off of objdump optput from glibc */
--- a/arch/powerpc/include/asm/futex.h
+++ b/arch/powerpc/include/asm/futex.h
@ -60,8 +60,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,

 	pagefault_enable();

-	if (!ret)
-		*oval = oldval;
+	*oval = oldval;

 	prevent_write_to_user(uaddr, sizeof(*uaddr));
 	return ret;
--- a/arch/powerpc/include/asm/head-64.h
+++ b/arch/powerpc/include/asm/head-64.h
@ -169,47 +169,6 @@ name:

 #define ABS_ADDR(label) (label - fs_label + fs_start)

-#define EXC_REAL_BEGIN(name, start, size)			\
-	FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##name, start, size)
-
-#define EXC_REAL_END(name, start, size)				\
-	FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##name, start, size)
-
-#define EXC_VIRT_BEGIN(name, start, size)			\
-	FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size)
-
-#define EXC_VIRT_END(name, start, size)				\
-	FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size)
-
-#define EXC_COMMON_BEGIN(name)					\
-	USE_TEXT_SECTION();					\
-	.balign IFETCH_ALIGN_BYTES;				\
-	.global name;						\
-	_ASM_NOKPROBE_SYMBOL(name);				\
-	DEFINE_FIXED_SYMBOL(name);				\
-name:
-
-#define TRAMP_REAL_BEGIN(name)					\
-	FIXED_SECTION_ENTRY_BEGIN(real_trampolines, name)
-
-#define TRAMP_VIRT_BEGIN(name)					\
-	FIXED_SECTION_ENTRY_BEGIN(virt_trampolines, name)
-
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-#define TRAMP_KVM_BEGIN(name)					\
-	TRAMP_VIRT_BEGIN(name)
-#else
-#define TRAMP_KVM_BEGIN(name)
-#endif
-
-#define EXC_REAL_NONE(start, size)				\
-	FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##unused, start, size); \
-	FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##unused, start, size)
-
-#define EXC_VIRT_NONE(start, size)				\
-	FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size); \
-	FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size)
-
 #endif /* __ASSEMBLY__ */

 #endif	/* _ASM_POWERPC_HEAD_64_H */
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@ -31,9 +31,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
 	return 0;
 }

-void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
-			    pte_t pte);
-
 #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
 void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 			    unsigned long end, unsigned long floor,
--- a/arch/powerpc/include/asm/io-workarounds.h
+++ b/arch/powerpc/include/asm/io-workarounds.h
@ -8,6 +8,7 @@
 #ifndef _IO_WORKAROUNDS_H
 #define _IO_WORKAROUNDS_H

+#ifdef CONFIG_PPC_IO_WORKAROUNDS
 #include <linux/io.h>
 #include <asm/pci-bridge.h>

@ -32,4 +33,23 @@ extern int spiderpci_iowa_init(struct iowa_bus *, void *);
 #define SPIDER_PCI_DUMMY_READ		0x0810
 #define SPIDER_PCI_DUMMY_READ_BASE	0x0814

+#endif
+
+#if defined(CONFIG_PPC_IO_WORKAROUNDS) && defined(CONFIG_PPC_INDIRECT_MMIO)
+extern bool io_workaround_inited;
+
+static inline bool iowa_is_active(void)
+{
+	return unlikely(io_workaround_inited);
+}
+#else
+static inline bool iowa_is_active(void)
+{
+	return false;
+}
+#endif
+
+void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size,
+			   pgprot_t prot, void *caller);
+
 #endif /* _IO_WORKAROUNDS_H */
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@ -705,16 +705,9 @@ static inline void iosync(void)
 *   create hand-made mappings for use only by the PCI code and cannot
 *   currently be hooked. Must be page aligned.
 *
- * * __ioremap is the low level implementation used by ioremap and
- *   ioremap_prot and cannot be hooked (but can be used by a hook on one
- *   of the previous ones)
- *
 * * __ioremap_caller is the same as above but takes an explicit caller
 *   reference rather than using __builtin_return_address(0)
 *
- * * __iounmap, is the low level implementation used by iounmap and cannot
- *   be hooked (but can be used by a hook on iounmap)
- *
 */
 extern void __iomem *ioremap(phys_addr_t address, unsigned long size);
 extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size,
@ -729,13 +722,14 @@ void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size);

 extern void iounmap(volatile void __iomem *addr);

-extern void __iomem *__ioremap(phys_addr_t, unsigned long size,
-			       unsigned long flags);
+int early_ioremap_range(unsigned long ea, phys_addr_t pa,
+			unsigned long size, pgprot_t prot);
+void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size,
+			 pgprot_t prot, void *caller);
+
 extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size,
 				      pgprot_t prot, void *caller);

-extern void __iounmap(volatile void __iomem *addr);
-
 extern void __iomem * __ioremap_at(phys_addr_t pa, void *ea,
 				   unsigned long size, pgprot_t prot);
 extern void __iounmap_at(void *ea, unsigned long size);
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@ -48,15 +48,16 @@ struct iommu_table_ops {
 	 * returns old TCE and DMA direction mask.
 	 * @tce is a physical address.
 	 */
-	int (*exchange)(struct iommu_table *tbl,
+	int (*xchg_no_kill)(struct iommu_table *tbl,
 			long index,
 			unsigned long *hpa,
-			enum dma_data_direction *direction);
-	/* Real mode */
-	int (*exchange_rm)(struct iommu_table *tbl,
-			long index,
-			unsigned long *hpa,
-			enum dma_data_direction *direction);
+			enum dma_data_direction *direction,
+			bool realmode);
+
+	void (*tce_kill)(struct iommu_table *tbl,
+			unsigned long index,
+			unsigned long pages,
+			bool realmode);

 	__be64 *(*useraddrptr)(struct iommu_table *tbl, long index, bool alloc);
 #endif
@ -111,6 +112,8 @@ struct iommu_table {
 	struct iommu_table_ops *it_ops;
 	struct kref    it_kref;
 	int it_nid;
+	unsigned long it_reserved_start; /* Start of not-DMA-able (MMIO) area */
+	unsigned long it_reserved_end;
 };

 #define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \
@ -149,8 +152,9 @@ extern int iommu_tce_table_put(struct iommu_table *tbl);
 /* Initializes an iommu_table based in values set in the passed-in
 * structure
 */
-extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
-					    int nid);
+extern struct iommu_table *iommu_init_table(struct iommu_table *tbl,
+		int nid, unsigned long res_start, unsigned long res_end);
+
 #define IOMMU_TABLE_GROUP_MAX_TABLES	2

 struct iommu_table_group;
@ -206,6 +210,12 @@ extern void iommu_del_device(struct device *dev);
 extern long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl,
 		unsigned long entry, unsigned long *hpa,
 		enum dma_data_direction *direction);
+extern long iommu_tce_xchg_no_kill(struct mm_struct *mm,
+		struct iommu_table *tbl,
+		unsigned long entry, unsigned long *hpa,
+		enum dma_data_direction *direction);
+extern void iommu_tce_kill(struct iommu_table *tbl,
+		unsigned long entry, unsigned long pages);
 #else
 static inline void iommu_register_group(struct iommu_table_group *table_group,
 					int pci_domain_number,
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@ -297,6 +297,7 @@ struct kvm_arch {
 	cpumask_t cpu_in_guest;
 	u8 radix;
 	u8 fwnmi_enabled;
+	u8 secure_guest;
 	bool threads_indep;
 	bool nested_enable;
 	pgd_t *pgtable;
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@ -3,9 +3,6 @@
 #define _ASM_POWERPC_MACHDEP_H
 #ifdef __KERNEL__

-/*
- */
-
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/dma-mapping.h>
@ -31,10 +28,6 @@ struct pci_host_bridge;
 struct machdep_calls {
 	char		*name;
 #ifdef CONFIG_PPC64
-	void __iomem *	(*ioremap)(phys_addr_t addr, unsigned long size,
-				   pgprot_t prot, void *caller);
-	void		(*iounmap)(volatile void __iomem *token);
-
 #ifdef CONFIG_PM
 	void		(*iommu_save)(void);
 	void		(*iommu_restore)(void);
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@ -30,6 +30,10 @@ enum MCE_Disposition {
 enum MCE_Initiator {
 	MCE_INITIATOR_UNKNOWN = 0,
 	MCE_INITIATOR_CPU = 1,
+	MCE_INITIATOR_PCI = 2,
+	MCE_INITIATOR_ISA = 3,
+	MCE_INITIATOR_MEMORY= 4,
+	MCE_INITIATOR_POWERMGM = 5,
 };

 enum MCE_ErrorType {
@ -41,6 +45,8 @@ enum MCE_ErrorType {
 	MCE_ERROR_TYPE_USER = 5,
 	MCE_ERROR_TYPE_RA = 6,
 	MCE_ERROR_TYPE_LINK = 7,
+	MCE_ERROR_TYPE_DCACHE = 8,
+	MCE_ERROR_TYPE_ICACHE = 9,
 };

 enum MCE_ErrorClass {
@ -122,7 +128,8 @@ struct machine_check_event {
 			enum MCE_UeErrorType ue_error_type:8;
 			u8		effective_address_provided;
 			u8		physical_address_provided;
-			u8		reserved_1[5];
+			u8		ignore_event;
+			u8		reserved_1[4];
 			u64		effective_address;
 			u64		physical_address;
 			u8		reserved_2[8];
@ -193,6 +200,7 @@ struct mce_error_info {
 	enum MCE_Initiator	initiator:8;
 	enum MCE_ErrorClass	error_class:8;
 	bool			sync_error;
+	bool			ignore_event;
 };

 #define MAX_MC_EVT	100
--- a/arch/powerpc/include/asm/mem_encrypt.h
+++ b/arch/powerpc/include/asm/mem_encrypt.h
@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * SVM helper functions
+ *
+ * Copyright 2018 IBM Corporation
+ */
+
+#ifndef _ASM_POWERPC_MEM_ENCRYPT_H
+#define _ASM_POWERPC_MEM_ENCRYPT_H
+
+#include <asm/svm.h>
+
+static inline bool mem_encrypt_active(void)
+{
+	return is_secure_guest();
+}
+
+static inline bool force_dma_unencrypted(struct device *dev)
+{
+	return is_secure_guest();
+}
+
+int set_memory_encrypted(unsigned long addr, int numpages);
+int set_memory_decrypted(unsigned long addr, int numpages);
+
+#endif /* _ASM_POWERPC_MEM_ENCRYPT_H */
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@ -257,7 +257,7 @@ extern void radix__mmu_cleanup_all(void);
 /* Functions for creating and updating partition table on POWER9 */
 extern void mmu_partition_table_init(void);
 extern void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
-					  unsigned long dw1);
+					  unsigned long dw1, bool flush);
 #endif /* CONFIG_PPC64 */

 struct mm_struct;
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@ -11,8 +11,6 @@
 #include <asm/mmu.h>			/* For sub-arch specific PPC_PIN_SIZE */
 #include <asm/asm-405.h>

-extern unsigned long ioremap_bot;
-
 #ifdef CONFIG_44x
 extern int icache_44x_need_flush;
 #endif
@ -78,23 +76,21 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
 */
 #include <asm/fixmap.h>

-#ifdef CONFIG_HIGHMEM
-#define KVIRT_TOP	PKMAP_BASE
-#else
-#define KVIRT_TOP	FIXADDR_START
-#endif
-
 /*
 * ioremap_bot starts at that address. Early ioremaps move down from there,
 * until mem_init() at which point this becomes the top of the vmalloc
 * and ioremap space
 */
-#ifdef CONFIG_NOT_COHERENT_CACHE
-#define IOREMAP_TOP	((KVIRT_TOP - CONFIG_CONSISTENT_SIZE) & PAGE_MASK)
+#ifdef CONFIG_HIGHMEM
+#define IOREMAP_TOP	PKMAP_BASE
 #else
-#define IOREMAP_TOP	KVIRT_TOP
+#define IOREMAP_TOP	FIXADDR_START
 #endif

+/* PPC32 shares vmalloc area with ioremap */
+#define IOREMAP_START	VMALLOC_START
+#define IOREMAP_END	VMALLOC_END
+
 /*
 * Just any arbitrary offset to the start of the vmalloc VM area: the
 * current 16MB value just means that there will be a 64MB "hole" after the
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@ -53,6 +53,7 @@
 #define  PHB_IO_BASE	(ISA_IO_END)
 #define  PHB_IO_END	(KERN_IO_START + FULL_IO_SIZE)
 #define IOREMAP_BASE	(PHB_IO_END)
+#define IOREMAP_START	(ioremap_bot)
 #define IOREMAP_END	(KERN_VIRT_START + KERN_VIRT_SIZE)


--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@ -293,5 +293,18 @@ static inline int pgd_huge(pgd_t pgd)
 #define is_hugepd(hpd)		(hugepd_ok(hpd))
 #endif

+/*
+ * This gets called at the end of handling a page fault, when
+ * the kernel has put a new PTE into the page table for the process.
+ * We use it to ensure coherency between the i-cache and d-cache
+ * for the page which has just been mapped in.
+ */
+#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_HUGETLB_PAGE)
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep);
+#else
+static inline
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) {}
+#endif
+
 #endif /* __ASSEMBLY__ */
 #endif
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@ -208,7 +208,10 @@
 #define OPAL_HANDLE_HMI2			166
 #define	OPAL_NX_COPROC_INIT			167
 #define OPAL_XIVE_GET_VP_STATE			170
-#define OPAL_LAST				170
+#define OPAL_MPIPL_UPDATE			173
+#define OPAL_MPIPL_REGISTER_TAG			174
+#define OPAL_MPIPL_QUERY_TAG			175
+#define OPAL_LAST				175

 #define QUIESCE_HOLD			1 /* Spin all calls at entry */
 #define QUIESCE_REJECT			2 /* Fail all calls with OPAL_BUSY */
@ -453,6 +456,7 @@ enum opal_msg_type {
 	OPAL_MSG_DPO		= 5,
 	OPAL_MSG_PRD		= 6,
 	OPAL_MSG_OCC		= 7,
+	OPAL_MSG_PRD2		= 8,
 	OPAL_MSG_TYPE_MAX,
 };

@ -1059,6 +1063,7 @@ enum {
 	OPAL_REBOOT_NORMAL		= 0,
 	OPAL_REBOOT_PLATFORM_ERROR	= 1,
 	OPAL_REBOOT_FULL_IPL		= 2,
+	OPAL_REBOOT_MPIPL		= 3,
 };

 /* Argument to OPAL_PCI_TCE_KILL */
@ -1135,6 +1140,44 @@ enum {
 #define OPAL_PCI_P2P_LOAD		0x2
 #define OPAL_PCI_P2P_STORE		0x4

+/* MPIPL update operations */
+enum opal_mpipl_ops {
+	OPAL_MPIPL_ADD_RANGE			= 0,
+	OPAL_MPIPL_REMOVE_RANGE			= 1,
+	OPAL_MPIPL_REMOVE_ALL			= 2,
+	OPAL_MPIPL_FREE_PRESERVED_MEMORY	= 3,
+};
+
+/* Tag will point to various metadata area. Kernel will
+ * use tag to get metadata value.
+ */
+enum opal_mpipl_tags {
+	OPAL_MPIPL_TAG_CPU	= 0,
+	OPAL_MPIPL_TAG_OPAL	= 1,
+	OPAL_MPIPL_TAG_KERNEL	= 2,
+	OPAL_MPIPL_TAG_BOOT_MEM	= 3,
+};
+
+/* Preserved memory details */
+struct opal_mpipl_region {
+	__be64	src;
+	__be64	dest;
+	__be64	size;
+};
+
+/* Structure version */
+#define OPAL_MPIPL_VERSION		0x01
+
+struct opal_mpipl_fadump {
+	u8	version;
+	u8	reserved[7];
+	__be32	crashing_pir;	/* OPAL crashing CPU PIR */
+	__be32	cpu_data_version;
+	__be32	cpu_data_size;
+	__be32	region_cnt;
+	struct	opal_mpipl_region region[];
+} __packed;
+
 #endif /* __ASSEMBLY__ */

 #endif /* __OPAL_API_H */
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@ -39,6 +39,7 @@ int64_t opal_npu_spa_clear_cache(uint64_t phb_id, uint32_t bdfn,
 				uint64_t PE_handle);
 int64_t opal_npu_tl_set(uint64_t phb_id, uint32_t bdfn, long cap,
 			uint64_t rate_phys, uint32_t size);
+
 int64_t opal_console_write(int64_t term_number, __be64 *length,
 			   const uint8_t *buffer);
 int64_t opal_console_read(int64_t term_number, __be64 *length,
@ -272,7 +273,7 @@ int64_t opal_xive_get_vp_info(uint64_t vp,
 int64_t opal_xive_set_vp_info(uint64_t vp,
 			      uint64_t flags,
 			      uint64_t report_cl_pair);
-int64_t opal_xive_allocate_irq(uint32_t chip_id);
+int64_t opal_xive_allocate_irq_raw(uint32_t chip_id);
 int64_t opal_xive_free_irq(uint32_t girq);
 int64_t opal_xive_sync(uint32_t type, uint32_t id);
 int64_t opal_xive_dump(uint32_t type, uint32_t id);
@ -297,6 +298,10 @@ int opal_sensor_group_clear(u32 group_hndl, int token);
 int opal_sensor_group_enable(u32 group_hndl, int token, bool enable);
 int opal_nx_coproc_init(uint32_t chip_id, uint32_t ct);

+s64 opal_mpipl_update(enum opal_mpipl_ops op, u64 src, u64 dest, u64 size);
+s64 opal_mpipl_register_tag(enum opal_mpipl_tags tag, u64 addr);
+s64 opal_mpipl_query_tag(enum opal_mpipl_tags tag, u64 *addr);
+
 s64 opal_signal_system_reset(s32 cpu);
 s64 opal_quiesce(u64 shutdown_type, s32 cpu);

--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@ -215,9 +215,19 @@ static inline bool pfn_valid(unsigned long pfn)
 /*
 * gcc miscompiles (unsigned long)(&static_var) - PAGE_OFFSET
 * with -mcmodel=medium, so we use & and | instead of - and + on 64-bit.
+ * This also results in better code generation.
 */
-#define __va(x) ((void *)(unsigned long)((phys_addr_t)(x) | PAGE_OFFSET))
-#define __pa(x) ((unsigned long)(x) & 0x0fffffffffffffffUL)
+#define __va(x)								\
+({									\
+	VIRTUAL_BUG_ON((unsigned long)(x) >= PAGE_OFFSET);		\
+	(void *)(unsigned long)((phys_addr_t)(x) | PAGE_OFFSET);	\
+})
+
+#define __pa(x)								\
+({									\
+	VIRTUAL_BUG_ON((unsigned long)(x) < PAGE_OFFSET);		\
+	(unsigned long)(x) & 0x0fffffffffffffffUL;			\
+})

 #else /* 32-bit, non book E */
 #define __va(x) ((void *)(unsigned long)((phys_addr_t)(x) + PAGE_OFFSET - MEMORY_START))
--- a/arch/powerpc/include/asm/page_32.h
+++ b/arch/powerpc/include/asm/page_32.h
@ -40,6 +40,8 @@ typedef unsigned long long pte_basic_t;
 typedef unsigned long pte_basic_t;
 #endif

+#include <asm/bug.h>
+
 /*
 * Clear page using the dcbz instruction, which doesn't cause any
 * memory traffic (except to write out any cache lines which get
@ -49,6 +51,8 @@ static inline void clear_page(void *addr)
 {
 	unsigned int i;

+	WARN_ON((unsigned long)addr & (L1_CACHE_BYTES - 1));
+
 	for (i = 0; i < PAGE_SIZE / L1_CACHE_BYTES; i++, addr += L1_CACHE_BYTES)
 		dcbz(addr);
 }
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@ -183,6 +183,7 @@ struct iommu_table;
 struct pci_dn {
 	int     flags;
 #define PCI_DN_FLAG_IOV_VF	0x01
+#define PCI_DN_FLAG_DEAD	0x02    /* Device has been hot-removed */

 	int	busno;			/* pci bus number */
 	int	devfn;			/* pci device and function number */
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@ -68,6 +68,8 @@ extern pgd_t swapper_pg_dir[];

 extern void paging_init(void);

+extern unsigned long ioremap_bot;
+
 /*
 * kern_addr_valid is intended to indicate whether an address is a valid
 * kernel address.  Most 32-bit archs define it as always true (like this)
@ -77,18 +79,6 @@ extern void paging_init(void);

 #include <asm-generic/pgtable.h>

-
-/*
- * This gets called at the end of handling a page fault, when
- * the kernel has put a new PTE into the page table for the process.
- * We use it to ensure coherency between the i-cache and d-cache
- * for the page which has just been mapped in.
- * On machines which use an MMU hash table, we use this to put a
- * corresponding HPTE into the hash table ahead of time, instead of
- * waiting for the inevitable extra hash-table miss exception.
- */
-extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *);
-
 #ifndef CONFIG_TRANSPARENT_HUGEPAGE
 #define pmd_large(pmd)		0
 #endif
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@ -340,6 +340,12 @@ static inline long plpar_set_ciabr(unsigned long ciabr)
 {
 	return 0;
 }
+
+static inline long plpar_pte_read_4(unsigned long flags, unsigned long ptex,
+				    unsigned long *ptes)
+{
+	return 0;
+}
 #endif /* CONFIG_PPC_PSERIES */

 #endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@ -62,11 +62,6 @@ void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode);
 void eeh_sysfs_add_device(struct pci_dev *pdev);
 void eeh_sysfs_remove_device(struct pci_dev *pdev);

-static inline const char *eeh_pci_name(struct pci_dev *pdev) 
-{ 
-	return pdev ? pci_name(pdev) : "<null>";
-} 
-
 static inline const char *eeh_driver_name(struct pci_dev *pdev)
 {
 	return (pdev && pdev->driver) ? pdev->driver->name : "<null>";
@ -74,6 +69,8 @@ static inline const char *eeh_driver_name(struct pci_dev *pdev)

 #endif /* CONFIG_EEH */

+#define PCI_BUSNO(bdfn) ((bdfn >> 8) & 0xff)
+
 #else /* CONFIG_PCI */
 static inline void init_pci_config_tokens(void) { }
 #endif /* !CONFIG_PCI */
--- a/arch/powerpc/include/asm/ppc4xx_ocm.h
+++ b/arch/powerpc/include/asm/ppc4xx_ocm.h
@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * PowerPC 4xx OCM memory allocation support
- *
- * (C) Copyright 2009, Applied Micro Circuits Corporation
- * Victor Gallardo (vgallardo@amcc.com)
- *
- * See file CREDITS for list of people who contributed to this
- * project.
- */
-
-#ifndef __ASM_POWERPC_PPC4XX_OCM_H__
-#define __ASM_POWERPC_PPC4XX_OCM_H__
-
-#define PPC4XX_OCM_NON_CACHED 0
-#define PPC4XX_OCM_CACHED     1
-
-#if defined(CONFIG_PPC4xx_OCM)
-
-void *ppc4xx_ocm_alloc(phys_addr_t *phys, int size, int align,
-		  int flags, const char *owner);
-void ppc4xx_ocm_free(const void *virt);
-
-#else
-
-#define ppc4xx_ocm_alloc(phys, size, align, flags, owner)	NULL
-#define ppc4xx_ocm_free(addr)	((void)0)
-
-#endif /* CONFIG_PPC4xx_OCM */
-
-#endif  /* __ASM_POWERPC_PPC4XX_OCM_H__ */
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@ -311,18 +311,48 @@ n:
 	addis	reg,reg,(name - 0b)@ha;		\
 	addi	reg,reg,(name - 0b)@l;

-#ifdef __powerpc64__
-#ifdef HAVE_AS_ATHIGH
+#if defined(__powerpc64__) && defined(HAVE_AS_ATHIGH)
 #define __AS_ATHIGH high
 #else
 #define __AS_ATHIGH h
 #endif
-#define LOAD_REG_IMMEDIATE(reg,expr)		\
-	lis     reg,(expr)@highest;		\
-	ori     reg,reg,(expr)@higher;	\
-	rldicr  reg,reg,32,31;		\
-	oris    reg,reg,(expr)@__AS_ATHIGH;	\
-	ori     reg,reg,(expr)@l;
+
+.macro __LOAD_REG_IMMEDIATE_32 r, x
+	.if (\x) >= 0x8000 || (\x) < -0x8000
+		lis \r, (\x)@__AS_ATHIGH
+		.if (\x) & 0xffff != 0
+			ori \r, \r, (\x)@l
+		.endif
+	.else
+		li \r, (\x)@l
+	.endif
+.endm
+
+.macro __LOAD_REG_IMMEDIATE r, x
+	.if (\x) >= 0x80000000 || (\x) < -0x80000000
+		__LOAD_REG_IMMEDIATE_32 \r, (\x) >> 32
+		sldi	\r, \r, 32
+		.if (\x) & 0xffff0000 != 0
+			oris \r, \r, (\x)@__AS_ATHIGH
+		.endif
+		.if (\x) & 0xffff != 0
+			ori \r, \r, (\x)@l
+		.endif
+	.else
+		__LOAD_REG_IMMEDIATE_32 \r, \x
+	.endif
+.endm
+
+#ifdef __powerpc64__
+
+#define LOAD_REG_IMMEDIATE(reg, expr) __LOAD_REG_IMMEDIATE reg, expr
+
+#define LOAD_REG_IMMEDIATE_SYM(reg, tmp, expr)	\
+	lis	tmp, (expr)@highest;		\
+	lis	reg, (expr)@__AS_ATHIGH;	\
+	ori	tmp, tmp, (expr)@higher;	\
+	ori	reg, reg, (expr)@l;		\
+	rldimi	reg, tmp, 32, 0

 #define LOAD_REG_ADDR(reg,name)			\
 	ld	reg,name@got(r2)
@ -335,11 +365,13 @@ n:

 #else /* 32-bit */

-#define LOAD_REG_IMMEDIATE(reg,expr)		\
+#define LOAD_REG_IMMEDIATE(reg, expr) __LOAD_REG_IMMEDIATE_32 reg, expr
+
+#define LOAD_REG_IMMEDIATE_SYM(reg,expr)		\
 	lis	reg,(expr)@ha;		\
 	addi	reg,reg,(expr)@l;

-#define LOAD_REG_ADDR(reg,name)		LOAD_REG_IMMEDIATE(reg, name)
+#define LOAD_REG_ADDR(reg,name)		LOAD_REG_IMMEDIATE_SYM(reg, name)

 #define LOAD_REG_ADDRBASE(reg, name)	lis	reg,name@ha
 #define ADDROFF(name)			name@l
@ -351,19 +383,9 @@ n:

 /* various errata or part fixups */
 #ifdef CONFIG_PPC601_SYNC_FIX
-#define SYNC				\
-BEGIN_FTR_SECTION			\
-	sync;				\
-	isync;				\
-END_FTR_SECTION_IFSET(CPU_FTR_601)
-#define SYNC_601			\
-BEGIN_FTR_SECTION			\
-	sync;				\
-END_FTR_SECTION_IFSET(CPU_FTR_601)
-#define ISYNC_601			\
-BEGIN_FTR_SECTION			\
-	isync;				\
-END_FTR_SECTION_IFSET(CPU_FTR_601)
+#define SYNC		sync; isync
+#define SYNC_601	sync
+#define ISYNC_601	isync
 #else
 #define	SYNC
 #define SYNC_601
@ -389,15 +411,11 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96)
 #define MFTBU(dest)			mfspr dest, SPRN_TBRU
 #endif

-#ifndef CONFIG_SMP
-#define TLBSYNC
-#else /* CONFIG_SMP */
 /* tlbsync is not implemented on 601 */
-#define TLBSYNC				\
-BEGIN_FTR_SECTION			\
-	tlbsync;			\
-	sync;				\
-END_FTR_SECTION_IFCLR(CPU_FTR_601)
+#if !defined(CONFIG_SMP) || defined(CONFIG_PPC_BOOK3S_601)
+#define TLBSYNC
+#else
+#define TLBSYNC		tlbsync; sync
 #endif

 #ifdef CONFIG_PPC64
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@ -203,7 +203,11 @@ do {									      \
 #endif /* __powerpc64__ */

 #define arch_has_single_step()	(1)
-#define arch_has_block_step()	(!cpu_has_feature(CPU_FTR_601))
+#ifndef CONFIG_BOOK3S_601
+#define arch_has_block_step()	(true)
+#else
+#define arch_has_block_step()	(false)
+#endif
 #define ARCH_HAS_USER_SINGLE_STEP_REPORT

 /*
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@ -38,6 +38,7 @@
 #define MSR_TM_LG	32		/* Trans Mem Available */
 #define MSR_VEC_LG	25	        /* Enable AltiVec */
 #define MSR_VSX_LG	23		/* Enable VSX */
+#define MSR_S_LG	22		/* Secure state */
 #define MSR_POW_LG	18		/* Enable Power Management */
 #define MSR_WE_LG	18		/* Wait State Enable */
 #define MSR_TGPR_LG	17		/* TLB Update registers in use */
@ -71,11 +72,13 @@
 #define MSR_SF		__MASK(MSR_SF_LG)	/* Enable 64 bit mode */
 #define MSR_ISF		__MASK(MSR_ISF_LG)	/* Interrupt 64b mode valid on 630 */
 #define MSR_HV 		__MASK(MSR_HV_LG)	/* Hypervisor state */
+#define MSR_S		__MASK(MSR_S_LG)	/* Secure state */
 #else
 /* so tests for these bits fail on 32-bit */
 #define MSR_SF		0
 #define MSR_ISF		0
 #define MSR_HV		0
+#define MSR_S		0
 #endif

 /*
--- a/arch/powerpc/include/asm/scom.h
+++ b/arch/powerpc/include/asm/scom.h
@ -1,154 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright 2010 Benjamin Herrenschmidt, IBM Corp
- *                <benh@kernel.crashing.org>
- *     and        David Gibson, IBM Corporation.
- */
-
-#ifndef _ASM_POWERPC_SCOM_H
-#define _ASM_POWERPC_SCOM_H
-
-#ifdef __KERNEL__
-#ifndef __ASSEMBLY__
-#ifdef CONFIG_PPC_SCOM
-
-/*
- * The SCOM bus is a sideband bus used for accessing various internal
- * registers of the processor or the chipset. The implementation details
- * differ between processors and platforms, and the access method as
- * well.
- *
- * This API allows to "map" ranges of SCOM register numbers associated
- * with a given SCOM controller. The later must be represented by a
- * device node, though some implementations might support NULL if there
- * is no possible ambiguity
- *
- * Then, scom_read/scom_write can be used to accesses registers inside
- * that range. The argument passed is a register number relative to
- * the beginning of the range mapped.
- */
-
-typedef void *scom_map_t;
-
-/* Value for an invalid SCOM map */
-#define SCOM_MAP_INVALID	(NULL)
-
-/* The scom_controller data structure is what the platform passes
- * to the core code in scom_init, it provides the actual implementation
- * of all the SCOM functions
- */
-struct scom_controller {
-	scom_map_t (*map)(struct device_node *ctrl_dev, u64 reg, u64 count);
-	void (*unmap)(scom_map_t map);
-
-	int (*read)(scom_map_t map, u64 reg, u64 *value);
-	int (*write)(scom_map_t map, u64 reg, u64 value);
-};
-
-extern const struct scom_controller *scom_controller;
-
-/**
- * scom_init - Initialize the SCOM backend, called by the platform
- * @controller: The platform SCOM controller
- */
-static inline void scom_init(const struct scom_controller *controller)
-{
-	scom_controller = controller;
-}
-
-/**
- * scom_map_ok - Test is a SCOM mapping is successful
- * @map: The result of scom_map to test
- */
-static inline int scom_map_ok(scom_map_t map)
-{
-	return map != SCOM_MAP_INVALID;
-}
-
-/**
- * scom_map - Map a block of SCOM registers
- * @ctrl_dev: Device node of the SCOM controller
- *            some implementations allow NULL here
- * @reg: first SCOM register to map
- * @count: Number of SCOM registers to map
- */
-
-static inline scom_map_t scom_map(struct device_node *ctrl_dev,
-				  u64 reg, u64 count)
-{
-	return scom_controller->map(ctrl_dev, reg, count);
-}
-
-/**
- * scom_find_parent - Find the SCOM controller for a device
- * @dev: OF node of the device
- *
- * This is not meant for general usage, but in combination with
- * scom_map() allows to map registers not represented by the
- * device own scom-reg property. Useful for applying HW workarounds
- * on things not properly represented in the device-tree for example.
- */
-struct device_node *scom_find_parent(struct device_node *dev);
-
-
-/**
- * scom_map_device - Map a device's block of SCOM registers
- * @dev: OF node of the device
- * @index: Register bank index (index in "scom-reg" property)
- *
- * This function will use the device-tree binding for SCOM which
- * is to follow "scom-parent" properties until it finds a node with
- * a "scom-controller" property to find the controller. It will then
- * use the "scom-reg" property which is made of reg/count pairs,
- * each of them having a size defined by the controller's #scom-cells
- * property
- */
-extern scom_map_t scom_map_device(struct device_node *dev, int index);
-
-
-/**
- * scom_unmap - Unmap a block of SCOM registers
- * @map: Result of scom_map is to be unmapped
- */
-static inline void scom_unmap(scom_map_t map)
-{
-	if (scom_map_ok(map))
-		scom_controller->unmap(map);
-}
-
-/**
- * scom_read - Read a SCOM register
- * @map: Result of scom_map
- * @reg: Register index within that map
- * @value: Updated with the value read
- *
- * Returns 0 (success) or a negative error code
- */
-static inline int scom_read(scom_map_t map, u64 reg, u64 *value)
-{
-	int rc;
-
-	rc = scom_controller->read(map, reg, value);
-	if (rc)
-		*value = 0xfffffffffffffffful;
-	return rc;
-}
-
-/**
- * scom_write - Write to a SCOM register
- * @map: Result of scom_map
- * @reg: Register index within that map
- * @value: Value to write
- *
- * Returns 0 (success) or a negative error code
- */
-static inline int scom_write(scom_map_t map, u64 reg, u64 value)
-{
-	return scom_controller->write(map, reg, value);
-}
-
-
-#endif /* CONFIG_PPC_SCOM */
-#endif /* __ASSEMBLY__ */
-#endif /* __KERNEL__ */
-#endif /* _ASM_POWERPC_SCOM_H */
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@ -61,17 +61,6 @@ static inline int overlaps_kernel_text(unsigned long start, unsigned long end)
 		(unsigned long)_stext < end;
 }

-static inline int overlaps_kvm_tmp(unsigned long start, unsigned long end)
-{
-#ifdef CONFIG_KVM_GUEST
-	extern char kvm_tmp[];
-	return start < (unsigned long)kvm_tmp &&
-		(unsigned long)&kvm_tmp[1024 * 1024] < end;
-#else
-	return 0;
-#endif
-}
-
 #ifdef PPC64_ELF_ABI_v1

 #define HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR 1
--- a/arch/powerpc/include/asm/setjmp.h
+++ b/arch/powerpc/include/asm/setjmp.h
@ -7,7 +7,7 @@

 #define JMP_BUF_LEN    23

-extern long setjmp(long *);
-extern void longjmp(long *, long);
+extern long setjmp(long *) __attribute__((returns_twice));
+extern void longjmp(long *, long) __attribute__((noreturn));

 #endif /* _ASM_POWERPC_SETJMP_H */
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@ -101,15 +101,43 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)

 #if defined(CONFIG_PPC_SPLPAR)
 /* We only yield to the hypervisor if we are in shared processor mode */
-#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
-extern void __spin_yield(arch_spinlock_t *lock);
-extern void __rw_yield(arch_rwlock_t *lock);
+void splpar_spin_yield(arch_spinlock_t *lock);
+void splpar_rw_yield(arch_rwlock_t *lock);
 #else /* SPLPAR */
-#define __spin_yield(x)	barrier()
-#define __rw_yield(x)	barrier()
-#define SHARED_PROCESSOR	0
+static inline void splpar_spin_yield(arch_spinlock_t *lock) {};
+static inline void splpar_rw_yield(arch_rwlock_t *lock) {};
 #endif

+static inline bool is_shared_processor(void)
+{
+/*
+ * LPPACA is only available on Pseries so guard anything LPPACA related to
+ * allow other platforms (which include this common header) to compile.
+ */
+#ifdef CONFIG_PPC_PSERIES
+	return (IS_ENABLED(CONFIG_PPC_SPLPAR) &&
+		lppaca_shared_proc(local_paca->lppaca_ptr));
+#else
+	return false;
+#endif
+}
+
+static inline void spin_yield(arch_spinlock_t *lock)
+{
+	if (is_shared_processor())
+		splpar_spin_yield(lock);
+	else
+		barrier();
+}
+
+static inline void rw_yield(arch_rwlock_t *lock)
+{
+	if (is_shared_processor())
+		splpar_rw_yield(lock);
+	else
+		barrier();
+}
+
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
 	while (1) {
@ -117,8 +145,8 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
 			break;
 		do {
 			HMT_low();
-			if (SHARED_PROCESSOR)
-				__spin_yield(lock);
+			if (is_shared_processor())
+				splpar_spin_yield(lock);
 		} while (unlikely(lock->slock != 0));
 		HMT_medium();
 	}
@ -136,8 +164,8 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
 		local_irq_restore(flags);
 		do {
 			HMT_low();
-			if (SHARED_PROCESSOR)
-				__spin_yield(lock);
+			if (is_shared_processor())
+				splpar_spin_yield(lock);
 		} while (unlikely(lock->slock != 0));
 		HMT_medium();
 		local_irq_restore(flags_dis);
@ -226,8 +254,8 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
 			break;
 		do {
 			HMT_low();
-			if (SHARED_PROCESSOR)
-				__rw_yield(rw);
+			if (is_shared_processor())
+				splpar_rw_yield(rw);
 		} while (unlikely(rw->lock < 0));
 		HMT_medium();
 	}
@ -240,8 +268,8 @@ static inline void arch_write_lock(arch_rwlock_t *rw)
 			break;
 		do {
 			HMT_low();
-			if (SHARED_PROCESSOR)
-				__rw_yield(rw);
+			if (is_shared_processor())
+				splpar_rw_yield(rw);
 		} while (unlikely(rw->lock != 0));
 		HMT_medium();
 	}
@ -281,9 +309,9 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 	rw->lock = 0;
 }

-#define arch_spin_relax(lock)	__spin_yield(lock)
-#define arch_read_relax(lock)	__rw_yield(lock)
-#define arch_write_relax(lock)	__rw_yield(lock)
+#define arch_spin_relax(lock)	spin_yield(lock)
+#define arch_read_relax(lock)	rw_yield(lock)
+#define arch_write_relax(lock)	rw_yield(lock)

 /* See include/linux/spinlock.h */
 #define smp_mb__after_spinlock()   smp_mb()
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@ -53,7 +53,9 @@ void *__memmove(void *to, const void *from, __kernel_size_t n);
 #ifndef CONFIG_KASAN
 #define __HAVE_ARCH_MEMSET32
 #define __HAVE_ARCH_MEMSET64
+#define __HAVE_ARCH_MEMCPY_MCSAFE

+extern int memcpy_mcsafe(void *dst, const void *src, __kernel_size_t sz);
 extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t);
 extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
 extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t);
--- a/arch/powerpc/include/asm/svm.h
+++ b/arch/powerpc/include/asm/svm.h
@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * SVM helper functions
+ *
+ * Copyright 2018 Anshuman Khandual, IBM Corporation.
+ */
+
+#ifndef _ASM_POWERPC_SVM_H
+#define _ASM_POWERPC_SVM_H
+
+#ifdef CONFIG_PPC_SVM
+
+static inline bool is_secure_guest(void)
+{
+	return mfmsr() & MSR_S;
+}
+
+void dtl_cache_ctor(void *addr);
+#define get_dtl_cache_ctor()	(is_secure_guest() ? dtl_cache_ctor : NULL)
+
+#else /* CONFIG_PPC_SVM */
+
+static inline bool is_secure_guest(void)
+{
+	return false;
+}
+
+#define get_dtl_cache_ctor() NULL
+
+#endif /* CONFIG_PPC_SVM */
+#endif /* _ASM_POWERPC_SVM_H */
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@ -41,11 +41,7 @@ struct div_result {

 /* Accessor functions for the timebase (RTC on 601) registers. */
 /* If one day CONFIG_POWER is added just define __USE_RTC as 1 */
-#ifdef CONFIG_PPC_BOOK3S_32
-#define __USE_RTC()	(cpu_has_feature(CPU_FTR_USE_RTC))
-#else
-#define __USE_RTC()	0
-#endif
+#define __USE_RTC()	(IS_ENABLED(CONFIG_PPC_BOOK3S_601))

 #ifdef CONFIG_PPC64

--- a/arch/powerpc/include/asm/timex.h
+++ b/arch/powerpc/include/asm/timex.h
@ -17,38 +17,10 @@ typedef unsigned long cycles_t;

 static inline cycles_t get_cycles(void)
 {
-#ifdef __powerpc64__
+	if (IS_ENABLED(CONFIG_BOOK3S_601))
+		return 0;
+
 	return mftb();
-#else
-	cycles_t ret;
-
-	/*
-	 * For the "cycle" counter we use the timebase lower half.
-	 * Currently only used on SMP.
-	 */
-
-	ret = 0;
-
-	__asm__ __volatile__(
-#ifdef CONFIG_PPC_8xx
-		"97:	mftb %0\n"
-#else
-		"97:	mfspr %0, %2\n"
-#endif
-		"99:\n"
-		".section __ftr_fixup,\"a\"\n"
-		".align 2\n"
-		"98:\n"
-		"	.long %1\n"
-		"	.long 0\n"
-		"	.long 97b-98b\n"
-		"	.long 99b-98b\n"
-		"	.long 0\n"
-		"	.long 0\n"
-		".previous"
-		: "=r" (ret) : "i" (CPU_FTR_601), "i" (SPRN_TBRL));
-	return ret;
-#endif
 }

 #endif	/* __KERNEL__ */
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@ -387,6 +387,20 @@ static inline unsigned long raw_copy_to_user(void __user *to,
 	return ret;
 }

+static __always_inline unsigned long __must_check
+copy_to_user_mcsafe(void __user *to, const void *from, unsigned long n)
+{
+	if (likely(check_copy_size(from, n, true))) {
+		if (access_ok(to, n)) {
+			allow_write_to_user(to, n);
+			n = memcpy_mcsafe((void *)to, from, n);
+			prevent_write_to_user(to, n);
+		}
+	}
+
+	return n;
+}
+
 extern unsigned long __clear_user(void __user *addr, unsigned long size);

 static inline unsigned long clear_user(void __user *addr, unsigned long size)
--- a/arch/powerpc/include/asm/ultravisor-api.h
+++ b/arch/powerpc/include/asm/ultravisor-api.h
@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Ultravisor API.
+ *
+ * Copyright 2019, IBM Corporation.
+ *
+ */
+#ifndef _ASM_POWERPC_ULTRAVISOR_API_H
+#define _ASM_POWERPC_ULTRAVISOR_API_H
+
+#include <asm/hvcall.h>
+
+/* Return codes */
+#define U_BUSY			H_BUSY
+#define U_FUNCTION		H_FUNCTION
+#define U_NOT_AVAILABLE		H_NOT_AVAILABLE
+#define U_P2			H_P2
+#define U_P3			H_P3
+#define U_P4			H_P4
+#define U_P5			H_P5
+#define U_PARAMETER		H_PARAMETER
+#define U_PERMISSION		H_PERMISSION
+#define U_SUCCESS		H_SUCCESS
+
+/* opcodes */
+#define UV_WRITE_PATE			0xF104
+#define UV_RETURN			0xF11C
+#define UV_ESM				0xF110
+#define UV_SHARE_PAGE			0xF130
+#define UV_UNSHARE_PAGE			0xF134
+#define UV_UNSHARE_ALL_PAGES		0xF140
+
+#endif /* _ASM_POWERPC_ULTRAVISOR_API_H */
--- a/arch/powerpc/include/asm/ultravisor.h
+++ b/arch/powerpc/include/asm/ultravisor.h
@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Ultravisor definitions
+ *
+ * Copyright 2019, IBM Corporation.
+ *
+ */
+#ifndef _ASM_POWERPC_ULTRAVISOR_H
+#define _ASM_POWERPC_ULTRAVISOR_H
+
+#include <asm/asm-prototypes.h>
+#include <asm/ultravisor-api.h>
+#include <asm/firmware.h>
+
+int early_init_dt_scan_ultravisor(unsigned long node, const char *uname,
+				  int depth, void *data);
+
+/*
+ * In ultravisor enabled systems, PTCR becomes ultravisor privileged only for
+ * writing and an attempt to write to it will cause a Hypervisor Emulation
+ * Assistance interrupt.
+ */
+static inline void set_ptcr_when_no_uv(u64 val)
+{
+	if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+		mtspr(SPRN_PTCR, val);
+}
+
+static inline int uv_register_pate(u64 lpid, u64 dw0, u64 dw1)
+{
+	return ucall_norets(UV_WRITE_PATE, lpid, dw0, dw1);
+}
+
+static inline int uv_share_page(u64 pfn, u64 npages)
+{
+	return ucall_norets(UV_SHARE_PAGE, pfn, npages);
+}
+
+static inline int uv_unshare_page(u64 pfn, u64 npages)
+{
+	return ucall_norets(UV_UNSHARE_PAGE, pfn, npages);
+}
+
+static inline int uv_unshare_all_pages(void)
+{
+	return ucall_norets(UV_UNSHARE_ALL_PAGES);
+}
+
+#endif	/* _ASM_POWERPC_ULTRAVISOR_H */
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@ -99,6 +99,7 @@ extern void xive_flush_interrupt(void);

 /* xmon hook */
 extern void xmon_xive_do_dump(int cpu);
+extern int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d);

 /* APIs used by KVM */
 extern u32 xive_native_default_eq_shift(void);
--- a/arch/powerpc/kernel/.gitignore
+++ b/arch/powerpc/kernel/.gitignore
@ -1 +1,2 @@
+prom_init_check
 vmlinux.lds
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@ -52,7 +52,7 @@ obj-y				:= cputable.o ptrace.o syscalls.o \
 				   of_platform.o prom_parse.o
 obj-$(CONFIG_PPC64)		+= setup_64.o sys_ppc32.o \
 				   signal_64.o ptrace32.o \
-				   paca.o nvram_64.o firmware.o
+				   paca.o nvram_64.o firmware.o note.o
 obj-$(CONFIG_VDSO32)		+= vdso32/
 obj-$(CONFIG_PPC_WATCHDOG)	+= watchdog.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
@ -78,7 +78,9 @@ obj-$(CONFIG_EEH)              += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \
 				  eeh_driver.o eeh_event.o eeh_sysfs.o
 obj-$(CONFIG_GENERIC_TBSYNC)	+= smp-tbsync.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
-obj-$(CONFIG_FA_DUMP)		+= fadump.o
+ifneq ($(CONFIG_FA_DUMP)$(CONFIG_PRESERVE_FA_DUMP),)
+obj-y				+= fadump.o
+endif
 ifdef CONFIG_PPC32
 obj-$(CONFIG_E500)		+= idle_e500.o
 endif
@ -155,6 +157,9 @@ endif

 obj-$(CONFIG_EPAPR_PARAVIRT)	+= epapr_paravirt.o epapr_hcalls.o
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvm_emul.o
+ifneq ($(CONFIG_PPC_POWERNV)$(CONFIG_PPC_SVM),)
+obj-y				+= ucall.o
+endif

 # Disable GCOV, KCOV & sanitizers in odd or sensitive code
 GCOV_PROFILE_prom_init.o := n
@ -184,15 +189,13 @@ extra-$(CONFIG_ALTIVEC)		+= vector.o
 extra-$(CONFIG_PPC64)		+= entry_64.o
 extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE)	+= prom_init.o

-ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE
-$(obj)/built-in.a:		prom_init_check
+extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE)	+= prom_init_check

-quiet_cmd_prom_init_check = CALL    $<
-      cmd_prom_init_check = $(CONFIG_SHELL) $< "$(NM)" "$(obj)/prom_init.o"
+quiet_cmd_prom_init_check = PROMCHK $@
+      cmd_prom_init_check = $(CONFIG_SHELL) $< "$(NM)" $(obj)/prom_init.o; touch $@

-PHONY += prom_init_check
-prom_init_check: $(src)/prom_init_check.sh $(obj)/prom_init.o
-	$(call cmd,prom_init_check)
-endif
+$(obj)/prom_init_check: $(src)/prom_init_check.sh $(obj)/prom_init.o FORCE
+	$(call if_changed,prom_init_check)
+targets += prom_init_check

 clean-files := vmlinux.lds
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@ -506,6 +506,7 @@ int main(void)
 	OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v);
 	OFFSET(KVM_RADIX, kvm, arch.radix);
 	OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled);
+	OFFSET(KVM_SECURE_GUEST, kvm, arch.secure_guest);
 	OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr);
 	OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar);
 	OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@ -569,7 +569,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 #endif	/* CONFIG_PPC_BOOK3S_64 */

 #ifdef CONFIG_PPC32
-#ifdef CONFIG_PPC_BOOK3S_32
+#ifdef CONFIG_PPC_BOOK3S_601
 	{	/* 601 */
 		.pvr_mask		= 0xffff0000,
 		.pvr_value		= 0x00010000,
@ -583,6 +583,8 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.machine_check		= machine_check_generic,
 		.platform		= "ppc601",
 	},
+#endif /* CONFIG_PPC_BOOK3S_601 */
+#ifdef CONFIG_PPC_BOOK3S_6xx
 	{	/* 603 */
 		.pvr_mask		= 0xffff0000,
 		.pvr_value		= 0x00030000,
@ -1212,7 +1214,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.machine_check		= machine_check_generic,
 		.platform		= "ppc603",
 	},
-#endif /* CONFIG_PPC_BOOK3S_32 */
+#endif /* CONFIG_PPC_BOOK3S_6xx */
 #ifdef CONFIG_PPC_8xx
 	{	/* 8xx */
 		.pvr_mask		= 0xffff0000,
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@ -122,18 +122,17 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask)
 {
 	struct iommu_table *tbl = get_iommu_table_base(dev);

-	if (!tbl) {
-		dev_info(dev, "Warning: IOMMU dma not supported: mask 0x%08llx"
-			", table unavailable\n", mask);
-		return 0;
-	}
-
 	if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) {
 		dev->archdata.iommu_bypass = true;
 		dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
 		return 1;
 	}

+	if (!tbl) {
+		dev_err(dev, "Warning: IOMMU dma not supported: mask 0x%08llx, table unavailable\n", mask);
+		return 0;
+	}
+
 	if (tbl->it_offset > (mask >> tbl->it_page_shift)) {
 		dev_info(dev, "Warning: IOMMU offset too big for device mask\n");
 		dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n",
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@ -150,6 +150,16 @@ static int __init eeh_setup(char *str)
 }
 __setup("eeh=", eeh_setup);

+void eeh_show_enabled(void)
+{
+	if (eeh_has_flag(EEH_FORCE_DISABLED))
+		pr_info("EEH: Recovery disabled by kernel parameter.\n");
+	else if (eeh_has_flag(EEH_ENABLED))
+		pr_info("EEH: Capable adapter found: recovery enabled.\n");
+	else
+		pr_info("EEH: No capable adapters found: recovery disabled.\n");
+}
+
 /*
 * This routine captures assorted PCI configuration space data
 * for the indicated PCI device, and puts them into a buffer
@ -410,11 +420,9 @@ static int eeh_phb_check_failure(struct eeh_pe *pe)
 	eeh_pe_mark_isolated(phb_pe);
 	eeh_serialize_unlock(flags);

-	pr_err("EEH: PHB#%x failure detected, location: %s\n",
+	pr_debug("EEH: PHB#%x failure detected, location: %s\n",
 		phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe));
-	dump_stack();
 	eeh_send_failure_event(phb_pe);
-
 	return 1;
 out:
 	eeh_serialize_unlock(flags);
@ -441,7 +449,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 	unsigned long flags;
 	struct device_node *dn;
 	struct pci_dev *dev;
-	struct eeh_pe *pe, *parent_pe, *phb_pe;
+	struct eeh_pe *pe, *parent_pe;
 	int rc = 0;
 	const char *location = NULL;

@ -460,8 +468,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 	/* Access to IO BARs might get this far and still not want checking. */
 	if (!pe) {
 		eeh_stats.ignored_check++;
-		pr_debug("EEH: Ignored check for %s\n",
-			eeh_pci_name(dev));
+		eeh_edev_dbg(edev, "Ignored check\n");
 		return 0;
 	}

@ -501,12 +508,11 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 			if (dn)
 				location = of_get_property(dn, "ibm,loc-code",
 						NULL);
-			printk(KERN_ERR "EEH: %d reads ignored for recovering device at "
-				"location=%s driver=%s pci addr=%s\n",
+			eeh_edev_err(edev, "%d reads ignored for recovering device at location=%s driver=%s\n",
 				pe->check_count,
 				location ? location : "unknown",
-				eeh_driver_name(dev), eeh_pci_name(dev));
-			printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n",
+				eeh_driver_name(dev));
+			eeh_edev_err(edev, "Might be infinite loop in %s driver\n",
 				eeh_driver_name(dev));
 			dump_stack();
 		}
@ -573,13 +579,8 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 	 * a stack trace will help the device-driver authors figure
 	 * out what happened.  So print that out.
 	 */
-	phb_pe = eeh_phb_pe_get(pe->phb);
-	pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
-	       pe->phb->global_number, pe->addr);
-	pr_err("EEH: PE location: %s, PHB location: %s\n",
-	       eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
-	dump_stack();
-
+	pr_debug("EEH: %s: Frozen PHB#%x-PE#%x detected\n",
+		__func__, pe->phb->global_number, pe->addr);
 	eeh_send_failure_event(pe);

 	return 1;
@ -697,7 +698,7 @@ int eeh_pci_enable(struct eeh_pe *pe, int function)
 	return rc;
 }

-static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev,
+static void eeh_disable_and_save_dev_state(struct eeh_dev *edev,
 					    void *userdata)
 {
 	struct pci_dev *pdev = eeh_dev_to_pci_dev(edev);
@ -708,7 +709,7 @@ static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev,
 	 * state for the specified device
 	 */
 	if (!pdev || pdev == dev)
-		return NULL;
+		return;

 	/* Ensure we have D0 power state */
 	pci_set_power_state(pdev, PCI_D0);
@ -721,18 +722,16 @@ static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev,
 	 * interrupt from the device
 	 */
 	pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
-
-	return NULL;
 }

-static void *eeh_restore_dev_state(struct eeh_dev *edev, void *userdata)
+static void eeh_restore_dev_state(struct eeh_dev *edev, void *userdata)
 {
 	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
 	struct pci_dev *pdev = eeh_dev_to_pci_dev(edev);
 	struct pci_dev *dev = userdata;

 	if (!pdev)
-		return NULL;
+		return;

 	/* Apply customization from firmware */
 	if (pdn && eeh_ops->restore_config)
@ -741,8 +740,6 @@ static void *eeh_restore_dev_state(struct eeh_dev *edev, void *userdata)
 	/* The caller should restore state for the specified device */
 	if (pdev != dev)
 		pci_restore_state(pdev);
-
-	return NULL;
 }

 int eeh_restore_vf_config(struct pci_dn *pdn)
@ -868,7 +865,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
 * the indicated device and its children so that the bunch of the
 * devices could be reset properly.
 */
-static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag)
+static void eeh_set_dev_freset(struct eeh_dev *edev, void *flag)
 {
 	struct pci_dev *dev;
 	unsigned int *freset = (unsigned int *)flag;
@ -876,8 +873,6 @@ static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag)
 	dev = eeh_dev_to_pci_dev(edev);
 	if (dev)
 		*freset |= dev->needs_freset;
-
-	return NULL;
 }

 static void eeh_pe_refreeze_passed(struct eeh_pe *root)
@ -1063,23 +1058,6 @@ static struct notifier_block eeh_reboot_nb = {
 	.notifier_call = eeh_reboot_notifier,
 };

-void eeh_probe_devices(void)
-{
-	struct pci_controller *hose, *tmp;
-	struct pci_dn *pdn;
-
-	/* Enable EEH for all adapters */
-	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
-		pdn = hose->pci_data;
-		traverse_pci_dn(pdn, eeh_ops->probe, NULL);
-	}
-	if (eeh_enabled())
-		pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
-	else
-		pr_info("EEH: No capable adapters found\n");
-
-}
-
 /**
 * eeh_init - EEH initialization
 *
@ -1120,6 +1098,8 @@ static int eeh_init(void)
 	list_for_each_entry_safe(hose, tmp, &hose_list, list_node)
 		eeh_dev_phb_init_dynamic(hose);

+	eeh_addr_cache_init();
+
 	/* Initialize EEH event */
 	return eeh_event_init();
 }
@ -1190,15 +1170,14 @@ void eeh_add_device_late(struct pci_dev *dev)
 	struct pci_dn *pdn;
 	struct eeh_dev *edev;

-	if (!dev || !eeh_enabled())
+	if (!dev)
 		return;

-	pr_debug("EEH: Adding device %s\n", pci_name(dev));
-
 	pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
 	edev = pdn_to_eeh_dev(pdn);
+	eeh_edev_dbg(edev, "Adding device\n");
 	if (edev->pdev == dev) {
-		pr_debug("EEH: Already referenced !\n");
+		eeh_edev_dbg(edev, "Device already referenced!\n");
 		return;
 	}

@ -1246,6 +1225,8 @@ void eeh_add_device_tree_late(struct pci_bus *bus)
 {
 	struct pci_dev *dev;

+	if (eeh_has_flag(EEH_FORCE_DISABLED))
+		return;
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		eeh_add_device_late(dev);
 		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
@ -1299,10 +1280,10 @@ void eeh_remove_device(struct pci_dev *dev)
 	edev = pci_dev_to_eeh_dev(dev);

 	/* Unregister the device with the EEH/PCI address search system */
-	pr_debug("EEH: Removing device %s\n", pci_name(dev));
+	dev_dbg(&dev->dev, "EEH: Removing device\n");

 	if (!edev || !edev->pdev || !edev->pe) {
-		pr_debug("EEH: Not referenced !\n");
+		dev_dbg(&dev->dev, "EEH: Device not referenced!\n");
 		return;
 	}

@ -1890,6 +1871,198 @@ static const struct file_operations eeh_force_recover_fops = {
 	.llseek	= no_llseek,
 	.write	= eeh_force_recover_write,
 };
+
+static ssize_t eeh_debugfs_dev_usage(struct file *filp,
+				char __user *user_buf,
+				size_t count, loff_t *ppos)
+{
+	static const char usage[] = "input format: <domain>:<bus>:<dev>.<fn>\n";
+
+	return simple_read_from_buffer(user_buf, count, ppos,
+				       usage, sizeof(usage) - 1);
+}
+
+static ssize_t eeh_dev_check_write(struct file *filp,
+				const char __user *user_buf,
+				size_t count, loff_t *ppos)
+{
+	uint32_t domain, bus, dev, fn;
+	struct pci_dev *pdev;
+	struct eeh_dev *edev;
+	char buf[20];
+	int ret;
+
+	memset(buf, 0, sizeof(buf));
+	ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count);
+	if (!ret)
+		return -EFAULT;
+
+	ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn);
+	if (ret != 4) {
+		pr_err("%s: expected 4 args, got %d\n", __func__, ret);
+		return -EINVAL;
+	}
+
+	pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn);
+	if (!pdev)
+		return -ENODEV;
+
+	edev = pci_dev_to_eeh_dev(pdev);
+	if (!edev) {
+		pci_err(pdev, "No eeh_dev for this device!\n");
+		pci_dev_put(pdev);
+		return -ENODEV;
+	}
+
+	ret = eeh_dev_check_failure(edev);
+	pci_info(pdev, "eeh_dev_check_failure(%04x:%02x:%02x.%01x) = %d\n",
+			domain, bus, dev, fn, ret);
+
+	pci_dev_put(pdev);
+
+	return count;
+}
+
+static const struct file_operations eeh_dev_check_fops = {
+	.open	= simple_open,
+	.llseek	= no_llseek,
+	.write	= eeh_dev_check_write,
+	.read   = eeh_debugfs_dev_usage,
+};
+
+static int eeh_debugfs_break_device(struct pci_dev *pdev)
+{
+	struct resource *bar = NULL;
+	void __iomem *mapped;
+	u16 old, bit;
+	int i, pos;
+
+	/* Do we have an MMIO BAR to disable? */
+	for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
+		struct resource *r = &pdev->resource[i];
+
+		if (!r->flags || !r->start)
+			continue;
+		if (r->flags & IORESOURCE_IO)
+			continue;
+		if (r->flags & IORESOURCE_UNSET)
+			continue;
+
+		bar = r;
+		break;
+	}
+
+	if (!bar) {
+		pci_err(pdev, "Unable to find Memory BAR to cause EEH with\n");
+		return -ENXIO;
+	}
+
+	pci_err(pdev, "Going to break: %pR\n", bar);
+
+	if (pdev->is_virtfn) {
+#ifndef CONFIG_IOV
+		return -ENXIO;
+#else
+		/*
+		 * VFs don't have a per-function COMMAND register, so the best
+		 * we can do is clear the Memory Space Enable bit in the PF's
+		 * SRIOV control reg.
+		 *
+		 * Unfortunately, this requires that we have a PF (i.e doesn't
+		 * work for a passed-through VF) and it has the potential side
+		 * effect of also causing an EEH on every other VF under the
+		 * PF. Oh well.
+		 */
+		pdev = pdev->physfn;
+		if (!pdev)
+			return -ENXIO; /* passed through VFs have no PF */
+
+		pos  = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
+		pos += PCI_SRIOV_CTRL;
+		bit  = PCI_SRIOV_CTRL_MSE;
+#endif /* !CONFIG_IOV */
+	} else {
+		bit = PCI_COMMAND_MEMORY;
+		pos = PCI_COMMAND;
+	}
+
+	/*
+	 * Process here is:
+	 *
+	 * 1. Disable Memory space.
+	 *
+	 * 2. Perform an MMIO to the device. This should result in an error
+	 *    (CA  / UR) being raised by the device which results in an EEH
+	 *    PE freeze. Using the in_8() accessor skips the eeh detection hook
+	 *    so the freeze hook so the EEH Detection machinery won't be
+	 *    triggered here. This is to match the usual behaviour of EEH
+	 *    where the HW will asyncronously freeze a PE and it's up to
+	 *    the kernel to notice and deal with it.
+	 *
+	 * 3. Turn Memory space back on. This is more important for VFs
+	 *    since recovery will probably fail if we don't. For normal
+	 *    the COMMAND register is reset as a part of re-initialising
+	 *    the device.
+	 *
+	 * Breaking stuff is the point so who cares if it's racy ;)
+	 */
+	pci_read_config_word(pdev, pos, &old);
+
+	mapped = ioremap(bar->start, PAGE_SIZE);
+	if (!mapped) {
+		pci_err(pdev, "Unable to map MMIO BAR %pR\n", bar);
+		return -ENXIO;
+	}
+
+	pci_write_config_word(pdev, pos, old & ~bit);
+	in_8(mapped);
+	pci_write_config_word(pdev, pos, old);
+
+	iounmap(mapped);
+
+	return 0;
+}
+
+static ssize_t eeh_dev_break_write(struct file *filp,
+				const char __user *user_buf,
+				size_t count, loff_t *ppos)
+{
+	uint32_t domain, bus, dev, fn;
+	struct pci_dev *pdev;
+	char buf[20];
+	int ret;
+
+	memset(buf, 0, sizeof(buf));
+	ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count);
+	if (!ret)
+		return -EFAULT;
+
+	ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn);
+	if (ret != 4) {
+		pr_err("%s: expected 4 args, got %d\n", __func__, ret);
+		return -EINVAL;
+	}
+
+	pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn);
+	if (!pdev)
+		return -ENODEV;
+
+	ret = eeh_debugfs_break_device(pdev);
+	pci_dev_put(pdev);
+
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+
+static const struct file_operations eeh_dev_break_fops = {
+	.open	= simple_open,
+	.llseek	= no_llseek,
+	.write	= eeh_dev_break_write,
+	.read   = eeh_debugfs_dev_usage,
+};
+
 #endif

 static int __init eeh_init_proc(void)
@ -1905,6 +2078,12 @@ static int __init eeh_init_proc(void)
 		debugfs_create_bool("eeh_disable_recovery", 0600,
 				powerpc_debugfs_root,
 				&eeh_debugfs_no_recover);
+		debugfs_create_file_unsafe("eeh_dev_check", 0600,
+				powerpc_debugfs_root, NULL,
+				&eeh_dev_check_fops);
+		debugfs_create_file_unsafe("eeh_dev_break", 0600,
+				powerpc_debugfs_root, NULL,
+				&eeh_dev_break_fops);
 		debugfs_create_file_unsafe("eeh_force_recover", 0600,
 				powerpc_debugfs_root, NULL,
 				&eeh_force_recover_fops);
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@ -148,8 +148,8 @@ eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t alo,
 	piar->pcidev = dev;
 	piar->flags = flags;

-	pr_debug("PIAR: insert range=[%pap:%pap] dev=%s\n",
-		 &alo, &ahi, pci_name(dev));
+	eeh_edev_dbg(piar->edev, "PIAR: insert range=[%pap:%pap]\n",
+		 &alo, &ahi);

 	rb_link_node(&piar->rb_node, parent, p);
 	rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
@ -229,8 +229,8 @@ restart:
 		piar = rb_entry(n, struct pci_io_addr_range, rb_node);

 		if (piar->pcidev == dev) {
-			pr_debug("PIAR: remove range=[%pap:%pap] dev=%s\n",
-				 &piar->addr_lo, &piar->addr_hi, pci_name(dev));
+			eeh_edev_dbg(piar->edev, "PIAR: remove range=[%pap:%pap]\n",
+				 &piar->addr_lo, &piar->addr_hi);
 			rb_erase(n, &pci_io_addr_cache_root.rb_root);
 			kfree(piar);
 			goto restart;
@ -258,37 +258,14 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev)
 }

 /**
- * eeh_addr_cache_build - Build a cache of I/O addresses
+ * eeh_addr_cache_init - Initialize a cache of I/O addresses
 *
- * Build a cache of pci i/o addresses.  This cache will be used to
+ * Initialize a cache of pci i/o addresses.  This cache will be used to
 * find the pci device that corresponds to a given address.
- * This routine scans all pci busses to build the cache.
- * Must be run late in boot process, after the pci controllers
- * have been scanned for devices (after all device resources are known).
 */
-void eeh_addr_cache_build(void)
+void eeh_addr_cache_init(void)
 {
-	struct pci_dn *pdn;
-	struct eeh_dev *edev;
-	struct pci_dev *dev = NULL;
-
 	spin_lock_init(&pci_io_addr_cache_root.piar_lock);
-
-	for_each_pci_dev(dev) {
-		pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
-		if (!pdn)
-			continue;
-
-		edev = pdn_to_eeh_dev(pdn);
-		if (!edev)
-			continue;
-
-		dev->dev.archdata.edev = edev;
-		edev->pdev = dev;
-
-		eeh_addr_cache_insert_dev(dev);
-		eeh_sysfs_add_device(dev);
-	}
 }

 static int eeh_addr_cache_show(struct seq_file *s, void *v)
--- a/arch/powerpc/kernel/eeh_dev.c
+++ b/arch/powerpc/kernel/eeh_dev.c
@ -47,6 +47,8 @@ struct eeh_dev *eeh_dev_init(struct pci_dn *pdn)
 	/* Associate EEH device with OF node */
 	pdn->edev = edev;
 	edev->pdn = pdn;
+	edev->bdfn = (pdn->busno << 8) | pdn->devfn;
+	edev->controller = pdn->phb;

 	return edev;
 }
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@ -27,6 +27,7 @@
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/pci_hotplug.h>
 #include <asm/eeh.h>
 #include <asm/eeh_event.h>
 #include <asm/ppc-pci.h>
@ -81,23 +82,6 @@ static const char *pci_ers_result_name(enum pci_ers_result result)
 	}
 };

-static __printf(2, 3) void eeh_edev_info(const struct eeh_dev *edev,
-					 const char *fmt, ...)
-{
-	struct va_format vaf;
-	va_list args;
-
-	va_start(args, fmt);
-
-	vaf.fmt = fmt;
-	vaf.va = &args;
-
-	printk(KERN_INFO "EEH: PE#%x (PCI %s): %pV\n", edev->pe_config_addr,
-	       edev->pdev ? dev_name(&edev->pdev->dev) : "none", &vaf);
-
-	va_end(args);
-}
-
 static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old,
 						enum pci_ers_result new)
 {
@ -113,8 +97,16 @@ static bool eeh_dev_removed(struct eeh_dev *edev)

 static bool eeh_edev_actionable(struct eeh_dev *edev)
 {
-	return (edev->pdev && !eeh_dev_removed(edev) &&
-		!eeh_pe_passed(edev->pe));
+	if (!edev->pdev)
+		return false;
+	if (edev->pdev->error_state == pci_channel_io_perm_failure)
+		return false;
+	if (eeh_dev_removed(edev))
+		return false;
+	if (eeh_pe_passed(edev->pe))
+		return false;
+
+	return true;
 }

 /**
@ -214,12 +206,12 @@ static void eeh_enable_irq(struct eeh_dev *edev)
 	}
 }

-static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata)
+static void eeh_dev_save_state(struct eeh_dev *edev, void *userdata)
 {
 	struct pci_dev *pdev;

 	if (!edev)
-		return NULL;
+		return;

 	/*
 	 * We cannot access the config space on some adapters.
@ -229,14 +221,13 @@ static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata)
 	 * device is created.
 	 */
 	if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED))
-		return NULL;
+		return;

 	pdev = eeh_dev_to_pci_dev(edev);
 	if (!pdev)
-		return NULL;
+		return;

 	pci_save_state(pdev);
-	return NULL;
 }

 static void eeh_set_channel_state(struct eeh_pe *root, enum pci_channel_state s)
@ -274,20 +265,27 @@ static void eeh_set_irq_state(struct eeh_pe *root, bool enable)
 }

 typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *,
+					     struct pci_dev *,
 					     struct pci_driver *);
 static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn,
 			       enum pci_ers_result *result)
 {
+	struct pci_dev *pdev;
 	struct pci_driver *driver;
 	enum pci_ers_result new_result;

-	if (!edev->pdev) {
+	pci_lock_rescan_remove();
+	pdev = edev->pdev;
+	if (pdev)
+		get_device(&pdev->dev);
+	pci_unlock_rescan_remove();
+	if (!pdev) {
 		eeh_edev_info(edev, "no device");
 		return;
 	}
-	device_lock(&edev->pdev->dev);
+	device_lock(&pdev->dev);
 	if (eeh_edev_actionable(edev)) {
-		driver = eeh_pcid_get(edev->pdev);
+		driver = eeh_pcid_get(pdev);

 		if (!driver)
 			eeh_edev_info(edev, "no driver");
@ -296,7 +294,7 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn,
 		else if (edev->mode & EEH_DEV_NO_HANDLER)
 			eeh_edev_info(edev, "driver bound too late");
 		else {
-			new_result = fn(edev, driver);
+			new_result = fn(edev, pdev, driver);
 			eeh_edev_info(edev, "%s driver reports: '%s'",
 				      driver->name,
 				      pci_ers_result_name(new_result));
@ -305,12 +303,15 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn,
 							       new_result);
 		}
 		if (driver)
-			eeh_pcid_put(edev->pdev);
+			eeh_pcid_put(pdev);
 	} else {
-		eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!edev->pdev,
+		eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!pdev,
 			      !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe));
 	}
-	device_unlock(&edev->pdev->dev);
+	device_unlock(&pdev->dev);
+	if (edev->pdev != pdev)
+		eeh_edev_warn(edev, "Device changed during processing!\n");
+	put_device(&pdev->dev);
 }

 static void eeh_pe_report(const char *name, struct eeh_pe *root,
@ -337,20 +338,20 @@ static void eeh_pe_report(const char *name, struct eeh_pe *root,
 * Report an EEH error to each device driver.
 */
 static enum pci_ers_result eeh_report_error(struct eeh_dev *edev,
+					    struct pci_dev *pdev,
 					    struct pci_driver *driver)
 {
 	enum pci_ers_result rc;
-	struct pci_dev *dev = edev->pdev;

 	if (!driver->err_handler->error_detected)
 		return PCI_ERS_RESULT_NONE;

 	eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)",
 		      driver->name);
-	rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
+	rc = driver->err_handler->error_detected(pdev, pci_channel_io_frozen);

 	edev->in_error = true;
-	pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
+	pci_uevent_ers(pdev, PCI_ERS_RESULT_NONE);
 	return rc;
 }

@ -363,12 +364,13 @@ static enum pci_ers_result eeh_report_error(struct eeh_dev *edev,
 * are now enabled.
 */
 static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev,
+						   struct pci_dev *pdev,
 						   struct pci_driver *driver)
 {
 	if (!driver->err_handler->mmio_enabled)
 		return PCI_ERS_RESULT_NONE;
 	eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name);
-	return driver->err_handler->mmio_enabled(edev->pdev);
+	return driver->err_handler->mmio_enabled(pdev);
 }

 /**
@ -382,20 +384,21 @@ static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev,
 * driver can work again while the device is recovered.
 */
 static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev,
+					    struct pci_dev *pdev,
 					    struct pci_driver *driver)
 {
 	if (!driver->err_handler->slot_reset || !edev->in_error)
 		return PCI_ERS_RESULT_NONE;
 	eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name);
-	return driver->err_handler->slot_reset(edev->pdev);
+	return driver->err_handler->slot_reset(pdev);
 }

-static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
+static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
 {
 	struct pci_dev *pdev;

 	if (!edev)
-		return NULL;
+		return;

 	/*
 	 * The content in the config space isn't saved because
@ -407,15 +410,14 @@ static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
 		if (list_is_last(&edev->entry, &edev->pe->edevs))
 			eeh_pe_restore_bars(edev->pe);

-		return NULL;
+		return;
 	}

 	pdev = eeh_dev_to_pci_dev(edev);
 	if (!pdev)
-		return NULL;
+		return;

 	pci_restore_state(pdev);
-	return NULL;
 }

 /**
@ -428,13 +430,14 @@ static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
 * to make the recovered device work again.
 */
 static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev,
+					     struct pci_dev *pdev,
 					     struct pci_driver *driver)
 {
 	if (!driver->err_handler->resume || !edev->in_error)
 		return PCI_ERS_RESULT_NONE;

 	eeh_edev_info(edev, "Invoking %s->resume()", driver->name);
-	driver->err_handler->resume(edev->pdev);
+	driver->err_handler->resume(pdev);

 	pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED);
 #ifdef CONFIG_PCI_IOV
@ -453,6 +456,7 @@ static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev,
 * dead, and that no further recovery attempts will be made on it.
 */
 static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev,
+					      struct pci_dev *pdev,
 					      struct pci_driver *driver)
 {
 	enum pci_ers_result rc;
@ -462,10 +466,10 @@ static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev,

 	eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)",
 		      driver->name);
-	rc = driver->err_handler->error_detected(edev->pdev,
+	rc = driver->err_handler->error_detected(pdev,
 						 pci_channel_io_perm_failure);

-	pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_DISCONNECT);
+	pci_uevent_ers(pdev, PCI_ERS_RESULT_DISCONNECT);
 	return rc;
 }

@ -473,12 +477,9 @@ static void *eeh_add_virt_device(struct eeh_dev *edev)
 {
 	struct pci_driver *driver;
 	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
-	struct pci_dn *pdn = eeh_dev_to_pdn(edev);

 	if (!(edev->physfn)) {
-		pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n",
-			__func__, pdn->phb->global_number, pdn->busno,
-			PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+		eeh_edev_warn(edev, "Not for VF\n");
 		return NULL;
 	}

@ -492,12 +493,12 @@ static void *eeh_add_virt_device(struct eeh_dev *edev)
 	}

 #ifdef CONFIG_PCI_IOV
-	pci_iov_add_virtfn(edev->physfn, pdn->vf_index);
+	pci_iov_add_virtfn(edev->physfn, eeh_dev_to_pdn(edev)->vf_index);
 #endif
 	return NULL;
 }

-static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata)
+static void eeh_rmv_device(struct eeh_dev *edev, void *userdata)
 {
 	struct pci_driver *driver;
 	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
@ -512,7 +513,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata)
 	 */
 	if (!eeh_edev_actionable(edev) ||
 	    (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE))
-		return NULL;
+		return;

 	if (rmv_data) {
 		driver = eeh_pcid_get(dev);
@ -521,7 +522,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata)
 			    driver->err_handler->error_detected &&
 			    driver->err_handler->slot_reset) {
 				eeh_pcid_put(dev);
-				return NULL;
+				return;
 			}
 			eeh_pcid_put(dev);
 		}
@ -554,8 +555,6 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata)
 		pci_stop_and_remove_bus_device(dev);
 		pci_unlock_rescan_remove();
 	}
-
-	return NULL;
 }

 static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata)
@ -744,6 +743,99 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
 */
 #define MAX_WAIT_FOR_RECOVERY 300

+
+/* Walks the PE tree after processing an event to remove any stale PEs.
+ *
+ * NB: This needs to be recursive to ensure the leaf PEs get removed
+ * before their parents do. Although this is possible to do recursively
+ * we don't since this is easier to read and we need to garantee
+ * the leaf nodes will be handled first.
+ */
+static void eeh_pe_cleanup(struct eeh_pe *pe)
+{
+	struct eeh_pe *child_pe, *tmp;
+
+	list_for_each_entry_safe(child_pe, tmp, &pe->child_list, child)
+		eeh_pe_cleanup(child_pe);
+
+	if (pe->state & EEH_PE_KEEP)
+		return;
+
+	if (!(pe->state & EEH_PE_INVALID))
+		return;
+
+	if (list_empty(&pe->edevs) && list_empty(&pe->child_list)) {
+		list_del(&pe->child);
+		kfree(pe);
+	}
+}
+
+/**
+ * eeh_check_slot_presence - Check if a device is still present in a slot
+ * @pdev: pci_dev to check
+ *
+ * This function may return a false positive if we can't determine the slot's
+ * presence state. This might happen for for PCIe slots if the PE containing
+ * the upstream bridge is also frozen, or the bridge is part of the same PE
+ * as the device.
+ *
+ * This shouldn't happen often, but you might see it if you hotplug a PCIe
+ * switch.
+ */
+static bool eeh_slot_presence_check(struct pci_dev *pdev)
+{
+	const struct hotplug_slot_ops *ops;
+	struct pci_slot *slot;
+	u8 state;
+	int rc;
+
+	if (!pdev)
+		return false;
+
+	if (pdev->error_state == pci_channel_io_perm_failure)
+		return false;
+
+	slot = pdev->slot;
+	if (!slot || !slot->hotplug)
+		return true;
+
+	ops = slot->hotplug->ops;
+	if (!ops || !ops->get_adapter_status)
+		return true;
+
+	/* set the attention indicator while we've got the slot ops */
+	if (ops->set_attention_status)
+		ops->set_attention_status(slot->hotplug, 1);
+
+	rc = ops->get_adapter_status(slot->hotplug, &state);
+	if (rc)
+		return true;
+
+	return !!state;
+}
+
+static void eeh_clear_slot_attention(struct pci_dev *pdev)
+{
+	const struct hotplug_slot_ops *ops;
+	struct pci_slot *slot;
+
+	if (!pdev)
+		return;
+
+	if (pdev->error_state == pci_channel_io_perm_failure)
+		return;
+
+	slot = pdev->slot;
+	if (!slot || !slot->hotplug)
+		return;
+
+	ops = slot->hotplug->ops;
+	if (!ops || !ops->set_attention_status)
+		return;
+
+	ops->set_attention_status(slot->hotplug, 0);
+}
+
 /**
 * eeh_handle_normal_event - Handle EEH events on a specific PE
 * @pe: EEH PE - which should not be used after we return, as it may
@ -774,6 +866,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 	enum pci_ers_result result = PCI_ERS_RESULT_NONE;
 	struct eeh_rmv_data rmv_data =
 		{LIST_HEAD_INIT(rmv_data.removed_vf_list), 0};
+	int devices = 0;

 	bus = eeh_pe_bus_get(pe);
 	if (!bus) {
@ -782,7 +875,59 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 		return;
 	}

-	eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
+	/*
+	 * When devices are hot-removed we might get an EEH due to
+	 * a driver attempting to touch the MMIO space of a removed
+	 * device. In this case we don't have a device to recover
+	 * so suppress the event if we can't find any present devices.
+	 *
+	 * The hotplug driver should take care of tearing down the
+	 * device itself.
+	 */
+	eeh_for_each_pe(pe, tmp_pe)
+		eeh_pe_for_each_dev(tmp_pe, edev, tmp)
+			if (eeh_slot_presence_check(edev->pdev))
+				devices++;
+
+	if (!devices) {
+		pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n",
+			pe->phb->global_number, pe->addr);
+		goto out; /* nothing to recover */
+	}
+
+	/* Log the event */
+	if (pe->type & EEH_PE_PHB) {
+		pr_err("EEH: PHB#%x failure detected, location: %s\n",
+			pe->phb->global_number, eeh_pe_loc_get(pe));
+	} else {
+		struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb);
+
+		pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
+		       pe->phb->global_number, pe->addr);
+		pr_err("EEH: PE location: %s, PHB location: %s\n",
+		       eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
+	}
+
+#ifdef CONFIG_STACKTRACE
+	/*
+	 * Print the saved stack trace now that we've verified there's
+	 * something to recover.
+	 */
+	if (pe->trace_entries) {
+		void **ptrs = (void **) pe->stack_trace;
+		int i;
+
+		pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
+		       pe->phb->global_number, pe->addr);
+
+		/* FIXME: Use the same format as dump_stack() */
+		pr_err("EEH: Call Trace:\n");
+		for (i = 0; i < pe->trace_entries; i++)
+			pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]);
+
+		pe->trace_entries = 0;
+	}
+#endif /* CONFIG_STACKTRACE */

 	eeh_pe_update_time_stamp(pe);
 	pe->freeze_count++;
@ -793,6 +938,10 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 		result = PCI_ERS_RESULT_DISCONNECT;
 	}

+	eeh_for_each_pe(pe, tmp_pe)
+		eeh_pe_for_each_dev(tmp_pe, edev, tmp)
+			edev->mode &= ~EEH_DEV_NO_HANDLER;
+
 	/* Walk the various device drivers attached to this slot through
 	 * a reset sequence, giving each an opportunity to do what it needs
 	 * to accomplish the reset.  Each child gets a report of the
@ -969,6 +1118,19 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 			return;
 		}
 	}
+
+out:
+	/*
+	 * Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING
+	 * we don't want to modify the PE tree structure so we do it here.
+	 */
+	eeh_pe_cleanup(pe);
+
+	/* clear the slot attention LED for all recovered devices */
+	eeh_for_each_pe(pe, tmp_pe)
+		eeh_pe_for_each_dev(tmp_pe, edev, tmp)
+			eeh_clear_slot_attention(edev->pdev);
+
 	eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
 }

@ -981,7 +1143,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 */
 void eeh_handle_special_event(void)
 {
-	struct eeh_pe *pe, *phb_pe;
+	struct eeh_pe *pe, *phb_pe, *tmp_pe;
+	struct eeh_dev *edev, *tmp_edev;
 	struct pci_bus *bus;
 	struct pci_controller *hose;
 	unsigned long flags;
@ -1040,6 +1203,7 @@ void eeh_handle_special_event(void)
 		 */
 		if (rc == EEH_NEXT_ERR_FROZEN_PE ||
 		    rc == EEH_NEXT_ERR_FENCED_PHB) {
+			eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
 			eeh_handle_normal_event(pe);
 		} else {
 			pci_lock_rescan_remove();
@ -1050,6 +1214,10 @@ void eeh_handle_special_event(void)
 				    (phb_pe->state & EEH_PE_RECOVERING))
 					continue;

+				eeh_for_each_pe(pe, tmp_pe)
+					eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev)
+						edev->mode &= ~EEH_DEV_NO_HANDLER;
+
 				/* Notify all devices to be down */
 				eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
 				eeh_set_channel_state(pe, pci_channel_io_perm_failure);
--- a/arch/powerpc/kernel/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@ -40,7 +40,6 @@ static int eeh_event_handler(void * dummy)
 {
 	unsigned long flags;
 	struct eeh_event *event;
-	struct eeh_pe *pe;

 	while (!kthread_should_stop()) {
 		if (wait_for_completion_interruptible(&eeh_eventlist_event))
@ -59,19 +58,10 @@ static int eeh_event_handler(void * dummy)
 			continue;

 		/* We might have event without binding PE */
-		pe = event->pe;
-		if (pe) {
-			if (pe->type & EEH_PE_PHB)
-				pr_info("EEH: Detected error on PHB#%x\n",
-					 pe->phb->global_number);
-			else
-				pr_info("EEH: Detected PCI bus error on "
-					"PHB#%x-PE#%x\n",
-					pe->phb->global_number, pe->addr);
-			eeh_handle_normal_event(pe);
-		} else {
+		if (event->pe)
+			eeh_handle_normal_event(event->pe);
+		else
 			eeh_handle_special_event();
-		}

 		kfree(event);
 	}
@ -121,6 +111,24 @@ int __eeh_send_failure_event(struct eeh_pe *pe)
 	}
 	event->pe = pe;

+	/*
+	 * Mark the PE as recovering before inserting it in the queue.
+	 * This prevents the PE from being free()ed by a hotplug driver
+	 * while the PE is sitting in the event queue.
+	 */
+	if (pe) {
+#ifdef CONFIG_STACKTRACE
+		/*
+		 * Save the current stack trace so we can dump it from the
+		 * event handler thread.
+		 */
+		pe->trace_entries = stack_trace_save(pe->stack_trace,
+					 ARRAY_SIZE(pe->stack_trace), 0);
+#endif /* CONFIG_STACKTRACE */
+
+		eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
+	}
+
 	/* We may or may not be called in an interrupt context */
 	spin_lock_irqsave(&eeh_eventlist_lock, flags);
 	list_add(&event->list, &eeh_eventlist);
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@ -231,29 +231,22 @@ void *eeh_pe_traverse(struct eeh_pe *root,
 * The function is used to traverse the devices of the specified
 * PE and its child PEs.
 */
-void *eeh_pe_dev_traverse(struct eeh_pe *root,
+void eeh_pe_dev_traverse(struct eeh_pe *root,
 			  eeh_edev_traverse_func fn, void *flag)
 {
 	struct eeh_pe *pe;
 	struct eeh_dev *edev, *tmp;
-	void *ret;

 	if (!root) {
 		pr_warn("%s: Invalid PE %p\n",
 			__func__, root);
-		return NULL;
+		return;
 	}

 	/* Traverse root PE */
-	eeh_for_each_pe(root, pe) {
-		eeh_pe_for_each_dev(pe, edev, tmp) {
-			ret = fn(edev, flag);
-			if (ret)
-				return ret;
-		}
-	}
-
-	return NULL;
+	eeh_for_each_pe(root, pe)
+		eeh_pe_for_each_dev(pe, edev, tmp)
+			fn(edev, flag);
 }

 /**
@ -379,8 +372,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)

 	/* Check if the PE number is valid */
 	if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) {
-		pr_err("%s: Invalid PE#0 for edev 0x%x on PHB#%x\n",
-		       __func__, config_addr, pdn->phb->global_number);
+		eeh_edev_err(edev, "PE#0 is invalid for this PHB!\n");
 		return -EINVAL;
 	}

@ -391,42 +383,34 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 	 * components.
 	 */
 	pe = eeh_pe_get(pdn->phb, edev->pe_config_addr, config_addr);
-	if (pe && !(pe->type & EEH_PE_INVALID)) {
-		/* Mark the PE as type of PCI bus */
-		pe->type = EEH_PE_BUS;
-		edev->pe = pe;
+	if (pe) {
+		if (pe->type & EEH_PE_INVALID) {
+			list_add_tail(&edev->entry, &pe->edevs);
+			edev->pe = pe;
+			/*
+			 * We're running to here because of PCI hotplug caused by
+			 * EEH recovery. We need clear EEH_PE_INVALID until the top.
+			 */
+			parent = pe;
+			while (parent) {
+				if (!(parent->type & EEH_PE_INVALID))
+					break;
+				parent->type &= ~EEH_PE_INVALID;
+				parent = parent->parent;
+			}

-		/* Put the edev to PE */
-		list_add_tail(&edev->entry, &pe->edevs);
-		pr_debug("EEH: Add %04x:%02x:%02x.%01x to Bus PE#%x\n",
-			 pdn->phb->global_number,
-			 pdn->busno,
-			 PCI_SLOT(pdn->devfn),
-			 PCI_FUNC(pdn->devfn),
-			 pe->addr);
-		return 0;
-	} else if (pe && (pe->type & EEH_PE_INVALID)) {
-		list_add_tail(&edev->entry, &pe->edevs);
-		edev->pe = pe;
-		/*
-		 * We're running to here because of PCI hotplug caused by
-		 * EEH recovery. We need clear EEH_PE_INVALID until the top.
-		 */
-		parent = pe;
-		while (parent) {
-			if (!(parent->type & EEH_PE_INVALID))
-				break;
-			parent->type &= ~EEH_PE_INVALID;
-			parent = parent->parent;
+			eeh_edev_dbg(edev,
+				     "Added to device PE (parent: PE#%x)\n",
+				     pe->parent->addr);
+		} else {
+			/* Mark the PE as type of PCI bus */
+			pe->type = EEH_PE_BUS;
+			edev->pe = pe;
+
+			/* Put the edev to PE */
+			list_add_tail(&edev->entry, &pe->edevs);
+			eeh_edev_dbg(edev, "Added to bus PE\n");
 		}
-
-		pr_debug("EEH: Add %04x:%02x:%02x.%01x to Device "
-			 "PE#%x, Parent PE#%x\n",
-			 pdn->phb->global_number,
-			 pdn->busno,
-			 PCI_SLOT(pdn->devfn),
-			 PCI_FUNC(pdn->devfn),
-			 pe->addr, pe->parent->addr);
 		return 0;
 	}

@ -468,13 +452,8 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 	list_add_tail(&pe->child, &parent->child_list);
 	list_add_tail(&edev->entry, &pe->edevs);
 	edev->pe = pe;
-	pr_debug("EEH: Add %04x:%02x:%02x.%01x to "
-		 "Device PE#%x, Parent PE#%x\n",
-		 pdn->phb->global_number,
-		 pdn->busno,
-		 PCI_SLOT(pdn->devfn),
-		 PCI_FUNC(pdn->devfn),
-		 pe->addr, pe->parent->addr);
+	eeh_edev_dbg(edev, "Added to device PE (parent: PE#%x)\n",
+		     pe->parent->addr);

 	return 0;
 }
@ -491,16 +470,12 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
 {
 	struct eeh_pe *pe, *parent, *child;
+	bool keep, recover;
 	int cnt;
-	struct pci_dn *pdn = eeh_dev_to_pdn(edev);

 	pe = eeh_dev_to_pe(edev);
 	if (!pe) {
-		pr_debug("%s: No PE found for device %04x:%02x:%02x.%01x\n",
-			 __func__,  pdn->phb->global_number,
-			 pdn->busno,
-			 PCI_SLOT(pdn->devfn),
-			 PCI_FUNC(pdn->devfn));
+		eeh_edev_dbg(edev, "No PE found for device.\n");
 		return -EEXIST;
 	}

@ -516,10 +491,21 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
 	 */
 	while (1) {
 		parent = pe->parent;
+
+		/* PHB PEs should never be removed */
 		if (pe->type & EEH_PE_PHB)
 			break;

-		if (!(pe->state & EEH_PE_KEEP)) {
+		/*
+		 * XXX: KEEP is set while resetting a PE. I don't think it's
+		 * ever set without RECOVERING also being set. I could
+		 * be wrong though so catch that with a WARN.
+		 */
+		keep = !!(pe->state & EEH_PE_KEEP);
+		recover = !!(pe->state & EEH_PE_RECOVERING);
+		WARN_ON(keep && !recover);
+
+		if (!keep && !recover) {
 			if (list_empty(&pe->edevs) &&
 			    list_empty(&pe->child_list)) {
 				list_del(&pe->child);
@ -528,6 +514,15 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
 				break;
 			}
 		} else {
+			/*
+			 * Mark the PE as invalid. At the end of the recovery
+			 * process any invalid PEs will be garbage collected.
+			 *
+			 * We need to delay the free()ing of them since we can
+			 * remove edev's while traversing the PE tree which
+			 * might trigger the removal of a PE and we can't
+			 * deal with that (yet).
+			 */
 			if (list_empty(&pe->edevs)) {
 				cnt = 0;
 				list_for_each_entry(child, &pe->child_list, child) {
@ -623,13 +618,11 @@ void eeh_pe_mark_isolated(struct eeh_pe *root)
 }
 EXPORT_SYMBOL_GPL(eeh_pe_mark_isolated);

-static void *__eeh_pe_dev_mode_mark(struct eeh_dev *edev, void *flag)
+static void __eeh_pe_dev_mode_mark(struct eeh_dev *edev, void *flag)
 {
 	int mode = *((int *)flag);

 	edev->mode |= mode;
-
-	return NULL;
 }

 /**
@ -717,17 +710,13 @@ static void eeh_bridge_check_link(struct eeh_dev *edev)
 	if (!(edev->mode & (EEH_DEV_ROOT_PORT | EEH_DEV_DS_PORT)))
 		return;

-	pr_debug("%s: Check PCIe link for %04x:%02x:%02x.%01x ...\n",
-		 __func__, pdn->phb->global_number,
-		 pdn->busno,
-		 PCI_SLOT(pdn->devfn),
-		 PCI_FUNC(pdn->devfn));
+	eeh_edev_dbg(edev, "Checking PCIe link...\n");

 	/* Check slot status */
 	cap = edev->pcie_cap;
 	eeh_ops->read_config(pdn, cap + PCI_EXP_SLTSTA, 2, &val);
 	if (!(val & PCI_EXP_SLTSTA_PDS)) {
-		pr_debug("  No card in the slot (0x%04x) !\n", val);
+		eeh_edev_dbg(edev, "No card in the slot (0x%04x) !\n", val);
 		return;
 	}

@ -736,7 +725,7 @@ static void eeh_bridge_check_link(struct eeh_dev *edev)
 	if (val & PCI_EXP_SLTCAP_PCP) {
 		eeh_ops->read_config(pdn, cap + PCI_EXP_SLTCTL, 2, &val);
 		if (val & PCI_EXP_SLTCTL_PCC) {
-			pr_debug("  In power-off state, power it on ...\n");
+			eeh_edev_dbg(edev, "In power-off state, power it on ...\n");
 			val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC);
 			val |= (0x0100 & PCI_EXP_SLTCTL_PIC);
 			eeh_ops->write_config(pdn, cap + PCI_EXP_SLTCTL, 2, val);
@ -752,7 +741,7 @@ static void eeh_bridge_check_link(struct eeh_dev *edev)
 	/* Check link */
 	eeh_ops->read_config(pdn, cap + PCI_EXP_LNKCAP, 4, &val);
 	if (!(val & PCI_EXP_LNKCAP_DLLLARC)) {
-		pr_debug("  No link reporting capability (0x%08x) \n", val);
+		eeh_edev_dbg(edev, "No link reporting capability (0x%08x) \n", val);
 		msleep(1000);
 		return;
 	}
@ -769,10 +758,10 @@ static void eeh_bridge_check_link(struct eeh_dev *edev)
 	}

 	if (val & PCI_EXP_LNKSTA_DLLLA)
-		pr_debug("  Link up (%s)\n",
+		eeh_edev_dbg(edev, "Link up (%s)\n",
 			 (val & PCI_EXP_LNKSTA_CLS_2_5GB) ? "2.5GB" : "5GB");
 	else
-		pr_debug("  Link not ready (0x%04x)\n", val);
+		eeh_edev_dbg(edev, "Link not ready (0x%04x)\n", val);
 }

 #define BYTE_SWAP(OFF)	(8*((OFF)/4)+3-(OFF))
@ -852,7 +841,7 @@ static void eeh_restore_device_bars(struct eeh_dev *edev)
 * the expansion ROM base address, the latency timer, and etc.
 * from the saved values in the device node.
 */
-static void *eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag)
+static void eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag)
 {
 	struct pci_dn *pdn = eeh_dev_to_pdn(edev);

@ -864,8 +853,6 @@ static void *eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag)

 	if (eeh_ops->restore_config && pdn)
 		eeh_ops->restore_config(pdn);
-
-	return NULL;
 }

 /**
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@ -230,7 +230,7 @@ transfer_to_handler_cont:
 	 */
 	lis	r12,reenable_mmu@h
 	ori	r12,r12,reenable_mmu@l
-	LOAD_MSR_KERNEL(r0, MSR_KERNEL)
+	LOAD_REG_IMMEDIATE(r0, MSR_KERNEL)
 	mtspr	SPRN_SRR0,r12
 	mtspr	SPRN_SRR1,r0
 	SYNC
@ -304,7 +304,7 @@ stack_ovf:
 	addi	r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
 	lis	r9,StackOverflow@ha
 	addi	r9,r9,StackOverflow@l
-	LOAD_MSR_KERNEL(r10,MSR_KERNEL)
+	LOAD_REG_IMMEDIATE(r10,MSR_KERNEL)
 #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
 	mtspr	SPRN_NRI, r0
 #endif
@ -324,7 +324,7 @@ trace_syscall_entry_irq_off:
 	bl	trace_hardirqs_on

 	/* Now enable for real */
-	LOAD_MSR_KERNEL(r10, MSR_KERNEL | MSR_EE)
+	LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE)
 	mtmsr	r10

 	REST_GPR(0, r1)
@ -394,7 +394,7 @@ ret_from_syscall:
 #endif
 	mr	r6,r3
 	/* disable interrupts so current_thread_info()->flags can't change */
-	LOAD_MSR_KERNEL(r10,MSR_KERNEL)	/* doesn't include MSR_EE */
+	LOAD_REG_IMMEDIATE(r10,MSR_KERNEL)	/* doesn't include MSR_EE */
 	/* Note: We don't bother telling lockdep about it */
 	SYNC
 	MTMSRD(r10)
@ -777,11 +777,19 @@ fast_exception_return:
 1:	lis	r3,exc_exit_restart_end@ha
 	addi	r3,r3,exc_exit_restart_end@l
 	cmplw	r12,r3
+#if CONFIG_PPC_BOOK3S_601
+	bge	2b
+#else
 	bge	3f
+#endif
 	lis	r4,exc_exit_restart@ha
 	addi	r4,r4,exc_exit_restart@l
 	cmplw	r12,r4
+#if CONFIG_PPC_BOOK3S_601
+	blt	2b
+#else
 	blt	3f
+#endif
 	lis	r3,fee_restarts@ha
 	tophys(r3,r3)
 	lwz	r5,fee_restarts@l(r3)
@ -800,9 +808,6 @@ fee_restarts:
 /* aargh, we don't know which trap this is */
 /* but the 601 doesn't implement the RI bit, so assume it's OK */
 3:
-BEGIN_FTR_SECTION
-	b	2b
-END_FTR_SECTION_IFSET(CPU_FTR_601)
 	li	r10,-1
 	stw	r10,_TRAP(r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
@ -824,7 +829,7 @@ ret_from_except:
 	 * can't change between when we test it and when we return
 	 * from the interrupt. */
 	/* Note: We don't bother telling lockdep about it */
-	LOAD_MSR_KERNEL(r10,MSR_KERNEL)
+	LOAD_REG_IMMEDIATE(r10,MSR_KERNEL)
 	SYNC			/* Some chip revs have problems here... */
 	MTMSRD(r10)		/* disable interrupts */

@ -991,7 +996,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
 	 * can restart the exception exit path at the label
 	 * exc_exit_restart below.  -- paulus
 	 */
-	LOAD_MSR_KERNEL(r10,MSR_KERNEL & ~MSR_RI)
+	LOAD_REG_IMMEDIATE(r10,MSR_KERNEL & ~MSR_RI)
 	SYNC
 	MTMSRD(r10)		/* clear the RI bit */
 	.globl exc_exit_restart
@ -1066,7 +1071,7 @@ exc_exit_restart_end:
 	REST_NVGPRS(r1);						\
 	lwz	r3,_MSR(r1);						\
 	andi.	r3,r3,MSR_PR;						\
-	LOAD_MSR_KERNEL(r10,MSR_KERNEL);				\
+	LOAD_REG_IMMEDIATE(r10,MSR_KERNEL);				\
 	bne	user_exc_return;					\
 	lwz	r0,GPR0(r1);						\
 	lwz	r2,GPR2(r1);						\
@ -1236,7 +1241,7 @@ recheck:
 	 * neither. Those disable/enable cycles used to peek at
 	 * TI_FLAGS aren't advertised.
 	 */
-	LOAD_MSR_KERNEL(r10,MSR_KERNEL)
+	LOAD_REG_IMMEDIATE(r10,MSR_KERNEL)
 	SYNC
 	MTMSRD(r10)		/* disable interrupts */
 	lwz	r9,TI_FLAGS(r2)
@ -1270,11 +1275,19 @@ nonrecoverable:
 	lis	r10,exc_exit_restart_end@ha
 	addi	r10,r10,exc_exit_restart_end@l
 	cmplw	r12,r10
+#ifdef CONFIG_PPC_BOOK3S_601
+	bgelr
+#else
 	bge	3f
+#endif
 	lis	r11,exc_exit_restart@ha
 	addi	r11,r11,exc_exit_restart@l
 	cmplw	r12,r11
+#ifdef CONFIG_PPC_BOOK3S_601
+	bltlr
+#else
 	blt	3f
+#endif
 	lis	r10,ee_restarts@ha
 	lwz	r12,ee_restarts@l(r10)
 	addi	r12,r12,1
@ -1283,9 +1296,6 @@ nonrecoverable:
 	blr
 3:	/* OK, we can't recover, kill this process */
 	/* but the 601 doesn't implement the RI bit, so assume it's OK */
-BEGIN_FTR_SECTION
-	blr
-END_FTR_SECTION_IFSET(CPU_FTR_601)
 	lwz	r3,_TRAP(r1)
 	andi.	r0,r3,1
 	beq	5f
@ -1329,7 +1339,7 @@ _GLOBAL(enter_rtas)
 	lwz	r4,RTASBASE(r4)
 	mfmsr	r9
 	stw	r9,8(r1)
-	LOAD_MSR_KERNEL(r0,MSR_KERNEL)
+	LOAD_REG_IMMEDIATE(r0,MSR_KERNEL)
 	SYNC			/* disable interrupts so SRR0/1 */
 	MTMSRD(r0)		/* don't get trashed */
 	li	r9,MSR_KERNEL & ~(MSR_IR|MSR_DR)
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@ -69,24 +69,20 @@ BEGIN_FTR_SECTION
 	bne	.Ltabort_syscall
 END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
-	andi.	r10,r12,MSR_PR
 	mr	r10,r1
-	addi	r1,r1,-INT_FRAME_SIZE
-	beq-	1f
 	ld	r1,PACAKSAVE(r13)
-1:	std	r10,0(r1)
+	std	r10,0(r1)
 	std	r11,_NIP(r1)
 	std	r12,_MSR(r1)
 	std	r0,GPR0(r1)
 	std	r10,GPR1(r1)
-	beq	2f			/* if from kernel mode */
 #ifdef CONFIG_PPC_FSL_BOOK3E
 START_BTB_FLUSH_SECTION
 	BTB_FLUSH(r10)
 END_BTB_FLUSH_SECTION
 #endif
 	ACCOUNT_CPU_USER_ENTRY(r13, r10, r11)
-2:	std	r2,GPR2(r1)
+	std	r2,GPR2(r1)
 	std	r3,GPR3(r1)
 	mfcr	r2
 	std	r4,GPR4(r1)
@ -122,14 +118,13 @@ END_BTB_FLUSH_SECTION

 #if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR)
 BEGIN_FW_FTR_SECTION
-	beq	33f
-	/* if from user, see if there are any DTL entries to process */
+	/* see if there are any DTL entries to process */
 	ld	r10,PACALPPACAPTR(r13)	/* get ptr to VPA */
 	ld	r11,PACA_DTL_RIDX(r13)	/* get log read index */
 	addi	r10,r10,LPPACA_DTLIDX
 	LDX_BE	r10,0,r10		/* get log write index */
-	cmpd	cr1,r11,r10
-	beq+	cr1,33f
+	cmpd	r11,r10
+	beq+	33f
 	bl	accumulate_stolen_time
 	REST_GPR(0,r1)
 	REST_4GPRS(3,r1)
@ -203,6 +198,7 @@ system_call:			/* label this so stack traces look sane */
 	mtctr   r12
 	bctrl			/* Call handler */

+	/* syscall_exit can exit to kernel mode, via ret_from_kernel_thread */
 .Lsyscall_exit:
 	std	r3,RESULT(r1)

@ -216,11 +212,6 @@ system_call:			/* label this so stack traces look sane */
 	ld	r12, PACA_THREAD_INFO(r13)

 	ld	r8,_MSR(r1)
-#ifdef CONFIG_PPC_BOOK3S
-	/* No MSR:RI on BookE */
-	andi.	r10,r8,MSR_RI
-	beq-	.Lunrecov_restore
-#endif

 /*
 * This is a few instructions into the actual syscall exit path (which actually
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@ -750,12 +750,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	ld	r15,PACATOC(r13)
 	ld	r14,interrupt_base_book3e@got(r15)
 	ld	r15,__end_interrupts@got(r15)
-#else
-	LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
-	LOAD_REG_IMMEDIATE(r15,__end_interrupts)
-#endif
 	cmpld	cr0,r10,r14
 	cmpld	cr1,r10,r15
+#else
+	LOAD_REG_IMMEDIATE_SYM(r14, r15, interrupt_base_book3e)
+	cmpld	cr0, r10, r14
+	LOAD_REG_IMMEDIATE_SYM(r14, r15, __end_interrupts)
+	cmpld	cr1, r10, r14
+#endif
 	blt+	cr0,1f
 	bge+	cr1,1f

@ -820,12 +822,14 @@ kernel_dbg_exc:
 	ld	r15,PACATOC(r13)
 	ld	r14,interrupt_base_book3e@got(r15)
 	ld	r15,__end_interrupts@got(r15)
-#else
-	LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
-	LOAD_REG_IMMEDIATE(r15,__end_interrupts)
-#endif
 	cmpld	cr0,r10,r14
 	cmpld	cr1,r10,r15
+#else
+	LOAD_REG_IMMEDIATE_SYM(r14, r15, interrupt_base_book3e)
+	cmpld	cr0, r10, r14
+	LOAD_REG_IMMEDIATE_SYM(r14, r15,__end_interrupts)
+	cmpld	cr1, r10, r14
+#endif
 	blt+	cr0,1f
 	bge+	cr1,1f

@ -1449,7 +1453,7 @@ a2_tlbinit_code_start:
 a2_tlbinit_after_linear_map:

 	/* Now we branch the new virtual address mapped by this entry */
-	LOAD_REG_IMMEDIATE(r3,1f)
+	LOAD_REG_IMMEDIATE_SYM(r3, r5, 1f)
 	mtctr	r3
 	bctr

--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@ -34,7 +34,16 @@

 #include "head_32.h"

-/* 601 only have IBAT; cr0.eq is set on 601 when using this macro */
+/* 601 only have IBAT */
+#ifdef CONFIG_PPC_BOOK3S_601
+#define LOAD_BAT(n, reg, RA, RB)	\
+	li	RA,0;			\
+	mtspr	SPRN_IBAT##n##U,RA;	\
+	lwz	RA,(n*16)+0(reg);	\
+	lwz	RB,(n*16)+4(reg);	\
+	mtspr	SPRN_IBAT##n##U,RA;	\
+	mtspr	SPRN_IBAT##n##L,RB
+#else
 #define LOAD_BAT(n, reg, RA, RB)	\
 	/* see the comment for clear_bats() -- Cort */ \
 	li	RA,0;			\
@ -44,12 +53,11 @@
 	lwz	RB,(n*16)+4(reg);	\
 	mtspr	SPRN_IBAT##n##U,RA;	\
 	mtspr	SPRN_IBAT##n##L,RB;	\
-	beq	1f;			\
 	lwz	RA,(n*16)+8(reg);	\
 	lwz	RB,(n*16)+12(reg);	\
 	mtspr	SPRN_DBAT##n##U,RA;	\
-	mtspr	SPRN_DBAT##n##L,RB;	\
-1:
+	mtspr	SPRN_DBAT##n##L,RB
+#endif

 	__HEAD
 	.stabs	"arch/powerpc/kernel/",N_SO,0,0,0f
@ -557,9 +565,9 @@ DataStoreTLBMiss:
 	cmplw	0,r1,r3
 	mfspr	r2, SPRN_SPRG_PGDIR
 #ifdef CONFIG_SWAP
-	li	r1, _PAGE_RW | _PAGE_PRESENT | _PAGE_ACCESSED
+	li	r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED
 #else
-	li	r1, _PAGE_RW | _PAGE_PRESENT
+	li	r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT
 #endif
 	bge-	112f
 	lis	r2, (swapper_pg_dir - PAGE_OFFSET)@ha	/* if kernel address, use */
@ -820,9 +828,6 @@ load_up_mmu:

 /* Load the BAT registers with the values set up by MMU_init.
   MMU_init takes care of whether we're on a 601 or not. */
-	mfpvr	r3
-	srwi	r3,r3,16
-	cmpwi	r3,1
 	lis	r3,BATS@ha
 	addi	r3,r3,BATS@l
 	tophys(r3,r3)
@ -897,9 +902,11 @@ start_here:
 	bl	machine_init
 	bl	__save_cpu_setup
 	bl	MMU_init
+#ifdef CONFIG_KASAN
 BEGIN_MMU_FTR_SECTION
 	bl	MMU_init_hw_patch
 END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
+#endif

 /*
 * Go back to running unmapped so we can load up new values
@ -996,11 +1003,8 @@ EXPORT_SYMBOL(switch_mmu_context)
 */
 clear_bats:
 	li	r10,0
-	mfspr	r9,SPRN_PVR
-	rlwinm	r9,r9,16,16,31		/* r9 = 1 for 601, 4 for 604 */
-	cmpwi	r9, 1
-	beq	1f

+#ifndef CONFIG_PPC_BOOK3S_601
 	mtspr	SPRN_DBAT0U,r10
 	mtspr	SPRN_DBAT0L,r10
 	mtspr	SPRN_DBAT1U,r10
@ -1009,7 +1013,7 @@ clear_bats:
 	mtspr	SPRN_DBAT2L,r10
 	mtspr	SPRN_DBAT3U,r10
 	mtspr	SPRN_DBAT3L,r10
-1:
+#endif
 	mtspr	SPRN_IBAT0U,r10
 	mtspr	SPRN_IBAT0L,r10
 	mtspr	SPRN_IBAT1U,r10
@ -1104,10 +1108,7 @@ mmu_off:
 */
 initial_bats:
 	lis	r11,PAGE_OFFSET@h
-	mfspr	r9,SPRN_PVR
-	rlwinm	r9,r9,16,16,31		/* r9 = 1 for 601, 4 for 604 */
-	cmpwi	0,r9,1
-	bne	4f
+#ifdef CONFIG_PPC_BOOK3S_601
 	ori	r11,r11,4		/* set up BAT registers for 601 */
 	li	r8,0x7f			/* valid, block length = 8MB */
 	mtspr	SPRN_IBAT0U,r11		/* N.B. 601 has valid bit in */
@ -1120,10 +1121,8 @@ initial_bats:
 	addis	r8,r8,0x800000@h
 	mtspr	SPRN_IBAT2U,r11
 	mtspr	SPRN_IBAT2L,r8
-	isync
-	blr
-
-4:	tophys(r8,r11)
+#else
+	tophys(r8,r11)
 #ifdef CONFIG_SMP
 	ori	r8,r8,0x12		/* R/W access, M=1 */
 #else
@ -1135,10 +1134,10 @@ initial_bats:
 	mtspr	SPRN_DBAT0U,r11		/* bit in upper BAT register */
 	mtspr	SPRN_IBAT0L,r8
 	mtspr	SPRN_IBAT0U,r11
+#endif
 	isync
 	blr

-
 #ifdef CONFIG_BOOTX_TEXT
 setup_disp_bat:
 	/*
@ -1153,15 +1152,13 @@ setup_disp_bat:
 	beqlr
 	lwz	r11,0(r8)
 	lwz	r8,4(r8)
-	mfspr	r9,SPRN_PVR
-	rlwinm	r9,r9,16,16,31		/* r9 = 1 for 601, 4 for 604 */
-	cmpwi	0,r9,1
-	beq	1f
+#ifndef CONFIG_PPC_BOOK3S_601
 	mtspr	SPRN_DBAT3L,r8
 	mtspr	SPRN_DBAT3U,r11
-	blr
-1:	mtspr	SPRN_IBAT3L,r8
+#else
+	mtspr	SPRN_IBAT3L,r8
 	mtspr	SPRN_IBAT3U,r11
+#endif
 	blr
 #endif /* CONFIG_BOOTX_TEXT */

--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@ -4,19 +4,6 @@

 #include <asm/ptrace.h>	/* for STACK_FRAME_REGS_MARKER */

-/*
- * MSR_KERNEL is > 0x8000 on 4xx/Book-E since it include MSR_CE.
- */
-.macro __LOAD_MSR_KERNEL r, x
-.if \x >= 0x8000
-	lis \r, (\x)@h
-	ori \r, \r, (\x)@l
-.else
-	li \r, (\x)
-.endif
-.endm
-#define LOAD_MSR_KERNEL(r, x) __LOAD_MSR_KERNEL r, x
-
 /*
 * Exception entry code.  This code runs with address translation
 * turned off, i.e. using physical addresses.
@ -92,7 +79,7 @@
 #ifdef CONFIG_40x
 	rlwinm	r9,r9,0,14,12		/* clear MSR_WE (necessary?) */
 #else
-	LOAD_MSR_KERNEL(r10, MSR_KERNEL & ~(MSR_IR|MSR_DR)) /* can take exceptions */
+	LOAD_REG_IMMEDIATE(r10, MSR_KERNEL & ~(MSR_IR|MSR_DR)) /* can take exceptions */
 	MTMSRD(r10)			/* (except for mach check in rtas) */
 #endif
 	lis	r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */
@ -140,10 +127,10 @@
 	 * otherwise we might risk taking an interrupt before we tell lockdep
 	 * they are enabled.
 	 */
-	LOAD_MSR_KERNEL(r10, MSR_KERNEL)
+	LOAD_REG_IMMEDIATE(r10, MSR_KERNEL)
 	rlwimi	r10, r9, 0, MSR_EE
 #else
-	LOAD_MSR_KERNEL(r10, MSR_KERNEL | MSR_EE)
+	LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE)
 #endif
 #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
 	mtspr	SPRN_NRI, r0
@ -187,7 +174,7 @@ label:
 #define EXC_XFER_TEMPLATE(hdlr, trap, msr, tfer, ret)		\
 	li	r10,trap;					\
 	stw	r10,_TRAP(r11);					\
-	LOAD_MSR_KERNEL(r10, msr);				\
+	LOAD_REG_IMMEDIATE(r10, msr);				\
 	bl	tfer;						\
 	.long	hdlr;						\
 	.long	ret
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@ -182,7 +182,8 @@ __secondary_hold:
 	isync
 	bctr
 #else
-	BUG_OPCODE
+0:	trap
+	EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0
 #endif
 CLOSE_FIXED_SECTION(first_256B)

@ -635,7 +636,7 @@ __after_prom_start:
 	sub	r5,r5,r11
 #else
 	/* just copy interrupts */
-	LOAD_REG_IMMEDIATE(r5, FIXED_SYMBOL_ABS_ADDR(__end_interrupts))
+	LOAD_REG_IMMEDIATE_SYM(r5, r11, FIXED_SYMBOL_ABS_ADDR(__end_interrupts))
 #endif
 	b	5f
 3:
@ -998,7 +999,8 @@ start_here_common:
 	bl	start_kernel

 	/* Not reached */
-	BUG_OPCODE
+	trap
+	EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0

 /*
 * We put a few things here that have to be page-aligned.
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@ -15,6 +15,7 @@
 */

 #include <linux/init.h>
+#include <linux/magic.h>
 #include <asm/processor.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
@ -574,8 +575,6 @@ InstructionBreakpoint:
 * by decoding the registers used by the dcbx instruction and adding them.
 * DAR is set to the calculated address.
 */
- /* define if you don't want to use self modifying code */
-#define NO_SELF_MODIFYING_CODE
 FixupDAR:/* Entry point for dcbx workaround. */
 	mtspr	SPRN_M_TW, r10
 	/* fetch instruction from memory. */
@ -639,27 +638,6 @@ FixupDAR:/* Entry point for dcbx workaround. */
 	rlwinm	r10, r10,0,7,5	/* Clear store bit for buggy dcbst insn */
 	mtspr	SPRN_DSISR, r10
 142:	/* continue, it was a dcbx, dcbi instruction. */
-#ifndef NO_SELF_MODIFYING_CODE
-	andis.	r10,r11,0x1f	/* test if reg RA is r0 */
-	li	r10,modified_instr@l
-	dcbtst	r0,r10		/* touch for store */
-	rlwinm	r11,r11,0,0,20	/* Zero lower 10 bits */
-	oris	r11,r11,640	/* Transform instr. to a "add r10,RA,RB" */
-	ori	r11,r11,532
-	stw	r11,0(r10)	/* store add/and instruction */
-	dcbf	0,r10		/* flush new instr. to memory. */
-	icbi	0,r10		/* invalidate instr. cache line */
-	mfspr	r11, SPRN_SPRG_SCRATCH1	/* restore r11 */
-	mfspr	r10, SPRN_SPRG_SCRATCH0	/* restore r10 */
-	isync			/* Wait until new instr is loaded from memory */
-modified_instr:
-	.space	4		/* this is where the add instr. is stored */
-	bne+	143f
-	subf	r10,r0,r10	/* r10=r10-r0, only if reg RA is r0 */
-143:	mtdar	r10		/* store faulting EA in DAR */
-	mfspr	r10,SPRN_M_TW
-	b	DARFixed	/* Go back to normal TLB handling */
-#else
 	mfctr	r10
 	mtdar	r10			/* save ctr reg in DAR */
 	rlwinm	r10, r11, 24, 24, 28	/* offset into jump table for reg RB */
@ -723,7 +701,6 @@ modified_instr:
 	add	r10, r10, r11	/* add it */
 	mfctr	r11		/* restore r11 */
 	b	151b
-#endif

 /*
 * This is where the main kernel code starts.
@ -741,6 +718,9 @@ start_here:
 	/* stack */
 	lis	r1,init_thread_union@ha
 	addi	r1,r1,init_thread_union@l
+	lis	r0, STACK_END_MAGIC@h
+	ori	r0, r0, STACK_END_MAGIC@l
+	stw	r0, 0(r1)
 	li	r0,0
 	stwu	r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1)

--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@ -195,18 +195,63 @@ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs)
 	tsk->thread.last_hit_ubp = NULL;
 }

+static bool is_larx_stcx_instr(struct pt_regs *regs, unsigned int instr)
+{
+	int ret, type;
+	struct instruction_op op;
+
+	ret = analyse_instr(&op, regs, instr);
+	type = GETTYPE(op.type);
+	return (!ret && (type == LARX || type == STCX));
+}
+
 /*
 * Handle debug exception notifications.
 */
+static bool stepping_handler(struct pt_regs *regs, struct perf_event *bp,
+			     unsigned long addr)
+{
+	unsigned int instr = 0;
+
+	if (__get_user_inatomic(instr, (unsigned int *)regs->nip))
+		goto fail;
+
+	if (is_larx_stcx_instr(regs, instr)) {
+		printk_ratelimited("Breakpoint hit on instruction that can't be emulated."
+				   " Breakpoint at 0x%lx will be disabled.\n", addr);
+		goto disable;
+	}
+
+	/* Do not emulate user-space instructions, instead single-step them */
+	if (user_mode(regs)) {
+		current->thread.last_hit_ubp = bp;
+		regs->msr |= MSR_SE;
+		return false;
+	}
+
+	if (!emulate_step(regs, instr))
+		goto fail;
+
+	return true;
+
+fail:
+	/*
+	 * We've failed in reliably handling the hw-breakpoint. Unregister
+	 * it and throw a warning message to let the user know about it.
+	 */
+	WARN(1, "Unable to handle hardware breakpoint. Breakpoint at "
+		"0x%lx will be disabled.", addr);
+
+disable:
+	perf_event_disable_inatomic(bp);
+	return false;
+}
+
 int hw_breakpoint_handler(struct die_args *args)
 {
 	int rc = NOTIFY_STOP;
 	struct perf_event *bp;
 	struct pt_regs *regs = args->regs;
-#ifndef CONFIG_PPC_8xx
-	int stepped = 1;
-	unsigned int instr;
-#endif
 	struct arch_hw_breakpoint *info;
 	unsigned long dar = regs->dar;

@ -251,31 +296,9 @@ int hw_breakpoint_handler(struct die_args *args)
 	      (dar - bp->attr.bp_addr < bp->attr.bp_len)))
 		info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ;

-#ifndef CONFIG_PPC_8xx
-	/* Do not emulate user-space instructions, instead single-step them */
-	if (user_mode(regs)) {
-		current->thread.last_hit_ubp = bp;
-		regs->msr |= MSR_SE;
+	if (!IS_ENABLED(CONFIG_PPC_8xx) && !stepping_handler(regs, bp, info->address))
 		goto out;
-	}

-	stepped = 0;
-	instr = 0;
-	if (!__get_user_inatomic(instr, (unsigned int *) regs->nip))
-		stepped = emulate_step(regs, instr);
-
-	/*
-	 * emulate_step() could not execute it. We've failed in reliably
-	 * handling the hw-breakpoint. Unregister it and throw a warning
-	 * message to let the user know about it.
-	 */
-	if (!stepped) {
-		WARN(1, "Unable to handle hardware breakpoint. Breakpoint at "
-			"0x%lx will be disabled.", info->address);
-		perf_event_disable_inatomic(bp);
-		goto out;
-	}
-#endif
 	/*
 	 * As a policy, the callback is invoked in a 'trigger-after-execute'
 	 * fashion
--- a/arch/powerpc/kernel/io-workarounds.c
+++ b/arch/powerpc/kernel/io-workarounds.c
@ -149,8 +149,8 @@ static const struct ppc_pci_io iowa_pci_io = {
 };

 #ifdef CONFIG_PPC_INDIRECT_MMIO
-static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size,
-				  pgprot_t prot, void *caller)
+void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size,
+			   pgprot_t prot, void *caller)
 {
 	struct iowa_bus *bus;
 	void __iomem *res = __ioremap_caller(addr, size, prot, caller);
@ -163,20 +163,17 @@ static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size,
 	}
 	return res;
 }
-#else /* CONFIG_PPC_INDIRECT_MMIO */
-#define iowa_ioremap NULL
 #endif /* !CONFIG_PPC_INDIRECT_MMIO */

+bool io_workaround_inited;
+
 /* Enable IO workaround */
 static void io_workaround_init(void)
 {
-	static int io_workaround_inited;
-
 	if (io_workaround_inited)
 		return;
 	ppc_pci_io = iowa_pci_io;
-	ppc_md.ioremap = iowa_ioremap;
-	io_workaround_inited = 1;
+	io_workaround_inited = true;
 }

 /* Register new bus to support workaround */
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@ -633,11 +633,54 @@ static void iommu_table_clear(struct iommu_table *tbl)
 #endif
 }

+static void iommu_table_reserve_pages(struct iommu_table *tbl,
+		unsigned long res_start, unsigned long res_end)
+{
+	int i;
+
+	WARN_ON_ONCE(res_end < res_start);
+	/*
+	 * Reserve page 0 so it will not be used for any mappings.
+	 * This avoids buggy drivers that consider page 0 to be invalid
+	 * to crash the machine or even lose data.
+	 */
+	if (tbl->it_offset == 0)
+		set_bit(0, tbl->it_map);
+
+	tbl->it_reserved_start = res_start;
+	tbl->it_reserved_end = res_end;
+
+	/* Check if res_start..res_end isn't empty and overlaps the table */
+	if (res_start && res_end &&
+			(tbl->it_offset + tbl->it_size < res_start ||
+			 res_end < tbl->it_offset))
+		return;
+
+	for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
+		set_bit(i - tbl->it_offset, tbl->it_map);
+}
+
+static void iommu_table_release_pages(struct iommu_table *tbl)
+{
+	int i;
+
+	/*
+	 * In case we have reserved the first bit, we should not emit
+	 * the warning below.
+	 */
+	if (tbl->it_offset == 0)
+		clear_bit(0, tbl->it_map);
+
+	for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
+		clear_bit(i - tbl->it_offset, tbl->it_map);
+}
+
 /*
 * Build a iommu_table structure.  This contains a bit map which
 * is used to manage allocation of the tce space.
 */
-struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
+struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
+		unsigned long res_start, unsigned long res_end)
 {
 	unsigned long sz;
 	static int welcomed = 0;
@ -656,13 +699,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 	tbl->it_map = page_address(page);
 	memset(tbl->it_map, 0, sz);

-	/*
-	 * Reserve page 0 so it will not be used for any mappings.
-	 * This avoids buggy drivers that consider page 0 to be invalid
-	 * to crash the machine or even lose data.
-	 */
-	if (tbl->it_offset == 0)
-		set_bit(0, tbl->it_map);
+	iommu_table_reserve_pages(tbl, res_start, res_end);

 	/* We only split the IOMMU table if we have 1GB or more of space */
 	if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024))
@ -714,12 +751,7 @@ static void iommu_table_free(struct kref *kref)
 		return;
 	}

-	/*
-	 * In case we have reserved the first bit, we should not emit
-	 * the warning below.
-	 */
-	if (tbl->it_offset == 0)
-		clear_bit(0, tbl->it_map);
+	iommu_table_release_pages(tbl);

 	/* verify that table contains no entries */
 	if (!bitmap_empty(tbl->it_map, tbl->it_size))
@ -981,29 +1013,32 @@ int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa)
 }
 EXPORT_SYMBOL_GPL(iommu_tce_check_gpa);

-long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl,
+extern long iommu_tce_xchg_no_kill(struct mm_struct *mm,
+		struct iommu_table *tbl,
 		unsigned long entry, unsigned long *hpa,
 		enum dma_data_direction *direction)
 {
 	long ret;
 	unsigned long size = 0;

-	ret = tbl->it_ops->exchange(tbl, entry, hpa, direction);
-
+	ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, false);
 	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
 			(*direction == DMA_BIDIRECTIONAL)) &&
 			!mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift,
 					&size))
 		SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));

-	/* if (unlikely(ret))
-		pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
-			__func__, hwaddr, entry << tbl->it_page_shift,
-				hwaddr, ret); */
-
 	return ret;
 }
-EXPORT_SYMBOL_GPL(iommu_tce_xchg);
+EXPORT_SYMBOL_GPL(iommu_tce_xchg_no_kill);
+
+void iommu_tce_kill(struct iommu_table *tbl,
+		unsigned long entry, unsigned long pages)
+{
+	if (tbl->it_ops->tce_kill)
+		tbl->it_ops->tce_kill(tbl, entry, pages, false);
+}
+EXPORT_SYMBOL_GPL(iommu_tce_kill);

 int iommu_take_ownership(struct iommu_table *tbl)
 {
@ -1017,22 +1052,21 @@ int iommu_take_ownership(struct iommu_table *tbl)
 	 * requires exchange() callback defined so if it is not
 	 * implemented, we disallow taking ownership over the table.
 	 */
-	if (!tbl->it_ops->exchange)
+	if (!tbl->it_ops->xchg_no_kill)
 		return -EINVAL;

 	spin_lock_irqsave(&tbl->large_pool.lock, flags);
 	for (i = 0; i < tbl->nr_pools; i++)
 		spin_lock(&tbl->pools[i].lock);

-	if (tbl->it_offset == 0)
-		clear_bit(0, tbl->it_map);
+	iommu_table_release_pages(tbl);

 	if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
 		pr_err("iommu_tce: it_map is not empty");
 		ret = -EBUSY;
-		/* Restore bit#0 set by iommu_init_table() */
-		if (tbl->it_offset == 0)
-			set_bit(0, tbl->it_map);
+		/* Undo iommu_table_release_pages, i.e. restore bit#0, etc */
+		iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
+				tbl->it_reserved_end);
 	} else {
 		memset(tbl->it_map, 0xff, sz);
 	}
@ -1055,9 +1089,8 @@ void iommu_release_ownership(struct iommu_table *tbl)

 	memset(tbl->it_map, 0, sz);

-	/* Restore bit#0 set by iommu_init_table() */
-	if (tbl->it_offset == 0)
-		set_bit(0, tbl->it_map);
+	iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
+			tbl->it_reserved_end);

 	for (i = 0; i < tbl->nr_pools; i++)
 		spin_unlock(&tbl->pools[i].lock);
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@ -64,16 +64,17 @@
 #define KVM_INST_MTSRIN		0x7c0001e4

 static bool kvm_patching_worked = true;
-char kvm_tmp[1024 * 1024];
+extern char kvm_tmp[];
+extern char kvm_tmp_end[];
 static int kvm_tmp_index;

-static inline void kvm_patch_ins(u32 *inst, u32 new_inst)
+static void __init kvm_patch_ins(u32 *inst, u32 new_inst)
 {
 	*inst = new_inst;
 	flush_icache_range((ulong)inst, (ulong)inst + 4);
 }

-static void kvm_patch_ins_ll(u32 *inst, long addr, u32 rt)
+static void __init kvm_patch_ins_ll(u32 *inst, long addr, u32 rt)
 {
 #ifdef CONFIG_64BIT
 	kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc));
@ -82,7 +83,7 @@ static void kvm_patch_ins_ll(u32 *inst, long addr, u32 rt)
 #endif
 }

-static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt)
+static void __init kvm_patch_ins_ld(u32 *inst, long addr, u32 rt)
 {
 #ifdef CONFIG_64BIT
 	kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc));
@ -91,12 +92,12 @@ static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt)
 #endif
 }

-static void kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt)
+static void __init kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt)
 {
 	kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000ffff));
 }

-static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt)
+static void __init kvm_patch_ins_std(u32 *inst, long addr, u32 rt)
 {
 #ifdef CONFIG_64BIT
 	kvm_patch_ins(inst, KVM_INST_STD | rt | (addr & 0x0000fffc));
@ -105,17 +106,17 @@ static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt)
 #endif
 }

-static void kvm_patch_ins_stw(u32 *inst, long addr, u32 rt)
+static void __init kvm_patch_ins_stw(u32 *inst, long addr, u32 rt)
 {
 	kvm_patch_ins(inst, KVM_INST_STW | rt | (addr & 0x0000fffc));
 }

-static void kvm_patch_ins_nop(u32 *inst)
+static void __init kvm_patch_ins_nop(u32 *inst)
 {
 	kvm_patch_ins(inst, KVM_INST_NOP);
 }

-static void kvm_patch_ins_b(u32 *inst, int addr)
+static void __init kvm_patch_ins_b(u32 *inst, int addr)
 {
 #if defined(CONFIG_RELOCATABLE) && defined(CONFIG_PPC_BOOK3S)
 	/* On relocatable kernels interrupts handlers and our code
@ -128,11 +129,11 @@ static void kvm_patch_ins_b(u32 *inst, int addr)
 	kvm_patch_ins(inst, KVM_INST_B | (addr & KVM_INST_B_MASK));
 }

-static u32 *kvm_alloc(int len)
+static u32 * __init kvm_alloc(int len)
 {
 	u32 *p;

-	if ((kvm_tmp_index + len) > ARRAY_SIZE(kvm_tmp)) {
+	if ((kvm_tmp_index + len) > (kvm_tmp_end - kvm_tmp)) {
 		printk(KERN_ERR "KVM: No more space (%d + %d)\n",
 				kvm_tmp_index, len);
 		kvm_patching_worked = false;
@ -151,7 +152,7 @@ extern u32 kvm_emulate_mtmsrd_orig_ins_offs;
 extern u32 kvm_emulate_mtmsrd_len;
 extern u32 kvm_emulate_mtmsrd[];

-static void kvm_patch_ins_mtmsrd(u32 *inst, u32 rt)
+static void __init kvm_patch_ins_mtmsrd(u32 *inst, u32 rt)
 {
 	u32 *p;
 	int distance_start;
@ -204,7 +205,7 @@ extern u32 kvm_emulate_mtmsr_orig_ins_offs;
 extern u32 kvm_emulate_mtmsr_len;
 extern u32 kvm_emulate_mtmsr[];

-static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt)
+static void __init kvm_patch_ins_mtmsr(u32 *inst, u32 rt)
 {
 	u32 *p;
 	int distance_start;
@ -265,7 +266,7 @@ extern u32 kvm_emulate_wrtee_orig_ins_offs;
 extern u32 kvm_emulate_wrtee_len;
 extern u32 kvm_emulate_wrtee[];

-static void kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one)
+static void __init kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one)
 {
 	u32 *p;
 	int distance_start;
@ -322,7 +323,7 @@ extern u32 kvm_emulate_wrteei_0_branch_offs;
 extern u32 kvm_emulate_wrteei_0_len;
 extern u32 kvm_emulate_wrteei_0[];

-static void kvm_patch_ins_wrteei_0(u32 *inst)
+static void __init kvm_patch_ins_wrteei_0(u32 *inst)
 {
 	u32 *p;
 	int distance_start;
@ -363,7 +364,7 @@ extern u32 kvm_emulate_mtsrin_orig_ins_offs;
 extern u32 kvm_emulate_mtsrin_len;
 extern u32 kvm_emulate_mtsrin[];

-static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb)
+static void __init kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb)
 {
 	u32 *p;
 	int distance_start;
@ -399,7 +400,7 @@ static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb)

 #endif

-static void kvm_map_magic_page(void *data)
+static void __init kvm_map_magic_page(void *data)
 {
 	u32 *features = data;

@ -414,7 +415,7 @@ static void kvm_map_magic_page(void *data)
 	*features = out[0];
 }

-static void kvm_check_ins(u32 *inst, u32 features)
+static void __init kvm_check_ins(u32 *inst, u32 features)
 {
 	u32 _inst = *inst;
 	u32 inst_no_rt = _inst & ~KVM_MASK_RT;
@ -658,7 +659,7 @@ static void kvm_check_ins(u32 *inst, u32 features)
 extern u32 kvm_template_start[];
 extern u32 kvm_template_end[];

-static void kvm_use_magic_page(void)
+static void __init kvm_use_magic_page(void)
 {
 	u32 *p;
 	u32 *start, *end;
@ -699,25 +700,13 @@ static void kvm_use_magic_page(void)
 			 kvm_patching_worked ? "worked" : "failed");
 }

-static __init void kvm_free_tmp(void)
-{
-	/*
-	 * Inform kmemleak about the hole in the .bss section since the
-	 * corresponding pages will be unmapped with DEBUG_PAGEALLOC=y.
-	 */
-	kmemleak_free_part(&kvm_tmp[kvm_tmp_index],
-			   ARRAY_SIZE(kvm_tmp) - kvm_tmp_index);
-	free_reserved_area(&kvm_tmp[kvm_tmp_index],
-			   &kvm_tmp[ARRAY_SIZE(kvm_tmp)], -1, NULL);
-}
-
 static int __init kvm_guest_init(void)
 {
 	if (!kvm_para_available())
-		goto free_tmp;
+		return 0;

 	if (!epapr_paravirt_enabled)
-		goto free_tmp;
+		return 0;

 	if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE))
 		kvm_use_magic_page();
@ -727,9 +716,6 @@ static int __init kvm_guest_init(void)
 	powersave_nap = 1;
 #endif

-free_tmp:
-	kvm_free_tmp();
-
 	return 0;
 }

--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@ -192,6 +192,8 @@ kvm_emulate_mtmsr_orig_ins_offs:
 kvm_emulate_mtmsr_len:
 	.long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4

+#ifdef CONFIG_BOOKE
+
 /* also used for wrteei 1 */
 .global kvm_emulate_wrtee
 kvm_emulate_wrtee:
@ -285,6 +287,10 @@ kvm_emulate_wrteei_0_branch_offs:
 kvm_emulate_wrteei_0_len:
 	.long (kvm_emulate_wrteei_0_end - kvm_emulate_wrteei_0) / 4

+#endif /* CONFIG_BOOKE */
+
+#ifdef CONFIG_PPC_BOOK3S_32
+
 .global kvm_emulate_mtsrin
 kvm_emulate_mtsrin:

@ -334,5 +340,15 @@ kvm_emulate_mtsrin_orig_ins_offs:
 kvm_emulate_mtsrin_len:
 	.long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4

+#endif /* CONFIG_PPC_BOOK3S_32 */
+
+	.balign 4
+	.global kvm_tmp
+kvm_tmp:
+	.space	(64 * 1024)
+
+.global kvm_tmp_end
+kvm_tmp_end:
+
 .global kvm_template_end
 kvm_template_end:
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@ -29,6 +29,8 @@
 #include <asm/smp.h>
 #include <asm/hw_breakpoint.h>
 #include <asm/asm-prototypes.h>
+#include <asm/svm.h>
+#include <asm/ultravisor.h>

 int default_machine_kexec_prepare(struct kimage *image)
 {
@ -327,6 +329,13 @@ void default_machine_kexec(struct kimage *image)
 #ifdef CONFIG_PPC_PSERIES
 	kexec_paca.lppaca_ptr = NULL;
 #endif
+
+	if (is_secure_guest() && !(image->preserve_context ||
+				   image->type == KEXEC_TYPE_CRASH)) {
+		uv_unshare_all_pages();
+		printk("kexec: Unshared all shared pages.\n");
+	}
+
 	paca_ptrs[kexec_paca.paca_index] = &kexec_paca;

 	setup_paca(&kexec_paca);
--- a/Show More
+++ b/Show More