qemu-e2k/hw/ppc/spapr_rtas.c
Laurent Vivier aef92d87c5 pseries: fix kvmppc_set_fwnmi()
QEMU issues the ioctl(KVM_CAP_PPC_FWNMI) on the first vCPU.

If the first vCPU is currently running, the vCPU mutex is held
and the ioctl() cannot be done and waits until the mutex is released.
This never happens and the VM is stuck.

To avoid this deadlock, issue the ioctl on the same vCPU doing the
RTAS call.

The problem can be reproduced by booting a guest with several vCPUs
(the probability to have the problem is (n - 1) / n,  n = # of CPUs),
and then by triggering a kernel crash with "echo c >/proc/sysrq-trigger".

On the reboot, the kernel hangs after:

...
[    0.000000] -----------------------------------------------------
[    0.000000] ppc64_pft_size    = 0x0
[    0.000000] phys_mem_size     = 0x48000000
[    0.000000] dcache_bsize      = 0x80
[    0.000000] icache_bsize      = 0x80
[    0.000000] cpu_features      = 0x0001c06f8f4f91a7
[    0.000000]   possible        = 0x0003fbffcf5fb1a7
[    0.000000]   always          = 0x00000003800081a1
[    0.000000] cpu_user_features = 0xdc0065c2 0xaee00000
[    0.000000] mmu_features      = 0x3c006041
[    0.000000] firmware_features = 0x00000085455a445f
[    0.000000] physical_start    = 0x8000000
[    0.000000] -----------------------------------------------------
[    0.000000] numa:   NODE_DATA [mem 0x47f33c80-0x47f3ffff]

Fixes: ec010c0066 ("ppc/spapr: KVM FWNMI should not be enabled until guest requests it")
Cc: npiggin@gmail.com
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Message-Id: <20200724083533.281700-1-lvivier@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2020-07-27 11:09:25 +10:00

634 lines
20 KiB
C

/*
* QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
*
* Hypercall based emulated RTAS
*
* Copyright (c) 2010-2011 David Gibson, IBM Corporation.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*/
#include "qemu/osdep.h"
#include "cpu.h"
#include "qemu/log.h"
#include "qemu/error-report.h"
#include "sysemu/sysemu.h"
#include "sysemu/device_tree.h"
#include "sysemu/cpus.h"
#include "sysemu/hw_accel.h"
#include "sysemu/runstate.h"
#include "kvm_ppc.h"
#include "hw/ppc/spapr.h"
#include "hw/ppc/spapr_vio.h"
#include "hw/ppc/spapr_rtas.h"
#include "hw/ppc/spapr_cpu_core.h"
#include "hw/ppc/ppc.h"
#include "hw/boards.h"
#include <libfdt.h>
#include "hw/ppc/spapr_drc.h"
#include "qemu/cutils.h"
#include "trace.h"
#include "hw/ppc/fdt.h"
#include "target/ppc/mmu-hash64.h"
#include "target/ppc/mmu-book3s-v3.h"
#include "migration/blocker.h"
static void rtas_display_character(PowerPCCPU *cpu, SpaprMachineState *spapr,
uint32_t token, uint32_t nargs,
target_ulong args,
uint32_t nret, target_ulong rets)
{
uint8_t c = rtas_ld(args, 0);
SpaprVioDevice *sdev = vty_lookup(spapr, 0);
if (!sdev) {
rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
} else {
vty_putchars(sdev, &c, sizeof(c));
rtas_st(rets, 0, RTAS_OUT_SUCCESS);
}
}
static void rtas_power_off(PowerPCCPU *cpu, SpaprMachineState *spapr,
uint32_t token, uint32_t nargs, target_ulong args,
uint32_t nret, target_ulong rets)
{
if (nargs != 2 || nret != 1) {
rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
return;
}
qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
cpu_stop_current();
rtas_st(rets, 0, RTAS_OUT_SUCCESS);
}
static void rtas_system_reboot(PowerPCCPU *cpu, SpaprMachineState *spapr,
uint32_t token, uint32_t nargs,
target_ulong args,
uint32_t nret, target_ulong rets)
{
if (nargs != 0 || nret != 1) {
rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
return;
}
qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
rtas_st(rets, 0, RTAS_OUT_SUCCESS);
}
static void rtas_query_cpu_stopped_state(PowerPCCPU *cpu_,
SpaprMachineState *spapr,
uint32_t token, uint32_t nargs,
target_ulong args,
uint32_t nret, target_ulong rets)
{
target_ulong id;
PowerPCCPU *cpu;
if (nargs != 1 || nret != 2) {
rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
return;
}
id = rtas_ld(args, 0);
cpu = spapr_find_cpu(id);
if (cpu != NULL) {
if (CPU(cpu)->halted) {
rtas_st(rets, 1, 0);
} else {
rtas_st(rets, 1, 2);
}
rtas_st(rets, 0, RTAS_OUT_SUCCESS);
return;
}
/* Didn't find a matching cpu */
rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
}
static void rtas_start_cpu(PowerPCCPU *callcpu, SpaprMachineState *spapr,
uint32_t token, uint32_t nargs,
target_ulong args,
uint32_t nret, target_ulong rets)
{
target_ulong id, start, r3;
PowerPCCPU *newcpu;
CPUPPCState *env;
PowerPCCPUClass *pcc;
target_ulong lpcr;
if (nargs != 3 || nret != 1) {
rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
return;
}
id = rtas_ld(args, 0);
start = rtas_ld(args, 1);
r3 = rtas_ld(args, 2);
newcpu = spapr_find_cpu(id);
if (!newcpu) {
/* Didn't find a matching cpu */
rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
return;
}
env = &newcpu->env;
pcc = POWERPC_CPU_GET_CLASS(newcpu);
if (!CPU(newcpu)->halted) {
rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
return;
}
cpu_synchronize_state(CPU(newcpu));
env->msr = (1ULL << MSR_SF) | (1ULL << MSR_ME);
/* Enable Power-saving mode Exit Cause exceptions for the new CPU */
lpcr = env->spr[SPR_LPCR];
if (!pcc->interrupts_big_endian(callcpu)) {
lpcr |= LPCR_ILE;
}
if (env->mmu_model == POWERPC_MMU_3_00) {
/*
* New cpus are expected to start in the same radix/hash mode
* as the existing CPUs
*/
if (ppc64_v3_radix(callcpu)) {
lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
} else {
lpcr &= ~(LPCR_UPRT | LPCR_GTSE | LPCR_HR);
}
env->spr[SPR_PSSCR] &= ~PSSCR_EC;
}
ppc_store_lpcr(newcpu, lpcr);
/*
* Set the timebase offset of the new CPU to that of the invoking
* CPU. This helps hotplugged CPU to have the correct timebase
* offset.
*/
newcpu->env.tb_env->tb_offset = callcpu->env.tb_env->tb_offset;
spapr_cpu_set_entry_state(newcpu, start, 0, r3, 0);
qemu_cpu_kick(CPU(newcpu));
rtas_st(rets, 0, RTAS_OUT_SUCCESS);
}
static void rtas_stop_self(PowerPCCPU *cpu, SpaprMachineState *spapr,
uint32_t token, uint32_t nargs,
target_ulong args,
uint32_t nret, target_ulong rets)
{
CPUState *cs = CPU(cpu);
CPUPPCState *env = &cpu->env;
PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
/* Disable Power-saving mode Exit Cause exceptions for the CPU.
* This could deliver an interrupt on a dying CPU and crash the
* guest.
* For the same reason, set PSSCR_EC.
*/
ppc_store_lpcr(cpu, env->spr[SPR_LPCR] & ~pcc->lpcr_pm);
env->spr[SPR_PSSCR] |= PSSCR_EC;
cs->halted = 1;
kvmppc_set_reg_ppc_online(cpu, 0);
qemu_cpu_kick(cs);
}
static void rtas_ibm_suspend_me(PowerPCCPU *cpu, SpaprMachineState *spapr,
uint32_t token, uint32_t nargs,
target_ulong args,
uint32_t nret, target_ulong rets)
{
CPUState *cs;
if (nargs != 0 || nret != 1) {
rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
return;
}
CPU_FOREACH(cs) {
PowerPCCPU *c = POWERPC_CPU(cs);
CPUPPCState *e = &c->env;
if (c == cpu) {
continue;
}
/* See h_join */
if (!cs->halted || (e->msr & (1ULL << MSR_EE))) {
rtas_st(rets, 0, H_MULTI_THREADS_ACTIVE);
return;
}
}
qemu_system_suspend_request();
rtas_st(rets, 0, RTAS_OUT_SUCCESS);
}
static inline int sysparm_st(target_ulong addr, target_ulong len,
const void *val, uint16_t vallen)
{
hwaddr phys = ppc64_phys_to_real(addr);
if (len < 2) {
return RTAS_OUT_SYSPARM_PARAM_ERROR;
}
stw_be_phys(&address_space_memory, phys, vallen);
cpu_physical_memory_write(phys + 2, val, MIN(len - 2, vallen));
return RTAS_OUT_SUCCESS;
}
static void rtas_ibm_get_system_parameter(PowerPCCPU *cpu,
SpaprMachineState *spapr,
uint32_t token, uint32_t nargs,
target_ulong args,
uint32_t nret, target_ulong rets)
{
PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
MachineState *ms = MACHINE(spapr);
target_ulong parameter = rtas_ld(args, 0);
target_ulong buffer = rtas_ld(args, 1);
target_ulong length = rtas_ld(args, 2);
target_ulong ret;
switch (parameter) {
case RTAS_SYSPARM_SPLPAR_CHARACTERISTICS: {
char *param_val = g_strdup_printf("MaxEntCap=%d,"
"DesMem=%" PRIu64 ","
"DesProcs=%d,"
"MaxPlatProcs=%d",
ms->smp.max_cpus,
ms->ram_size / MiB,
ms->smp.cpus,
ms->smp.max_cpus);
if (pcc->n_host_threads > 0) {
char *hostthr_val, *old = param_val;
/*
* Add HostThrs property. This property is not present in PAPR but
* is expected by some guests to communicate the number of physical
* host threads per core on the system so that they can scale
* information which varies based on the thread configuration.
*/
hostthr_val = g_strdup_printf(",HostThrs=%d", pcc->n_host_threads);
param_val = g_strconcat(param_val, hostthr_val, NULL);
g_free(hostthr_val);
g_free(old);
}
ret = sysparm_st(buffer, length, param_val, strlen(param_val) + 1);
g_free(param_val);
break;
}
case RTAS_SYSPARM_DIAGNOSTICS_RUN_MODE: {
uint8_t param_val = DIAGNOSTICS_RUN_MODE_DISABLED;
ret = sysparm_st(buffer, length, &param_val, sizeof(param_val));
break;
}
case RTAS_SYSPARM_UUID:
ret = sysparm_st(buffer, length, (unsigned char *)&qemu_uuid,
(qemu_uuid_set ? 16 : 0));
break;
default:
ret = RTAS_OUT_NOT_SUPPORTED;
}
rtas_st(rets, 0, ret);
}
static void rtas_ibm_set_system_parameter(PowerPCCPU *cpu,
SpaprMachineState *spapr,
uint32_t token, uint32_t nargs,
target_ulong args,
uint32_t nret, target_ulong rets)
{
target_ulong parameter = rtas_ld(args, 0);
target_ulong ret = RTAS_OUT_NOT_SUPPORTED;
switch (parameter) {
case RTAS_SYSPARM_SPLPAR_CHARACTERISTICS:
case RTAS_SYSPARM_DIAGNOSTICS_RUN_MODE:
case RTAS_SYSPARM_UUID:
ret = RTAS_OUT_NOT_AUTHORIZED;
break;
}
rtas_st(rets, 0, ret);
}
static void rtas_ibm_os_term(PowerPCCPU *cpu,
SpaprMachineState *spapr,
uint32_t token, uint32_t nargs,
target_ulong args,
uint32_t nret, target_ulong rets)
{
target_ulong msgaddr = rtas_ld(args, 0);
char msg[512];
cpu_physical_memory_read(msgaddr, msg, sizeof(msg) - 1);
msg[sizeof(msg) - 1] = 0;
error_report("OS terminated: %s", msg);
qemu_system_guest_panicked(NULL);
rtas_st(rets, 0, RTAS_OUT_SUCCESS);
}
static void rtas_set_power_level(PowerPCCPU *cpu, SpaprMachineState *spapr,
uint32_t token, uint32_t nargs,
target_ulong args, uint32_t nret,
target_ulong rets)
{
int32_t power_domain;
if (nargs != 2 || nret != 2) {
rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
return;
}
/* we currently only use a single, "live insert" powerdomain for
* hotplugged/dlpar'd resources, so the power is always live/full (100)
*/
power_domain = rtas_ld(args, 0);
if (power_domain != -1) {
rtas_st(rets, 0, RTAS_OUT_NOT_SUPPORTED);
return;
}
rtas_st(rets, 0, RTAS_OUT_SUCCESS);
rtas_st(rets, 1, 100);
}
static void rtas_get_power_level(PowerPCCPU *cpu, SpaprMachineState *spapr,
uint32_t token, uint32_t nargs,
target_ulong args, uint32_t nret,
target_ulong rets)
{
int32_t power_domain;
if (nargs != 1 || nret != 2) {
rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
return;
}
/* we currently only use a single, "live insert" powerdomain for
* hotplugged/dlpar'd resources, so the power is always live/full (100)
*/
power_domain = rtas_ld(args, 0);
if (power_domain != -1) {
rtas_st(rets, 0, RTAS_OUT_NOT_SUPPORTED);
return;
}
rtas_st(rets, 0, RTAS_OUT_SUCCESS);
rtas_st(rets, 1, 100);
}
static void rtas_ibm_nmi_register(PowerPCCPU *cpu,
SpaprMachineState *spapr,
uint32_t token, uint32_t nargs,
target_ulong args,
uint32_t nret, target_ulong rets)
{
hwaddr rtas_addr;
target_ulong sreset_addr, mce_addr;
if (spapr_get_cap(spapr, SPAPR_CAP_FWNMI) == SPAPR_CAP_OFF) {
rtas_st(rets, 0, RTAS_OUT_NOT_SUPPORTED);
return;
}
rtas_addr = spapr_get_rtas_addr();
if (!rtas_addr) {
rtas_st(rets, 0, RTAS_OUT_NOT_SUPPORTED);
return;
}
sreset_addr = rtas_ld(args, 0);
mce_addr = rtas_ld(args, 1);
/* PAPR requires these are in the first 32M of memory and within RMA */
if (sreset_addr >= 32 * MiB || sreset_addr >= spapr->rma_size ||
mce_addr >= 32 * MiB || mce_addr >= spapr->rma_size) {
rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
return;
}
if (kvm_enabled()) {
if (kvmppc_set_fwnmi(cpu) < 0) {
rtas_st(rets, 0, RTAS_OUT_NOT_SUPPORTED);
return;
}
}
spapr->fwnmi_system_reset_addr = sreset_addr;
spapr->fwnmi_machine_check_addr = mce_addr;
rtas_st(rets, 0, RTAS_OUT_SUCCESS);
}
static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
SpaprMachineState *spapr,
uint32_t token, uint32_t nargs,
target_ulong args,
uint32_t nret, target_ulong rets)
{
if (spapr_get_cap(spapr, SPAPR_CAP_FWNMI) == SPAPR_CAP_OFF) {
rtas_st(rets, 0, RTAS_OUT_NOT_SUPPORTED);
return;
}
if (spapr->fwnmi_machine_check_addr == -1) {
qemu_log_mask(LOG_GUEST_ERROR,
"FWNMI: ibm,nmi-interlock RTAS called with FWNMI not registered.\n");
/* NMI register not called */
rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
return;
}
if (spapr->fwnmi_machine_check_interlock != cpu->vcpu_id) {
/*
* The vCPU that hit the NMI should invoke "ibm,nmi-interlock"
* This should be PARAM_ERROR, but Linux calls "ibm,nmi-interlock"
* for system reset interrupts, despite them not being interlocked.
* PowerVM silently ignores this and returns success here. Returning
* failure causes Linux to print the error "FWNMI: nmi-interlock
* failed: -3", although no other apparent ill effects, this is a
* regression for the user when enabling FWNMI. So for now, match
* PowerVM. When most Linux clients are fixed, this could be
* changed.
*/
rtas_st(rets, 0, RTAS_OUT_SUCCESS);
return;
}
/*
* vCPU issuing "ibm,nmi-interlock" is done with NMI handling,
* hence unset fwnmi_machine_check_interlock.
*/
spapr->fwnmi_machine_check_interlock = -1;
qemu_cond_signal(&spapr->fwnmi_machine_check_interlock_cond);
rtas_st(rets, 0, RTAS_OUT_SUCCESS);
migrate_del_blocker(spapr->fwnmi_migration_blocker);
}
static struct rtas_call {
const char *name;
spapr_rtas_fn fn;
} rtas_table[RTAS_TOKEN_MAX - RTAS_TOKEN_BASE];
target_ulong spapr_rtas_call(PowerPCCPU *cpu, SpaprMachineState *spapr,
uint32_t token, uint32_t nargs, target_ulong args,
uint32_t nret, target_ulong rets)
{
if ((token >= RTAS_TOKEN_BASE) && (token < RTAS_TOKEN_MAX)) {
struct rtas_call *call = rtas_table + (token - RTAS_TOKEN_BASE);
if (call->fn) {
call->fn(cpu, spapr, token, nargs, args, nret, rets);
return H_SUCCESS;
}
}
/* HACK: Some Linux early debug code uses RTAS display-character,
* but assumes the token value is 0xa (which it is on some real
* machines) without looking it up in the device tree. This
* special case makes this work */
if (token == 0xa) {
rtas_display_character(cpu, spapr, 0xa, nargs, args, nret, rets);
return H_SUCCESS;
}
hcall_dprintf("Unknown RTAS token 0x%x\n", token);
rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
return H_PARAMETER;
}
uint64_t qtest_rtas_call(char *cmd, uint32_t nargs, uint64_t args,
uint32_t nret, uint64_t rets)
{
int token;
for (token = 0; token < RTAS_TOKEN_MAX - RTAS_TOKEN_BASE; token++) {
if (strcmp(cmd, rtas_table[token].name) == 0) {
SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
PowerPCCPU *cpu = POWERPC_CPU(first_cpu);
rtas_table[token].fn(cpu, spapr, token + RTAS_TOKEN_BASE,
nargs, args, nret, rets);
return H_SUCCESS;
}
}
return H_PARAMETER;
}
void spapr_rtas_register(int token, const char *name, spapr_rtas_fn fn)
{
assert((token >= RTAS_TOKEN_BASE) && (token < RTAS_TOKEN_MAX));
token -= RTAS_TOKEN_BASE;
assert(!name || !rtas_table[token].name);
rtas_table[token].name = name;
rtas_table[token].fn = fn;
}
void spapr_dt_rtas_tokens(void *fdt, int rtas)
{
int i;
for (i = 0; i < RTAS_TOKEN_MAX - RTAS_TOKEN_BASE; i++) {
struct rtas_call *call = &rtas_table[i];
if (!call->name) {
continue;
}
_FDT(fdt_setprop_cell(fdt, rtas, call->name, i + RTAS_TOKEN_BASE));
}
}
hwaddr spapr_get_rtas_addr(void)
{
SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
int rtas_node;
const fdt32_t *rtas_data;
void *fdt = spapr->fdt_blob;
/* fetch rtas addr from fdt */
rtas_node = fdt_path_offset(fdt, "/rtas");
if (rtas_node < 0) {
return 0;
}
rtas_data = fdt_getprop(fdt, rtas_node, "linux,rtas-base", NULL);
if (!rtas_data) {
return 0;
}
/*
* We assume that the OS called RTAS instantiate-rtas, but some other
* OS might call RTAS instantiate-rtas-64 instead. This fine as of now
* as SLOF only supports 32-bit variant.
*/
return (hwaddr)fdt32_to_cpu(*rtas_data);
}
static void core_rtas_register_types(void)
{
spapr_rtas_register(RTAS_DISPLAY_CHARACTER, "display-character",
rtas_display_character);
spapr_rtas_register(RTAS_POWER_OFF, "power-off", rtas_power_off);
spapr_rtas_register(RTAS_SYSTEM_REBOOT, "system-reboot",
rtas_system_reboot);
spapr_rtas_register(RTAS_QUERY_CPU_STOPPED_STATE, "query-cpu-stopped-state",
rtas_query_cpu_stopped_state);
spapr_rtas_register(RTAS_START_CPU, "start-cpu", rtas_start_cpu);
spapr_rtas_register(RTAS_STOP_SELF, "stop-self", rtas_stop_self);
spapr_rtas_register(RTAS_IBM_SUSPEND_ME, "ibm,suspend-me",
rtas_ibm_suspend_me);
spapr_rtas_register(RTAS_IBM_GET_SYSTEM_PARAMETER,
"ibm,get-system-parameter",
rtas_ibm_get_system_parameter);
spapr_rtas_register(RTAS_IBM_SET_SYSTEM_PARAMETER,
"ibm,set-system-parameter",
rtas_ibm_set_system_parameter);
spapr_rtas_register(RTAS_IBM_OS_TERM, "ibm,os-term",
rtas_ibm_os_term);
spapr_rtas_register(RTAS_SET_POWER_LEVEL, "set-power-level",
rtas_set_power_level);
spapr_rtas_register(RTAS_GET_POWER_LEVEL, "get-power-level",
rtas_get_power_level);
spapr_rtas_register(RTAS_IBM_NMI_REGISTER, "ibm,nmi-register",
rtas_ibm_nmi_register);
spapr_rtas_register(RTAS_IBM_NMI_INTERLOCK, "ibm,nmi-interlock",
rtas_ibm_nmi_interlock);
}
type_init(core_rtas_register_types)