diff --git a/docs/specs/ppc-spapr-numa.rst b/docs/specs/ppc-spapr-numa.rst index e762038022..5fca2bdd8e 100644 --- a/docs/specs/ppc-spapr-numa.rst +++ b/docs/specs/ppc-spapr-numa.rst @@ -158,9 +158,235 @@ kernel tree). This results in the following distances: * resources four NUMA levels apart: 160 -Consequences for QEMU NUMA tuning +pseries NUMA mechanics +====================== + +Starting in QEMU 5.2, the pseries machine considers user input when setting NUMA +topology of the guest. The overall design is: + +* ibm,associativity-reference-points is set to {0x4, 0x3, 0x2, 0x1}, allowing + for 4 distinct NUMA distance values based on the NUMA levels + +* ibm,max-associativity-domains supports multiple associativity domains in all + NUMA levels, granting user flexibility + +* ibm,associativity for all resources varies with user input + +These changes are only effective for pseries-5.2 and newer machines that are +created with more than one NUMA node (disconsidering NUMA nodes created by +the machine itself, e.g. NVLink 2 GPUs). The now legacy support has been +around for such a long time, with users seeing NUMA distances 10 and 40 +(and 80 if using NVLink2 GPUs), and there is no need to disrupt the +existing experience of those guests. + +To bring the user experience x86 users have when tuning up NUMA, we had +to operate under the current pseries Linux kernel logic described in +`How the pseries Linux guest calculates NUMA distances`_. The result +is that we needed to translate NUMA distance user input to pseries +Linux kernel input. + +Translating user distance to kernel distance +-------------------------------------------- + +User input for NUMA distance can vary from 10 to 254. We need to translate +that to the values that the Linux kernel operates on (10, 20, 40, 80, 160). +This is how it is being done: + +* user distance 11 to 30 will be interpreted as 20 +* user distance 31 to 60 will be interpreted as 40 +* user distance 61 to 120 will be interpreted as 80 +* user distance 121 and beyond will be interpreted as 160 +* user distance 10 stays 10 + +The reasoning behind this aproximation is to avoid any round up to the local +distance (10), keeping it exclusive to the 4th NUMA level (which is still +exclusive to the node_id). All other ranges were chosen under the developer +discretion of what would be (somewhat) sensible considering the user input. +Any other strategy can be used here, but in the end the reality is that we'll +have to accept that a large array of values will be translated to the same +NUMA topology in the guest, e.g. this user input: + +:: + + 0 1 2 + 0 10 31 120 + 1 31 10 30 + 2 120 30 10 + +And this other user input: + +:: + + 0 1 2 + 0 10 60 61 + 1 60 10 11 + 2 61 11 10 + +Will both be translated to the same values internally: + +:: + + 0 1 2 + 0 10 40 80 + 1 40 10 20 + 2 80 20 10 + +Users are encouraged to use only the kernel values in the NUMA definition to +avoid being taken by surprise with that the guest is actually seeing in the +topology. There are enough potential surprises that are inherent to the +associativity domain assignment process, discussed below. + + +How associativity domains are assigned +-------------------------------------- + +LOPAPR allows more than one associativity array (or 'string') per allocated +resource. This would be used to represent that the resource has multiple +connections with the board, and then the operational system, when deciding +NUMA distancing, should consider the associativity information that provides +the shortest distance. + +The spapr implementation does not support multiple associativity arrays per +resource, neither does the pseries Linux kernel. We'll have to represent the +NUMA topology using one associativity per resource, which means that choices +and compromises are going to be made. + +Consider the following NUMA topology entered by user input: + +:: + + 0 1 2 3 + 0 10 40 20 40 + 1 40 10 80 40 + 2 20 80 10 20 + 3 40 40 20 10 + +All the associativity arrays are initialized with NUMA id in all associativity +domains: + +* node 0: 0 0 0 0 +* node 1: 1 1 1 1 +* node 2: 2 2 2 2 +* node 3: 3 3 3 3 + + +Honoring just the relative distances of node 0 to every other node, we find the +NUMA level matches (considering the reference points {0x4, 0x3, 0x2, 0x1}) for +each distance: + +* distance from 0 to 1 is 40 (no match at 0x4 and 0x3, will match + at 0x2) +* distance from 0 to 2 is 20 (no match at 0x4, will match at 0x3) +* distance from 0 to 3 is 40 (no match at 0x4 and 0x3, will match + at 0x2) + +We'll copy the associativity domains of node 0 to all other nodes, based on +the NUMA level matches. Between 0 and 1, a match in 0x2, we'll also copy +the domains 0x2 and 0x1 from 0 to 1 as well. This will give us: + +* node 0: 0 0 0 0 +* node 1: 0 0 1 1 + +Doing the same to node 2 and node 3, these are the associativity arrays +after considering all matches with node 0: + +* node 0: 0 0 0 0 +* node 1: 0 0 1 1 +* node 2: 0 0 0 2 +* node 3: 0 0 3 3 + +The distances related to node 0 are accounted for. For node 1, and keeping +in mind that we don't need to revisit node 0 again, the distance from +node 1 to 2 is 80, matching at 0x1, and distance from 1 to 3 is 40, +match in 0x2. Repeating the same logic of copying all domains up to +the NUMA level match: + +* node 0: 0 0 0 0 +* node 1: 1 0 1 1 +* node 2: 1 0 0 2 +* node 3: 1 0 3 3 + +In the last step we will analyze just nodes 2 and 3. The desired distance +between 2 and 3 is 20, i.e. a match in 0x3: + +* node 0: 0 0 0 0 +* node 1: 1 0 1 1 +* node 2: 1 0 0 2 +* node 3: 1 0 0 3 + + +The kernel will read these arrays and will calculate the following NUMA topology for +the guest: + +:: + + 0 1 2 3 + 0 10 40 20 20 + 1 40 10 40 40 + 2 20 40 10 20 + 3 20 40 20 10 + +Note that this is not what the user wanted - the desired distance between +0 and 3 is 40, we calculated it as 20. This is what the current logic and +implementation constraints of the kernel and QEMU will provide inside the +LOPAPR specification. + +Users are welcome to use this knowledge and experiment with the input to get +the NUMA topology they want, or as closer as they want. The important thing +is to keep expectations up to par with what we are capable of provide at this +moment: an approximation. + +Limitations of the implementation --------------------------------- +As mentioned above, the pSeries NUMA distance logic is, in fact, a way to approximate +user choice. The Linux kernel, and PAPR itself, does not provide QEMU with the ways +to fully map user input to actual NUMA distance the guest will use. These limitations +creates two notable limitations in our support: + +* Asymmetrical topologies aren't supported. We only support NUMA topologies where + the distance from node A to B is always the same as B to A. We do not support + any A-B pair where the distance back and forth is asymmetric. For example, the + following topology isn't supported and the pSeries guest will not boot with this + user input: + +:: + + 0 1 + 0 10 40 + 1 20 10 + + +* 'non-transitive' topologies will be poorly translated to the guest. This is the + kind of topology where the distance from a node A to B is X, B to C is X, but + the distance A to C is not X. E.g.: + +:: + + 0 1 2 3 + 0 10 20 20 40 + 1 20 10 80 40 + 2 20 80 10 20 + 3 40 40 20 10 + + In the example above, distance 0 to 2 is 20, 2 to 3 is 20, but 0 to 3 is 40. + The kernel will always match with the shortest associativity domain possible, + and we're attempting to retain the previous established relations between the + nodes. This means that a distance equal to 20 between nodes 0 and 2 and the + same distance 20 between nodes 2 and 3 will cause the distance between 0 and 3 + to also be 20. + + +Legacy (5.1 and older) pseries NUMA mechanics +============================================= + +In short, we can summarize the NUMA distances seem in pseries Linux guests, using +QEMU up to 5.1, as follows: + +* local distance, i.e. the distance of the resource to its own NUMA node: 10 +* if it's a NVLink GPU device, distance: 80 +* every other resource, distance: 40 + The way the pseries Linux guest calculates NUMA distances has a direct effect on what QEMU users can expect when doing NUMA tuning. As of QEMU 5.1, this is the default ibm,associativity-reference-points being used in the pseries @@ -180,12 +406,5 @@ as far as NUMA distance goes: to the same third NUMA level, having distance = 40 * for NVLink GPUs, distance = 80 from everything else -In short, we can summarize the NUMA distances seem in pseries Linux guests, using -QEMU up to 5.1, as follows: - -* local distance, i.e. the distance of the resource to its own NUMA node: 10 -* if it's a NVLink GPU device, distance: 80 -* every other resource, distance: 40 - This also means that user input in QEMU command line does not change the NUMA distancing inside the guest for the pseries machine. diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c index 6670967e26..d9e52873ea 100644 --- a/hw/ppc/pnv.c +++ b/hw/ppc/pnv.c @@ -61,7 +61,7 @@ #define FW_FILE_NAME "skiboot.lid" #define FW_LOAD_ADDR 0x0 -#define FW_MAX_SIZE (4 * MiB) +#define FW_MAX_SIZE (16 * MiB) #define KERNEL_LOAD_ADDR 0x20000000 #define KERNEL_MAX_SIZE (256 * MiB) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 2db810f73a..63315f2d0f 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -294,6 +294,15 @@ static hwaddr spapr_node0_size(MachineState *machine) return machine->ram_size; } +bool spapr_machine_using_legacy_numa(SpaprMachineState *spapr) +{ + MachineState *machine = MACHINE(spapr); + SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); + + return smc->pre_5_2_numa_associativity || + machine->numa_state->num_nodes <= 1; +} + static void add_str(GString *s, const gchar *s1) { g_string_append_len(s, s1, strlen(s1) + 1); @@ -1483,6 +1492,12 @@ void spapr_reallocate_hpt(SpaprMachineState *spapr, int shift, spapr_free_hpt(spapr); rc = kvmppc_reset_htab(shift); + + if (rc == -EOPNOTSUPP) { + error_setg(errp, "HPT not supported in nested guests"); + return; + } + if (rc < 0) { /* kernel-side HPT needed, but couldn't allocate one */ error_setg_errno(errp, errno, @@ -3365,22 +3380,19 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size, int i; uint64_t addr = addr_start; bool hotplugged = spapr_drc_hotplugged(dev); - Error *local_err = NULL; for (i = 0; i < nr_lmbs; i++) { drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / SPAPR_MEMORY_BLOCK_SIZE); g_assert(drc); - spapr_drc_attach(drc, dev, &local_err); - if (local_err) { + if (!spapr_drc_attach(drc, dev, errp)) { while (addr > addr_start) { addr -= SPAPR_MEMORY_BLOCK_SIZE; drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / SPAPR_MEMORY_BLOCK_SIZE); spapr_drc_detach(drc); } - error_propagate(errp, local_err); return; } if (!hotplugged) { @@ -3475,9 +3487,7 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, } if (is_nvdimm) { - spapr_nvdimm_validate(hotplug_dev, NVDIMM(dev), size, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!spapr_nvdimm_validate(hotplug_dev, NVDIMM(dev), size, errp)) { return; } } else if (size % SPAPR_MEMORY_BLOCK_SIZE) { @@ -3489,9 +3499,7 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP, &error_abort); pagesize = host_memory_backend_pagesize(MEMORY_BACKEND(memdev)); - spapr_check_pagesize(spapr, pagesize, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!spapr_check_pagesize(spapr, pagesize, errp)) { return; } @@ -3761,7 +3769,6 @@ static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev, CPUCore *cc = CPU_CORE(dev); CPUState *cs; SpaprDrc *drc; - Error *local_err = NULL; CPUArchId *core_slot; int index; bool hotplugged = spapr_drc_hotplugged(dev); @@ -3779,9 +3786,7 @@ static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev, g_assert(drc || !mc->has_hotpluggable_cpus); if (drc) { - spapr_drc_attach(drc, dev, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!spapr_drc_attach(drc, dev, errp)) { return; } @@ -3811,10 +3816,9 @@ static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev, */ if (hotplugged) { for (i = 0; i < cc->nr_threads; i++) { - ppc_set_compat(core->threads[i], POWERPC_CPU(first_cpu)->compat_pvr, - &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (ppc_set_compat(core->threads[i], + POWERPC_CPU(first_cpu)->compat_pvr, + errp) < 0) { return; } } @@ -3934,7 +3938,6 @@ static void spapr_phb_plug(HotplugHandler *hotplug_dev, DeviceState *dev, SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev); SpaprDrc *drc; bool hotplugged = spapr_drc_hotplugged(dev); - Error *local_err = NULL; if (!smc->dr_phb_enabled) { return; @@ -3944,9 +3947,7 @@ static void spapr_phb_plug(HotplugHandler *hotplug_dev, DeviceState *dev, /* hotplug hooks should check it's enabled before getting this far */ assert(drc); - spapr_drc_attach(drc, dev, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!spapr_drc_attach(drc, dev, errp)) { return; } @@ -4290,7 +4291,7 @@ int spapr_get_vcpu_id(PowerPCCPU *cpu) return cpu->vcpu_id; } -void spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp) +bool spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp) { SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); MachineState *ms = MACHINE(spapr); @@ -4303,10 +4304,11 @@ void spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp) error_append_hint(errp, "Adjust the number of cpus to %d " "or try to raise the number of threads per core\n", vcpu_id * ms->smp.threads / spapr->vsmt); - return; + return false; } cpu->vcpu_id = vcpu_id; + return true; } PowerPCCPU *spapr_find_cpu(int vcpu_id) @@ -4526,8 +4528,11 @@ DEFINE_SPAPR_MACHINE(5_2, "5.2", true); */ static void spapr_machine_5_1_class_options(MachineClass *mc) { + SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); + spapr_machine_5_2_class_options(mc); compat_props_add(mc->compat_props, hw_compat_5_1, hw_compat_5_1_len); + smc->pre_5_2_numa_associativity = true; } DEFINE_SPAPR_MACHINE(5_1, "5.1", false); diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c index 10a80a8159..9341e9782a 100644 --- a/hw/ppc/spapr_caps.c +++ b/hw/ppc/spapr_caps.c @@ -310,13 +310,13 @@ static void cap_safe_indirect_branch_apply(SpaprMachineState *spapr, #define VALUE_DESC_TRISTATE " (broken, workaround, fixed)" -void spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize, +bool spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize, Error **errp) { hwaddr maxpagesize = (1ULL << spapr->eff.caps[SPAPR_CAP_HPT_MAXPAGESIZE]); if (!kvmppc_hpt_needs_host_contiguous_pages()) { - return; + return true; } if (maxpagesize > pagesize) { @@ -324,7 +324,10 @@ void spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize, "Can't support %"HWADDR_PRIu" kiB guest pages with %" HWADDR_PRIu" kiB host pages with this KVM implementation", maxpagesize >> 10, pagesize >> 10); + return false; } + + return true; } static void cap_hpt_maxpagesize_apply(SpaprMachineState *spapr, diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c index 2125fdac34..b03620823a 100644 --- a/hw/ppc/spapr_cpu_core.c +++ b/hw/ppc/spapr_cpu_core.c @@ -227,15 +227,14 @@ static void spapr_cpu_core_unrealize(DeviceState *dev) g_free(sc->threads); } -static void spapr_realize_vcpu(PowerPCCPU *cpu, SpaprMachineState *spapr, +static bool spapr_realize_vcpu(PowerPCCPU *cpu, SpaprMachineState *spapr, SpaprCpuCore *sc, Error **errp) { CPUPPCState *env = &cpu->env; CPUState *cs = CPU(cpu); - Error *local_err = NULL; if (!qdev_realize(DEVICE(cpu), NULL, errp)) { - return; + return false; } /* Set time-base frequency to 512 MHz */ @@ -244,15 +243,16 @@ static void spapr_realize_vcpu(PowerPCCPU *cpu, SpaprMachineState *spapr, cpu_ppc_set_vhyp(cpu, PPC_VIRTUAL_HYPERVISOR(spapr)); kvmppc_set_papr(cpu); - if (spapr_irq_cpu_intc_create(spapr, cpu, &local_err) < 0) { + if (spapr_irq_cpu_intc_create(spapr, cpu, errp) < 0) { cpu_remove_sync(CPU(cpu)); - return; + return false; } if (!sc->pre_3_0_migration) { vmstate_register(NULL, cs->cpu_index, &vmstate_spapr_cpu_state, cpu->machine_data); } + return true; } static PowerPCCPU *spapr_create_vcpu(SpaprCpuCore *sc, int i, Error **errp) @@ -263,7 +263,6 @@ static PowerPCCPU *spapr_create_vcpu(SpaprCpuCore *sc, int i, Error **errp) char *id; CPUState *cs; PowerPCCPU *cpu; - Error *local_err = NULL; obj = object_new(scc->cpu_type); @@ -275,8 +274,7 @@ static PowerPCCPU *spapr_create_vcpu(SpaprCpuCore *sc, int i, Error **errp) */ cs->start_powered_off = true; cs->cpu_index = cc->core_id + i; - spapr_set_vcpu_id(cpu, cs->cpu_index, &local_err); - if (local_err) { + if (!spapr_set_vcpu_id(cpu, cs->cpu_index, errp)) { goto err; } @@ -293,7 +291,6 @@ static PowerPCCPU *spapr_create_vcpu(SpaprCpuCore *sc, int i, Error **errp) err: object_unref(obj); - error_propagate(errp, local_err); return NULL; } @@ -316,7 +313,6 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp) TYPE_SPAPR_MACHINE); SpaprCpuCore *sc = SPAPR_CPU_CORE(OBJECT(dev)); CPUCore *cc = CPU_CORE(OBJECT(dev)); - Error *local_err = NULL; int i, j; if (!spapr) { @@ -326,15 +322,14 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp) sc->threads = g_new(PowerPCCPU *, cc->nr_threads); for (i = 0; i < cc->nr_threads; i++) { - sc->threads[i] = spapr_create_vcpu(sc, i, &local_err); - if (local_err) { + sc->threads[i] = spapr_create_vcpu(sc, i, errp); + if (!sc->threads[i]) { goto err; } } for (j = 0; j < cc->nr_threads; j++) { - spapr_realize_vcpu(sc->threads[j], spapr, sc, &local_err); - if (local_err) { + if (!spapr_realize_vcpu(sc->threads[j], spapr, sc, errp)) { goto err_unrealize; } } @@ -351,7 +346,6 @@ err: spapr_delete_vcpu(sc->threads[i], sc); } g_free(sc->threads); - error_propagate(errp, local_err); } static Property spapr_cpu_core_properties[] = { diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c index fe998d8108..697b28c343 100644 --- a/hw/ppc/spapr_drc.c +++ b/hw/ppc/spapr_drc.c @@ -302,7 +302,6 @@ static void prop_get_fdt(Object *obj, Visitor *v, const char *name, { SpaprDrc *drc = SPAPR_DR_CONNECTOR(obj); QNull *null = NULL; - Error *err = NULL; int fdt_offset_next, fdt_offset, fdt_depth; void *fdt; @@ -321,6 +320,7 @@ static void prop_get_fdt(Object *obj, Visitor *v, const char *name, const struct fdt_property *prop = NULL; int prop_len = 0, name_len = 0; uint32_t tag; + bool ok; tag = fdt_next_tag(fdt, fdt_offset, &fdt_offset_next); switch (tag) { @@ -334,10 +334,9 @@ static void prop_get_fdt(Object *obj, Visitor *v, const char *name, case FDT_END_NODE: /* shouldn't ever see an FDT_END_NODE before FDT_BEGIN_NODE */ g_assert(fdt_depth > 0); - visit_check_struct(v, &err); + ok = visit_check_struct(v, errp); visit_end_struct(v, NULL); - if (err) { - error_propagate(errp, err); + if (!ok) { return; } fdt_depth--; @@ -355,10 +354,9 @@ static void prop_get_fdt(Object *obj, Visitor *v, const char *name, return; } } - visit_check_list(v, &err); + ok = visit_check_list(v, errp); visit_end_list(v, NULL); - if (err) { - error_propagate(errp, err); + if (!ok) { return; } break; @@ -371,13 +369,13 @@ static void prop_get_fdt(Object *obj, Visitor *v, const char *name, } while (fdt_depth != 0); } -void spapr_drc_attach(SpaprDrc *drc, DeviceState *d, Error **errp) +bool spapr_drc_attach(SpaprDrc *drc, DeviceState *d, Error **errp) { trace_spapr_drc_attach(spapr_drc_index(drc)); if (drc->dev) { error_setg(errp, "an attached device is still awaiting release"); - return; + return false; } g_assert((drc->state == SPAPR_DRC_STATE_LOGICAL_UNUSABLE) || (drc->state == SPAPR_DRC_STATE_PHYSICAL_POWERON)); @@ -388,6 +386,7 @@ void spapr_drc_attach(SpaprDrc *drc, DeviceState *d, Error **errp) object_get_typename(OBJECT(drc->dev)), (Object **)(&drc->dev), NULL, 0); + return true; } static void spapr_drc_release(SpaprDrc *drc) diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index c2776b6a7d..607740150f 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -1590,12 +1590,11 @@ static target_ulong h_signal_sys_reset(PowerPCCPU *cpu, } } -static uint32_t cas_check_pvr(SpaprMachineState *spapr, PowerPCCPU *cpu, - target_ulong *addr, bool *raw_mode_supported, - Error **errp) +/* Returns either a logical PVR or zero if none was found */ +static uint32_t cas_check_pvr(PowerPCCPU *cpu, uint32_t max_compat, + target_ulong *addr, bool *raw_mode_supported) { bool explicit_match = false; /* Matched the CPU's real PVR */ - uint32_t max_compat = spapr->max_compat_pvr; uint32_t best_compat = 0; int i; @@ -1624,14 +1623,6 @@ static uint32_t cas_check_pvr(SpaprMachineState *spapr, PowerPCCPU *cpu, } } - if ((best_compat == 0) && (!explicit_match || max_compat)) { - /* We couldn't find a suitable compatibility mode, and either - * the guest doesn't support "raw" mode for this CPU, or raw - * mode is disabled because a maximum compat mode is set */ - error_setg(errp, "Couldn't negotiate a suitable PVR during CAS"); - return 0; - } - *raw_mode_supported = explicit_match; /* Parsing finished */ @@ -1675,11 +1666,11 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu, uint32_t cas_pvr; SpaprOptionVector *ov1_guest, *ov5_guest; bool guest_radix; - Error *local_err = NULL; bool raw_mode_supported = false; bool guest_xive; CPUState *cs; void *fdt; + uint32_t max_compat = spapr->max_compat_pvr; /* CAS is supposed to be called early when only the boot vCPU is active. */ CPU_FOREACH(cs) { @@ -1692,16 +1683,22 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu, } } - cas_pvr = cas_check_pvr(spapr, cpu, &vec, &raw_mode_supported, &local_err); - if (local_err) { - error_report_err(local_err); + cas_pvr = cas_check_pvr(cpu, max_compat, &vec, &raw_mode_supported); + if (!cas_pvr && (!raw_mode_supported || max_compat)) { + /* + * We couldn't find a suitable compatibility mode, and either + * the guest doesn't support "raw" mode for this CPU, or "raw" + * mode is disabled because a maximum compat mode is set. + */ + error_report("Couldn't negotiate a suitable PVR during CAS"); return H_HARDWARE; } /* Update CPUs */ if (cpu->compat_pvr != cas_pvr) { - ppc_set_compat_all(cas_pvr, &local_err); - if (local_err) { + Error *local_err = NULL; + + if (ppc_set_compat_all(cas_pvr, &local_err) < 0) { /* We fail to set compat mode (likely because running with KVM PR), * but maybe we can fallback to raw mode if the guest supports it. */ @@ -1710,7 +1707,6 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu, return H_HARDWARE; } error_free(local_err); - local_err = NULL; } } diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index 64fe567f5d..b50796bbe3 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -19,12 +19,126 @@ /* Moved from hw/ppc/spapr_pci_nvlink2.c */ #define SPAPR_GPU_NUMA_ID (cpu_to_be32(1)) +static bool spapr_numa_is_symmetrical(MachineState *ms) +{ + int src, dst; + int nb_numa_nodes = ms->numa_state->num_nodes; + NodeInfo *numa_info = ms->numa_state->nodes; + + for (src = 0; src < nb_numa_nodes; src++) { + for (dst = src; dst < nb_numa_nodes; dst++) { + if (numa_info[src].distance[dst] != + numa_info[dst].distance[src]) { + return false; + } + } + } + + return true; +} + +/* + * This function will translate the user distances into + * what the kernel understand as possible values: 10 + * (local distance), 20, 40, 80 and 160, and return the equivalent + * NUMA level for each. Current heuristic is: + * - local distance (10) returns numa_level = 0x4, meaning there is + * no rounding for local distance + * - distances between 11 and 30 inclusive -> rounded to 20, + * numa_level = 0x3 + * - distances between 31 and 60 inclusive -> rounded to 40, + * numa_level = 0x2 + * - distances between 61 and 120 inclusive -> rounded to 80, + * numa_level = 0x1 + * - everything above 120 returns numa_level = 0 to indicate that + * there is no match. This will be calculated as disntace = 160 + * by the kernel (as of v5.9) + */ +static uint8_t spapr_numa_get_numa_level(uint8_t distance) +{ + if (distance == 10) { + return 0x4; + } else if (distance > 11 && distance <= 30) { + return 0x3; + } else if (distance > 31 && distance <= 60) { + return 0x2; + } else if (distance > 61 && distance <= 120) { + return 0x1; + } + + return 0; +} + +static void spapr_numa_define_associativity_domains(SpaprMachineState *spapr) +{ + MachineState *ms = MACHINE(spapr); + NodeInfo *numa_info = ms->numa_state->nodes; + int nb_numa_nodes = ms->numa_state->num_nodes; + int src, dst, i; + + for (src = 0; src < nb_numa_nodes; src++) { + for (dst = src; dst < nb_numa_nodes; dst++) { + /* + * This is how the associativity domain between A and B + * is calculated: + * + * - get the distance D between them + * - get the correspondent NUMA level 'n_level' for D + * - all associativity arrays were initialized with their own + * numa_ids, and we're calculating the distance in node_id + * ascending order, starting from node id 0 (the first node + * retrieved by numa_state). This will have a cascade effect in + * the algorithm because the associativity domains that node 0 + * defines will be carried over to other nodes, and node 1 + * associativities will be carried over after taking node 0 + * associativities into account, and so on. This happens because + * we'll assign assoc_src as the associativity domain of dst + * as well, for all NUMA levels beyond and including n_level. + * + * The PPC kernel expects the associativity domains of node 0 to + * be always 0, and this algorithm will grant that by default. + */ + uint8_t distance = numa_info[src].distance[dst]; + uint8_t n_level = spapr_numa_get_numa_level(distance); + uint32_t assoc_src; + + /* + * n_level = 0 means that the distance is greater than our last + * rounded value (120). In this case there is no NUMA level match + * between src and dst and we can skip the remaining of the loop. + * + * The Linux kernel will assume that the distance between src and + * dst, in this case of no match, is 10 (local distance) doubled + * for each NUMA it didn't match. We have MAX_DISTANCE_REF_POINTS + * levels (4), so this gives us 10*2*2*2*2 = 160. + * + * This logic can be seen in the Linux kernel source code, as of + * v5.9, in arch/powerpc/mm/numa.c, function __node_distance(). + */ + if (n_level == 0) { + continue; + } + + /* + * We must assign all assoc_src to dst, starting from n_level + * and going up to 0x1. + */ + for (i = n_level; i > 0; i--) { + assoc_src = spapr->numa_assoc_array[src][i]; + spapr->numa_assoc_array[dst][i] = assoc_src; + } + } + } + +} + void spapr_numa_associativity_init(SpaprMachineState *spapr, MachineState *machine) { SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); int nb_numa_nodes = machine->numa_state->num_nodes; int i, j, max_nodes_with_gpus; + bool using_legacy_numa = spapr_machine_using_legacy_numa(spapr); /* * For all associativity arrays: first position is the size, @@ -38,6 +152,17 @@ void spapr_numa_associativity_init(SpaprMachineState *spapr, for (i = 0; i < nb_numa_nodes; i++) { spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); + + /* + * Fill all associativity domains of non-zero NUMA nodes with + * node_id. This is required because the default value (0) is + * considered a match with associativity domains of node 0. + */ + if (!using_legacy_numa && i != 0) { + for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) { + spapr->numa_assoc_array[i][j] = cpu_to_be32(i); + } + } } /* @@ -61,6 +186,23 @@ void spapr_numa_associativity_init(SpaprMachineState *spapr, spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); } + + /* + * Legacy NUMA guests (pseries-5.1 and older, or guests with only + * 1 NUMA node) will not benefit from anything we're going to do + * after this point. + */ + if (using_legacy_numa) { + return; + } + + if (!spapr_numa_is_symmetrical(machine)) { + error_report("Asymmetrical NUMA topologies aren't supported " + "in the pSeries machine"); + exit(EXIT_FAILURE); + } + + spapr_numa_define_associativity_domains(spapr); } void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, @@ -144,24 +286,51 @@ int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt, */ void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas) { + MachineState *ms = MACHINE(spapr); SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); uint32_t refpoints[] = { cpu_to_be32(0x4), - cpu_to_be32(0x4), + cpu_to_be32(0x3), cpu_to_be32(0x2), + cpu_to_be32(0x1), }; uint32_t nr_refpoints = ARRAY_SIZE(refpoints); - uint32_t maxdomain = cpu_to_be32(spapr->gpu_numa_id > 1 ? 1 : 0); + uint32_t maxdomain = ms->numa_state->num_nodes + spapr->gpu_numa_id; uint32_t maxdomains[] = { cpu_to_be32(4), - maxdomain, - maxdomain, - maxdomain, - cpu_to_be32(spapr->gpu_numa_id), + cpu_to_be32(maxdomain), + cpu_to_be32(maxdomain), + cpu_to_be32(maxdomain), + cpu_to_be32(maxdomain) }; - if (smc->pre_5_1_assoc_refpoints) { - nr_refpoints = 2; + if (spapr_machine_using_legacy_numa(spapr)) { + uint32_t legacy_refpoints[] = { + cpu_to_be32(0x4), + cpu_to_be32(0x4), + cpu_to_be32(0x2), + }; + uint32_t legacy_maxdomain = spapr->gpu_numa_id > 1 ? 1 : 0; + uint32_t legacy_maxdomains[] = { + cpu_to_be32(4), + cpu_to_be32(legacy_maxdomain), + cpu_to_be32(legacy_maxdomain), + cpu_to_be32(legacy_maxdomain), + cpu_to_be32(spapr->gpu_numa_id), + }; + + G_STATIC_ASSERT(sizeof(legacy_refpoints) <= sizeof(refpoints)); + G_STATIC_ASSERT(sizeof(legacy_maxdomains) <= sizeof(maxdomains)); + + nr_refpoints = 3; + + memcpy(refpoints, legacy_refpoints, sizeof(legacy_refpoints)); + memcpy(maxdomains, legacy_maxdomains, sizeof(legacy_maxdomains)); + + /* pseries-5.0 and older reference-points array is {0x4, 0x4} */ + if (smc->pre_5_1_assoc_refpoints) { + nr_refpoints = 2; + } } _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points", diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c index 63872054f3..b3a489e9fe 100644 --- a/hw/ppc/spapr_nvdimm.c +++ b/hw/ppc/spapr_nvdimm.c @@ -33,7 +33,7 @@ #include "sysemu/sysemu.h" #include "hw/ppc/spapr_numa.h" -void spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, +bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, uint64_t size, Error **errp) { const MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev); @@ -45,7 +45,7 @@ void spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, if (!mc->nvdimm_supported) { error_setg(errp, "NVDIMM hotplug not supported for this machine"); - return; + return false; } /* @@ -59,20 +59,20 @@ void spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, */ if (!ms->nvdimms_state->is_enabled && nvdimm_opt) { error_setg(errp, "nvdimm device found but 'nvdimm=off' was set"); - return; + return false; } if (object_property_get_int(OBJECT(nvdimm), NVDIMM_LABEL_SIZE_PROP, &error_abort) == 0) { error_setg(errp, "PAPR requires NVDIMM devices to have label-size set"); - return; + return false; } if (size % SPAPR_MINIMUM_SCM_BLOCK_SIZE) { error_setg(errp, "PAPR requires NVDIMM memory size (excluding label)" " to be a multiple of %" PRIu64 "MB", SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB); - return; + return false; } uuidstr = object_property_get_str(OBJECT(nvdimm), NVDIMM_UUID_PROP, @@ -82,8 +82,10 @@ void spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, if (qemu_uuid_is_null(&uuid)) { error_setg(errp, "NVDIMM device requires the uuid to be set"); - return; + return false; } + + return true; } @@ -91,14 +93,11 @@ void spapr_add_nvdimm(DeviceState *dev, uint64_t slot, Error **errp) { SpaprDrc *drc; bool hotplugged = spapr_drc_hotplugged(dev); - Error *local_err = NULL; drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, slot); g_assert(drc); - spapr_drc_attach(drc, dev, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!spapr_drc_attach(drc, dev, errp)) { return; } diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 5db912b48c..3999392b32 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -1539,7 +1539,6 @@ static void spapr_pci_plug(HotplugHandler *plug_handler, PCIDevice *pdev = PCI_DEVICE(plugged_dev); PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(plugged_dev); SpaprDrc *drc = drc_from_dev(phb, pdev); - Error *local_err = NULL; PCIBus *bus = PCI_BUS(qdev_get_parent_bus(DEVICE(pdev))); uint32_t slotnr = PCI_SLOT(pdev->devfn); @@ -1578,9 +1577,7 @@ static void spapr_pci_plug(HotplugHandler *plug_handler, return; } - spapr_drc_attach(drc, DEVICE(pdev), &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!spapr_drc_attach(drc, DEVICE(pdev), errp)) { return; } diff --git a/hw/ppc/spapr_vio.c b/hw/ppc/spapr_vio.c index 731080d989..44fdd64b88 100644 --- a/hw/ppc/spapr_vio.c +++ b/hw/ppc/spapr_vio.c @@ -474,7 +474,6 @@ static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp) SpaprVioDevice *dev = (SpaprVioDevice *)qdev; SpaprVioDeviceClass *pc = VIO_SPAPR_DEVICE_GET_CLASS(dev); char *id; - Error *local_err = NULL; if (dev->reg != -1) { /* @@ -510,16 +509,15 @@ static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp) dev->irq = spapr_vio_reg_to_irq(dev->reg); if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) { - dev->irq = spapr_irq_findone(spapr, &local_err); - if (local_err) { - error_propagate(errp, local_err); + int irq = spapr_irq_findone(spapr, errp); + + if (irq < 0) { return; } + dev->irq = irq; } - spapr_irq_claim(spapr, dev->irq, false, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (spapr_irq_claim(spapr, dev->irq, false, errp) < 0) { return; } diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index 194f3b9d07..bb47896f17 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -138,6 +138,7 @@ struct SpaprMachineClass { bool smp_threads_vsmt; /* set VSMT to smp_threads by default */ hwaddr rma_limit; /* clamp the RMA to this size */ bool pre_5_1_assoc_refpoints; + bool pre_5_2_numa_associativity; void (*phb_placement)(SpaprMachineState *spapr, uint32_t index, uint64_t *buid, hwaddr *pio, @@ -853,6 +854,7 @@ int spapr_max_server_number(SpaprMachineState *spapr); void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex, uint64_t pte0, uint64_t pte1); void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered); +bool spapr_machine_using_legacy_numa(SpaprMachineState *spapr); /* DRC callbacks. */ void spapr_core_release(DeviceState *dev); @@ -902,7 +904,7 @@ void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg); #define HTAB_SIZE(spapr) (1ULL << ((spapr)->htab_shift)) int spapr_get_vcpu_id(PowerPCCPU *cpu); -void spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp); +bool spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp); PowerPCCPU *spapr_find_cpu(int vcpu_id); int spapr_caps_pre_load(void *opaque); @@ -934,7 +936,7 @@ void spapr_caps_cpu_apply(SpaprMachineState *spapr, PowerPCCPU *cpu); void spapr_caps_add_properties(SpaprMachineClass *smc); int spapr_caps_post_migration(SpaprMachineState *spapr); -void spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize, +bool spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize, Error **errp); /* * XIVE definitions diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h index f270860769..165b281496 100644 --- a/include/hw/ppc/spapr_drc.h +++ b/include/hw/ppc/spapr_drc.h @@ -235,7 +235,7 @@ SpaprDrc *spapr_drc_by_index(uint32_t index); SpaprDrc *spapr_drc_by_id(const char *type, uint32_t id); int spapr_dt_drc(void *fdt, int offset, Object *owner, uint32_t drc_type_mask); -void spapr_drc_attach(SpaprDrc *drc, DeviceState *d, Error **errp); +bool spapr_drc_attach(SpaprDrc *drc, DeviceState *d, Error **errp); void spapr_drc_detach(SpaprDrc *drc); /* Returns true if a hot plug/unplug request is pending */ diff --git a/include/hw/ppc/spapr_nvdimm.h b/include/hw/ppc/spapr_nvdimm.h index 3eb344e8e9..b834d82f55 100644 --- a/include/hw/ppc/spapr_nvdimm.h +++ b/include/hw/ppc/spapr_nvdimm.h @@ -28,7 +28,7 @@ QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE); int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr, void *fdt, int *fdt_start_offset, Error **errp); void spapr_dt_persistent_memory(SpaprMachineState *spapr, void *fdt); -void spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, +bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, uint64_t size, Error **errp); void spapr_add_nvdimm(DeviceState *dev, uint64_t slot, Error **errp); void spapr_create_nvdimm_dr_connectors(SpaprMachineState *spapr); diff --git a/target/ppc/compat.c b/target/ppc/compat.c index 08aede88dc..e9bec5ffed 100644 --- a/target/ppc/compat.c +++ b/target/ppc/compat.c @@ -158,7 +158,7 @@ bool ppc_type_check_compat(const char *cputype, uint32_t compat_pvr, return pcc_compat(pcc, compat_pvr, min_compat_pvr, max_compat_pvr); } -void ppc_set_compat(PowerPCCPU *cpu, uint32_t compat_pvr, Error **errp) +int ppc_set_compat(PowerPCCPU *cpu, uint32_t compat_pvr, Error **errp) { const CompatInfo *compat = compat_by_pvr(compat_pvr); CPUPPCState *env = &cpu->env; @@ -169,11 +169,11 @@ void ppc_set_compat(PowerPCCPU *cpu, uint32_t compat_pvr, Error **errp) pcr = 0; } else if (!compat) { error_setg(errp, "Unknown compatibility PVR 0x%08"PRIx32, compat_pvr); - return; + return -EINVAL; } else if (!ppc_check_compat(cpu, compat_pvr, 0, 0)) { error_setg(errp, "Compatibility PVR 0x%08"PRIx32" not valid for CPU", compat_pvr); - return; + return -EINVAL; } else { pcr = compat->pcr; } @@ -185,17 +185,19 @@ void ppc_set_compat(PowerPCCPU *cpu, uint32_t compat_pvr, Error **errp) if (ret < 0) { error_setg_errno(errp, -ret, "Unable to set CPU compatibility mode in KVM"); - return; + return ret; } } cpu->compat_pvr = compat_pvr; env->spr[SPR_PCR] = pcr & pcc->pcr_mask; + return 0; } typedef struct { uint32_t compat_pvr; - Error *err; + Error **errp; + int ret; } SetCompatState; static void do_set_compat(CPUState *cs, run_on_cpu_data arg) @@ -203,26 +205,28 @@ static void do_set_compat(CPUState *cs, run_on_cpu_data arg) PowerPCCPU *cpu = POWERPC_CPU(cs); SetCompatState *s = arg.host_ptr; - ppc_set_compat(cpu, s->compat_pvr, &s->err); + s->ret = ppc_set_compat(cpu, s->compat_pvr, s->errp); } -void ppc_set_compat_all(uint32_t compat_pvr, Error **errp) +int ppc_set_compat_all(uint32_t compat_pvr, Error **errp) { CPUState *cs; CPU_FOREACH(cs) { SetCompatState s = { .compat_pvr = compat_pvr, - .err = NULL, + .errp = errp, + .ret = 0, }; run_on_cpu(cs, do_set_compat, RUN_ON_CPU_HOST_PTR(&s)); - if (s.err) { - error_propagate(errp, s.err); - return; + if (s.ret < 0) { + return s.ret; } } + + return 0; } int ppc_compat_max_vthreads(PowerPCCPU *cpu) diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index 766e9c5c26..e8aa185d4f 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -1352,10 +1352,10 @@ bool ppc_check_compat(PowerPCCPU *cpu, uint32_t compat_pvr, bool ppc_type_check_compat(const char *cputype, uint32_t compat_pvr, uint32_t min_compat_pvr, uint32_t max_compat_pvr); -void ppc_set_compat(PowerPCCPU *cpu, uint32_t compat_pvr, Error **errp); +int ppc_set_compat(PowerPCCPU *cpu, uint32_t compat_pvr, Error **errp); #if !defined(CONFIG_USER_ONLY) -void ppc_set_compat_all(uint32_t compat_pvr, Error **errp); +int ppc_set_compat_all(uint32_t compat_pvr, Error **errp); #endif int ppc_compat_max_vthreads(PowerPCCPU *cpu); void ppc_compat_add_property(Object *obj, const char *name, diff --git a/target/ppc/machine.c b/target/ppc/machine.c index 109d071162..5e58377376 100644 --- a/target/ppc/machine.c +++ b/target/ppc/machine.c @@ -347,18 +347,19 @@ static int cpu_post_load(void *opaque, int version_id) if (cpu->compat_pvr) { uint32_t compat_pvr = cpu->compat_pvr; Error *local_err = NULL; + int ret; cpu->compat_pvr = 0; - ppc_set_compat(cpu, compat_pvr, &local_err); - if (local_err) { + ret = ppc_set_compat(cpu, compat_pvr, &local_err); + if (ret < 0) { error_report_err(local_err); - return -1; + return ret; } } else #endif { if (!pvr_match(cpu, env->spr[SPR_PVR])) { - return -1; + return -EINVAL; } }