From ee4c112846a0f2ac4fe5601918b0a2642ac8e2ed Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 7 Sep 2017 14:27:09 -0600 Subject: [PATCH 01/17] vhost: Release memory references on cleanup vhost registers a MemoryListener where it adds and removes references to MemoryRegions as the MemoryRegionSections pass through. The region_add callback is invoked for each existing section when the MemoryListener is registered, but unregistering the MemoryListener performs no reciprocal region_del callback. It's therefore the owner of the MemoryListener's responsibility to cleanup any persistent changes, such as these memory references, after unregistering. The consequence of this bug is that if we have both a vhost device and a vfio device, the vhost device will reference any mmap'd MMIO of the vfio device via this MemoryListener. If the vhost device is then removed, those references remain outstanding. If we then attempt to remove the vfio device, it never gets finalized and the only way to release the kernel file descriptors is to terminate the QEMU process. Fixes: dfde4e6e1a86 ("memory: add ref/unref calls") Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: qemu-stable@nongnu.org # v1.6.0+ Signed-off-by: Alex Williamson Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 0049a2c0b3..5fd69f0b2e 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -1356,6 +1356,10 @@ void vhost_dev_cleanup(struct vhost_dev *hdev) if (hdev->mem) { /* those are only safe after successful init */ memory_listener_unregister(&hdev->memory_listener); + for (i = 0; i < hdev->n_mem_sections; ++i) { + MemoryRegionSection *section = &hdev->mem_sections[i]; + memory_region_unref(section->mr); + } QLIST_REMOVE(hdev, entry); } if (hdev->migration_blocker) { From a6fd5b0e050abc892ae3a64547631d2332b893de Mon Sep 17 00:00:00 2001 From: Marcel Apfelbaum Date: Wed, 6 Sep 2017 17:26:57 +0300 Subject: [PATCH 02/17] pc: add 2.11 machine types Signed-off-by: Marcel Apfelbaum Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc_piix.c | 15 ++++++++++++--- hw/i386/pc_q35.c | 13 +++++++++++-- include/hw/i386/pc.h | 3 +++ 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 46dfd2c954..b03cc047c3 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -436,21 +436,30 @@ static void pc_i440fx_machine_options(MachineClass *m) m->default_display = "std"; } -static void pc_i440fx_2_10_machine_options(MachineClass *m) +static void pc_i440fx_2_11_machine_options(MachineClass *m) { pc_i440fx_machine_options(m); m->alias = "pc"; m->is_default = 1; } +DEFINE_I440FX_MACHINE(v2_11, "pc-i440fx-2.11", NULL, + pc_i440fx_2_11_machine_options); + +static void pc_i440fx_2_10_machine_options(MachineClass *m) +{ + pc_i440fx_2_11_machine_options(m); + m->is_default = 0; + m->alias = NULL; + SET_MACHINE_COMPAT(m, PC_COMPAT_2_10); +} + DEFINE_I440FX_MACHINE(v2_10, "pc-i440fx-2.10", NULL, pc_i440fx_2_10_machine_options); static void pc_i440fx_2_9_machine_options(MachineClass *m) { pc_i440fx_2_10_machine_options(m); - m->is_default = 0; - m->alias = NULL; SET_MACHINE_COMPAT(m, PC_COMPAT_2_9); m->numa_auto_assign_ram = numa_legacy_auto_assign_ram; } diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 169a214d50..c1cba584d1 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -302,10 +302,20 @@ static void pc_q35_machine_options(MachineClass *m) m->max_cpus = 288; } -static void pc_q35_2_10_machine_options(MachineClass *m) +static void pc_q35_2_11_machine_options(MachineClass *m) { pc_q35_machine_options(m); m->alias = "q35"; +} + +DEFINE_Q35_MACHINE(v2_11, "pc-q35-2.11", NULL, + pc_q35_2_11_machine_options); + +static void pc_q35_2_10_machine_options(MachineClass *m) +{ + pc_q35_2_11_machine_options(m); + m->alias = NULL; + SET_MACHINE_COMPAT(m, PC_COMPAT_2_10); m->numa_auto_assign_ram = numa_legacy_auto_assign_ram; } @@ -315,7 +325,6 @@ DEFINE_Q35_MACHINE(v2_10, "pc-q35-2.10", NULL, static void pc_q35_2_9_machine_options(MachineClass *m) { pc_q35_2_10_machine_options(m); - m->alias = NULL; SET_MACHINE_COMPAT(m, PC_COMPAT_2_9); } diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index d80859bfad..8226904524 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -369,6 +369,9 @@ int e820_add_entry(uint64_t, uint64_t, uint32_t); int e820_get_num_entries(void); bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *); +#define PC_COMPAT_2_10 \ + HW_COMPAT_2_10 \ + #define PC_COMPAT_2_9 \ HW_COMPAT_2_9 \ {\ From f5855994fee2f8815dc86b8453e4a63e290aea05 Mon Sep 17 00:00:00 2001 From: Anthony PERARD Date: Wed, 6 Sep 2017 14:40:31 +0100 Subject: [PATCH 03/17] hw/acpi: Limit hotplug to root bus on legacy mode Signed-off-by: Anthony PERARD Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/pcihp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c index c420a388ea..9db3c2eaf2 100644 --- a/hw/acpi/pcihp.c +++ b/hw/acpi/pcihp.c @@ -273,7 +273,7 @@ static void pci_write(void *opaque, hwaddr addr, uint64_t data, addr, data); break; case PCI_SEL_BASE: - s->hotplug_select = data; + s->hotplug_select = s->legacy_piix ? ACPI_PCIHP_BSEL_DEFAULT : data; ACPI_PCIHP_DPRINTF("pcisel write %" HWADDR_PRIx " <== %" PRIu64 "\n", addr, data); default: From ab938ae43f8a3a71a3525566edf586081b7a7452 Mon Sep 17 00:00:00 2001 From: Anthony PERARD Date: Wed, 6 Sep 2017 14:40:32 +0100 Subject: [PATCH 04/17] hw/acpi: Move acpi_set_pci_info to pcihp HW part of ACPI PCI hotplug in QEMU depends on ACPI_PCIHP_PROP_BSEL being set on a PCI bus that supports ACPI hotplug. It should work regardless of the source of ACPI tables (QEMU generator/legacy SeaBIOS/Xen). So move ACPI_PCIHP_PROP_BSEL initialization into HW ACPI implementation part from QEMU's ACPI table generator. To do PCI passthrough with Xen, the property ACPI_PCIHP_PROP_BSEL needs to be set, but this was done only when ACPI tables are built which is not needed for a Xen guest. The need for the property starts with commit "pc: pcihp: avoid adding ACPI_PCIHP_PROP_BSEL twice" (f0c9d64a68b776374ec4732424a3e27753ce37b6). Adding find_i440fx into stubs so that mips-softmmu target can be built. Reported-by: Sander Eikelenboom Signed-off-by: Anthony PERARD Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/pcihp.c | 38 ++++++++++++++++++++++++++++++++++++++ hw/i386/acpi-build.c | 32 -------------------------------- stubs/Makefile.objs | 1 + stubs/pci-host-piix.c | 6 ++++++ 4 files changed, 45 insertions(+), 32 deletions(-) create mode 100644 stubs/pci-host-piix.c diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c index 9db3c2eaf2..7da51c0569 100644 --- a/hw/acpi/pcihp.c +++ b/hw/acpi/pcihp.c @@ -75,6 +75,43 @@ static int acpi_pcihp_get_bsel(PCIBus *bus) } } +/* Assign BSEL property to all buses. In the future, this can be changed + * to only assign to buses that support hotplug. + */ +static void *acpi_set_bsel(PCIBus *bus, void *opaque) +{ + unsigned *bsel_alloc = opaque; + unsigned *bus_bsel; + + if (qbus_is_hotpluggable(BUS(bus))) { + bus_bsel = g_malloc(sizeof *bus_bsel); + + *bus_bsel = (*bsel_alloc)++; + object_property_add_uint32_ptr(OBJECT(bus), ACPI_PCIHP_PROP_BSEL, + bus_bsel, &error_abort); + } + + return bsel_alloc; +} + +static void acpi_set_pci_info(void) +{ + static bool bsel_is_set; + PCIBus *bus; + unsigned bsel_alloc = ACPI_PCIHP_BSEL_DEFAULT; + + if (bsel_is_set) { + return; + } + bsel_is_set = true; + + bus = find_i440fx(); /* TODO: Q35 support */ + if (bus) { + /* Scan all PCI buses. Set property to enable acpi based hotplug. */ + pci_for_each_bus_depth_first(bus, acpi_set_bsel, NULL, &bsel_alloc); + } +} + static void acpi_pcihp_test_hotplug_bus(PCIBus *bus, void *opaque) { AcpiPciHpFind *find = opaque; @@ -177,6 +214,7 @@ static void acpi_pcihp_update(AcpiPciHpState *s) void acpi_pcihp_reset(AcpiPciHpState *s) { + acpi_set_pci_info(); acpi_pcihp_update(s); } diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 98dd424678..4d19d91e1b 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -493,36 +493,6 @@ build_madt(GArray *table_data, BIOSLinker *linker, PCMachineState *pcms) table_data->len - madt_start, 1, NULL, NULL); } -/* Assign BSEL property to all buses. In the future, this can be changed - * to only assign to buses that support hotplug. - */ -static void *acpi_set_bsel(PCIBus *bus, void *opaque) -{ - unsigned *bsel_alloc = opaque; - unsigned *bus_bsel; - - if (qbus_is_hotpluggable(BUS(bus))) { - bus_bsel = g_malloc(sizeof *bus_bsel); - - *bus_bsel = (*bsel_alloc)++; - object_property_add_uint32_ptr(OBJECT(bus), ACPI_PCIHP_PROP_BSEL, - bus_bsel, &error_abort); - } - - return bsel_alloc; -} - -static void acpi_set_pci_info(void) -{ - PCIBus *bus = find_i440fx(); /* TODO: Q35 support */ - unsigned bsel_alloc = ACPI_PCIHP_BSEL_DEFAULT; - - if (bus) { - /* Scan all PCI buses. Set property to enable acpi based hotplug. */ - pci_for_each_bus_depth_first(bus, acpi_set_bsel, NULL, &bsel_alloc); - } -} - static void build_append_pcihp_notify_entry(Aml *method, int slot) { Aml *if_ctx; @@ -2888,8 +2858,6 @@ void acpi_setup(void) build_state = g_malloc0(sizeof *build_state); - acpi_set_pci_info(); - acpi_build_tables_init(&tables); acpi_build(&tables, MACHINE(pcms)); diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs index e69c217aff..4a33495911 100644 --- a/stubs/Makefile.objs +++ b/stubs/Makefile.objs @@ -40,3 +40,4 @@ stub-obj-y += pc_madt_cpu_entry.o stub-obj-y += vmgenid.o stub-obj-y += xen-common.o stub-obj-y += xen-hvm.o +stub-obj-y += pci-host-piix.o diff --git a/stubs/pci-host-piix.c b/stubs/pci-host-piix.c new file mode 100644 index 0000000000..6ed81b1f21 --- /dev/null +++ b/stubs/pci-host-piix.c @@ -0,0 +1,6 @@ +#include "qemu/osdep.h" +#include "hw/i386/pc.h" +PCIBus *find_i440fx(void) +{ + return NULL; +} From 2bed1ba77fae50bc8b5e68ede2d80b652b30c3b8 Mon Sep 17 00:00:00 2001 From: Anthony PERARD Date: Wed, 6 Sep 2017 14:40:33 +0100 Subject: [PATCH 05/17] Revert "ACPI: don't call acpi_pcihp_device_plug_cb on xen" This reverts commit 153eba4726dfa1bdfc31d1fe973b2a61b9035492. This patch prevents PCI passthrough hotplug on Xen. Even if the Xen tool stack prepares its own ACPI tables, we still rely on QEMU for hotplug ACPI notifications. The original issue is fixed by the two previous patch: hw/acpi: Limit hotplug to root bus on legacy mode hw/acpi: Move acpi_set_pci_info to pcihp Signed-off-by: Anthony PERARD Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/piix4.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c index f276967365..f4fd5907b8 100644 --- a/hw/acpi/piix4.c +++ b/hw/acpi/piix4.c @@ -385,10 +385,7 @@ static void piix4_device_plug_cb(HotplugHandler *hotplug_dev, dev, errp); } } else if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { - if (!xen_enabled()) { - acpi_pcihp_device_plug_cb(hotplug_dev, &s->acpi_pci_hotplug, dev, - errp); - } + acpi_pcihp_device_plug_cb(hotplug_dev, &s->acpi_pci_hotplug, dev, errp); } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { if (s->cpu_hotplug_legacy) { legacy_acpi_cpu_plug_cb(hotplug_dev, &s->gpe_cpu, dev, errp); @@ -411,10 +408,8 @@ static void piix4_device_unplug_request_cb(HotplugHandler *hotplug_dev, acpi_memory_unplug_request_cb(hotplug_dev, &s->acpi_memory_hotplug, dev, errp); } else if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { - if (!xen_enabled()) { - acpi_pcihp_device_unplug_cb(hotplug_dev, &s->acpi_pci_hotplug, dev, - errp); - } + acpi_pcihp_device_unplug_cb(hotplug_dev, &s->acpi_pci_hotplug, dev, + errp); } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU) && !s->cpu_hotplug_legacy) { acpi_cpu_unplug_request_cb(hotplug_dev, &s->cpuhp_state, dev, errp); From a35fe226558ac85436ea01af8977f1834927f53f Mon Sep 17 00:00:00 2001 From: Aleksandr Bezzubikov Date: Fri, 18 Aug 2017 02:36:47 +0300 Subject: [PATCH 06/17] hw/pci: introduce pcie-pci-bridge device Introduce a new PCIExpress-to-PCI Bridge device, which is a hot-pluggable PCI Express device and supports devices hot-plug with SHPC. This device is intended to replace the DMI-to-PCI Bridge. Signed-off-by: Aleksandr Bezzubikov Reviewed-by: Marcel Apfelbaum Tested-by: Marcel Apfelbaum Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci-bridge/Makefile.objs | 2 +- hw/pci-bridge/pcie_pci_bridge.c | 192 ++++++++++++++++++++++++++++++++ include/hw/pci/pci.h | 1 + 3 files changed, 194 insertions(+), 1 deletion(-) create mode 100644 hw/pci-bridge/pcie_pci_bridge.c diff --git a/hw/pci-bridge/Makefile.objs b/hw/pci-bridge/Makefile.objs index c4683cf5c1..666db37da2 100644 --- a/hw/pci-bridge/Makefile.objs +++ b/hw/pci-bridge/Makefile.objs @@ -1,4 +1,4 @@ -common-obj-y += pci_bridge_dev.o +common-obj-y += pci_bridge_dev.o pcie_pci_bridge.o common-obj-$(CONFIG_PCIE_PORT) += pcie_root_port.o gen_pcie_root_port.o common-obj-$(CONFIG_PXB) += pci_expander_bridge.o common-obj-$(CONFIG_XIO3130) += xio3130_upstream.o xio3130_downstream.o diff --git a/hw/pci-bridge/pcie_pci_bridge.c b/hw/pci-bridge/pcie_pci_bridge.c new file mode 100644 index 0000000000..9aa5cc3e45 --- /dev/null +++ b/hw/pci-bridge/pcie_pci_bridge.c @@ -0,0 +1,192 @@ +/* + * QEMU Generic PCIE-PCI Bridge + * + * Copyright (c) 2017 Aleksandr Bezzubikov + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "hw/pci/pci.h" +#include "hw/pci/pci_bus.h" +#include "hw/pci/pci_bridge.h" +#include "hw/pci/msi.h" +#include "hw/pci/shpc.h" +#include "hw/pci/slotid_cap.h" + +typedef struct PCIEPCIBridge { + /*< private >*/ + PCIBridge parent_obj; + + OnOffAuto msi; + MemoryRegion shpc_bar; + /*< public >*/ +} PCIEPCIBridge; + +#define TYPE_PCIE_PCI_BRIDGE_DEV "pcie-pci-bridge" +#define PCIE_PCI_BRIDGE_DEV(obj) \ + OBJECT_CHECK(PCIEPCIBridge, (obj), TYPE_PCIE_PCI_BRIDGE_DEV) + +static void pcie_pci_bridge_realize(PCIDevice *d, Error **errp) +{ + PCIBridge *br = PCI_BRIDGE(d); + PCIEPCIBridge *pcie_br = PCIE_PCI_BRIDGE_DEV(d); + int rc, pos; + + pci_bridge_initfn(d, TYPE_PCI_BUS); + + d->config[PCI_INTERRUPT_PIN] = 0x1; + memory_region_init(&pcie_br->shpc_bar, OBJECT(d), "shpc-bar", + shpc_bar_size(d)); + rc = shpc_init(d, &br->sec_bus, &pcie_br->shpc_bar, 0, errp); + if (rc) { + goto error; + } + + rc = pcie_cap_init(d, 0, PCI_EXP_TYPE_PCI_BRIDGE, 0, errp); + if (rc < 0) { + goto cap_error; + } + + pos = pci_add_capability(d, PCI_CAP_ID_PM, 0, PCI_PM_SIZEOF, errp); + if (pos < 0) { + goto pm_error; + } + d->exp.pm_cap = pos; + pci_set_word(d->config + pos + PCI_PM_PMC, 0x3); + + pcie_cap_arifwd_init(d); + pcie_cap_deverr_init(d); + + rc = pcie_aer_init(d, PCI_ERR_VER, 0x100, PCI_ERR_SIZEOF, errp); + if (rc < 0) { + goto aer_error; + } + + if (pcie_br->msi != ON_OFF_AUTO_OFF) { + rc = msi_init(d, 0, 1, true, true, errp); + if (rc < 0) { + goto msi_error; + } + } + pci_register_bar(d, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_64, &pcie_br->shpc_bar); + return; + +msi_error: + pcie_aer_exit(d); +aer_error: +pm_error: + pcie_cap_exit(d); +cap_error: + shpc_free(d); +error: + pci_bridge_exitfn(d); +} + +static void pcie_pci_bridge_exit(PCIDevice *d) +{ + PCIEPCIBridge *bridge_dev = PCIE_PCI_BRIDGE_DEV(d); + pcie_cap_exit(d); + shpc_cleanup(d, &bridge_dev->shpc_bar); + pci_bridge_exitfn(d); +} + +static void pcie_pci_bridge_reset(DeviceState *qdev) +{ + PCIDevice *d = PCI_DEVICE(qdev); + pci_bridge_reset(qdev); + msi_reset(d); + shpc_reset(d); +} + +static void pcie_pci_bridge_write_config(PCIDevice *d, + uint32_t address, uint32_t val, int len) +{ + pci_bridge_write_config(d, address, val, len); + msi_write_config(d, address, val, len); + shpc_cap_write_config(d, address, val, len); +} + +static Property pcie_pci_bridge_dev_properties[] = { + DEFINE_PROP_ON_OFF_AUTO("msi", PCIEPCIBridge, msi, ON_OFF_AUTO_ON), + DEFINE_PROP_END_OF_LIST(), +}; + +static const VMStateDescription pcie_pci_bridge_dev_vmstate = { + .name = TYPE_PCIE_PCI_BRIDGE_DEV, + .fields = (VMStateField[]) { + VMSTATE_PCI_DEVICE(parent_obj, PCIBridge), + SHPC_VMSTATE(shpc, PCIDevice, NULL), + VMSTATE_END_OF_LIST() + } +}; + +static void pcie_pci_bridge_hotplug_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + PCIDevice *pci_hotplug_dev = PCI_DEVICE(hotplug_dev); + + if (!shpc_present(pci_hotplug_dev)) { + error_setg(errp, "standard hotplug controller has been disabled for " + "this %s", TYPE_PCIE_PCI_BRIDGE_DEV); + return; + } + shpc_device_hotplug_cb(hotplug_dev, dev, errp); +} + +static void pcie_pci_bridge_hot_unplug_request_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, + Error **errp) +{ + PCIDevice *pci_hotplug_dev = PCI_DEVICE(hotplug_dev); + + if (!shpc_present(pci_hotplug_dev)) { + error_setg(errp, "standard hotplug controller has been disabled for " + "this %s", TYPE_PCIE_PCI_BRIDGE_DEV); + return; + } + shpc_device_hot_unplug_request_cb(hotplug_dev, dev, errp); +} + +static void pcie_pci_bridge_class_init(ObjectClass *klass, void *data) +{ + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + DeviceClass *dc = DEVICE_CLASS(klass); + HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(klass); + + k->is_express = 1; + k->is_bridge = 1; + k->vendor_id = PCI_VENDOR_ID_REDHAT; + k->device_id = PCI_DEVICE_ID_REDHAT_PCIE_BRIDGE; + k->realize = pcie_pci_bridge_realize; + k->exit = pcie_pci_bridge_exit; + k->config_write = pcie_pci_bridge_write_config; + dc->vmsd = &pcie_pci_bridge_dev_vmstate; + dc->props = pcie_pci_bridge_dev_properties; + dc->vmsd = &pcie_pci_bridge_dev_vmstate; + dc->reset = &pcie_pci_bridge_reset; + set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories); + hc->plug = pcie_pci_bridge_hotplug_cb; + hc->unplug_request = pcie_pci_bridge_hot_unplug_request_cb; +} + +static const TypeInfo pcie_pci_bridge_info = { + .name = TYPE_PCIE_PCI_BRIDGE_DEV, + .parent = TYPE_PCI_BRIDGE, + .instance_size = sizeof(PCIEPCIBridge), + .class_init = pcie_pci_bridge_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_HOTPLUG_HANDLER }, + { }, + } +}; + +static void pciepci_register(void) +{ + type_register_static(&pcie_pci_bridge_info); +} + +type_init(pciepci_register); diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 8bb6449dd7..aa7ef9cf69 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -100,6 +100,7 @@ extern bool pci_available; #define PCI_DEVICE_ID_REDHAT_PXB_PCIE 0x000b #define PCI_DEVICE_ID_REDHAT_PCIE_RP 0x000c #define PCI_DEVICE_ID_REDHAT_XHCI 0x000d +#define PCI_DEVICE_ID_REDHAT_PCIE_BRIDGE 0x000e #define PCI_DEVICE_ID_REDHAT_QXL 0x0100 #define FMT_PCIBUS PRIx64 From 70e1ee59bb9490d9ac529e96820a03b346086ca1 Mon Sep 17 00:00:00 2001 From: Aleksandr Bezzubikov Date: Fri, 18 Aug 2017 02:36:48 +0300 Subject: [PATCH 07/17] hw/pci: introduce bridge-only vendor-specific capability to provide some hints to firmware On PCI init PCI bridges may need some extra info about bus number, IO, memory and prefetchable memory to reserve. QEMU can provide this with a special vendor-specific PCI capability. Signed-off-by: Aleksandr Bezzubikov Reviewed-by: Marcel Apfelbaum Tested-by: Marcel Apfelbaum Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pci_bridge.c | 46 +++++++++++++++++++++++++++++++++++++ include/hw/pci/pci_bridge.h | 25 ++++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c index 720119b21a..17feae5ed8 100644 --- a/hw/pci/pci_bridge.c +++ b/hw/pci/pci_bridge.c @@ -408,6 +408,52 @@ void pci_bridge_map_irq(PCIBridge *br, const char* bus_name, br->bus_name = bus_name; } + +int pci_bridge_qemu_reserve_cap_init(PCIDevice *dev, int cap_offset, + uint32_t bus_reserve, uint64_t io_reserve, + uint32_t mem_non_pref_reserve, + uint32_t mem_pref_32_reserve, + uint64_t mem_pref_64_reserve, + Error **errp) +{ + if (mem_pref_32_reserve != (uint32_t)-1 && + mem_pref_64_reserve != (uint64_t)-1) { + error_setg(errp, + "PCI resource reserve cap: PREF32 and PREF64 conflict"); + return -EINVAL; + } + + if (bus_reserve == (uint32_t)-1 && + io_reserve == (uint64_t)-1 && + mem_non_pref_reserve == (uint32_t)-1 && + mem_pref_32_reserve == (uint32_t)-1 && + mem_pref_64_reserve == (uint64_t)-1) { + return 0; + } + + size_t cap_len = sizeof(PCIBridgeQemuCap); + PCIBridgeQemuCap cap = { + .len = cap_len, + .type = REDHAT_PCI_CAP_RESOURCE_RESERVE, + .bus_res = bus_reserve, + .io = io_reserve, + .mem = mem_non_pref_reserve, + .mem_pref_32 = mem_pref_32_reserve, + .mem_pref_64 = mem_pref_64_reserve + }; + + int offset = pci_add_capability(dev, PCI_CAP_ID_VNDR, + cap_offset, cap_len, errp); + if (offset < 0) { + return offset; + } + + memcpy(dev->config + offset + PCI_CAP_FLAGS, + (char *)&cap + PCI_CAP_FLAGS, + cap_len - PCI_CAP_FLAGS); + return 0; +} + static const TypeInfo pci_bridge_type_info = { .name = TYPE_PCI_BRIDGE, .parent = TYPE_PCI_DEVICE, diff --git a/include/hw/pci/pci_bridge.h b/include/hw/pci/pci_bridge.h index ff7cbaa227..1acadc2c15 100644 --- a/include/hw/pci/pci_bridge.h +++ b/include/hw/pci/pci_bridge.h @@ -67,4 +67,29 @@ void pci_bridge_map_irq(PCIBridge *br, const char* bus_name, #define PCI_BRIDGE_CTL_DISCARD_STATUS 0x400 /* Discard timer status */ #define PCI_BRIDGE_CTL_DISCARD_SERR 0x800 /* Discard timer SERR# enable */ +typedef struct PCIBridgeQemuCap { + uint8_t id; /* Standard PCI capability header field */ + uint8_t next; /* Standard PCI capability header field */ + uint8_t len; /* Standard PCI vendor-specific capability header field */ + uint8_t type; /* Red Hat vendor-specific capability type. + Types are defined with REDHAT_PCI_CAP_ prefix */ + + uint32_t bus_res; /* Minimum number of buses to reserve */ + uint64_t io; /* IO space to reserve */ + uint32_t mem; /* Non-prefetchable memory to reserve */ + /* At most one of the following two fields may be set to a value + * different from -1 */ + uint32_t mem_pref_32; /* Prefetchable memory to reserve (32-bit MMIO) */ + uint64_t mem_pref_64; /* Prefetchable memory to reserve (64-bit MMIO) */ +} PCIBridgeQemuCap; + +#define REDHAT_PCI_CAP_RESOURCE_RESERVE 1 + +int pci_bridge_qemu_reserve_cap_init(PCIDevice *dev, int cap_offset, + uint32_t bus_reserve, uint64_t io_reserve, + uint32_t mem_non_pref_reserve, + uint32_t mem_pref_32_reserve, + uint64_t mem_pref_64_reserve, + Error **errp); + #endif /* QEMU_PCI_BRIDGE_H */ From 226263fb5cdaa4a4a95f1680fabbc9dd2123fd67 Mon Sep 17 00:00:00 2001 From: Aleksandr Bezzubikov Date: Fri, 18 Aug 2017 02:36:49 +0300 Subject: [PATCH 08/17] hw/pci: add QEMU-specific PCI capability to the Generic PCI Express Root Port To enable hotplugging of a newly created pcie-pci-bridge, we need to tell firmware (e.g. SeaBIOS) to reserve additional buses or IO/MEM/PREF space for pcie-root-port. Additional bus reservation allows us to hotplug pcie-pci-bridge into this root port. The number of buses and IO/MEM/PREF space to reserve are provided to the device via a corresponding property, and to the firmware via new PCI capability. The properties' default values are -1 to keep default behavior unchanged. Signed-off-by: Aleksandr Bezzubikov Reviewed-by: Marcel Apfelbaum Tested-by: Marcel Apfelbaum Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci-bridge/gen_pcie_root_port.c | 36 ++++++++++++++++++++++++++++++ include/hw/pci/pcie_port.h | 1 + 2 files changed, 37 insertions(+) diff --git a/hw/pci-bridge/gen_pcie_root_port.c b/hw/pci-bridge/gen_pcie_root_port.c index cb694d6da5..ed03ffc764 100644 --- a/hw/pci-bridge/gen_pcie_root_port.c +++ b/hw/pci-bridge/gen_pcie_root_port.c @@ -16,6 +16,8 @@ #include "hw/pci/pcie_port.h" #define TYPE_GEN_PCIE_ROOT_PORT "pcie-root-port" +#define GEN_PCIE_ROOT_PORT(obj) \ + OBJECT_CHECK(GenPCIERootPort, (obj), TYPE_GEN_PCIE_ROOT_PORT) #define GEN_PCIE_ROOT_PORT_AER_OFFSET 0x100 #define GEN_PCIE_ROOT_PORT_MSIX_NR_VECTOR 1 @@ -26,6 +28,13 @@ typedef struct GenPCIERootPort { /*< public >*/ bool migrate_msix; + + /* additional resources to reserve on firmware init */ + uint32_t bus_reserve; + uint64_t io_reserve; + uint64_t mem_reserve; + uint64_t pref32_reserve; + uint64_t pref64_reserve; } GenPCIERootPort; static uint8_t gen_rp_aer_vector(const PCIDevice *d) @@ -60,6 +69,24 @@ static bool gen_rp_test_migrate_msix(void *opaque, int version_id) return rp->migrate_msix; } +static void gen_rp_realize(DeviceState *dev, Error **errp) +{ + PCIDevice *d = PCI_DEVICE(dev); + GenPCIERootPort *grp = GEN_PCIE_ROOT_PORT(d); + PCIERootPortClass *rpc = PCIE_ROOT_PORT_GET_CLASS(d); + + rpc->parent_realize(dev, errp); + + int rc = pci_bridge_qemu_reserve_cap_init(d, 0, grp->bus_reserve, + grp->io_reserve, grp->mem_reserve, grp->pref32_reserve, + grp->pref64_reserve, errp); + + if (rc < 0) { + rpc->parent_class.exit(d); + return; + } +} + static const VMStateDescription vmstate_rp_dev = { .name = "pcie-root-port", .version_id = 1, @@ -78,6 +105,11 @@ static const VMStateDescription vmstate_rp_dev = { static Property gen_rp_props[] = { DEFINE_PROP_BOOL("x-migrate-msix", GenPCIERootPort, migrate_msix, true), + DEFINE_PROP_UINT32("bus-reserve", GenPCIERootPort, bus_reserve, -1), + DEFINE_PROP_SIZE("io-reserve", GenPCIERootPort, io_reserve, -1), + DEFINE_PROP_SIZE("mem-reserve", GenPCIERootPort, mem_reserve, -1), + DEFINE_PROP_SIZE("pref32-reserve", GenPCIERootPort, pref32_reserve, -1), + DEFINE_PROP_SIZE("pref64-reserve", GenPCIERootPort, pref64_reserve, -1), DEFINE_PROP_END_OF_LIST() }; @@ -92,6 +124,10 @@ static void gen_rp_dev_class_init(ObjectClass *klass, void *data) dc->desc = "PCI Express Root Port"; dc->vmsd = &vmstate_rp_dev; dc->props = gen_rp_props; + + rpc->parent_realize = dc->realize; + dc->realize = gen_rp_realize; + rpc->aer_vector = gen_rp_aer_vector; rpc->interrupts_init = gen_rp_interrupts_init; rpc->interrupts_uninit = gen_rp_interrupts_uninit; diff --git a/include/hw/pci/pcie_port.h b/include/hw/pci/pcie_port.h index 13332668e8..0736014bfd 100644 --- a/include/hw/pci/pcie_port.h +++ b/include/hw/pci/pcie_port.h @@ -65,6 +65,7 @@ void pcie_chassis_del_slot(PCIESlot *s); typedef struct PCIERootPortClass { PCIDeviceClass parent_class; + DeviceRealize parent_realize; uint8_t (*aer_vector)(const PCIDevice *dev); int (*interrupts_init)(PCIDevice *dev, Error **errp); From c1800a16276582f65807f464a6ab0b7c88a1c16e Mon Sep 17 00:00:00 2001 From: Aleksandr Bezzubikov Date: Fri, 18 Aug 2017 02:36:50 +0300 Subject: [PATCH 09/17] docs: update documentation considering PCIE-PCI bridge Signed-off-by: Aleksandr Bezzubikov Reviewed-by: Laszlo Ersek Reviewed-by: Marcel Apfelbaum Tested-by: Marcel Apfelbaum Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- docs/pcie.txt | 49 +++++++++-------- docs/pcie_pci_bridge.txt | 114 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+), 23 deletions(-) create mode 100644 docs/pcie_pci_bridge.txt diff --git a/docs/pcie.txt b/docs/pcie.txt index 5bada24a15..76b85ece32 100644 --- a/docs/pcie.txt +++ b/docs/pcie.txt @@ -46,7 +46,7 @@ Place only the following kinds of devices directly on the Root Complex: (2) PCI Express Root Ports (ioh3420), for starting exclusively PCI Express hierarchies. - (3) DMI-PCI Bridges (i82801b11-bridge), for starting legacy PCI + (3) PCI Express to PCI Bridge (pcie-pci-bridge), for starting legacy PCI hierarchies. (4) Extra Root Complexes (pxb-pcie), if multiple PCI Express Root Buses @@ -55,18 +55,18 @@ Place only the following kinds of devices directly on the Root Complex: pcie.0 bus ---------------------------------------------------------------------------- | | | | - ----------- ------------------ ------------------ -------------- - | PCI Dev | | PCIe Root Port | | DMI-PCI Bridge | | pxb-pcie | - ----------- ------------------ ------------------ -------------- + ----------- ------------------ ------------------- -------------- + | PCI Dev | | PCIe Root Port | | PCIe-PCI Bridge | | pxb-pcie | + ----------- ------------------ ------------------- -------------- 2.1.1 To plug a device into pcie.0 as a Root Complex Integrated Endpoint use: -device [,bus=pcie.0] 2.1.2 To expose a new PCI Express Root Bus use: -device pxb-pcie,id=pcie.1,bus_nr=x[,numa_node=y][,addr=z] - Only PCI Express Root Ports and DMI-PCI bridges can be connected - to the pcie.1 bus: + PCI Express Root Ports and PCI Express to PCI bridges can be + connected to the pcie.1 bus: -device ioh3420,id=root_port1[,bus=pcie.1][,chassis=x][,slot=y][,addr=z] \ - -device i82801b11-bridge,id=dmi_pci_bridge1,bus=pcie.1 + -device pcie-pci-bridge,id=pcie_pci_bridge1,bus=pcie.1 2.2 PCI Express only hierarchy @@ -130,24 +130,24 @@ Notes: Legacy PCI devices can be plugged into pcie.0 as Integrated Endpoints, but, as mentioned in section 5, doing so means the legacy PCI device in question will be incapable of hot-unplugging. -Besides that use DMI-PCI Bridges (i82801b11-bridge) in combination -with PCI-PCI Bridges (pci-bridge) to start PCI hierarchies. +Besides that use PCI Express to PCI Bridges (pcie-pci-bridge) in +combination with PCI-PCI Bridges (pci-bridge) to start PCI hierarchies. -Prefer flat hierarchies. For most scenarios a single DMI-PCI Bridge +Prefer flat hierarchies. For most scenarios a single PCI Express to PCI Bridge (having 32 slots) and several PCI-PCI Bridges attached to it (each supporting also 32 slots) will support hundreds of legacy devices. -The recommendation is to populate one PCI-PCI Bridge under the DMI-PCI Bridge -until is full and then plug a new PCI-PCI Bridge... +The recommendation is to populate one PCI-PCI Bridge under the +PCI Express to PCI Bridge until is full and then plug a new PCI-PCI Bridge... pcie.0 bus ---------------------------------------------- | | - ----------- ------------------ - | PCI Dev | | DMI-PCI BRIDGE | - ---------- ------------------ + ----------- ------------------- + | PCI Dev | | PCIe-PCI Bridge | + ----------- ------------------- | | ------------------ ------------------ - | PCI-PCI Bridge | | PCI-PCI Bridge | ... + | PCI-PCI Bridge | | PCI-PCI Bridge | ------------------ ------------------ | | ----------- ----------- @@ -157,11 +157,11 @@ until is full and then plug a new PCI-PCI Bridge... 2.3.1 To plug a PCI device into pcie.0 as an Integrated Endpoint use: -device [,bus=pcie.0] 2.3.2 Plugging a PCI device into a PCI-PCI Bridge: - -device i82801b11-bridge,id=dmi_pci_bridge1[,bus=pcie.0] \ - -device pci-bridge,id=pci_bridge1,bus=dmi_pci_bridge1[,chassis_nr=x][,addr=y] \ + -device pcie-pci-bridge,id=pcie_pci_bridge1[,bus=pcie.0] \ + -device pci-bridge,id=pci_bridge1,bus=pcie_pci_bridge1[,chassis_nr=x][,addr=y] \ -device ,bus=pci_bridge1[,addr=x] Note that 'addr' cannot be 0 unless shpc=off parameter is passed to - the PCI Bridge. + the PCI Bridge/PCI Express to PCI Bridge. 3. IO space issues =================== @@ -219,14 +219,16 @@ do not support hot-plug, so any devices plugged into Root Complexes cannot be hot-plugged/hot-unplugged: (1) PCI Express Integrated Endpoints (2) PCI Express Root Ports - (3) DMI-PCI Bridges + (3) PCI Express to PCI Bridges (4) pxb-pcie Be aware that PCI Express Downstream Ports can't be hot-plugged into an existing PCI Express Upstream Port. -PCI devices can be hot-plugged into PCI-PCI Bridges. The PCI hot-plug is ACPI -based and can work side by side with the PCI Express native hot-plug. +PCI devices can be hot-plugged into PCI Express to PCI and PCI-PCI Bridges. +The PCI hot-plug into PCI-PCI bridge is ACPI based, whereas hot-plug into +PCI Express to PCI bridges is SHPC-based. They both can work side by side with +the PCI Express native hot-plug. PCI Express devices can be natively hot-plugged/hot-unplugged into/from PCI Express Root Ports (and PCI Express Downstream Ports). @@ -234,10 +236,11 @@ PCI Express Root Ports (and PCI Express Downstream Ports). 5.1 Planning for hot-plug: (1) PCI hierarchy Leave enough PCI-PCI Bridge slots empty or add one - or more empty PCI-PCI Bridges to the DMI-PCI Bridge. + or more empty PCI-PCI Bridges to the PCI Express to PCI Bridge. For each such PCI-PCI Bridge the Guest Firmware is expected to reserve 4K IO space and 2M MMIO range to be used for all devices behind it. + Appropriate PCI capability is designed, see pcie_pci_bridge.txt. Because of the hard IO limit of around 10 PCI Bridges (~ 40K space) per system don't use more than 9 PCI-PCI Bridges, leaving 4K for the diff --git a/docs/pcie_pci_bridge.txt b/docs/pcie_pci_bridge.txt new file mode 100644 index 0000000000..5a4203f97c --- /dev/null +++ b/docs/pcie_pci_bridge.txt @@ -0,0 +1,114 @@ +Generic PCI Express to PCI Bridge +================================ + +Description +=========== +PCIE-to-PCI bridge is a new method for legacy PCI +hierarchies creation on Q35 machines. + +Previously Intel DMI-to-PCI bridge was used for this purpose. +But due to its strict limitations - no support of hot-plug, +no cross-platform and cross-architecture support - a new generic +PCIE-to-PCI bridge should now be used for any legacy PCI device usage +with PCI Express machine. + +This generic PCIE-PCI bridge is a cross-platform device, +can be hot-plugged into appropriate root port (requires additional actions, +see 'PCIE-PCI bridge hot-plug' section), +and supports devices hot-plug into the bridge itself +(with some limitations, see below). + +Hot-plug of legacy PCI devices into the bridge +is provided by bridge's built-in Standard hot-plug Controller. +Though it still has some limitations, see below. + +PCIE-PCI bridge hot-plug +======================= +Guest OSes require extra efforts to enable PCIE-PCI bridge hot-plug. +Motivation - now on init any PCI Express root port which doesn't have +any device plugged in, has no free buses reserved to provide any of them +to a hot-plugged devices in future. + +To solve this problem we reserve additional buses on a firmware level. +Currently only SeaBIOS is supported. +The way of bus number to reserve delivery is special +Red Hat vendor-specific PCI capability, added to the root port +that is planned to have PCIE-PCI bridge hot-plugged in. + +Capability layout (defined in include/hw/pci/pci_bridge.h): + + uint8_t id; Standard PCI capability header field + uint8_t next; Standard PCI capability header field + uint8_t len; Standard PCI vendor-specific capability header field + + uint8_t type; Red Hat vendor-specific capability type + List of currently existing types: + RESOURCE_RESERVE = 1 + + + uint32_t bus_res; Minimum number of buses to reserve + + uint64_t io; IO space to reserve + uint32_t mem Non-prefetchable memory to reserve + + At most one of the following two fields may be set to a value + different from -1: + uint32_t mem_pref_32; Prefetchable memory to reserve (32-bit MMIO) + uint64_t mem_pref_64; Prefetchable memory to reserve (64-bit MMIO) + +If any reservation field is -1 then this kind of reservation is not +needed and must be ignored by firmware. + +At the moment this capability is used only in QEMU generic PCIe root port +(-device pcie-root-port). Capability construction function takes all reservation +fields values from corresponding device properties. By default all of them are +set to -1 to leave root port's default behavior unchanged. + +Usage +===== +A detailed command line would be: + +[qemu-bin + storage options] \ +-m 2G \ +-device pcie-root-port,bus=pcie.0,id=rp1 \ +-device pcie-root-port,bus=pcie.0,id=rp2 \ +-device pcie-root-port,bus=pcie.0,id=rp3,bus-reserve=1 \ +-device pcie-pci-bridge,id=br1,bus=rp1 \ +-device pcie-pci-bridge,id=br2,bus=rp2 \ +-device e1000,bus=br1,addr=8 + +Then in monitor it's OK to execute next commands: +device_add pcie-pci-bridge,id=br3,bus=rp3 \ +device_add e1000,bus=br2,addr=1 \ +device_add e1000,bus=br3,addr=1 + +Here you have: + (1) Cold-plugged: + - Root ports: 1 QEMU generic root port with the capability mentioned above, + 2 QEMU generic root ports without this capability; + - 2 PCIE-PCI bridges plugged into 2 different root ports; + - e1000 plugged into the first bridge. + (2) Hot-plugged: + - PCIE-PCI bridge, plugged into QEMU generic root port; + - 2 e1000 cards, one plugged into the cold-plugged PCIE-PCI bridge, + another plugged into the hot-plugged bridge. + +Limitations +=========== +The PCIE-PCI bridge can be hot-plugged only into pcie-root-port that +has proper 'bus-reserve' property value to provide secondary bus for the +hot-plugged bridge. + +Windows 7 and older versions don't support hot-plug devices into the PCIE-PCI bridge. +To enable device hot-plug into the bridge on Linux there're 3 ways: +1) Build shpchp module with this patch http://www.spinics.net/lists/linux-pci/msg63052.html +2) Use kernel 4.14+ where the patch mentioned above is already merged. +3) Set 'msi' property to off - this forces the bridge to use legacy INTx, + which allows the bridge to notify the OS about hot-plug event without having + BUSMASTER set. + +Implementation +============== +The PCIE-PCI bridge is based on PCI-PCI bridge, but also accumulates PCI Express +features as a PCI Express device (is_express=1). + From 66a4a0318e6b9539505491e4576fb93a708095d8 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 17 Aug 2017 13:56:14 +0800 Subject: [PATCH 10/17] intel_iommu: fix missing BQL in pt fast path In vtd_switch_address_space() we did the memory region switch, however it's possible that the caller of it has not taken the BQL at all. Make sure we have it. CC: Paolo Bonzini CC: Jason Wang CC: Michael S. Tsirkin Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index a7bf87a19e..3a5bb0bc2e 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -957,6 +957,8 @@ static bool vtd_dev_pt_enabled(VTDAddressSpace *as) static bool vtd_switch_address_space(VTDAddressSpace *as) { bool use_iommu; + /* Whether we need to take the BQL on our own */ + bool take_bql = !qemu_mutex_iothread_locked(); assert(as); @@ -967,6 +969,15 @@ static bool vtd_switch_address_space(VTDAddressSpace *as) VTD_PCI_FUNC(as->devfn), use_iommu); + /* + * It's possible that we reach here without BQL, e.g., when called + * from vtd_pt_enable_fast_path(). However the memory APIs need + * it. We'd better make sure we have had it already, or, take it. + */ + if (take_bql) { + qemu_mutex_lock_iothread(); + } + /* Turn off first then on the other */ if (use_iommu) { memory_region_set_enabled(&as->sys_alias, false); @@ -976,6 +987,10 @@ static bool vtd_switch_address_space(VTDAddressSpace *as) memory_region_set_enabled(&as->sys_alias, true); } + if (take_bql) { + qemu_mutex_unlock_iothread(); + } + return use_iommu; } From 0b4a775188cbe411d9ce7073ea1a24b72848327e Mon Sep 17 00:00:00 2001 From: Yoni Bettan Date: Tue, 5 Sep 2017 11:46:34 +0300 Subject: [PATCH 11/17] acpi/vmgenid: change device category to misc Moved vmgenid from uncategorized to misc category in QEMU help menu Signed-off-by: Yoni Bettan Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/vmgenid.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/acpi/vmgenid.c b/hw/acpi/vmgenid.c index a32b847fe0..876723a4ce 100644 --- a/hw/acpi/vmgenid.c +++ b/hw/acpi/vmgenid.c @@ -240,6 +240,7 @@ static void vmgenid_device_class_init(ObjectClass *klass, void *data) dc->realize = vmgenid_realize; dc->hotpluggable = false; dc->props = vmgenid_properties; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); object_class_property_add_str(klass, VMGENID_GUID, NULL, vmgenid_set_guid, NULL); From 35480cbfcb73143af66c8de4b444d686a46c2e88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Tue, 29 Aug 2017 17:27:50 +0200 Subject: [PATCH 12/17] libvhost-user: support resuming vq->last_avail_idx based on used_idx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is the same workaround as commit 523b018dde3b765, which was lost with libvhost-user transition in commit e10e798c85c2331. Signed-off-by: Marc-André Lureau Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- contrib/libvhost-user/libvhost-user.c | 13 +++++++++++++ contrib/libvhost-user/libvhost-user.h | 7 +++++++ 2 files changed, 20 insertions(+) diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c index 35fa0c5e56..d27d6303db 100644 --- a/contrib/libvhost-user/libvhost-user.c +++ b/contrib/libvhost-user/libvhost-user.c @@ -521,6 +521,19 @@ vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg) vq->used_idx = vq->vring.used->idx; + if (vq->last_avail_idx != vq->used_idx) { + bool resume = dev->iface->queue_is_processed_in_order && + dev->iface->queue_is_processed_in_order(dev, index); + + DPRINT("Last avail index != used index: %u != %u%s\n", + vq->last_avail_idx, vq->used_idx, + resume ? ", resuming" : ""); + + if (resume) { + vq->shadow_avail_idx = vq->last_avail_idx = vq->used_idx; + } + } + return false; } diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h index 53ef222c0b..4021f1124e 100644 --- a/contrib/libvhost-user/libvhost-user.h +++ b/contrib/libvhost-user/libvhost-user.h @@ -132,6 +132,7 @@ typedef void (*vu_set_features_cb) (VuDev *dev, uint64_t features); typedef int (*vu_process_msg_cb) (VuDev *dev, VhostUserMsg *vmsg, int *do_reply); typedef void (*vu_queue_set_started_cb) (VuDev *dev, int qidx, bool started); +typedef bool (*vu_queue_is_processed_in_order_cb) (VuDev *dev, int qidx); typedef struct VuDevIface { /* called by VHOST_USER_GET_FEATURES to get the features bitmask */ @@ -148,6 +149,12 @@ typedef struct VuDevIface { vu_process_msg_cb process_msg; /* tells when queues can be processed */ vu_queue_set_started_cb queue_set_started; + /* + * If the queue is processed in order, in which case it will be + * resumed to vring.used->idx. This can help to support resuming + * on unmanaged exit/crash. + */ + vu_queue_is_processed_in_order_cb queue_is_processed_in_order; } VuDevIface; typedef void (*vu_queue_handler_cb) (VuDev *dev, int qidx); From 672339f7eff5e9226f302037290e84e783d2b5cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Tue, 29 Aug 2017 17:27:51 +0200 Subject: [PATCH 13/17] vhost-user-bridge: fix resume regression (since 2.9) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit e10e798c85c2331 switched to libvhost-user which lacked support for resuming the avail_idx based on used_idx. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1485867 Signed-off-by: Marc-André Lureau Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/vhost-user-bridge.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/vhost-user-bridge.c b/tests/vhost-user-bridge.c index 1e5b5ca3da..f922cc75ae 100644 --- a/tests/vhost-user-bridge.c +++ b/tests/vhost-user-bridge.c @@ -466,11 +466,18 @@ vubr_panic(VuDev *dev, const char *msg) vubr->quit = 1; } +static bool +vubr_queue_is_processed_in_order(VuDev *dev, int qidx) +{ + return true; +} + static const VuDevIface vuiface = { .get_features = vubr_get_features, .set_features = vubr_set_features, .process_msg = vubr_process_msg, .queue_set_started = vubr_queue_set_started, + .queue_is_processed_in_order = vubr_queue_is_processed_in_order, }; static void From c8389550dedc65892fba9c3df29423efd802f544 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Mon, 7 Aug 2017 18:45:13 +0200 Subject: [PATCH 14/17] vmgenid: replace x-write-pointer-available hack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This compat property sole function is to prevent the device from being instantiated. Instead of requiring an extra compat property, check if fw_cfg has DMA enabled. fw_cfg is a built-in device that is initialized very early by the machine init code. We have at least one other device that also assumes fw_cfg_find() can be safely used on realize: pvpanic. This has the additional benefit of handling other cases properly, like: $ qemu-system-x86_64 -device vmgenid -machine none qemu-system-x86_64: -device vmgenid: vmgenid requires DMA write support in fw_cfg, which this machine type does not provide $ qemu-system-x86_64 -device vmgenid -machine pc-i440fx-2.9 -global fw_cfg.dma_enabled=off qemu-system-x86_64: -device vmgenid: vmgenid requires DMA write support in fw_cfg, which this machine type does not provide $ qemu-system-x86_64 -device vmgenid -machine pc-i440fx-2.6 -global fw_cfg.dma_enabled=on [boots normally] Suggested-by: Eduardo Habkost Signed-off-by: Marc-André Lureau Reviewed-by: Michael S. Tsirkin Reviewed-by: Eduardo Habkost Reviewed-by: Ben Warren Reviewed-by: Laszlo Ersek Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/bios-linker-loader.c | 10 ++++++++++ hw/acpi/vmgenid.c | 9 +-------- include/hw/acpi/bios-linker-loader.h | 2 ++ include/hw/acpi/vmgenid.h | 1 - include/hw/compat.h | 4 ---- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/hw/acpi/bios-linker-loader.c b/hw/acpi/bios-linker-loader.c index 046183a0f1..d16b8bbcb1 100644 --- a/hw/acpi/bios-linker-loader.c +++ b/hw/acpi/bios-linker-loader.c @@ -168,6 +168,16 @@ bios_linker_find_file(const BIOSLinker *linker, const char *name) return NULL; } +/* + * board code must realize fw_cfg first, as a fixed device, before + * another device realize function call bios_linker_loader_can_write_pointer() + */ +bool bios_linker_loader_can_write_pointer(void) +{ + FWCfgState *fw_cfg = fw_cfg_find(); + return fw_cfg && fw_cfg_dma_enabled(fw_cfg); +} + /* * bios_linker_loader_alloc: ask guest to load file into guest memory. * diff --git a/hw/acpi/vmgenid.c b/hw/acpi/vmgenid.c index 876723a4ce..2876d8a639 100644 --- a/hw/acpi/vmgenid.c +++ b/hw/acpi/vmgenid.c @@ -205,17 +205,11 @@ static void vmgenid_handle_reset(void *opaque) memset(vms->vmgenid_addr_le, 0, ARRAY_SIZE(vms->vmgenid_addr_le)); } -static Property vmgenid_properties[] = { - DEFINE_PROP_BOOL("x-write-pointer-available", VmGenIdState, - write_pointer_available, true), - DEFINE_PROP_END_OF_LIST(), -}; - static void vmgenid_realize(DeviceState *dev, Error **errp) { VmGenIdState *vms = VMGENID(dev); - if (!vms->write_pointer_available) { + if (!bios_linker_loader_can_write_pointer()) { error_setg(errp, "%s requires DMA write support in fw_cfg, " "which this machine type does not provide", VMGENID_DEVICE); return; @@ -239,7 +233,6 @@ static void vmgenid_device_class_init(ObjectClass *klass, void *data) dc->vmsd = &vmstate_vmgenid; dc->realize = vmgenid_realize; dc->hotpluggable = false; - dc->props = vmgenid_properties; set_bit(DEVICE_CATEGORY_MISC, dc->categories); object_class_property_add_str(klass, VMGENID_GUID, NULL, diff --git a/include/hw/acpi/bios-linker-loader.h b/include/hw/acpi/bios-linker-loader.h index efe17b0b9c..a711dbced8 100644 --- a/include/hw/acpi/bios-linker-loader.h +++ b/include/hw/acpi/bios-linker-loader.h @@ -7,6 +7,8 @@ typedef struct BIOSLinker { GArray *file_list; } BIOSLinker; +bool bios_linker_loader_can_write_pointer(void); + BIOSLinker *bios_linker_loader_init(void); void bios_linker_loader_alloc(BIOSLinker *linker, diff --git a/include/hw/acpi/vmgenid.h b/include/hw/acpi/vmgenid.h index 7beb9592fb..38586ecbdf 100644 --- a/include/hw/acpi/vmgenid.h +++ b/include/hw/acpi/vmgenid.h @@ -21,7 +21,6 @@ typedef struct VmGenIdState { DeviceClass parent_obj; QemuUUID guid; /* The 128-bit GUID seen by the guest */ uint8_t vmgenid_addr_le[8]; /* Address of the GUID (little-endian) */ - bool write_pointer_available; } VmGenIdState; /* returns NULL unless there is exactly one device */ diff --git a/include/hw/compat.h b/include/hw/compat.h index 3e101f8f67..9cc14dd798 100644 --- a/include/hw/compat.h +++ b/include/hw/compat.h @@ -153,10 +153,6 @@ .driver = "fw_cfg_io",\ .property = "dma_enabled",\ .value = "off",\ - },{\ - .driver = "vmgenid",\ - .property = "x-write-pointer-available",\ - .value = "off",\ }, #define HW_COMPAT_2_3 \ From 9b717a3a1318455afce761301fec114982ccbf1f Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Sun, 16 Jul 2017 21:27:33 +0100 Subject: [PATCH 15/17] pci: move check for existing devfn into new pci_bus_devfn_available() helper Also touch up the logic in do_pci_register_device() accordingly. Signed-off-by: Mark Cave-Ayland Reviewed-by: Marcel Apfelbaum Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pci.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 26f346db2c..002e66920c 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -953,6 +953,11 @@ uint16_t pci_requester_id(PCIDevice *dev) return pci_req_id_cache_extract(&dev->requester_id_cache); } +static bool pci_bus_devfn_available(PCIBus *bus, int devfn) +{ + return !(bus->devices[devfn]); +} + /* -1 for devfn means auto assign */ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus, const char *name, int devfn, @@ -976,14 +981,15 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus, if (devfn < 0) { for(devfn = bus->devfn_min ; devfn < ARRAY_SIZE(bus->devices); devfn += PCI_FUNC_MAX) { - if (!bus->devices[devfn]) + if (pci_bus_devfn_available(bus, devfn)) { goto found; + } } error_setg(errp, "PCI: no slot/function available for %s, all in use", name); return NULL; found: ; - } else if (bus->devices[devfn]) { + } else if (!pci_bus_devfn_available(bus, devfn)) { error_setg(errp, "PCI: slot %d function %d not available for %s," " in use by %s", PCI_SLOT(devfn), PCI_FUNC(devfn), name, From 8b8849844fd6a31956e934885f2a7ae9ac1a95d8 Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Sun, 16 Jul 2017 21:27:34 +0100 Subject: [PATCH 16/17] pci: add reserved slot check to do_pci_register_device() Add a new slot_reserved_mask bitmask to PCIBus indicating whether or not each PCI slot on the bus is reserved. Ensure that it is initialised to zero to maintain the existing behaviour that all slots are available by default, and add the additional check with appropriate error reporting to do_pci_register_device(). Signed-off-by: Mark Cave-Ayland Reviewed-by: Marcel Apfelbaum Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pci.c | 18 +++++++++++++++--- include/hw/pci/pci_bus.h | 1 + 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 002e66920c..21e203b056 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -373,6 +373,7 @@ static void pci_bus_init(PCIBus *bus, DeviceState *parent, { assert(PCI_FUNC(devfn_min) == 0); bus->devfn_min = devfn_min; + bus->slot_reserved_mask = 0x0; bus->address_space_mem = address_space_mem; bus->address_space_io = address_space_io; @@ -958,6 +959,11 @@ static bool pci_bus_devfn_available(PCIBus *bus, int devfn) return !(bus->devices[devfn]); } +static bool pci_bus_devfn_reserved(PCIBus *bus, int devfn) +{ + return bus->slot_reserved_mask & (1UL << PCI_SLOT(devfn)); +} + /* -1 for devfn means auto assign */ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus, const char *name, int devfn, @@ -981,14 +987,20 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus, if (devfn < 0) { for(devfn = bus->devfn_min ; devfn < ARRAY_SIZE(bus->devices); devfn += PCI_FUNC_MAX) { - if (pci_bus_devfn_available(bus, devfn)) { + if (pci_bus_devfn_available(bus, devfn) && + !pci_bus_devfn_reserved(bus, devfn)) { goto found; } } - error_setg(errp, "PCI: no slot/function available for %s, all in use", - name); + error_setg(errp, "PCI: no slot/function available for %s, all in use " + "or reserved", name); return NULL; found: ; + } else if (pci_bus_devfn_reserved(bus, devfn)) { + error_setg(errp, "PCI: slot %d function %d not available for %s," + " reserved", + PCI_SLOT(devfn), PCI_FUNC(devfn), name); + return NULL; } else if (!pci_bus_devfn_available(bus, devfn)) { error_setg(errp, "PCI: slot %d function %d not available for %s," " in use by %s", diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h index 5484a9b5c5..bc34fd0017 100644 --- a/include/hw/pci/pci_bus.h +++ b/include/hw/pci/pci_bus.h @@ -23,6 +23,7 @@ struct PCIBus { PCIIOMMUFunc iommu_fn; void *iommu_opaque; uint8_t devfn_min; + uint32_t slot_reserved_mask; pci_set_irq_fn set_irq; pci_map_irq_fn map_irq; pci_route_irq_fn route_intx_to_irq; From 6f6f4aec749ba9a4fb58c7c20536a61b0381ff35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Mon, 7 Aug 2017 20:16:11 +0200 Subject: [PATCH 17/17] fw_cfg: rename read callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The callback is called on select. Furthermore, the next patch introduced a new callback, so rename the function type with a generic name. Signed-off-by: Marc-André Lureau Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/core/loader.c | 2 +- hw/nvram/fw_cfg.c | 30 ++++++++++++++++-------------- include/hw/loader.h | 2 +- include/hw/nvram/fw_cfg.h | 7 ++++--- 4 files changed, 22 insertions(+), 19 deletions(-) diff --git a/hw/core/loader.c b/hw/core/loader.c index ebe574c7ea..4593061445 100644 --- a/hw/core/loader.c +++ b/hw/core/loader.c @@ -989,7 +989,7 @@ err: MemoryRegion *rom_add_blob(const char *name, const void *blob, size_t len, size_t max_len, hwaddr addr, const char *fw_file_name, - FWCfgReadCallback fw_callback, void *callback_opaque, + FWCfgCallback fw_callback, void *callback_opaque, AddressSpace *as, bool read_only) { MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine()); diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c index 5bd904487f..e3bd626b8c 100644 --- a/hw/nvram/fw_cfg.c +++ b/hw/nvram/fw_cfg.c @@ -55,7 +55,7 @@ struct FWCfgEntry { bool allow_write; uint8_t *data; void *callback_opaque; - FWCfgReadCallback read_callback; + FWCfgCallback select_cb; }; #define JPG_FILE 0 @@ -236,8 +236,8 @@ static int fw_cfg_select(FWCfgState *s, uint16_t key) /* entry successfully selected, now run callback if present */ arch = !!(key & FW_CFG_ARCH_LOCAL); e = &s->entries[arch][key & FW_CFG_ENTRY_MASK]; - if (e->read_callback) { - e->read_callback(e->callback_opaque); + if (e->select_cb) { + e->select_cb(e->callback_opaque); } } @@ -568,11 +568,11 @@ static const VMStateDescription vmstate_fw_cfg = { } }; -static void fw_cfg_add_bytes_read_callback(FWCfgState *s, uint16_t key, - FWCfgReadCallback callback, - void *callback_opaque, - void *data, size_t len, - bool read_only) +static void fw_cfg_add_bytes_callback(FWCfgState *s, uint16_t key, + FWCfgCallback select_cb, + void *callback_opaque, + void *data, size_t len, + bool read_only) { int arch = !!(key & FW_CFG_ARCH_LOCAL); @@ -583,7 +583,7 @@ static void fw_cfg_add_bytes_read_callback(FWCfgState *s, uint16_t key, s->entries[arch][key].data = data; s->entries[arch][key].len = (uint32_t)len; - s->entries[arch][key].read_callback = callback; + s->entries[arch][key].select_cb = select_cb; s->entries[arch][key].callback_opaque = callback_opaque; s->entries[arch][key].allow_write = !read_only; } @@ -610,7 +610,7 @@ static void *fw_cfg_modify_bytes_read(FWCfgState *s, uint16_t key, void fw_cfg_add_bytes(FWCfgState *s, uint16_t key, void *data, size_t len) { - fw_cfg_add_bytes_read_callback(s, key, NULL, NULL, data, len, true); + fw_cfg_add_bytes_callback(s, key, NULL, NULL, data, len, true); } void fw_cfg_add_string(FWCfgState *s, uint16_t key, const char *value) @@ -736,7 +736,8 @@ static int get_fw_cfg_order(FWCfgState *s, const char *name) } void fw_cfg_add_file_callback(FWCfgState *s, const char *filename, - FWCfgReadCallback callback, void *callback_opaque, + FWCfgCallback select_cb, + void *callback_opaque, void *data, size_t len, bool read_only) { int i, index, count; @@ -798,9 +799,10 @@ void fw_cfg_add_file_callback(FWCfgState *s, const char *filename, } } - fw_cfg_add_bytes_read_callback(s, FW_CFG_FILE_FIRST + index, - callback, callback_opaque, data, len, - read_only); + fw_cfg_add_bytes_callback(s, FW_CFG_FILE_FIRST + index, + select_cb, + callback_opaque, data, len, + read_only); s->files->f[index].size = cpu_to_be32(len); s->files->f[index].select = cpu_to_be16(FW_CFG_FILE_FIRST + index); diff --git a/include/hw/loader.h b/include/hw/loader.h index 490c9ff8e6..355fe0f5a2 100644 --- a/include/hw/loader.h +++ b/include/hw/loader.h @@ -192,7 +192,7 @@ int rom_add_file(const char *file, const char *fw_dir, MemoryRegion *rom_add_blob(const char *name, const void *blob, size_t len, size_t max_len, hwaddr addr, const char *fw_file_name, - FWCfgReadCallback fw_callback, + FWCfgCallback fw_callback, void *callback_opaque, AddressSpace *as, bool read_only); int rom_add_elf_program(const char *name, void *data, size_t datasize, diff --git a/include/hw/nvram/fw_cfg.h b/include/hw/nvram/fw_cfg.h index b77ea48abb..f50d068563 100644 --- a/include/hw/nvram/fw_cfg.h +++ b/include/hw/nvram/fw_cfg.h @@ -44,7 +44,7 @@ typedef struct FWCfgDmaAccess { uint64_t address; } QEMU_PACKED FWCfgDmaAccess; -typedef void (*FWCfgReadCallback)(void *opaque); +typedef void (*FWCfgCallback)(void *opaque); struct FWCfgState { /*< private >*/ @@ -182,7 +182,7 @@ void fw_cfg_add_file(FWCfgState *s, const char *filename, void *data, * fw_cfg_add_file_callback: * @s: fw_cfg device being modified * @filename: name of new fw_cfg file item - * @callback: callback function + * @select_cb: callback function when selecting * @callback_opaque: argument to be passed into callback function * @data: pointer to start of item data * @len: size of item data @@ -201,7 +201,8 @@ void fw_cfg_add_file(FWCfgState *s, const char *filename, void *data, * with FW_CFG_DMA_CTL_SELECT). */ void fw_cfg_add_file_callback(FWCfgState *s, const char *filename, - FWCfgReadCallback callback, void *callback_opaque, + FWCfgCallback select_cb, + void *callback_opaque, void *data, size_t len, bool read_only); /**