diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 347718f938..5eba704477 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -24,6 +24,7 @@ #include "exec/address-spaces.h" #include "intel_iommu_internal.h" #include "hw/pci/pci.h" +#include "hw/pci/pci_bus.h" /*#define DEBUG_INTEL_IOMMU*/ #ifdef DEBUG_INTEL_IOMMU @@ -1871,6 +1872,16 @@ static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr, return ret; } +static void vtd_iommu_notify_started(MemoryRegion *iommu) +{ + VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); + + hw_error("Device at bus %s addr %02x.%d requires iommu notifier which " + "is currently not supported by intel-iommu emulation", + vtd_as->bus->qbus.name, PCI_SLOT(vtd_as->devfn), + PCI_FUNC(vtd_as->devfn)); +} + static const VMStateDescription vtd_vmstate = { .name = "iommu-intel", .unmigratable = 1, @@ -1938,6 +1949,7 @@ static void vtd_init(IntelIOMMUState *s) memset(s->womask, 0, DMAR_REG_SIZE); s->iommu_ops.translate = vtd_iommu_translate; + s->iommu_ops.notify_started = vtd_iommu_notify_started; s->root = 0; s->root_extended = false; s->dmar_enabled = false; diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 27cc1596f9..7be638e0e3 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -455,7 +455,8 @@ static void vfio_listener_region_del(MemoryListener *listener, QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { if (giommu->iommu == section->mr) { - memory_region_unregister_iommu_notifier(&giommu->n); + memory_region_unregister_iommu_notifier(giommu->iommu, + &giommu->n); QLIST_REMOVE(giommu, giommu_next); g_free(giommu); break; @@ -991,7 +992,7 @@ static void vfio_disconnect_container(VFIOGroup *group) QLIST_REMOVE(container, next); QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) { - memory_region_unregister_iommu_notifier(&giommu->n); + memory_region_unregister_iommu_notifier(giommu->iommu, &giommu->n); QLIST_REMOVE(giommu, giommu_next); g_free(giommu); } diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index 35d32b78f4..bec694c8d8 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -318,7 +318,7 @@ static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr) /* This windows doesn't seem to be used except by legacy VGA code */ if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || - !vdev->has_vga || nr != 4) { + !vdev->vga || nr != 4) { return; } @@ -366,7 +366,7 @@ static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr) /* Only enable on newer devices where BAR2 is 64bit */ if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || - !vdev->has_vga || nr != 2 || !vdev->bars[2].mem64) { + !vdev->vga || nr != 2 || !vdev->bars[2].mem64) { return; } @@ -660,7 +660,7 @@ static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr) VFIOConfigWindowQuirk *window; if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || - !vdev->has_vga || nr != 5) { + !vdev->vga || nr != 5) { return; } @@ -776,7 +776,7 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr) QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); /* The 0x1800 offset mirror only seems to get used by legacy VGA */ - if (vdev->has_vga) { + if (vdev->vga) { quirk = g_malloc0(sizeof(*quirk)); mirror = quirk->data = g_malloc0(sizeof(*mirror)); mirror->mem = quirk->mem = g_new0(MemoryRegion, 1); diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 53b87b76ea..f2c679e47c 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1502,6 +1502,21 @@ static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos) return next - pos; } + +static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos) +{ + uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE; + + for (tmp = PCI_CONFIG_SPACE_SIZE; tmp; + tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) { + if (tmp > pos && tmp < next) { + next = tmp; + } + } + + return next - pos; +} + static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask) { pci_set_word(buf, (pci_get_word(buf) & ~mask) | val); @@ -1749,16 +1764,100 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) return 0; } +static int vfio_add_ext_cap(VFIOPCIDevice *vdev) +{ + PCIDevice *pdev = &vdev->pdev; + uint32_t header; + uint16_t cap_id, next, size; + uint8_t cap_ver; + uint8_t *config; + + /* Only add extended caps if we have them and the guest can see them */ + if (!pci_is_express(pdev) || !pci_bus_is_express(pdev->bus) || + !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) { + return 0; + } + + /* + * pcie_add_capability always inserts the new capability at the tail + * of the chain. Therefore to end up with a chain that matches the + * physical device, we cache the config space to avoid overwriting + * the original config space when we parse the extended capabilities. + */ + config = g_memdup(pdev->config, vdev->config_size); + + /* + * Extended capabilities are chained with each pointing to the next, so we + * can drop anything other than the head of the chain simply by modifying + * the previous next pointer. For the head of the chain, we can modify the + * capability ID to something that cannot match a valid capability. ID + * 0 is reserved for this since absence of capabilities is indicated by + * 0 for the ID, version, AND next pointer. However, pcie_add_capability() + * uses ID 0 as reserved for list management and will incorrectly match and + * assert if we attempt to pre-load the head of the chain with with this + * ID. Use ID 0xFFFF temporarily since it is also seems to be reserved in + * part for identifying absence of capabilities in a root complex register + * block. If the ID still exists after adding capabilities, switch back to + * zero. We'll mark this entire first dword as emulated for this purpose. + */ + pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE, + PCI_EXT_CAP(0xFFFF, 0, 0)); + pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0); + pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0); + + for (next = PCI_CONFIG_SPACE_SIZE; next; + next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) { + header = pci_get_long(config + next); + cap_id = PCI_EXT_CAP_ID(header); + cap_ver = PCI_EXT_CAP_VER(header); + + /* + * If it becomes important to configure extended capabilities to their + * actual size, use this as the default when it's something we don't + * recognize. Since QEMU doesn't actually handle many of the config + * accesses, exact size doesn't seem worthwhile. + */ + size = vfio_ext_cap_max_size(config, next); + + /* Use emulated next pointer to allow dropping extended caps */ + pci_long_test_and_set_mask(vdev->emulated_config_bits + next, + PCI_EXT_CAP_NEXT_MASK); + + switch (cap_id) { + case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */ + trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next); + break; + default: + pcie_add_capability(pdev, cap_id, cap_ver, next, size); + } + + } + + /* Cleanup chain head ID if necessary */ + if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) { + pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0); + } + + g_free(config); + return 0; +} + static int vfio_add_capabilities(VFIOPCIDevice *vdev) { PCIDevice *pdev = &vdev->pdev; + int ret; if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) || !pdev->config[PCI_CAPABILITY_LIST]) { return 0; /* Nothing to add */ } - return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]); + ret = vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]); + if (ret) { + return ret; + } + + return vfio_add_ext_cap(vdev); } static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index b3eb0d838e..7d482d9d21 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -135,7 +135,6 @@ typedef struct VFIOPCIDevice { int32_t bootindex; uint32_t igd_gms; uint8_t pm_cap; - bool has_vga; bool pci_aer; bool req_enabled; bool has_flr; diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 9da0ff928b..a768fb54ec 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -37,6 +37,7 @@ vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: % vfio_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device %s config:\n size: 0x%lx, offset: 0x%lx, flags: 0x%lx" vfio_populate_device_get_irq_info_failure(void) "VFIO_DEVICE_GET_IRQ_INFO failure: %m" vfio_initfn(const char *name, int group_id) " (%s) group %d" +vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s %x@%x" vfio_pci_reset(const char *name) " (%s)" vfio_pci_reset_flr(const char *name) "%s FLR/VFIO_DEVICE_RESET" vfio_pci_reset_pm(const char *name) "%s PCI PM Reset" diff --git a/include/exec/memory.h b/include/exec/memory.h index e3829f797a..23c7399131 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -153,6 +153,10 @@ struct MemoryRegionIOMMUOps { IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr, bool is_write); /* Returns minimum supported page size */ uint64_t (*get_min_page_size)(MemoryRegion *iommu); + /* Called when the first notifier is set */ + void (*notify_started)(MemoryRegion *iommu); + /* Called when the last notifier is removed */ + void (*notify_stopped)(MemoryRegion *iommu); }; typedef struct CoalescedMemoryRange CoalescedMemoryRange; @@ -622,9 +626,11 @@ void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, bool is_write); * memory_region_unregister_iommu_notifier: unregister a notifier for * changes to IOMMU translation entries. * + * @mr: the memory region which was observed and for which notity_stopped() + * needs to be called * @n: the notifier to be removed. */ -void memory_region_unregister_iommu_notifier(Notifier *n); +void memory_region_unregister_iommu_notifier(MemoryRegion *mr, Notifier *n); /** * memory_region_name: get a memory region's name diff --git a/memory.c b/memory.c index 8549c791d7..33799e810b 100644 --- a/memory.c +++ b/memory.c @@ -1499,6 +1499,10 @@ bool memory_region_is_logging(MemoryRegion *mr, uint8_t client) void memory_region_register_iommu_notifier(MemoryRegion *mr, Notifier *n) { + if (mr->iommu_ops->notify_started && + QLIST_EMPTY(&mr->iommu_notify.notifiers)) { + mr->iommu_ops->notify_started(mr); + } notifier_list_add(&mr->iommu_notify, n); } @@ -1532,9 +1536,13 @@ void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, bool is_write) } } -void memory_region_unregister_iommu_notifier(Notifier *n) +void memory_region_unregister_iommu_notifier(MemoryRegion *mr, Notifier *n) { notifier_remove(n); + if (mr->iommu_ops->notify_stopped && + QLIST_EMPTY(&mr->iommu_notify.notifiers)) { + mr->iommu_ops->notify_stopped(mr); + } } void memory_region_notify_iommu(MemoryRegion *mr,