From ea486926b07d2ebd73ef67315ebb1eecf39faf5a Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 8 Oct 2012 08:45:29 -0600 Subject: [PATCH 01/13] vfio-pci: Update slow path INTx algorithm We can't afford the overhead of switching out and back into mmap mode around each interrupt, but we can do it lazily via a timer. On INTx interrupt, disable the mmap'd memory regions and set a timer. On every interrupt, push the timer out. If the timer expires and the interrupt is no longer pending, switch back to mmap mode. This has the benefit that things like graphics cards, which rarely or never, fire an interrupt don't need manual user intervention to add the x-intx=off parameter. They'll just remain in mmap mode until they trigger an interrupt, and if they don't continue to regularly fire interrupts, they'll switch back. The default timeout is tuned for network cards so that a ping is just enough to keep them in non-mmap mode, where they have much better latency. It is tunable with an experimental option, x-intx-mmap-timeout-ms. A value of 0 keeps the device in non-mmap mode after the first interrupt. It's possible we could look at the class code of devices and come up with reasonable per-class defaults based on expected interrupt frequency and latency. None of this is used for MSI interrupts and also won't be used if we can bypass through KVM. Signed-off-by: Alex Williamson --- hw/vfio_pci.c | 65 ++++++++++++++++++++++++++++++----------------- hw/vfio_pci_int.h | 4 +-- 2 files changed, 44 insertions(+), 25 deletions(-) diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c index a1eeced8fd..7ec9c30cd8 100644 --- a/hw/vfio_pci.c +++ b/hw/vfio_pci.c @@ -92,6 +92,34 @@ static void vfio_unmask_intx(VFIODevice *vdev) ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); } +/* + * Disabling BAR mmaping can be slow, but toggling it around INTx can + * also be a huge overhead. We try to get the best of both worlds by + * waiting until an interrupt to disable mmaps (subsequent transitions + * to the same state are effectively no overhead). If the interrupt has + * been serviced and the time gap is long enough, we re-enable mmaps for + * performance. This works well for things like graphics cards, which + * may not use their interrupt at all and are penalized to an unusable + * level by read/write BAR traps. Other devices, like NICs, have more + * regular interrupts and see much better latency by staying in non-mmap + * mode. We therefore set the default mmap_timeout such that a ping + * is just enough to keep the mmap disabled. Users can experiment with + * other options with the x-intx-mmap-timeout-ms parameter (a value of + * zero disables the timer). + */ +static void vfio_intx_mmap_enable(void *opaque) +{ + VFIODevice *vdev = opaque; + + if (vdev->intx.pending) { + qemu_mod_timer(vdev->intx.mmap_timer, + qemu_get_clock_ms(vm_clock) + vdev->intx.mmap_timeout); + return; + } + + vfio_mmap_set_enabled(vdev, true); +} + static void vfio_intx_interrupt(void *opaque) { VFIODevice *vdev = opaque; @@ -106,6 +134,11 @@ static void vfio_intx_interrupt(void *opaque) vdev->intx.pending = true; qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 1); + vfio_mmap_set_enabled(vdev, false); + if (vdev->intx.mmap_timeout) { + qemu_mod_timer(vdev->intx.mmap_timer, + qemu_get_clock_ms(vm_clock) + vdev->intx.mmap_timeout); + } } static void vfio_eoi(VFIODevice *vdev) @@ -141,7 +174,7 @@ static int vfio_enable_intx(VFIODevice *vdev) uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); int ret; - if (vdev->intx.disabled || !pin) { + if (!pin) { return 0; } @@ -162,16 +195,6 @@ static int vfio_enable_intx(VFIODevice *vdev) return -errno; } - /* - * Disable mmaps so we can trap on BAR accesses. We interpret any - * access as a response to an interrupt and unmask the physical - * device. The device will re-assert if the interrupt is still - * pending. We'll likely retrigger on the host multiple times per - * guest interrupt, but without EOI notification it's better than - * nothing. Acceleration paths through KVM will avoid this. - */ - vfio_mmap_set_enabled(vdev, false); - vdev->interrupt = VFIO_INT_INTx; DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain, @@ -184,6 +207,7 @@ static void vfio_disable_intx(VFIODevice *vdev) { int fd; + qemu_del_timer(vdev->intx.mmap_timer); vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX); vdev->intx.pending = false; qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0); @@ -1766,17 +1790,8 @@ static int vfio_initfn(PCIDevice *pdev) } if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) { - if (vdev->intx.intx && strcmp(vdev->intx.intx, "off")) { - error_report("vfio: Unknown option x-intx=%s, " - "valid options: \"off\".\n", vdev->intx.intx); - ret = -EINVAL; - goto out_teardown; - } - - if (vdev->intx.intx && !strcmp(vdev->intx.intx, "off")) { - vdev->intx.disabled = true; - } - + vdev->intx.mmap_timer = qemu_new_timer_ms(vm_clock, + vfio_intx_mmap_enable, vdev); ret = vfio_enable_intx(vdev); if (ret) { goto out_teardown; @@ -1802,6 +1817,9 @@ static void vfio_exitfn(PCIDevice *pdev) pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); vfio_disable_interrupts(vdev); + if (vdev->intx.mmap_timer) { + qemu_free_timer(vdev->intx.mmap_timer); + } vfio_teardown_msi(vdev); vfio_unmap_bars(vdev); vfio_put_device(vdev); @@ -1826,7 +1844,8 @@ static void vfio_pci_reset(DeviceState *dev) static Property vfio_pci_dev_properties[] = { DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host), - DEFINE_PROP_STRING("x-intx", VFIODevice, intx.intx), + DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIODevice, + intx.mmap_timeout, 1100), /* * TODO - support passed fds... is this necessary? * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name), diff --git a/hw/vfio_pci_int.h b/hw/vfio_pci_int.h index 3812d8d7f1..e69bf5f939 100644 --- a/hw/vfio_pci_int.h +++ b/hw/vfio_pci_int.h @@ -36,8 +36,8 @@ typedef struct VFIOINTx { EventNotifier interrupt; /* eventfd triggered on interrupt */ EventNotifier unmask; /* eventfd for unmask on QEMU bypass */ PCIINTxRoute route; /* routing info for QEMU bypass */ - bool disabled; - char *intx; + uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */ + QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */ } VFIOINTx; struct VFIODevice; From af6bc27e39fafadbc03c8876ffa95851c5869683 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 8 Oct 2012 08:45:29 -0600 Subject: [PATCH 02/13] vfio-pci: Re-order map/unmap This cleans up the next patch that calls unmap from map. Signed-off-by: Alex Williamson --- hw/vfio_pci.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c index 7ec9c30cd8..2d89d171ee 100644 --- a/hw/vfio_pci.c +++ b/hw/vfio_pci.c @@ -786,6 +786,24 @@ static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, /* * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 */ +static int vfio_dma_unmap(VFIOContainer *container, + target_phys_addr_t iova, ram_addr_t size) +{ + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = 0, + .iova = iova, + .size = size, + }; + + if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno); + return -errno; + } + + return 0; +} + static int vfio_dma_map(VFIOContainer *container, target_phys_addr_t iova, ram_addr_t size, void *vaddr, bool readonly) { @@ -809,24 +827,6 @@ static int vfio_dma_map(VFIOContainer *container, target_phys_addr_t iova, return 0; } -static int vfio_dma_unmap(VFIOContainer *container, - target_phys_addr_t iova, ram_addr_t size) -{ - struct vfio_iommu_type1_dma_unmap unmap = { - .argsz = sizeof(unmap), - .flags = 0, - .iova = iova, - .size = size, - }; - - if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { - DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno); - return -errno; - } - - return 0; -} - static void vfio_listener_dummy1(MemoryListener *listener) { /* We don't do batching (begin/commit) or care about logging */ From 12af1344871aee4a8df011c3b0548f7c77332d54 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 8 Oct 2012 08:45:29 -0600 Subject: [PATCH 03/13] vfio-pci: Unmap and retry DMA mapping Occasionally we get regions added that overlap with existing mappings. These always seems to be in the VGA ROM range. VFIO returns EBUSY for these mapping attempts. We can try a little harder and assume that the latest mapping is correct by removing any overlapping ranges and retrying the original request. Signed-off-by: Alex Williamson --- hw/vfio_pci.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c index 2d89d171ee..7413f2d531 100644 --- a/hw/vfio_pci.c +++ b/hw/vfio_pci.c @@ -819,12 +819,19 @@ static int vfio_dma_map(VFIOContainer *container, target_phys_addr_t iova, map.flags |= VFIO_DMA_MAP_FLAG_WRITE; } - if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) { - DPRINTF("VFIO_MAP_DMA: %d\n", -errno); - return -errno; + /* + * Try the mapping, if it fails with EBUSY, unmap the region and try + * again. This shouldn't be necessary, but we sometimes see it in + * the the VGA ROM space. + */ + if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || + (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 && + ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { + return 0; } - return 0; + DPRINTF("VFIO_MAP_DMA: %d\n", -errno); + return -errno; } static void vfio_listener_dummy1(MemoryListener *listener) From fd704adc479810477606c3418aeb64a590f51fe3 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 8 Oct 2012 08:45:29 -0600 Subject: [PATCH 04/13] vfio-pci: Rework MSIX setup/teardown We try to do lazy initialization of MSIX since we don't actually need to setup anything until MSIX vectors start getting used. This leads to problems if MSIX is enabled, but never used (we can end up trying to re-enable INTx while it's still enabled). We also run into problems trying to expand our reset function to tear down interrupts as we can then get vector release notifications after we've released data structures. By making explicit initialization and teardown we can avoid both of these problems and behave more similar to bare metal. Signed-off-by: Alex Williamson --- hw/vfio_pci.c | 108 +++++++++++++++++++++++++------------------------- 1 file changed, 55 insertions(+), 53 deletions(-) diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c index 7413f2d531..89e00ba5e6 100644 --- a/hw/vfio_pci.c +++ b/hw/vfio_pci.c @@ -72,8 +72,6 @@ static void vfio_disable_irqindex(VFIODevice *vdev, int index) }; ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); - - vdev->interrupt = VFIO_INT_NONE; } /* @@ -278,10 +276,6 @@ static int vfio_enable_vectors(VFIODevice *vdev, bool msix) g_free(irq_set); - if (!ret) { - vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI; - } - return ret; } @@ -296,15 +290,6 @@ static int vfio_msix_vector_use(PCIDevice *pdev, vdev->host.domain, vdev->host.bus, vdev->host.slot, vdev->host.function, nr); - if (vdev->interrupt != VFIO_INT_MSIX) { - vfio_disable_interrupts(vdev); - } - - if (!vdev->msi_vectors) { - vdev->msi_vectors = g_malloc0(vdev->msix->entries * - sizeof(VFIOMSIVector)); - } - vector = &vdev->msi_vectors[nr]; vector->vdev = vdev; vector->use = true; @@ -457,6 +442,23 @@ static void msi_set_qsize(PCIDevice *pdev, uint8_t size) pci_set_word(config + PCI_MSI_FLAGS, flags); } +static void vfio_enable_msix(VFIODevice *vdev) +{ + vfio_disable_interrupts(vdev); + + vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(VFIOMSIVector)); + + vdev->interrupt = VFIO_INT_MSIX; + + if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, + vfio_msix_vector_release)) { + error_report("vfio: msix_set_vector_notifiers failed\n"); + } + + DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function); +} + static void vfio_enable_msi(VFIODevice *vdev) { int ret, i; @@ -529,17 +531,43 @@ retry: msi_set_qsize(&vdev->pdev, vdev->nr_vectors); + vdev->interrupt = VFIO_INT_MSI; + DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__, vdev->host.domain, vdev->host.bus, vdev->host.slot, vdev->host.function, vdev->nr_vectors); } -static void vfio_disable_msi_x(VFIODevice *vdev, bool msix) +static void vfio_disable_msi_common(VFIODevice *vdev) +{ + g_free(vdev->msi_vectors); + vdev->msi_vectors = NULL; + vdev->nr_vectors = 0; + vdev->interrupt = VFIO_INT_NONE; + + vfio_enable_intx(vdev); +} + +static void vfio_disable_msix(VFIODevice *vdev) +{ + msix_unset_vector_notifiers(&vdev->pdev); + + if (vdev->nr_vectors) { + vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX); + } + + vfio_disable_msi_common(vdev); + + DPRINTF("%s(%04x:%02x:%02x.%x, msi%s)\n", __func__, + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function, msix ? "x" : ""); +} + +static void vfio_disable_msi(VFIODevice *vdev) { int i; - vfio_disable_irqindex(vdev, msix ? VFIO_PCI_MSIX_IRQ_INDEX : - VFIO_PCI_MSI_IRQ_INDEX); + vfio_disable_irqindex(vdev, VFIO_PCI_MSI_IRQ_INDEX); for (i = 0; i < vdev->nr_vectors; i++) { VFIOMSIVector *vector = &vdev->msi_vectors[i]; @@ -558,26 +586,15 @@ static void vfio_disable_msi_x(VFIODevice *vdev, bool msix) NULL, NULL, NULL); } - if (msix) { - msix_vector_unuse(&vdev->pdev, i); - } - event_notifier_cleanup(&vector->interrupt); } - g_free(vdev->msi_vectors); - vdev->msi_vectors = NULL; - vdev->nr_vectors = 0; + vfio_disable_msi_common(vdev); - if (!msix) { - msi_set_qsize(&vdev->pdev, 0); /* Actually still means 1 vector */ - } + msi_set_qsize(&vdev->pdev, 0); /* Actually still means 1 vector */ - DPRINTF("%s(%04x:%02x:%02x.%x, msi%s)\n", __func__, - vdev->host.domain, vdev->host.bus, vdev->host.slot, - vdev->host.function, msix ? "x" : ""); - - vfio_enable_intx(vdev); + DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function); } /* @@ -763,7 +780,7 @@ static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, if (!was_enabled && is_enabled) { vfio_enable_msi(vdev); } else if (was_enabled && !is_enabled) { - vfio_disable_msi_x(vdev, false); + vfio_disable_msi(vdev); } } @@ -776,9 +793,9 @@ static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, is_enabled = msix_enabled(pdev); if (!was_enabled && is_enabled) { - /* vfio_msix_vector_use handles this automatically */ + vfio_enable_msix(vdev); } else if (was_enabled && !is_enabled) { - vfio_disable_msi_x(vdev, true); + vfio_disable_msix(vdev); } } } @@ -973,10 +990,10 @@ static void vfio_disable_interrupts(VFIODevice *vdev) vfio_disable_intx(vdev); break; case VFIO_INT_MSI: - vfio_disable_msi_x(vdev, false); + vfio_disable_msi(vdev); break; case VFIO_INT_MSIX: - vfio_disable_msi_x(vdev, true); + vfio_disable_msix(vdev); break; } } @@ -1094,15 +1111,6 @@ static int vfio_setup_msix(VFIODevice *vdev, int pos) return ret; } - ret = msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, - vfio_msix_vector_release); - if (ret) { - error_report("vfio: msix_set_vector_notifiers failed %d\n", ret); - msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem, - &vdev->bars[vdev->msix->pba_bar].mem); - return ret; - } - return 0; } @@ -1111,12 +1119,6 @@ static void vfio_teardown_msi(VFIODevice *vdev) msi_uninit(&vdev->pdev); if (vdev->msix) { - /* FIXME: Why can't unset just silently do nothing?? */ - if (vdev->pdev.msix_vector_use_notifier && - vdev->pdev.msix_vector_release_notifier) { - msix_unset_vector_notifiers(&vdev->pdev); - } - msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem, &vdev->bars[vdev->msix->pba_bar].mem); } From 98cd5a5eaf4ad96d240e680eb1b26460dc278c33 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 8 Oct 2012 08:45:29 -0600 Subject: [PATCH 05/13] vfio-pci: No spurious MSIs FreeBSD doesn't like these spurious MSIs, remove them as they're mostly paranoia anyway. Signed-off-by: Alex Williamson --- hw/vfio_pci.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c index 89e00ba5e6..bca40835d6 100644 --- a/hw/vfio_pci.c +++ b/hw/vfio_pci.c @@ -322,21 +322,12 @@ static int vfio_msix_vector_use(PCIDevice *pdev, * increase them as needed. */ if (vdev->nr_vectors < nr + 1) { - int i; - vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX); vdev->nr_vectors = nr + 1; ret = vfio_enable_vectors(vdev, true); if (ret) { error_report("vfio: failed to enable vectors, %d\n", ret); } - - /* We don't know if we've missed interrupts in the interim... */ - for (i = 0; i < vdev->msix->entries; i++) { - if (vdev->msi_vectors[i].use) { - msix_notify(&vdev->pdev, i); - } - } } else { VFIOIRQSetFD irq_set_fd = { .irq_set = { @@ -353,12 +344,6 @@ static int vfio_msix_vector_use(PCIDevice *pdev, if (ret) { error_report("vfio: failed to modify vector, %d\n", ret); } - - /* - * If we were connected to the hardware PBA we could skip this, - * until then, a spurious interrupt is better than starvation. - */ - msix_notify(&vdev->pdev, nr); } return 0; From 5c97e5eba665b66395eee87d88cf2f8301594145 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 8 Oct 2012 08:45:30 -0600 Subject: [PATCH 06/13] vfio-pci: Roll the header into the .c file It's only ~100 lines and nobody else should be using this. Suggested by Michael Tsirkin. Signed-off-by: Alex Williamson --- hw/vfio_pci.c | 97 ++++++++++++++++++++++++++++++++++++++- hw/vfio_pci_int.h | 114 ---------------------------------------------- 2 files changed, 96 insertions(+), 115 deletions(-) delete mode 100644 hw/vfio_pci_int.h diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c index bca40835d6..4d0170756c 100644 --- a/hw/vfio_pci.c +++ b/hw/vfio_pci.c @@ -33,9 +33,11 @@ #include "memory.h" #include "msi.h" #include "msix.h" +#include "pci.h" +#include "qemu-common.h" #include "qemu-error.h" +#include "qemu-queue.h" #include "range.h" -#include "vfio_pci_int.h" /* #define DEBUG_VFIO */ #ifdef DEBUG_VFIO @@ -46,6 +48,99 @@ do { } while (0) #endif +typedef struct VFIOBAR { + off_t fd_offset; /* offset of BAR within device fd */ + int fd; /* device fd, allows us to pass VFIOBAR as opaque data */ + MemoryRegion mem; /* slow, read/write access */ + MemoryRegion mmap_mem; /* direct mapped access */ + void *mmap; + size_t size; + uint32_t flags; /* VFIO region flags (rd/wr/mmap) */ + uint8_t nr; /* cache the BAR number for debug */ +} VFIOBAR; + +typedef struct VFIOINTx { + bool pending; /* interrupt pending */ + bool kvm_accel; /* set when QEMU bypass through KVM enabled */ + uint8_t pin; /* which pin to pull for qemu_set_irq */ + EventNotifier interrupt; /* eventfd triggered on interrupt */ + EventNotifier unmask; /* eventfd for unmask on QEMU bypass */ + PCIINTxRoute route; /* routing info for QEMU bypass */ + uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */ + QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */ +} VFIOINTx; + +struct VFIODevice; + +typedef struct VFIOMSIVector { + EventNotifier interrupt; /* eventfd triggered on interrupt */ + struct VFIODevice *vdev; /* back pointer to device */ + int virq; /* KVM irqchip route for QEMU bypass */ + bool use; +} VFIOMSIVector; + +enum { + VFIO_INT_NONE = 0, + VFIO_INT_INTx = 1, + VFIO_INT_MSI = 2, + VFIO_INT_MSIX = 3, +}; + +struct VFIOGroup; + +typedef struct VFIOContainer { + int fd; /* /dev/vfio/vfio, empowered by the attached groups */ + struct { + /* enable abstraction to support various iommu backends */ + union { + MemoryListener listener; /* Used by type1 iommu */ + }; + void (*release)(struct VFIOContainer *); + } iommu_data; + QLIST_HEAD(, VFIOGroup) group_list; + QLIST_ENTRY(VFIOContainer) next; +} VFIOContainer; + +/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */ +typedef struct VFIOMSIXInfo { + uint8_t table_bar; + uint8_t pba_bar; + uint16_t entries; + uint32_t table_offset; + uint32_t pba_offset; + MemoryRegion mmap_mem; + void *mmap; +} VFIOMSIXInfo; + +typedef struct VFIODevice { + PCIDevice pdev; + int fd; + VFIOINTx intx; + unsigned int config_size; + off_t config_offset; /* Offset of config space region within device fd */ + unsigned int rom_size; + off_t rom_offset; /* Offset of ROM region within device fd */ + int msi_cap_size; + VFIOMSIVector *msi_vectors; + VFIOMSIXInfo *msix; + int nr_vectors; /* Number of MSI/MSIX vectors currently in use */ + int interrupt; /* Current interrupt type */ + VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ + PCIHostDeviceAddress host; + QLIST_ENTRY(VFIODevice) next; + struct VFIOGroup *group; + bool reset_works; +} VFIODevice; + +typedef struct VFIOGroup { + int fd; + int groupid; + VFIOContainer *container; + QLIST_HEAD(, VFIODevice) device_list; + QLIST_ENTRY(VFIOGroup) next; + QLIST_ENTRY(VFIOGroup) container_next; +} VFIOGroup; + #define MSIX_CAP_LENGTH 12 static QLIST_HEAD(, VFIOContainer) diff --git a/hw/vfio_pci_int.h b/hw/vfio_pci_int.h deleted file mode 100644 index e69bf5f939..0000000000 --- a/hw/vfio_pci_int.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * vfio based device assignment support - * - * Copyright Red Hat, Inc. 2012 - * - * Authors: - * Alex Williamson - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - */ - -#ifndef HW_VFIO_PCI_INT_H -#define HW_VFIO_PCI_INT_H - -#include "qemu-common.h" -#include "qemu-queue.h" -#include "pci.h" -#include "event_notifier.h" - -typedef struct VFIOBAR { - off_t fd_offset; /* offset of BAR within device fd */ - int fd; /* device fd, allows us to pass VFIOBAR as opaque data */ - MemoryRegion mem; /* slow, read/write access */ - MemoryRegion mmap_mem; /* direct mapped access */ - void *mmap; - size_t size; - uint32_t flags; /* VFIO region flags (rd/wr/mmap) */ - uint8_t nr; /* cache the BAR number for debug */ -} VFIOBAR; - -typedef struct VFIOINTx { - bool pending; /* interrupt pending */ - bool kvm_accel; /* set when QEMU bypass through KVM enabled */ - uint8_t pin; /* which pin to pull for qemu_set_irq */ - EventNotifier interrupt; /* eventfd triggered on interrupt */ - EventNotifier unmask; /* eventfd for unmask on QEMU bypass */ - PCIINTxRoute route; /* routing info for QEMU bypass */ - uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */ - QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */ -} VFIOINTx; - -struct VFIODevice; - -typedef struct VFIOMSIVector { - EventNotifier interrupt; /* eventfd triggered on interrupt */ - struct VFIODevice *vdev; /* back pointer to device */ - int virq; /* KVM irqchip route for QEMU bypass */ - bool use; -} VFIOMSIVector; - -enum { - VFIO_INT_NONE = 0, - VFIO_INT_INTx = 1, - VFIO_INT_MSI = 2, - VFIO_INT_MSIX = 3, -}; - -struct VFIOGroup; - -typedef struct VFIOContainer { - int fd; /* /dev/vfio/vfio, empowered by the attached groups */ - struct { - /* enable abstraction to support various iommu backends */ - union { - MemoryListener listener; /* Used by type1 iommu */ - }; - void (*release)(struct VFIOContainer *); - } iommu_data; - QLIST_HEAD(, VFIOGroup) group_list; - QLIST_ENTRY(VFIOContainer) next; -} VFIOContainer; - -/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */ -typedef struct VFIOMSIXInfo { - uint8_t table_bar; - uint8_t pba_bar; - uint16_t entries; - uint32_t table_offset; - uint32_t pba_offset; - MemoryRegion mmap_mem; - void *mmap; -} VFIOMSIXInfo; - -typedef struct VFIODevice { - PCIDevice pdev; - int fd; - VFIOINTx intx; - unsigned int config_size; - off_t config_offset; /* Offset of config space region within device fd */ - unsigned int rom_size; - off_t rom_offset; /* Offset of ROM region within device fd */ - int msi_cap_size; - VFIOMSIVector *msi_vectors; - VFIOMSIXInfo *msix; - int nr_vectors; /* Number of MSI/MSIX vectors currently in use */ - int interrupt; /* Current interrupt type */ - VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ - PCIHostDeviceAddress host; - QLIST_ENTRY(VFIODevice) next; - struct VFIOGroup *group; - bool reset_works; -} VFIODevice; - -typedef struct VFIOGroup { - int fd; - int groupid; - VFIOContainer *container; - QLIST_HEAD(, VFIODevice) device_list; - QLIST_ENTRY(VFIOGroup) next; - QLIST_ENTRY(VFIOGroup) container_next; -} VFIOGroup; - -#endif /* HW_VFIO_PCI_INT_H */ From e43b9a5a4fb2b501fbbec411faff899b74a2d451 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 8 Oct 2012 08:45:30 -0600 Subject: [PATCH 07/13] vfio-pci: Don't peak at msi_supported Let the init function fail, just don't warn for -ENOTSUP. Signed-off-by: Alex Williamson --- hw/vfio_pci.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c index 4d0170756c..d62ddd152f 100644 --- a/hw/vfio_pci.c +++ b/hw/vfio_pci.c @@ -1084,14 +1084,6 @@ static int vfio_setup_msi(VFIODevice *vdev, int pos) bool msi_64bit, msi_maskbit; int ret, entries; - /* - * TODO: don't peek into msi_supported, let msi_init fail and - * check for ENOTSUP - */ - if (!msi_supported) { - return 0; - } - if (pread(vdev->fd, &ctrl, sizeof(ctrl), vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { return -errno; @@ -1107,6 +1099,9 @@ static int vfio_setup_msi(VFIODevice *vdev, int pos) ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit); if (ret < 0) { + if (ret == -ENOTSUP) { + return 0; + } error_report("vfio: msi_init failed\n"); return ret; } @@ -1173,20 +1168,15 @@ static int vfio_setup_msix(VFIODevice *vdev, int pos) { int ret; - /* - * TODO: don't peek into msi_supported, let msix_init fail and - * check for ENOTSUP - */ - if (!msi_supported) { - return 0; - } - ret = msix_init(&vdev->pdev, vdev->msix->entries, &vdev->bars[vdev->msix->table_bar].mem, vdev->msix->table_bar, vdev->msix->table_offset, &vdev->bars[vdev->msix->pba_bar].mem, vdev->msix->pba_bar, vdev->msix->pba_offset, pos); if (ret < 0) { + if (ret == -ENOTSUP) { + return 0; + } error_report("vfio: msix_init failed\n"); return ret; } From 5976cdd58b657692e8c1af5310d55a60aecc9089 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 8 Oct 2012 08:45:30 -0600 Subject: [PATCH 08/13] vfio-pci: Use uintptr_t for void* cast We don't seem to run into any sign extension problems, but unsigned looks more correct. Signed-off-by: Alex williamson --- hw/vfio_pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c index d62ddd152f..0ca77cf9d1 100644 --- a/hw/vfio_pci.c +++ b/hw/vfio_pci.c @@ -907,7 +907,7 @@ static int vfio_dma_map(VFIOContainer *container, target_phys_addr_t iova, struct vfio_iommu_type1_dma_map map = { .argsz = sizeof(map), .flags = VFIO_DMA_MAP_FLAG_READ, - .vaddr = (__u64)(intptr_t)vaddr, + .vaddr = (__u64)(uintptr_t)vaddr, .iova = iova, .size = size, }; From 9b1e45c8f1355592cb60c6aed9ac1f90b606a1a8 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 8 Oct 2012 08:45:30 -0600 Subject: [PATCH 09/13] vfio-pci: Remove setting of MSI qsize This was a misinterpretation of the spec, hardware doesn't get to specify how many were actually enabled through this field. Signed-off-by: Alex Williamson --- hw/vfio_pci.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c index 0ca77cf9d1..a554e7cc65 100644 --- a/hw/vfio_pci.c +++ b/hw/vfio_pci.c @@ -508,20 +508,6 @@ static MSIMessage msi_get_msg(PCIDevice *pdev, unsigned int vector) return msg; } -/* So should this */ -static void msi_set_qsize(PCIDevice *pdev, uint8_t size) -{ - uint8_t *config = pdev->config + pdev->msi_cap; - uint16_t flags; - - flags = pci_get_word(config + PCI_MSI_FLAGS); - flags = le16_to_cpu(flags); - flags &= ~PCI_MSI_FLAGS_QSIZE; - flags |= (size & 0x7) << 4; - flags = cpu_to_le16(flags); - pci_set_word(config + PCI_MSI_FLAGS, flags); -} - static void vfio_enable_msix(VFIODevice *vdev) { vfio_disable_interrupts(vdev); @@ -609,8 +595,6 @@ retry: return; } - msi_set_qsize(&vdev->pdev, vdev->nr_vectors); - vdev->interrupt = VFIO_INT_MSI; DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__, @@ -671,8 +655,6 @@ static void vfio_disable_msi(VFIODevice *vdev) vfio_disable_msi_common(vdev); - msi_set_qsize(&vdev->pdev, 0); /* Actually still means 1 vector */ - DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain, vdev->host.bus, vdev->host.slot, vdev->host.function); } From 5834a83f4803de88949162346e6dfa2060d3fca6 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 8 Oct 2012 08:45:30 -0600 Subject: [PATCH 10/13] vfio-pci: Extend reset Take what we've learned from pci-assign and apply it to vfio-pci. On reset, disable previous interrupt config, perform a device reset if available, re-enable INTx, and disable memory regions on the device to prevent continuing DMA. Signed-off-by: Alex Williamson --- hw/vfio_pci.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c index a554e7cc65..b2383c4b55 100644 --- a/hw/vfio_pci.c +++ b/hw/vfio_pci.c @@ -1891,16 +1891,31 @@ static void vfio_pci_reset(DeviceState *dev) { PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev); VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + uint16_t cmd; - if (!vdev->reset_works) { - return; + DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function); + + vfio_disable_interrupts(vdev); + + /* + * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master. + * Also put INTx Disable in known state. + */ + cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2); + cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | + PCI_COMMAND_INTX_DISABLE); + vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); + + if (vdev->reset_works) { + if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) { + error_report("vfio: Error unable to reset physical device " + "(%04x:%02x:%02x.%x): %m\n", vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function); + } } - if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) { - error_report("vfio: Error unable to reset physical device " - "(%04x:%02x:%02x.%x): %m\n", vdev->host.domain, - vdev->host.bus, vdev->host.slot, vdev->host.function); - } + vfio_enable_intx(vdev); } static Property vfio_pci_dev_properties[] = { From ce59af2dba6fde2e3990b831f4c9a83641f5609e Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 8 Oct 2012 08:45:30 -0600 Subject: [PATCH 11/13] vfio-pci: Cleanup on INTx setup failure Missing some unwind code. Signed-off-by: Alex Williamson --- hw/vfio_pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c index b2383c4b55..2325272c44 100644 --- a/hw/vfio_pci.c +++ b/hw/vfio_pci.c @@ -285,6 +285,8 @@ static int vfio_enable_intx(VFIODevice *vdev) if (ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd)) { error_report("vfio: Error: Failed to setup INTx fd: %m\n"); + qemu_set_fd_handler(irq_set_fd.fd, NULL, NULL, vdev); + event_notifier_cleanup(&vdev->intx.interrupt); return -errno; } From 1a40313381262ebb5f30fb95d5550b674280f396 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 8 Oct 2012 08:45:31 -0600 Subject: [PATCH 12/13] vfio-pci: Clang cleanup Blue Swirl reports that Clang doesn't like the structure we define to avoid dynamic allocation for a number of calls to VFIO_DEVICE_SET_IRQS. Adding an element after a variable sized type is a GNU extension. Switch back to dynamic allocation, which really isn't a problem since this is only done on interrupt setup changes. Cc: Blue Swirl Signed-off-by: Alex Williamson --- hw/vfio_pci.c | 101 +++++++++++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 43 deletions(-) diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c index 2325272c44..85b16bbd23 100644 --- a/hw/vfio_pci.c +++ b/hw/vfio_pci.c @@ -248,24 +248,12 @@ static void vfio_eoi(VFIODevice *vdev) vfio_unmask_intx(vdev); } -typedef struct QEMU_PACKED VFIOIRQSetFD { - struct vfio_irq_set irq_set; - int32_t fd; -} VFIOIRQSetFD; - static int vfio_enable_intx(VFIODevice *vdev) { - VFIOIRQSetFD irq_set_fd = { - .irq_set = { - .argsz = sizeof(irq_set_fd), - .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, - .index = VFIO_PCI_INTX_IRQ_INDEX, - .start = 0, - .count = 1, - }, - }; uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); - int ret; + int ret, argsz; + struct vfio_irq_set *irq_set; + int32_t *pfd; if (!pin) { return 0; @@ -280,12 +268,24 @@ static int vfio_enable_intx(VFIODevice *vdev) return ret; } - irq_set_fd.fd = event_notifier_get_fd(&vdev->intx.interrupt); - qemu_set_fd_handler(irq_set_fd.fd, vfio_intx_interrupt, NULL, vdev); + argsz = sizeof(*irq_set) + sizeof(*pfd); - if (ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd)) { + irq_set = g_malloc0(argsz); + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + irq_set->count = 1; + pfd = (int32_t *)&irq_set->data; + + *pfd = event_notifier_get_fd(&vdev->intx.interrupt); + qemu_set_fd_handler(*pfd, vfio_intx_interrupt, NULL, vdev); + + ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set); + g_free(irq_set); + if (ret) { error_report("vfio: Error: Failed to setup INTx fd: %m\n"); - qemu_set_fd_handler(irq_set_fd.fd, NULL, NULL, vdev); + qemu_set_fd_handler(*pfd, NULL, NULL, vdev); event_notifier_cleanup(&vdev->intx.interrupt); return -errno; } @@ -426,18 +426,25 @@ static int vfio_msix_vector_use(PCIDevice *pdev, error_report("vfio: failed to enable vectors, %d\n", ret); } } else { - VFIOIRQSetFD irq_set_fd = { - .irq_set = { - .argsz = sizeof(irq_set_fd), - .flags = VFIO_IRQ_SET_DATA_EVENTFD | - VFIO_IRQ_SET_ACTION_TRIGGER, - .index = VFIO_PCI_MSIX_IRQ_INDEX, - .start = nr, - .count = 1, - }, - .fd = event_notifier_get_fd(&vector->interrupt), - }; - ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd); + int argsz; + struct vfio_irq_set *irq_set; + int32_t *pfd; + + argsz = sizeof(*irq_set) + sizeof(*pfd); + + irq_set = g_malloc0(argsz); + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | + VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = nr; + irq_set->count = 1; + pfd = (int32_t *)&irq_set->data; + + *pfd = event_notifier_get_fd(&vector->interrupt); + + ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set); + g_free(irq_set); if (ret) { error_report("vfio: failed to modify vector, %d\n", ret); } @@ -450,17 +457,9 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) { VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); VFIOMSIVector *vector = &vdev->msi_vectors[nr]; - VFIOIRQSetFD irq_set_fd = { - .irq_set = { - .argsz = sizeof(irq_set_fd), - .flags = VFIO_IRQ_SET_DATA_EVENTFD | - VFIO_IRQ_SET_ACTION_TRIGGER, - .index = VFIO_PCI_MSIX_IRQ_INDEX, - .start = nr, - .count = 1, - }, - .fd = -1, - }; + int argsz; + struct vfio_irq_set *irq_set; + int32_t *pfd; DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__, vdev->host.domain, vdev->host.bus, vdev->host.slot, @@ -472,7 +471,23 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) * bouncing through userspace and let msix.c drop it? Not sure. */ msix_vector_unuse(pdev, nr); - ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd); + + argsz = sizeof(*irq_set) + sizeof(*pfd); + + irq_set = g_malloc0(argsz); + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | + VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = nr; + irq_set->count = 1; + pfd = (int32_t *)&irq_set->data; + + *pfd = -1; + + ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set); + + g_free(irq_set); if (vector->virq < 0) { qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), From 3a4f2816fac1b0f9cc197bb2208ddf03dc7bc592 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 8 Oct 2012 08:45:31 -0600 Subject: [PATCH 13/13] vfio-pci: Fix BAR->VFIODevice translation in DO_UPCAST is supposed to translate from the first member of a struct to that struct, not from arbitrary ones. And it (usually) breaks the build when neglecting this rule. Use container_of to fix the build breakage and likely also the runtime behavior. Signed-off-by: Jan Kiszka aw: runtime behavior is actually the same, but clearly misuse of DO_UPCAST Signed-off-by: Alex Williamson --- hw/vfio_pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c index 85b16bbd23..639371e7a2 100644 --- a/hw/vfio_pci.c +++ b/hw/vfio_pci.c @@ -721,7 +721,7 @@ static void vfio_bar_write(void *opaque, target_phys_addr_t addr, * which access will service the interrupt, so we're potentially * getting quite a few host interrupts per guest interrupt. */ - vfio_eoi(DO_UPCAST(VFIODevice, bars[bar->nr], bar)); + vfio_eoi(container_of(bar, VFIODevice, bars[bar->nr])); } static uint64_t vfio_bar_read(void *opaque, @@ -761,7 +761,7 @@ static uint64_t vfio_bar_read(void *opaque, __func__, bar->nr, addr, size, data); /* Same as write above */ - vfio_eoi(DO_UPCAST(VFIODevice, bars[bar->nr], bar)); + vfio_eoi(container_of(bar, VFIODevice, bars[bar->nr])); return data; }