From 7d489dcdf5fd71b5052ffd401b869a627e1c751f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 6 Jul 2015 12:15:12 -0600 Subject: [PATCH 01/11] vfio: fix return type of pread size_t is an unsigned type, thus the error case is never reached in the below call to pread. If bytes is negative, it will be seen as a very high positive value. Spotted by Coverity. Signed-off-by: Paolo Bonzini Signed-off-by: Alex Williamson --- hw/vfio/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index e0e339a534..b8fa4ac509 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -939,7 +939,7 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) }; uint64_t size; off_t off = 0; - size_t bytes; + ssize_t bytes; if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, ®_info)) { error_report("vfio: Error getting ROM info: %m"); From 4e51361d79289aee2985dfed472f8d87bd53a8df Mon Sep 17 00:00:00 2001 From: Peter Crosthwaite Date: Mon, 6 Jul 2015 12:15:12 -0600 Subject: [PATCH 02/11] cpu-all: complete "real" host page size API Currently the "host" page size alignment API is really aligning to both host and target page sizes. There is the qemu_real_page_size which can be used for the actual host page size but it's missing a mask and ALIGN macro as provided for qemu_page_size. Complete the API. This allows system level code that cares about the host page size to use a consistent alignment interface without having to un-needingly align to the target page size. This also reduces system level code dependency on the cpu specific TARGET_PAGE_SIZE. Signed-off-by: Peter Crosthwaite Tested-by: Alexey Kardashevskiy Acked-by: Paolo Bonzini Signed-off-by: Alex Williamson --- include/exec/cpu-all.h | 3 +++ translate-all.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h index 8999634981..ea6a9a667c 100644 --- a/include/exec/cpu-all.h +++ b/include/exec/cpu-all.h @@ -183,10 +183,13 @@ extern unsigned long reserved_va; /* ??? These should be the larger of uintptr_t and target_ulong. */ extern uintptr_t qemu_real_host_page_size; +extern uintptr_t qemu_real_host_page_mask; extern uintptr_t qemu_host_page_size; extern uintptr_t qemu_host_page_mask; #define HOST_PAGE_ALIGN(addr) (((addr) + qemu_host_page_size - 1) & qemu_host_page_mask) +#define REAL_HOST_PAGE_ALIGN(addr) (((addr) + qemu_real_host_page_size - 1) & \ + qemu_real_host_page_mask) /* same as PROT_xxx */ #define PAGE_READ 0x0001 diff --git a/translate-all.c b/translate-all.c index 412bc9005f..50d53fdac0 100644 --- a/translate-all.c +++ b/translate-all.c @@ -118,6 +118,7 @@ typedef struct PageDesc { #define V_L1_SHIFT (L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS - V_L1_BITS) uintptr_t qemu_real_host_page_size; +uintptr_t qemu_real_host_page_mask; uintptr_t qemu_host_page_size; uintptr_t qemu_host_page_mask; @@ -307,6 +308,7 @@ void page_size_init(void) /* NOTE: we can always suppose that qemu_host_page_size >= TARGET_PAGE_SIZE */ qemu_real_host_page_size = getpagesize(); + qemu_real_host_page_mask = ~(qemu_real_host_page_size - 1); if (qemu_host_page_size == 0) { qemu_host_page_size = qemu_real_host_page_size; } From f7ceed190d7dcd907afe4b46b23809aaad09a619 Mon Sep 17 00:00:00 2001 From: Peter Crosthwaite Date: Mon, 6 Jul 2015 12:15:12 -0600 Subject: [PATCH 03/11] vfio: cpu: Use "real" page size API This is system level code, and should only depend on the host page size, not the target page size. Note that HOST_PAGE_SIZE is misleadingly lead and is really aligning to both host and target page size. Hence it's replacement with REAL_HOST_PAGE_SIZE. Signed-off-by: Peter Crosthwaite Tested-by: Alexey Kardashevskiy Acked-by: Paolo Bonzini Signed-off-by: Alex Williamson --- hw/vfio/pci.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index b8fa4ac509..66d42233ef 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2388,7 +2388,7 @@ static void vfio_map_bar(VFIOPCIDevice *vdev, int nr) * potentially insert a direct-mapped subregion before and after it. */ if (vdev->msix && vdev->msix->table_bar == nr) { - size = vdev->msix->table_offset & qemu_host_page_mask; + size = vdev->msix->table_offset & qemu_real_host_page_mask; } strncat(name, " mmap", sizeof(name) - strlen(name) - 1); @@ -2401,8 +2401,9 @@ static void vfio_map_bar(VFIOPCIDevice *vdev, int nr) if (vdev->msix && vdev->msix->table_bar == nr) { uint64_t start; - start = HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset + - (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE)); + start = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset + + (vdev->msix->entries * + PCI_MSIX_ENTRY_SIZE)); size = start < bar->region.size ? bar->region.size - start : 0; strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1); From 1c9b71a7311ed99635a2d007fc8a856879537a05 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Mon, 6 Jul 2015 12:15:13 -0600 Subject: [PATCH 04/11] kvm: rename kvm_irqchip_[add,remove]_irqfd_notifier with gsi suffix Anticipating for the introduction of new add/remove functions taking a qemu_irq parameter, let's rename existing ones with a gsi suffix. Signed-off-by: Eric Auger Tested-by: Vikram Sethi Acked-by: Paolo Bonzini Reviewed-by: Peter Maydell Signed-off-by: Alex Williamson --- hw/s390x/virtio-ccw.c | 8 ++++---- hw/vfio/pci.c | 6 +++--- hw/virtio/virtio-pci.c | 4 ++-- include/sysemu/kvm.h | 7 ++++--- kvm-all.c | 7 ++++--- kvm-stub.c | 7 ++++--- 6 files changed, 21 insertions(+), 18 deletions(-) diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c index d631337e11..e345a6e9b8 100644 --- a/hw/s390x/virtio-ccw.c +++ b/hw/s390x/virtio-ccw.c @@ -1316,8 +1316,8 @@ static int virtio_ccw_add_irqfd(VirtioCcwDevice *dev, int n) VirtQueue *vq = virtio_get_queue(vdev, n); EventNotifier *notifier = virtio_queue_get_guest_notifier(vq); - return kvm_irqchip_add_irqfd_notifier(kvm_state, notifier, NULL, - dev->routes.gsi[n]); + return kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, notifier, NULL, + dev->routes.gsi[n]); } static void virtio_ccw_remove_irqfd(VirtioCcwDevice *dev, int n) @@ -1327,8 +1327,8 @@ static void virtio_ccw_remove_irqfd(VirtioCcwDevice *dev, int n) EventNotifier *notifier = virtio_queue_get_guest_notifier(vq); int ret; - ret = kvm_irqchip_remove_irqfd_notifier(kvm_state, notifier, - dev->routes.gsi[n]); + ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, notifier, + dev->routes.gsi[n]); assert(ret == 0); } diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 66d42233ef..27b7ec133e 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -597,7 +597,7 @@ static void vfio_add_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage *msg, return; } - if (kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->kvm_interrupt, + if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, NULL, virq) < 0) { kvm_irqchip_release_virq(kvm_state, virq); event_notifier_cleanup(&vector->kvm_interrupt); @@ -609,8 +609,8 @@ static void vfio_add_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage *msg, static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector) { - kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->kvm_interrupt, - vector->virq); + kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, + vector->virq); kvm_irqchip_release_virq(kvm_state, vector->virq); vector->virq = -1; event_notifier_cleanup(&vector->kvm_interrupt); diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 6a0174e9cc..7a89081e4f 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -495,7 +495,7 @@ static int kvm_virtio_pci_irqfd_use(VirtIOPCIProxy *proxy, VirtQueue *vq = virtio_get_queue(vdev, queue_no); EventNotifier *n = virtio_queue_get_guest_notifier(vq); int ret; - ret = kvm_irqchip_add_irqfd_notifier(kvm_state, n, NULL, irqfd->virq); + ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, irqfd->virq); return ret; } @@ -509,7 +509,7 @@ static void kvm_virtio_pci_irqfd_release(VirtIOPCIProxy *proxy, VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector]; int ret; - ret = kvm_irqchip_remove_irqfd_notifier(kvm_state, n, irqfd->virq); + ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, n, irqfd->virq); assert(ret == 0); } diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index f459fbdbd4..acb3025f8c 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -416,9 +416,10 @@ void kvm_irqchip_release_virq(KVMState *s, int virq); int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter); -int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, - EventNotifier *rn, int virq); -int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq); +int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, + EventNotifier *rn, int virq); +int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, + int virq); void kvm_pc_gsi_handler(void *opaque, int n, int level); void kvm_pc_setup_irq_routing(bool pci_enabled); void kvm_init_irq_routing(KVMState *s); diff --git a/kvm-all.c b/kvm-all.c index df57da0bf2..de73b35e03 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1328,14 +1328,15 @@ int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg) } #endif /* !KVM_CAP_IRQ_ROUTING */ -int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, - EventNotifier *rn, int virq) +int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, + EventNotifier *rn, int virq) { return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), rn ? event_notifier_get_fd(rn) : -1, virq, true); } -int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq) +int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, + int virq) { return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq, false); diff --git a/kvm-stub.c b/kvm-stub.c index 7ba90c546f..977de21f9f 100644 --- a/kvm-stub.c +++ b/kvm-stub.c @@ -137,13 +137,14 @@ int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) return -ENOSYS; } -int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, - EventNotifier *rn, int virq) +int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, + EventNotifier *rn, int virq) { return -ENOSYS; } -int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq) +int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, + int virq) { return -ENOSYS; } From 197e35249a7360534e1aea0f2268ad0e1aa27121 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Mon, 6 Jul 2015 12:15:13 -0600 Subject: [PATCH 05/11] kvm-all.c: add qemu_irq/gsi hash table and utility routines VFIO platform device needs to setup irqfd but it does not know the gsi corresponding to the device qemu_irq. This patch proposes to store a hash table in kvm_state using the qemu_irq as key and the gsi as a value. kvm_irqchip_set_qemuirq_gsi allows to insert such a pair. The interrupt controller is supposed to use it. kvm_irqchip_[add, remove]_irqfd_notifier allows to setup/tear down irqfd directly from the qemu_irq. Signed-off-by: Eric Auger Tested-by: Vikram Sethi Acked-by: Paolo Bonzini Reviewed-by: Peter Maydell Signed-off-by: Alex Williamson --- include/sysemu/kvm.h | 6 ++++++ kvm-all.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index acb3025f8c..ba612fc76f 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -19,6 +19,7 @@ #include "qemu/queue.h" #include "qom/cpu.h" #include "exec/memattrs.h" +#include "hw/irq.h" #ifdef CONFIG_KVM #include @@ -420,6 +421,11 @@ int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, EventNotifier *rn, int virq); int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, int virq); +int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, + EventNotifier *rn, qemu_irq irq); +int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, + qemu_irq irq); +void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi); void kvm_pc_gsi_handler(void *opaque, int n, int level); void kvm_pc_setup_irq_routing(bool pci_enabled); void kvm_init_irq_routing(KVMState *s); diff --git a/kvm-all.c b/kvm-all.c index de73b35e03..cb8318707a 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -37,6 +37,7 @@ #include "exec/address-spaces.h" #include "qemu/event_notifier.h" #include "trace.h" +#include "hw/irq.h" #include "hw/boards.h" @@ -98,6 +99,7 @@ struct KVMState * unsigned, and treating them as signed here can break things */ unsigned irq_set_ioctl; unsigned int sigmask_len; + GHashTable *gsimap; #ifdef KVM_CAP_IRQ_ROUTING struct kvm_irq_routing *irq_routes; int nr_allocated_irq_routes; @@ -1342,6 +1344,35 @@ int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, false); } +int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, + EventNotifier *rn, qemu_irq irq) +{ + gpointer key, gsi; + gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); + + if (!found) { + return -ENXIO; + } + return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi)); +} + +int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, + qemu_irq irq) +{ + gpointer key, gsi; + gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); + + if (!found) { + return -ENXIO; + } + return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi)); +} + +void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi) +{ + g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi)); +} + static int kvm_irqchip_create(MachineState *machine, KVMState *s) { int ret; @@ -1374,6 +1405,8 @@ static int kvm_irqchip_create(MachineState *machine, KVMState *s) kvm_init_irq_routing(s); + s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal); + return 0; } From 6a1a9cfa1c4a3e5b521d82e6adb94311fc5b9f8b Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Mon, 6 Jul 2015 12:15:13 -0600 Subject: [PATCH 06/11] intc: arm_gic_kvm: set the qemu_irq/gsi mapping The arm_gic_kvm now calls kvm_irqchip_set_qemuirq_gsi to build the hash table storing qemu_irq/gsi mappings. From that point on irqfd can be setup directly from the qemu_irq using kvm_irqchip_add_irqfd_notifier. Signed-off-by: Eric Auger Tested-by: Vikram Sethi Acked-by: Paolo Bonzini Reviewed-by: Peter Maydell Signed-off-by: Alex Williamson --- hw/intc/arm_gic_kvm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/intc/arm_gic_kvm.c b/hw/intc/arm_gic_kvm.c index 2cb7d255d2..f56bff1afb 100644 --- a/hw/intc/arm_gic_kvm.c +++ b/hw/intc/arm_gic_kvm.c @@ -570,6 +570,12 @@ static void kvm_arm_gic_realize(DeviceState *dev, Error **errp) */ i += (GIC_INTERNAL * s->num_cpu); qdev_init_gpio_in(dev, kvm_arm_gic_set_irq, i); + + for (i = 0; i < s->num_irq - GIC_INTERNAL; i++) { + qemu_irq irq = qdev_get_gpio_in(dev, i); + kvm_irqchip_set_qemuirq_gsi(kvm_state, irq, i); + } + /* We never use our outbound IRQ/FIQ lines but provide them so that * we maintain the same interface as the non-KVM GIC. */ From 715ca691daca081108b33306faa6fa102f0df8d8 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Mon, 6 Jul 2015 12:15:14 -0600 Subject: [PATCH 07/11] sysbus: add irq_routing_notifier Add a new connect_irq_notifier notifier in the SysBusDeviceClass. This notifier, if populated, is called after sysbus_connect_irq. This mechanism is used to setup VFIO signaling once VFIO platform devices get attached to their platform bus, on a machine init done notifier. Signed-off-by: Eric Auger Reviewed-by: Peter Crosthwaite Tested-by: Vikram Sethi Reviewed-by: Peter Maydell Signed-off-by: Alex Williamson --- hw/core/sysbus.c | 6 ++++++ include/hw/sysbus.h | 1 + 2 files changed, 7 insertions(+) diff --git a/hw/core/sysbus.c b/hw/core/sysbus.c index 278a2d1bdd..3c58629894 100644 --- a/hw/core/sysbus.c +++ b/hw/core/sysbus.c @@ -109,7 +109,13 @@ qemu_irq sysbus_get_connected_irq(SysBusDevice *dev, int n) void sysbus_connect_irq(SysBusDevice *dev, int n, qemu_irq irq) { + SysBusDeviceClass *sbd = SYS_BUS_DEVICE_GET_CLASS(dev); + qdev_connect_gpio_out_named(DEVICE(dev), SYSBUS_DEVICE_GPIO_IRQ, n, irq); + + if (sbd->connect_irq_notifier) { + sbd->connect_irq_notifier(dev, irq); + } } /* Check whether an MMIO region exists */ diff --git a/include/hw/sysbus.h b/include/hw/sysbus.h index 34f93c39bf..cc1dba49bf 100644 --- a/include/hw/sysbus.h +++ b/include/hw/sysbus.h @@ -58,6 +58,7 @@ typedef struct SysBusDeviceClass { * omitted then. (This is not considered a fatal error.) */ char *(*explicit_ofw_unit_address)(const SysBusDevice *dev); + void (*connect_irq_notifier)(SysBusDevice *dev, qemu_irq irq); } SysBusDeviceClass; struct SysBusDevice { From 879904e8635b316de18393222f02d46d2d1f7f4e Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Mon, 6 Jul 2015 12:15:14 -0600 Subject: [PATCH 08/11] kvm: some fixes to kvm_resamplefds_allowed Commit f41389ae3c54b introduced kvm_resamplefds_enabled() and associated kvm_resamplefds_allowed boolean. This patch adds non-KVM version for kvm_resamplefds_enabled and also declares kvm_resamplefds_allowed in kvm-stub as it is done for fellow kvm_irqfds_allowed. Signed-off-by: Eric Auger Reviewed-by: Peter Maydell Signed-off-by: Alex Williamson --- include/sysemu/kvm.h | 1 + kvm-stub.c | 1 + 2 files changed, 2 insertions(+) diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index ba612fc76f..983e99e1e7 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -152,6 +152,7 @@ extern bool kvm_readonly_mem_allowed; #define kvm_halt_in_kernel() (false) #define kvm_eventfds_enabled() (false) #define kvm_irqfds_enabled() (false) +#define kvm_resamplefds_enabled() (false) #define kvm_msi_via_irqfd_enabled() (false) #define kvm_gsi_routing_allowed() (false) #define kvm_gsi_direct_mapping() (false) diff --git a/kvm-stub.c b/kvm-stub.c index 977de21f9f..d9ad624eee 100644 --- a/kvm-stub.c +++ b/kvm-stub.c @@ -24,6 +24,7 @@ bool kvm_kernel_irqchip; bool kvm_async_interrupts_allowed; bool kvm_eventfds_allowed; bool kvm_irqfds_allowed; +bool kvm_resamplefds_allowed; bool kvm_msi_via_irqfd_allowed; bool kvm_gsi_routing_allowed; bool kvm_gsi_direct_mapping; From fb5f816499a5184a1336d72db030b8419b523082 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Mon, 6 Jul 2015 12:15:14 -0600 Subject: [PATCH 09/11] hw/vfio/platform: add irqfd support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch aims at optimizing IRQ handling using irqfd framework. Instead of handling the eventfds on user-side they are handled on kernel side using - the KVM irqfd framework, - the VFIO driver virqfd framework. the virtual IRQ completion is trapped at interrupt controller This removes the need for fast/slow path swap. Overall this brings significant performance improvements. Signed-off-by: Alvise Rigo Signed-off-by: Eric Auger Reviewed-by: Alex BennĂ©e Tested-by: Vikram Sethi Acked-by: Peter Maydell Signed-off-by: Alex Williamson --- hw/vfio/platform.c | 100 ++++++++++++++++++++++++++++++++ include/hw/vfio/vfio-platform.h | 2 + trace-events | 1 + 3 files changed, 103 insertions(+) diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index 5c678b914e..60365d1279 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -26,6 +26,7 @@ #include "hw/sysbus.h" #include "trace.h" #include "hw/platform-bus.h" +#include "sysemu/kvm.h" /* * Functions used whatever the injection method @@ -51,6 +52,7 @@ static VFIOINTp *vfio_init_intp(VFIODevice *vbasedev, intp->pin = info.index; intp->flags = info.flags; intp->state = VFIO_IRQ_INACTIVE; + intp->kvm_accel = false; sysbus_init_irq(sbdev, &intp->qemuirq); @@ -61,6 +63,13 @@ static VFIOINTp *vfio_init_intp(VFIODevice *vbasedev, error_report("vfio: Error: trigger event_notifier_init failed "); return NULL; } + /* Get an eventfd for resample/unmask */ + ret = event_notifier_init(&intp->unmask, 0); + if (ret) { + g_free(intp); + error_report("vfio: Error: resamplefd event_notifier_init failed"); + return NULL; + } QLIST_INSERT_HEAD(&vdev->intp_list, intp, next); return intp; @@ -315,6 +324,94 @@ static int vfio_start_eventfd_injection(VFIOINTp *intp) return ret; } +/* + * Functions used for irqfd + */ + +/** + * vfio_set_resample_eventfd - sets the resamplefd for an IRQ + * @intp: the IRQ struct handle + * programs the VFIO driver to unmask this IRQ when the + * intp->unmask eventfd is triggered + */ +static int vfio_set_resample_eventfd(VFIOINTp *intp) +{ + VFIODevice *vbasedev = &intp->vdev->vbasedev; + struct vfio_irq_set *irq_set; + int argsz, ret; + int32_t *pfd; + + argsz = sizeof(*irq_set) + sizeof(*pfd); + irq_set = g_malloc0(argsz); + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK; + irq_set->index = intp->pin; + irq_set->start = 0; + irq_set->count = 1; + pfd = (int32_t *)&irq_set->data; + *pfd = event_notifier_get_fd(&intp->unmask); + qemu_set_fd_handler(*pfd, NULL, NULL, NULL); + ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set); + g_free(irq_set); + if (ret < 0) { + error_report("vfio: Failed to set resample eventfd: %m"); + } + return ret; +} + +static void vfio_start_irqfd_injection(SysBusDevice *sbdev, qemu_irq irq) +{ + VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(sbdev); + VFIOINTp *intp; + + if (!kvm_irqfds_enabled() || !kvm_resamplefds_enabled() || + !vdev->irqfd_allowed) { + return; + } + + QLIST_FOREACH(intp, &vdev->intp_list, next) { + if (intp->qemuirq == irq) { + break; + } + } + assert(intp); + + /* Get to a known interrupt state */ + qemu_set_fd_handler(event_notifier_get_fd(&intp->interrupt), + NULL, NULL, vdev); + + vfio_mask_single_irqindex(&vdev->vbasedev, intp->pin); + qemu_set_irq(intp->qemuirq, 0); + + if (kvm_irqchip_add_irqfd_notifier(kvm_state, &intp->interrupt, + &intp->unmask, irq) < 0) { + goto fail_irqfd; + } + + if (vfio_set_trigger_eventfd(intp, NULL) < 0) { + goto fail_vfio; + } + if (vfio_set_resample_eventfd(intp) < 0) { + goto fail_vfio; + } + + /* Let's resume injection with irqfd setup */ + vfio_unmask_single_irqindex(&vdev->vbasedev, intp->pin); + + intp->kvm_accel = true; + + trace_vfio_platform_start_irqfd_injection(intp->pin, + event_notifier_get_fd(&intp->interrupt), + event_notifier_get_fd(&intp->unmask)); + return; +fail_vfio: + kvm_irqchip_remove_irqfd_notifier(kvm_state, &intp->interrupt, irq); +fail_irqfd: + vfio_start_eventfd_injection(intp); + vfio_unmask_single_irqindex(&vdev->vbasedev, intp->pin); + return; +} + /* VFIO skeleton */ static void vfio_platform_compute_needs_reset(VFIODevice *vbasedev) @@ -584,17 +681,20 @@ static Property vfio_platform_dev_properties[] = { DEFINE_PROP_BOOL("x-mmap", VFIOPlatformDevice, vbasedev.allow_mmap, true), DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice, mmap_timeout, 1100), + DEFINE_PROP_BOOL("x-irqfd", VFIOPlatformDevice, irqfd_allowed, true), DEFINE_PROP_END_OF_LIST(), }; static void vfio_platform_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); + SysBusDeviceClass *sbc = SYS_BUS_DEVICE_CLASS(klass); dc->realize = vfio_platform_realize; dc->props = vfio_platform_dev_properties; dc->vmsd = &vfio_platform_vmstate; dc->desc = "VFIO-based platform device assignment"; + sbc->connect_irq_notifier = vfio_start_irqfd_injection; set_bit(DEVICE_CATEGORY_MISC, dc->categories); } diff --git a/include/hw/vfio/vfio-platform.h b/include/hw/vfio/vfio-platform.h index 26b2ad6f4e..c5cf1d79f3 100644 --- a/include/hw/vfio/vfio-platform.h +++ b/include/hw/vfio/vfio-platform.h @@ -41,6 +41,7 @@ typedef struct VFIOINTp { int state; /* inactive, pending, active */ uint8_t pin; /* index */ uint32_t flags; /* IRQ info flags */ + bool kvm_accel; /* set when QEMU bypass through KVM enabled */ } VFIOINTp; /* function type for user side eventfd handler */ @@ -57,6 +58,7 @@ typedef struct VFIOPlatformDevice { uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */ QEMUTimer *mmap_timer; /* allows fast-path resume after IRQ hit */ QemuMutex intp_mutex; /* protect the intp_list IRQ state */ + bool irqfd_allowed; /* debug option to force irqfd on/off */ } VFIOPlatformDevice; typedef struct VFIOPlatformDeviceClass { diff --git a/trace-events b/trace-events index 52b7efa9a4..d24d80aed2 100644 --- a/trace-events +++ b/trace-events @@ -1594,6 +1594,7 @@ vfio_platform_intp_interrupt(int pin, int fd) "Inject IRQ #%d (fd = %d)" vfio_platform_intp_inject_pending_lockheld(int pin, int fd) "Inject pending IRQ #%d (fd = %d)" vfio_platform_populate_interrupts(int pin, int count, int flags) "- IRQ index %d: count %d, flags=0x%x" vfio_intp_interrupt_set_pending(int index) "irq %d is set PENDING" +vfio_platform_start_irqfd_injection(int index, int fd, int resamplefd) "IRQ index=%d, fd = %d, resamplefd = %d" #hw/acpi/memory_hotplug.c mhp_acpi_invalid_slot_selected(uint32_t slot) "0x%"PRIx32 From f8d8a944009b7e836c718a05590ea6b36146978f Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 6 Jul 2015 12:15:15 -0600 Subject: [PATCH 10/11] vfio: Unregister IOMMU notifiers when container is destroyed On systems with guest visible IOMMU, adding a new memory region onto PCI bus calls vfio_listener_region_add() for every DMA window. This installs a notifier for IOMMU memory regions. The notifier is supposed to be removed vfio_listener_region_del(), however in the case of mixed PHB (emulated + VFIO devices) when last VFIO device is unplugged and container gets destroyed, all existing DMA windows stay alive altogether with the notifiers which are on the linked list which head was in the destroyed container. This unregisters IOMMU memory region notifier when a container is destroyed. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Alex Williamson --- hw/vfio/common.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index b1045da857..85ee9b005e 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -772,11 +772,19 @@ static void vfio_disconnect_container(VFIOGroup *group) if (QLIST_EMPTY(&container->group_list)) { VFIOAddressSpace *space = container->space; + VFIOGuestIOMMU *giommu, *tmp; if (container->iommu_data.release) { container->iommu_data.release(container); } QLIST_REMOVE(container, next); + + QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) { + memory_region_unregister_iommu_notifier(&giommu->n); + QLIST_REMOVE(giommu, giommu_next); + g_free(giommu); + } + trace_vfio_disconnect_container(container->fd); close(container->fd); g_free(container); From 43302969966bc3a95470bfc300289a83068ef5d9 Mon Sep 17 00:00:00 2001 From: Gabriel Laupre Date: Mon, 6 Jul 2015 12:15:15 -0600 Subject: [PATCH 11/11] vfio/pci : Add pba_offset PCI quirk for Chelsio T5 devices Fix pba_offset initialization value for Chelsio T5 Virtual Function device. The T5 hardware has a bug in it where it reports a Pending Interrupt Bit Array Offset of 0x8000 for its SR-IOV Virtual Functions instead of the 0x1000 that the hardware actually uses internally. As the hardware doesn't return the correct pba_offset value, add a quirk to instead return a hardcoded value of 0x1000 when a Chelsio T5 VF device is detected. This bug has been fixed in the Chelsio's next chip series T6 but there are no plans to respin the T5 ASIC for this bug. It is just documented in the T5 Errata and left it at that. Signed-off-by: Gabriel Laupre Reviewed-by: Bandan Das Signed-off-by: Alex Williamson --- hw/vfio/pci.c | 27 +++++++++++++++++++++++++++ include/hw/pci/pci_ids.h | 2 ++ 2 files changed, 29 insertions(+) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 27b7ec133e..2ed877fe9f 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2252,6 +2252,33 @@ static int vfio_early_setup_msix(VFIOPCIDevice *vdev) vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK; vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; + /* + * Test the size of the pba_offset variable and catch if it extends outside + * of the specified BAR. If it is the case, we need to apply a hardware + * specific quirk if the device is known or we have a broken configuration. + */ + if (vdev->msix->pba_offset >= + vdev->bars[vdev->msix->pba_bar].region.size) { + + PCIDevice *pdev = &vdev->pdev; + uint16_t vendor = pci_get_word(pdev->config + PCI_VENDOR_ID); + uint16_t device = pci_get_word(pdev->config + PCI_DEVICE_ID); + + /* + * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5 + * adapters. The T5 hardware returns an incorrect value of 0x8000 for + * the VF PBA offset while the BAR itself is only 8k. The correct value + * is 0x1000, so we hard code that here. + */ + if (vendor == PCI_VENDOR_ID_CHELSIO && (device & 0xff00) == 0x5800) { + vdev->msix->pba_offset = 0x1000; + } else { + error_report("vfio: Hardware reports invalid configuration, " + "MSIX PBA outside of specified BAR"); + return -EINVAL; + } + } + trace_vfio_early_setup_msix(vdev->vbasedev.name, pos, vdev->msix->table_bar, vdev->msix->table_offset, diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h index 49c062b8ce..d98e6c915d 100644 --- a/include/hw/pci/pci_ids.h +++ b/include/hw/pci/pci_ids.h @@ -114,6 +114,8 @@ #define PCI_VENDOR_ID_ENSONIQ 0x1274 #define PCI_DEVICE_ID_ENSONIQ_ES1370 0x5000 +#define PCI_VENDOR_ID_CHELSIO 0x1425 + #define PCI_VENDOR_ID_FREESCALE 0x1957 #define PCI_DEVICE_ID_MPC8533E 0x0030