From 7abba7c638ab809e626f379617cb8590a733eabf Mon Sep 17 00:00:00 2001 From: Cindy Lu Date: Tue, 9 Nov 2021 10:37:44 +0800 Subject: [PATCH 1/7] virtio-mmio : fix the crash in the vm shutdown The root cause for this crash is the ioeventfd not stopped while the VM stop. The callback for vmstate_change was not implement in virtio-mmio bus Reproduce step load the vm with -M microvm \ -netdev tap,id=net0,vhostforce,script=no,downscript=no \ -device virtio-net-device,netdev=net0\ After the VM boot, login the vm and then shutdown the vm System will crash [Current thread is 1 (Thread 0x7ffff6edde00 (LWP 374378))] (gdb) bt 0 0x00005555558f18b4 in qemu_flush_or_purge_queued_packets (purge=false, nc=0x55500252e850) at ../net/net.c:636 1 qemu_flush_queued_packets (nc=0x55500252e850) at ../net/net.c:656 2 0x0000555555b6c363 in virtio_queue_notify_vq (vq=0x7fffe7e2b010) at ../hw/virtio/virtio.c:2339 3 virtio_queue_host_notifier_read (n=0x7fffe7e2b08c) at ../hw/virtio/virtio.c:3583 4 0x0000555555de7b5a in aio_dispatch_handler (ctx=ctx@entry=0x5555567c5780, node=0x555556b83fd0) at ../util/aio-posix.c:329 5 0x0000555555de8454 in aio_dispatch_ready_handlers (ready_list=, ctx=) at ../util/aio-posix.c:359 6 aio_poll (ctx=0x5555567c5780, blocking=blocking@entry=false) at ../util/aio-posix.c:662 7 0x0000555555cce0cc in monitor_cleanup () at ../monitor/monitor.c:645 8 0x0000555555b06bd2 in qemu_cleanup () at ../softmmu/runstate.c:822 9 0x000055555586e693 in main (argc=, argv=, envp=) at ../softmmu/main.c:51 Signed-off-by: Cindy Lu Message-Id: <20211109023744.22387-1-lulu@redhat.com> Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-mmio.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/hw/virtio/virtio-mmio.c b/hw/virtio/virtio-mmio.c index 7b3ebca178..72da12fea5 100644 --- a/hw/virtio/virtio-mmio.c +++ b/hw/virtio/virtio-mmio.c @@ -817,6 +817,17 @@ static char *virtio_mmio_bus_get_dev_path(DeviceState *dev) return path; } +static void virtio_mmio_vmstate_change(DeviceState *d, bool running) +{ + VirtIOMMIOProxy *proxy = VIRTIO_MMIO(d); + + if (running) { + virtio_mmio_start_ioeventfd(proxy); + } else { + virtio_mmio_stop_ioeventfd(proxy); + } +} + static void virtio_mmio_bus_class_init(ObjectClass *klass, void *data) { BusClass *bus_class = BUS_CLASS(klass); @@ -832,6 +843,7 @@ static void virtio_mmio_bus_class_init(ObjectClass *klass, void *data) k->ioeventfd_enabled = virtio_mmio_ioeventfd_enabled; k->ioeventfd_assign = virtio_mmio_ioeventfd_assign; k->pre_plugged = virtio_mmio_pre_plugged; + k->vmstate_change = virtio_mmio_vmstate_change; k->has_variable_vring_alignment = true; bus_class->max_dev = 1; bus_class->get_dev_path = virtio_mmio_bus_get_dev_path; From 9323f892b39d133eb69b301484bf7b2f3f49737d Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Thu, 18 Nov 2021 14:32:23 +0100 Subject: [PATCH 2/7] failover: fix unplug pending detection Failover needs to detect the end of the PCI unplug to start migration after the VFIO card has been unplugged. To do that, a flag is set in pcie_cap_slot_unplug_request_cb() and reset in pcie_unplug_device(). But since 17858a169508 ("hw/acpi/ich9: Set ACPI PCI hot-plug as default on Q35") we have switched to ACPI unplug and these functions are not called anymore and the flag not set. So failover migration is not able to detect if card is really unplugged and acts as it's done as soon as it's started. So it doesn't wait the end of the unplug to start the migration. We don't see any problem when we test that because ACPI unplug is faster than PCIe native hotplug and when the migration really starts the unplug operation is already done. See c000a9bd06ea ("pci: mark device having guest unplug request pending") a99c4da9fc2a ("pci: mark devices partially unplugged") Signed-off-by: Laurent Vivier Reviewed-by: Ani Sinha Message-Id: <20211118133225.324937-4-lvivier@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/pcihp.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c index f610a25d2e..30405b5113 100644 --- a/hw/acpi/pcihp.c +++ b/hw/acpi/pcihp.c @@ -222,9 +222,27 @@ static void acpi_pcihp_eject_slot(AcpiPciHpState *s, unsigned bsel, unsigned slo PCIDevice *dev = PCI_DEVICE(qdev); if (PCI_SLOT(dev->devfn) == slot) { if (!acpi_pcihp_pc_no_hotplug(s, dev)) { - hotplug_ctrl = qdev_get_hotplug_handler(qdev); - hotplug_handler_unplug(hotplug_ctrl, qdev, &error_abort); - object_unparent(OBJECT(qdev)); + /* + * partially_hotplugged is used by virtio-net failover: + * failover has asked the guest OS to unplug the device + * but we need to keep some references to the device + * to be able to plug it back in case of failure so + * we don't execute hotplug_handler_unplug(). + */ + if (dev->partially_hotplugged) { + /* + * pending_deleted_event is set to true when + * virtio-net failover asks to unplug the device, + * and set to false here when the operation is done + * This is used by the migration loop to detect the + * end of the operation and really start the migration. + */ + qdev->pending_deleted_event = false; + } else { + hotplug_ctrl = qdev_get_hotplug_handler(qdev); + hotplug_handler_unplug(hotplug_ctrl, qdev, &error_abort); + object_unparent(OBJECT(qdev)); + } } } } @@ -396,6 +414,12 @@ void acpi_pcihp_device_unplug_request_cb(HotplugHandler *hotplug_dev, return; } + /* + * pending_deleted_event is used by virtio-net failover to detect the + * end of the unplug operation, the flag is set to false in + * acpi_pcihp_eject_slot() when the operation is completed. + */ + pdev->qdev.pending_deleted_event = true; s->acpi_pcihp_pci_status[bsel].down |= (1U << slot); acpi_send_event(DEVICE(hotplug_dev), ACPI_PCI_HOTPLUG_STATUS); } From 846a1e85da646c6006db429648389fc110f92d75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Thu, 25 Nov 2021 11:16:13 +0100 Subject: [PATCH 3/7] vdpa: Add dummy receive callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Qemu falls back on userland handlers even if vhost-user and vhost-vdpa cases. These assumes a tap device can handle the packets. If a vdpa device fail to start, it can trigger a sigsegv because of that. Add dummy receiver that returns no progress so it can keep running. Fixes: 1e0a84ea49 ("vhost-vdpa: introduce vhost-vdpa net client") Signed-off-by: Eugenio PĂ©rez Message-Id: <20211125101614.76927-2-eperezma@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- net/vhost-vdpa.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 2e3c22a8c7..25dd6dd975 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -170,9 +170,17 @@ static bool vhost_vdpa_check_peer_type(NetClientState *nc, ObjectClass *oc, return true; } +/** Dummy receive in case qemu falls back to userland tap networking */ +static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf, + size_t size) +{ + return 0; +} + static NetClientInfo net_vhost_vdpa_info = { .type = NET_CLIENT_DRIVER_VHOST_VDPA, .size = sizeof(VhostVDPAState), + .receive = vhost_vdpa_receive, .cleanup = vhost_vdpa_cleanup, .has_vnet_hdr = vhost_vdpa_has_vnet_hdr, .has_ufo = vhost_vdpa_has_ufo, From 0fe7245d8b938f371556c100b0b6ec1d2b41e584 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 29 Nov 2021 11:08:40 +0800 Subject: [PATCH 4/7] virtio-balloon: process all in sgs for free_page_vq We only process the first in sg which may lead to the bitmap of the pages belongs to following sgs were not cleared. This may result more pages to be migrated. Fixing this by process all in sgs for free_page_vq. Acked-by: David Hildenbrand Signed-off-by: Jason Wang Message-Id: <20211129030841.3611-1-jasowang@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-balloon.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index c6962fcbfe..17de2558cb 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -510,6 +510,7 @@ static bool get_free_page_hints(VirtIOBalloon *dev) VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtQueue *vq = dev->free_page_vq; bool ret = true; + int i; while (dev->block_iothread) { qemu_cond_wait(&dev->free_page_cond, &dev->free_page_lock); @@ -544,8 +545,10 @@ static bool get_free_page_hints(VirtIOBalloon *dev) } if (elem->in_num && dev->free_page_hint_status == FREE_PAGE_HINT_S_START) { - qemu_guest_free_page_hint(elem->in_sg[0].iov_base, - elem->in_sg[0].iov_len); + for (i = 0; i < elem->in_num; i++) { + qemu_guest_free_page_hint(elem->in_sg[i].iov_base, + elem->in_sg[i].iov_len); + } } out: From d3f1f940ebe43403feb1d12e4b5b9236aba50cb9 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 29 Nov 2021 11:08:41 +0800 Subject: [PATCH 5/7] virtio-balloon: correct used length Spec said: "and len the total of bytes written into the buffer." For inflateq, deflateq and statsq, we don't process in_sg so the used length should be zero. For free_page_vq, tough the pages could be changed by the device (in the destination), spec said: "Note: len is particularly useful for drivers using untrusted buffers: if a driver does not know exactly how much has been written by the device, the driver would have to zero the buffer in advance to ensure no data leakage occurs." So 0 should be used as well here. Signed-off-by: Jason Wang Message-Id: <20211129030841.3611-2-jasowang@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: David Hildenbrand --- hw/virtio/virtio-balloon.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 17de2558cb..9a4f491b54 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -231,7 +231,7 @@ static void balloon_stats_poll_cb(void *opaque) return; } - virtqueue_push(s->svq, s->stats_vq_elem, s->stats_vq_offset); + virtqueue_push(s->svq, s->stats_vq_elem, 0); virtio_notify(vdev, s->svq); g_free(s->stats_vq_elem); s->stats_vq_elem = NULL; @@ -438,7 +438,7 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) memory_region_unref(section.mr); } - virtqueue_push(vq, elem, offset); + virtqueue_push(vq, elem, 0); virtio_notify(vdev, vq); g_free(elem); virtio_balloon_pbp_free(&pbp); @@ -552,7 +552,7 @@ static bool get_free_page_hints(VirtIOBalloon *dev) } out: - virtqueue_push(vq, elem, 1); + virtqueue_push(vq, elem, 0); g_free(elem); return ret; } From 0192d6677c383d812fb23f572fda4e449e89d3f1 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 29 Nov 2021 11:36:18 +0800 Subject: [PATCH 6/7] intel-iommu: ignore leaf SNP bit in scalable mode When booting with scalable mode, I hit this error: qemu-system-x86_64: vtd_iova_to_slpte: detected splte reserve non-zero iova=0xfffff002, level=0x1slpte=0x102681803) qemu-system-x86_64: vtd_iommu_translate: detected translation failure (dev=01:00:00, iova=0xfffff002) qemu-system-x86_64: New fault is not recorded due to compression of faults This is because the SNP bit is set for second level page table since Linux kernel commit 6c00612d0cba1 ("iommu/vt-d: Report right snoop capability when using FL for IOVA") even if SC is not supported by the hardware. To unbreak the guest, ignore the leaf SNP bit for scalable mode first. In the future we may consider to add SC support. Signed-off-by: Jason Wang Message-Id: <20211129033618.3857-1-jasowang@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Peter Xu --- hw/i386/intel_iommu.c | 6 ++++++ hw/i386/intel_iommu_internal.h | 2 ++ 2 files changed, 8 insertions(+) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 294499ee20..f584449d8d 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -3629,6 +3629,12 @@ static void vtd_init(IntelIOMMUState *s) vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits, x86_iommu->dt_supported); + if (s->scalable_mode) { + vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP; + vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP; + vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; + } + if (x86_iommu_ir_supported(x86_iommu)) { s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV; if (s->intr_eim == ON_OFF_AUTO_ON) { diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 3d5487fe2c..a6c788049b 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -388,6 +388,8 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO 0xffff0000ffe0fff8 /* Rsvd field masks for spte */ +#define VTD_SPTE_SNP 0x800ULL + #define VTD_SPTE_PAGE_L1_RSVD_MASK(aw, dt_supported) \ dt_supported ? \ (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \ From bacf58ca18f06f0b464466bf8c19945f19791feb Mon Sep 17 00:00:00 2001 From: Daniella Lee Date: Fri, 26 Nov 2021 14:13:24 +0800 Subject: [PATCH 7/7] Fix bad overflow check in hw/pci/pcie.c Orginal qemu commit hash:14d02cfbe4adaeebe7cb833a8cc71191352cf03b In function pcie_add_capability, an assert contains the "offset < offset + size" expression. Both variable offset and variable size are uint16_t, the comparison is always true due to type promotion. The next expression may be the same. It might be like this: Thread 1 "qemu-system-x86" hit Breakpoint 1, pcie_add_capability ( dev=0x555557ce5f10, cap_id=1, cap_ver=2 '\002', offset=256, size=72) at ../hw/pci/pcie.c:930 930 { (gdb) n 931 assert(offset >= PCI_CONFIG_SPACE_SIZE); (gdb) n 932 assert(offset < offset + size); (gdb) p offset $1 = 256 (gdb) p offset < offset + size $2 = 1 (gdb) set offset=65533 (gdb) p offset < offset + size $3 = 1 (gdb) p offset < (uint16_t)(offset + size) $4 = 0 Signed-off-by: Daniella Lee Message-Id: <20211126061324.47331-1-daniellalee111@gmail.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pcie.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index c5ed266337..d7d73a31e4 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -929,8 +929,8 @@ void pcie_add_capability(PCIDevice *dev, uint16_t offset, uint16_t size) { assert(offset >= PCI_CONFIG_SPACE_SIZE); - assert(offset < offset + size); - assert(offset + size <= PCIE_CONFIG_SPACE_SIZE); + assert(offset < (uint16_t)(offset + size)); + assert((uint16_t)(offset + size) <= PCIE_CONFIG_SPACE_SIZE); assert(size >= 8); assert(pci_is_express(dev));