From 7abba7c638ab809e626f379617cb8590a733eabf Mon Sep 17 00:00:00 2001
From: Cindy Lu <lulu@redhat.com>
Date: Tue, 9 Nov 2021 10:37:44 +0800
Subject: [PATCH 1/7] virtio-mmio : fix the crash in the vm shutdown

The root cause for this crash is the ioeventfd not stopped while the VM stop.
The callback for vmstate_change was not implement in virtio-mmio bus

Reproduce step
load the vm with
 -M microvm \
  -netdev tap,id=net0,vhostforce,script=no,downscript=no  \
  -device virtio-net-device,netdev=net0\

After the VM boot, login the vm and then shutdown the vm

System will crash
[Current thread is 1 (Thread 0x7ffff6edde00 (LWP 374378))]
(gdb) bt
0  0x00005555558f18b4 in qemu_flush_or_purge_queued_packets (purge=false, nc=0x55500252e850) at ../net/net.c:636
1  qemu_flush_queued_packets (nc=0x55500252e850) at ../net/net.c:656
2  0x0000555555b6c363 in virtio_queue_notify_vq (vq=0x7fffe7e2b010) at ../hw/virtio/virtio.c:2339
3  virtio_queue_host_notifier_read (n=0x7fffe7e2b08c) at ../hw/virtio/virtio.c:3583
4  0x0000555555de7b5a in aio_dispatch_handler (ctx=ctx@entry=0x5555567c5780, node=0x555556b83fd0) at ../util/aio-posix.c:329
5  0x0000555555de8454 in aio_dispatch_ready_handlers (ready_list=<optimized out>, ctx=<optimized out>) at ../util/aio-posix.c:359
6  aio_poll (ctx=0x5555567c5780, blocking=blocking@entry=false) at ../util/aio-posix.c:662
7  0x0000555555cce0cc in monitor_cleanup () at ../monitor/monitor.c:645
8  0x0000555555b06bd2 in qemu_cleanup () at ../softmmu/runstate.c:822
9  0x000055555586e693 in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at ../softmmu/main.c:51

Signed-off-by: Cindy Lu <lulu@redhat.com>
Message-Id: <20211109023744.22387-1-lulu@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/virtio-mmio.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)
diff --git a/hw/virtio/virtio-mmio.c b/hw/virtio/virtio-mmio.c
index 7b3ebca178..72da12fea5 100644
--- a/hw/virtio/virtio-mmio.c
+++ b/hw/virtio/virtio-mmio.c
@@ -817,6 +817,17 @@ static char *virtio_mmio_bus_get_dev_path(DeviceState *dev)
     return path;
 }
 
+static void virtio_mmio_vmstate_change(DeviceState *d, bool running)
+{
+    VirtIOMMIOProxy *proxy = VIRTIO_MMIO(d);
+
+    if (running) {
+        virtio_mmio_start_ioeventfd(proxy);
+    } else {
+        virtio_mmio_stop_ioeventfd(proxy);
+    }
+}
+
 static void virtio_mmio_bus_class_init(ObjectClass *klass, void *data)
 {
     BusClass *bus_class = BUS_CLASS(klass);
@@ -832,6 +843,7 @@ static void virtio_mmio_bus_class_init(ObjectClass *klass, void *data)
     k->ioeventfd_enabled = virtio_mmio_ioeventfd_enabled;
     k->ioeventfd_assign = virtio_mmio_ioeventfd_assign;
     k->pre_plugged = virtio_mmio_pre_plugged;
+    k->vmstate_change = virtio_mmio_vmstate_change;
     k->has_variable_vring_alignment = true;
     bus_class->max_dev = 1;
     bus_class->get_dev_path = virtio_mmio_bus_get_dev_path;

From 9323f892b39d133eb69b301484bf7b2f3f49737d Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 18 Nov 2021 14:32:23 +0100
Subject: [PATCH 2/7] failover: fix unplug pending detection

Failover needs to detect the end of the PCI unplug to start migration
after the VFIO card has been unplugged.

To do that, a flag is set in pcie_cap_slot_unplug_request_cb() and reset in
pcie_unplug_device().

But since
    17858a169508 ("hw/acpi/ich9: Set ACPI PCI hot-plug as default on Q35")
we have switched to ACPI unplug and these functions are not called anymore
and the flag not set. So failover migration is not able to detect if card
is really unplugged and acts as it's done as soon as it's started. So it
doesn't wait the end of the unplug to start the migration. We don't see any
problem when we test that because ACPI unplug is faster than PCIe native
hotplug and when the migration really starts the unplug operation is
already done.

See c000a9bd06ea ("pci: mark device having guest unplug request pending")
    a99c4da9fc2a ("pci: mark devices partially unplugged")

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: Ani Sinha <ani@anisinha.ca>
Message-Id: <20211118133225.324937-4-lvivier@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/acpi/pcihp.c | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c
index f610a25d2e..30405b5113 100644
--- a/hw/acpi/pcihp.c
+++ b/hw/acpi/pcihp.c
@@ -222,9 +222,27 @@ static void acpi_pcihp_eject_slot(AcpiPciHpState *s, unsigned bsel, unsigned slo
         PCIDevice *dev = PCI_DEVICE(qdev);
         if (PCI_SLOT(dev->devfn) == slot) {
             if (!acpi_pcihp_pc_no_hotplug(s, dev)) {
-                hotplug_ctrl = qdev_get_hotplug_handler(qdev);
-                hotplug_handler_unplug(hotplug_ctrl, qdev, &error_abort);
-                object_unparent(OBJECT(qdev));
+                /*
+                 * partially_hotplugged is used by virtio-net failover:
+                 * failover has asked the guest OS to unplug the device
+                 * but we need to keep some references to the device
+                 * to be able to plug it back in case of failure so
+                 * we don't execute hotplug_handler_unplug().
+                 */
+                if (dev->partially_hotplugged) {
+                    /*
+                     * pending_deleted_event is set to true when
+                     * virtio-net failover asks to unplug the device,
+                     * and set to false here when the operation is done
+                     * This is used by the migration loop to detect the
+                     * end of the operation and really start the migration.
+                     */
+                    qdev->pending_deleted_event = false;
+                } else {
+                    hotplug_ctrl = qdev_get_hotplug_handler(qdev);
+                    hotplug_handler_unplug(hotplug_ctrl, qdev, &error_abort);
+                    object_unparent(OBJECT(qdev));
+                }
             }
         }
     }
@@ -396,6 +414,12 @@ void acpi_pcihp_device_unplug_request_cb(HotplugHandler *hotplug_dev,
         return;
     }
 
+    /*
+     * pending_deleted_event is used by virtio-net failover to detect the
+     * end of the unplug operation, the flag is set to false in
+     * acpi_pcihp_eject_slot() when the operation is completed.
+     */
+    pdev->qdev.pending_deleted_event = true;
     s->acpi_pcihp_pci_status[bsel].down |= (1U << slot);
     acpi_send_event(DEVICE(hotplug_dev), ACPI_PCI_HOTPLUG_STATUS);
 }

From 846a1e85da646c6006db429648389fc110f92d75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= <eperezma@redhat.com>
Date: Thu, 25 Nov 2021 11:16:13 +0100
Subject: [PATCH 3/7] vdpa: Add dummy receive callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Qemu falls back on userland handlers even if vhost-user and vhost-vdpa
cases. These assumes a tap device can handle the packets.

If a vdpa device fail to start, it can trigger a sigsegv because of
that. Add dummy receiver that returns no progress so it can keep
running.

Fixes: 1e0a84ea49 ("vhost-vdpa: introduce vhost-vdpa net client")
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Message-Id: <20211125101614.76927-2-eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 net/vhost-vdpa.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 2e3c22a8c7..25dd6dd975 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -170,9 +170,17 @@ static bool vhost_vdpa_check_peer_type(NetClientState *nc, ObjectClass *oc,
     return true;
 }
 
+/** Dummy receive in case qemu falls back to userland tap networking */
+static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
+                                  size_t size)
+{
+    return 0;
+}
+
 static NetClientInfo net_vhost_vdpa_info = {
         .type = NET_CLIENT_DRIVER_VHOST_VDPA,
         .size = sizeof(VhostVDPAState),
+        .receive = vhost_vdpa_receive,
         .cleanup = vhost_vdpa_cleanup,
         .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
         .has_ufo = vhost_vdpa_has_ufo,

From 0fe7245d8b938f371556c100b0b6ec1d2b41e584 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Mon, 29 Nov 2021 11:08:40 +0800
Subject: [PATCH 4/7] virtio-balloon: process all in sgs for free_page_vq

We only process the first in sg which may lead to the bitmap of the
pages belongs to following sgs were not cleared. This may result more
pages to be migrated. Fixing this by process all in sgs for
free_page_vq.

Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20211129030841.3611-1-jasowang@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/virtio-balloon.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index c6962fcbfe..17de2558cb 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -510,6 +510,7 @@ static bool get_free_page_hints(VirtIOBalloon *dev)
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VirtQueue *vq = dev->free_page_vq;
     bool ret = true;
+    int i;
 
     while (dev->block_iothread) {
         qemu_cond_wait(&dev->free_page_cond, &dev->free_page_lock);
@@ -544,8 +545,10 @@ static bool get_free_page_hints(VirtIOBalloon *dev)
     }
 
     if (elem->in_num && dev->free_page_hint_status == FREE_PAGE_HINT_S_START) {
-        qemu_guest_free_page_hint(elem->in_sg[0].iov_base,
-                                  elem->in_sg[0].iov_len);
+        for (i = 0; i < elem->in_num; i++) {
+            qemu_guest_free_page_hint(elem->in_sg[i].iov_base,
+                                      elem->in_sg[i].iov_len);
+        }
     }
 
 out:

From d3f1f940ebe43403feb1d12e4b5b9236aba50cb9 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Mon, 29 Nov 2021 11:08:41 +0800
Subject: [PATCH 5/7] virtio-balloon: correct used length

Spec said:

"and len the total of bytes written into the buffer."

For inflateq, deflateq and statsq, we don't process in_sg so the used
length should be zero. For free_page_vq, tough the pages could be
changed by the device (in the destination), spec said:

"Note: len is particularly useful for drivers using untrusted buffers:
if a driver does not know exactly how much has been written by the
device, the driver would have to zero the buffer in advance to ensure
no data leakage occurs."

So 0 should be used as well here.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20211129030841.3611-2-jasowang@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
---
 hw/virtio/virtio-balloon.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 17de2558cb..9a4f491b54 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -231,7 +231,7 @@ static void balloon_stats_poll_cb(void *opaque)
         return;
     }
 
-    virtqueue_push(s->svq, s->stats_vq_elem, s->stats_vq_offset);
+    virtqueue_push(s->svq, s->stats_vq_elem, 0);
     virtio_notify(vdev, s->svq);
     g_free(s->stats_vq_elem);
     s->stats_vq_elem = NULL;
@@ -438,7 +438,7 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
             memory_region_unref(section.mr);
         }
 
-        virtqueue_push(vq, elem, offset);
+        virtqueue_push(vq, elem, 0);
         virtio_notify(vdev, vq);
         g_free(elem);
         virtio_balloon_pbp_free(&pbp);
@@ -552,7 +552,7 @@ static bool get_free_page_hints(VirtIOBalloon *dev)
     }
 
 out:
-    virtqueue_push(vq, elem, 1);
+    virtqueue_push(vq, elem, 0);
     g_free(elem);
     return ret;
 }

From 0192d6677c383d812fb23f572fda4e449e89d3f1 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Mon, 29 Nov 2021 11:36:18 +0800
Subject: [PATCH 6/7] intel-iommu: ignore leaf SNP bit in scalable mode

When booting with scalable mode, I hit this error:

qemu-system-x86_64: vtd_iova_to_slpte: detected splte reserve non-zero iova=0xfffff002, level=0x1slpte=0x102681803)
qemu-system-x86_64: vtd_iommu_translate: detected translation failure (dev=01:00:00, iova=0xfffff002)
qemu-system-x86_64: New fault is not recorded due to compression of faults

This is because the SNP bit is set for second level page table since
Linux kernel commit 6c00612d0cba1 ("iommu/vt-d: Report right snoop
capability when using FL for IOVA") even if SC is not supported by the
hardware.

To unbreak the guest, ignore the leaf SNP bit for scalable mode
first. In the future we may consider to add SC support.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20211129033618.3857-1-jasowang@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
---
 hw/i386/intel_iommu.c          | 6 ++++++
 hw/i386/intel_iommu_internal.h | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 294499ee20..f584449d8d 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3629,6 +3629,12 @@ static void vtd_init(IntelIOMMUState *s)
     vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
                                                          x86_iommu->dt_supported);
 
+    if (s->scalable_mode) {
+        vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
+        vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
+        vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
+    }
+
     if (x86_iommu_ir_supported(x86_iommu)) {
         s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
         if (s->intr_eim == ON_OFF_AUTO_ON) {
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 3d5487fe2c..a6c788049b 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -388,6 +388,8 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO 0xffff0000ffe0fff8
 
 /* Rsvd field masks for spte */
+#define VTD_SPTE_SNP 0x800ULL
+
 #define VTD_SPTE_PAGE_L1_RSVD_MASK(aw, dt_supported) \
         dt_supported ? \
         (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \

From bacf58ca18f06f0b464466bf8c19945f19791feb Mon Sep 17 00:00:00 2001
From: Daniella Lee <daniellalee111@gmail.com>
Date: Fri, 26 Nov 2021 14:13:24 +0800
Subject: [PATCH 7/7] Fix bad overflow check in hw/pci/pcie.c

Orginal qemu commit hash:14d02cfbe4adaeebe7cb833a8cc71191352cf03b

In function pcie_add_capability, an assert contains the
"offset < offset + size" expression.
Both variable offset and variable size are uint16_t,
the comparison is always true due to type promotion.
The next expression may be the same.

It might be like this:
Thread 1 "qemu-system-x86" hit Breakpoint 1, pcie_add_capability (
    dev=0x555557ce5f10, cap_id=1, cap_ver=2 '\002', offset=256, size=72)
    at ../hw/pci/pcie.c:930
930	{
(gdb) n
931	    assert(offset >= PCI_CONFIG_SPACE_SIZE);
(gdb) n
932	    assert(offset < offset + size);
(gdb) p offset
$1 = 256
(gdb) p offset < offset + size
$2 = 1
(gdb) set offset=65533
(gdb) p offset < offset + size
$3 = 1
(gdb) p offset < (uint16_t)(offset + size)
$4 = 0

Signed-off-by: Daniella Lee <daniellalee111@gmail.com>
Message-Id: <20211126061324.47331-1-daniellalee111@gmail.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/pci/pcie.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index c5ed266337..d7d73a31e4 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -929,8 +929,8 @@ void pcie_add_capability(PCIDevice *dev,
                          uint16_t offset, uint16_t size)
 {
     assert(offset >= PCI_CONFIG_SPACE_SIZE);
-    assert(offset < offset + size);
-    assert(offset + size <= PCIE_CONFIG_SPACE_SIZE);
+    assert(offset < (uint16_t)(offset + size));
+    assert((uint16_t)(offset + size) <= PCIE_CONFIG_SPACE_SIZE);
     assert(size >= 8);
     assert(pci_is_express(dev));