From ae440bd14c002f3a5528bd38e8a285ea625c04ca Mon Sep 17 00:00:00 2001
From: Wei Wang <wei.w.wang@intel.com>
Date: Tue, 12 Mar 2019 17:34:40 +0800
Subject: [PATCH 01/26] virtio-balloon: fix a use-after-free case

The elem could theorically contain both outbuf and inbufs. We move the
free operation to the end of this function to avoid using elem->in_sg
while elem has been freed.

Fixes: c13c4153f7
("virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT")
Reported-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
CC: Michael S. Tsirkin <mst@redhat.com>
CC: Dr. David Alan Gilbert <dgilbert@redhat.com>
CC: Juan Quintela <quintela@redhat.com>
CC: Peter Xu <peterx@redhat.com>
Message-Id: <1552383280-4122-1-git-send-email-wei.w.wang@intel.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/virtio-balloon.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index e3a65940ef..b614552352 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -391,6 +391,7 @@ static bool get_free_page_hints(VirtIOBalloon *dev)
     VirtQueueElement *elem;
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VirtQueue *vq = dev->free_page_vq;
+    bool ret = true;
 
     while (dev->block_iothread) {
         qemu_cond_wait(&dev->free_page_cond, &dev->free_page_lock);
@@ -405,13 +406,12 @@ static bool get_free_page_hints(VirtIOBalloon *dev)
         uint32_t id;
         size_t size = iov_to_buf(elem->out_sg, elem->out_num, 0,
                                  &id, sizeof(id));
-        virtqueue_push(vq, elem, size);
-        g_free(elem);
 
         virtio_tswap32s(vdev, &id);
         if (unlikely(size != sizeof(id))) {
             virtio_error(vdev, "received an incorrect cmd id");
-            return false;
+            ret = false;
+            goto out;
         }
         if (id == dev->free_page_report_cmd_id) {
             dev->free_page_report_status = FREE_PAGE_REPORT_S_START;
@@ -431,11 +431,12 @@ static bool get_free_page_hints(VirtIOBalloon *dev)
             qemu_guest_free_page_hint(elem->in_sg[0].iov_base,
                                       elem->in_sg[0].iov_len);
         }
-        virtqueue_push(vq, elem, 1);
-        g_free(elem);
     }
 
-    return true;
+out:
+    virtqueue_push(vq, elem, 1);
+    g_free(elem);
+    return ret;
 }
 
 static void virtio_ballloon_get_free_page_hints(void *opaque)

From 301cf2a8dd5024aa5bbdc6bd3e121174bbfc2957 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Mar 2019 14:05:59 +1100
Subject: [PATCH 02/26] virtio-balloon: Don't mismatch g_malloc()/free (CID
 1399146)

ed48c59875b6 "virtio-balloon: Safely handle BALLOON_PAGE_SIZE < host
page size" introduced a new temporary data structure which tracks 4kiB
chunks which have been inserted into the balloon by the guest but
don't yet form a full host page which we can discard.

Unfortunately, I had a thinko and allocated that structure with
g_malloc0() but freed it with a plain free() rather than g_free().
This corrects the problem.

Fixes: ed48c59875b6
Reported-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Message-Id: <20190306030601.21986-2-david@gibson.dropbear.id.au>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
---
 hw/virtio/virtio-balloon.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index b614552352..1859724a36 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -82,7 +82,7 @@ static void balloon_inflate_page(VirtIOBalloon *balloon,
         /* We've partially ballooned part of a host page, but now
          * we're trying to balloon part of a different one.  Too hard,
          * give up on the old partial page */
-        free(balloon->pbp);
+        g_free(balloon->pbp);
         balloon->pbp = NULL;
     }
 
@@ -107,7 +107,7 @@ static void balloon_inflate_page(VirtIOBalloon *balloon,
          * has already reported them, and failing to discard a balloon
          * page is not fatal */
 
-        free(balloon->pbp);
+        g_free(balloon->pbp);
         balloon->pbp = NULL;
     }
 }

From b27b3239140470b7d593e3b0b09687bcc6fbf274 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Mar 2019 14:06:00 +1100
Subject: [PATCH 03/26] virtio-balloon: Fix possible guest memory corruption
 with inflates & deflates

This fixes a balloon bug with a nasty consequence - potentially
corrupting guest memory - but which is extremely unlikely to be
triggered in practice.

The balloon always works in 4kiB units, but the host could have a
larger page size on certain platforms.  Since ed48c59 "virtio-balloon:
Safely handle BALLOON_PAGE_SIZE < host page size" we've handled this
by accumulating requests to balloon 4kiB subpages until they formed a
full host page.  Since f6deb6d "virtio-balloon: Remove unnecessary
MADV_WILLNEED on deflate" we essentially ignore deflate requests.

Suppose we have a host with 8kiB pages, and one host page has subpages
A & B.  If we get this sequence of events -
	inflate A
	deflate A
	inflate B
- the current logic will discard the whole host page.  That's
incorrect because the guest has deflated subpage A, and could have
written important data to it.

This patch fixes the problem by adjusting our state information about
partially ballooned host pages when deflate requests are received.

Fixes: ed48c59 "virtio-balloon: Safely handle BALLOON_PAGE_SIZE < host page size"

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Message-Id: <20190306030601.21986-3-david@gibson.dropbear.id.au>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
---
 hw/virtio/virtio-balloon.c | 48 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 1859724a36..77532e80d5 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -112,6 +112,43 @@ static void balloon_inflate_page(VirtIOBalloon *balloon,
     }
 }
 
+static void balloon_deflate_page(VirtIOBalloon *balloon,
+                                 MemoryRegion *mr, hwaddr offset)
+{
+    void *addr = memory_region_get_ram_ptr(mr) + offset;
+    RAMBlock *rb;
+    size_t rb_page_size;
+    ram_addr_t ram_offset, host_page_base;
+
+    /* XXX is there a better way to get to the RAMBlock than via a
+     * host address? */
+    rb = qemu_ram_block_from_host(addr, false, &ram_offset);
+    rb_page_size = qemu_ram_pagesize(rb);
+    host_page_base = ram_offset & ~(rb_page_size - 1);
+
+    if (balloon->pbp
+        && rb == balloon->pbp->rb
+        && host_page_base == balloon->pbp->base) {
+        int subpages = rb_page_size / BALLOON_PAGE_SIZE;
+
+        /*
+         * This means the guest has asked to discard some of the 4kiB
+         * subpages of a host page, but then changed its mind and
+         * asked to keep them after all.  It's exceedingly unlikely
+         * for a guest to do this in practice, but handle it anyway,
+         * since getting it wrong could mean discarding memory the
+         * guest is still using. */
+        bitmap_clear(balloon->pbp->bitmap,
+                     (ram_offset - balloon->pbp->base) / BALLOON_PAGE_SIZE,
+                     subpages);
+
+        if (bitmap_empty(balloon->pbp->bitmap, subpages)) {
+            g_free(balloon->pbp);
+            balloon->pbp = NULL;
+        }
+    }
+}
+
 static const char *balloon_stat_names[] = {
    [VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in",
    [VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out",
@@ -315,8 +352,15 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
 
             trace_virtio_balloon_handle_output(memory_region_name(section.mr),
                                                pa);
-            if (!qemu_balloon_is_inhibited() && vq != s->dvq) {
-                balloon_inflate_page(s, section.mr, section.offset_within_region);
+            if (!qemu_balloon_is_inhibited()) {
+                if (vq == s->ivq) {
+                    balloon_inflate_page(s, section.mr,
+                                         section.offset_within_region);
+                } else if (vq == s->dvq) {
+                    balloon_deflate_page(s, section.mr, section.offset_within_region);
+                } else {
+                    g_assert_not_reached();
+                }
             }
             memory_region_unref(section.mr);
         }

From 596546fe9e4d1d1fa6423c300e2a73b6f90baeb0 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 6 Mar 2019 14:06:01 +1100
Subject: [PATCH 04/26] virtio-balloon: Restore MADV_WILLNEED hint on balloon
 deflate

Prior to f6deb6d9 "virtio-balloon: Remove unnecessary MADV_WILLNEED on
deflate", the balloon device issued an madvise() MADV_WILLNEED on
pages removed from the balloon.  That would hint to the host kernel
that the pages were likely to be needed by the guest in the near
future.

It's unclear if this is actually valuable or not, and so f6deb6d9
removed this, essentially ignoring balloon deflate requests.  However,
concerns have been raised that this might cause a performance
regression by causing extra latency for the guest in certain
configurations.

So, until we can get actual benchmark data to see if that's the case,
this restores the old behaviour, issuing a MADV_WILLNEED when a page is
removed from the balloon.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Message-Id: <20190306030601.21986-4-david@gibson.dropbear.id.au>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/virtio-balloon.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 77532e80d5..2112874055 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -119,6 +119,8 @@ static void balloon_deflate_page(VirtIOBalloon *balloon,
     RAMBlock *rb;
     size_t rb_page_size;
     ram_addr_t ram_offset, host_page_base;
+    void *host_addr;
+    int ret;
 
     /* XXX is there a better way to get to the RAMBlock than via a
      * host address? */
@@ -147,6 +149,17 @@ static void balloon_deflate_page(VirtIOBalloon *balloon,
             balloon->pbp = NULL;
         }
     }
+
+    host_addr = (void *)((uintptr_t)addr & ~(rb_page_size - 1));
+
+    /* When a page is deflated, we hint the whole host page it lives
+     * on, since we can't do anything smaller */
+    ret = qemu_madvise(host_addr, rb_page_size, QEMU_MADV_WILLNEED);
+    if (ret != 0) {
+        warn_report("Couldn't MADV_WILLNEED on balloon deflate: %s",
+                    strerror(errno));
+        /* Otherwise ignore, failing to page hint shouldn't be fatal */
+    }
 }
 
 static const char *balloon_stat_names[] = {

From ba275e9d28a2f4572325dceb867edebcff471fea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Fri, 8 Mar 2019 15:04:43 +0100
Subject: [PATCH 05/26] libvhost-user: fix clang enum-conversion warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that the VhostUserMsg.request field is used for both master &
slave requests, since commit d84599f56c820d8c1ac9928a76500dcdfbbf194d:

contrib/libvhost-user/libvhost-user.c:953:20: error: implicit conversion from enumeration type 'enum VhostUserSlaveRequest' to different enumeration type 'VhostUserRequest' (aka 'enum VhostUserRequest') [-Werror,-Wenum-conversion]
        .request = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG,
                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20190308140454.32437-2-marcandre.lureau@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 contrib/libvhost-user/libvhost-user.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
index 4aa55b4d2d..c0133b7f3f 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -145,7 +145,7 @@ typedef struct VhostUserVringArea {
 #endif
 
 typedef struct VhostUserMsg {
-    VhostUserRequest request;
+    int request;
 
 #define VHOST_USER_VERSION_MASK     (0x3)
 #define VHOST_USER_REPLY_MASK       (0x1 << 2)

From 482580a658df58f5d9f91a87d957660637d59432 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Fri, 8 Mar 2019 15:04:44 +0100
Subject: [PATCH 06/26] vhost-user: define conventions for vhost-user backends
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As discussed during "[PATCH v4 00/29] vhost-user for input & GPU"
review, let's define a common set of backend conventions to help with
management layer implementation, and interoperability.

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
Message-Id: <20190308140454.32437-3-marcandre.lureau@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 MAINTAINERS                  |   1 +
 docs/interop/vhost-user.json | 232 +++++++++++++++++++++++++++++++++++
 docs/interop/vhost-user.txt  | 101 ++++++++++++++-
 3 files changed, 332 insertions(+), 2 deletions(-)
 create mode 100644 docs/interop/vhost-user.json

diff --git a/MAINTAINERS b/MAINTAINERS
index d326756079..fd0e274eb0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1453,6 +1453,7 @@ vhost
 M: Michael S. Tsirkin <mst@redhat.com>
 S: Supported
 F: hw/*/*vhost*
+F: docs/interop/vhost-user.json
 F: docs/interop/vhost-user.txt
 F: contrib/vhost-user-*/
 
diff --git a/docs/interop/vhost-user.json b/docs/interop/vhost-user.json
new file mode 100644
index 0000000000..ae88c03117
--- /dev/null
+++ b/docs/interop/vhost-user.json
@@ -0,0 +1,232 @@
+# -*- Mode: Python -*-
+#
+# Copyright (C) 2018 Red Hat, Inc.
+#
+# Authors:
+#  Marc-André Lureau <marcandre.lureau@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or
+# later. See the COPYING file in the top-level directory.
+
+##
+# = vhost user backend discovery & capabilities
+##
+
+##
+# @VHostUserBackendType:
+#
+# List the various vhost user backend types.
+#
+# @9p: 9p virtio console
+# @balloon: virtio balloon
+# @block: virtio block
+# @caif: virtio caif
+# @console: virtio console
+# @crypto: virtio crypto
+# @gpu: virtio gpu
+# @input: virtio input
+# @net: virtio net
+# @rng: virtio rng
+# @rpmsg: virtio remote processor messaging
+# @rproc-serial: virtio remoteproc serial link
+# @scsi: virtio scsi
+# @vsock: virtio vsock transport
+#
+# Since: 4.0
+##
+{
+  'enum': 'VHostUserBackendType',
+  'data': [
+      '9p',
+      'balloon',
+      'block',
+      'caif',
+      'console',
+      'crypto',
+      'gpu',
+      'input',
+      'net',
+      'rng',
+      'rpmsg',
+      'rproc-serial',
+      'scsi',
+      'vsock'
+  ]
+}
+
+##
+# @VHostUserBackendInputFeature:
+#
+# List of vhost user "input" features.
+#
+# @evdev-path: The --evdev-path command line option is supported.
+# @no-grab: The --no-grab command line option is supported.
+#
+# Since: 4.0
+##
+{
+  'enum': 'VHostUserBackendInputFeature',
+  'data': [ 'evdev-path', 'no-grab' ]
+}
+
+##
+# @VHostUserBackendCapabilitiesInput:
+#
+# Capabilities reported by vhost user "input" backends
+#
+# @features: list of supported features.
+#
+# Since: 4.0
+##
+{
+  'struct': 'VHostUserBackendCapabilitiesInput',
+  'data': {
+    'features': [ 'VHostUserBackendInputFeature' ]
+  }
+}
+
+##
+# @VHostUserBackendGPUFeature:
+#
+# List of vhost user "gpu" features.
+#
+# @render-node: The --render-node command line option is supported.
+# @virgl: The --virgl command line option is supported.
+#
+# Since: 4.0
+##
+{
+  'enum': 'VHostUserBackendGPUFeature',
+  'data': [ 'render-node', 'virgl' ]
+}
+
+##
+# @VHostUserBackendCapabilitiesGPU:
+#
+# Capabilities reported by vhost user "gpu" backends.
+#
+# @features: list of supported features.
+#
+# Since: 4.0
+##
+{
+  'struct': 'VHostUserBackendCapabilitiesGPU',
+  'data': {
+    'features': [ 'VHostUserBackendGPUFeature' ]
+  }
+}
+
+##
+# @VHostUserBackendCapabilities:
+#
+# Capabilities reported by vhost user backends.
+#
+# @type: The vhost user backend type.
+#
+# Since: 4.0
+##
+{
+  'union': 'VHostUserBackendCapabilities',
+  'base': { 'type': 'VHostUserBackendType' },
+  'discriminator': 'type',
+  'data': {
+    'input': 'VHostUserBackendCapabilitiesInput',
+    'gpu': 'VHostUserBackendCapabilitiesGPU'
+  }
+}
+
+##
+# @VhostUserBackend:
+#
+# Describes a vhost user backend to management software.
+#
+# It is possible for multiple @VhostUserBackend elements to match the
+# search criteria of management software. Applications thus need rules
+# to pick one of the many matches, and users need the ability to
+# override distro defaults.
+#
+# It is recommended to create vhost user backend JSON files (each
+# containing a single @VhostUserBackend root element) with a
+# double-digit prefix, for example "50-qemu-gpu.json",
+# "50-crosvm-gpu.json", etc, so they can be sorted in predictable
+# order. The backend JSON files should be searched for in three
+# directories:
+#
+#   - /usr/share/qemu/vhost-user -- populated by distro-provided
+#                                   packages (XDG_DATA_DIRS covers
+#                                   /usr/share by default),
+#
+#   - /etc/qemu/vhost-user -- exclusively for sysadmins' local additions,
+#
+#   - $XDG_CONFIG_HOME/qemu/vhost-user -- exclusively for per-user local
+#                                         additions (XDG_CONFIG_HOME
+#                                         defaults to $HOME/.config).
+#
+# Top-down, the list of directories goes from general to specific.
+#
+# Management software should build a list of files from all three
+# locations, then sort the list by filename (i.e., basename
+# component). Management software should choose the first JSON file on
+# the sorted list that matches the search criteria. If a more specific
+# directory has a file with same name as a less specific directory,
+# then the file in the more specific directory takes effect. If the
+# more specific file is zero length, it hides the less specific one.
+#
+# For example, if a distro ships
+#
+#   - /usr/share/qemu/vhost-user/50-qemu-gpu.json
+#
+#   - /usr/share/qemu/vhost-user/50-crosvm-gpu.json
+#
+# then the sysadmin can prevent the default QEMU being used at all with
+#
+#   $ touch /etc/qemu/vhost-user/50-qemu-gpu.json
+#
+# The sysadmin can replace/alter the distro default OVMF with
+#
+#   $ vim /etc/qemu/vhost-user/50-qemu-gpu.json
+#
+# or they can provide a parallel QEMU GPU with higher priority
+#
+#   $ vim /etc/qemu/vhost-user/10-qemu-gpu.json
+#
+# or they can provide a parallel OVMF with lower priority
+#
+#   $ vim /etc/qemu/vhost-user/99-qemu-gpu.json
+#
+# @type: The vhost user backend type.
+#
+# @description: Provides a human-readable description of the backend.
+#               Management software may or may not display @description.
+#
+# @binary: Absolute path to the backend binary.
+#
+# @tags: An optional list of auxiliary strings associated with the
+#        backend for which @description is not appropriate, due to the
+#        latter's possible exposure to the end-user. @tags serves
+#        development and debugging purposes only, and management
+#        software shall explicitly ignore it.
+#
+# Since: 4.0
+#
+# Example:
+#
+# {
+#   "description": "QEMU vhost-user-gpu",
+#   "type": "gpu",
+#   "binary": "/usr/libexec/qemu/vhost-user-gpu",
+#   "tags": [
+#     "CONFIG_OPENGL_DMABUF=y"
+#   ]
+# }
+#
+##
+{
+  'struct' : 'VhostUserBackend',
+  'data'   : {
+    'description': 'str',
+    'type': 'VHostUserBackendType',
+    'binary': 'str',
+    '*tags': [ 'str' ]
+  }
+}
diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt
index c2194711d9..9ee2a60cfb 100644
--- a/docs/interop/vhost-user.txt
+++ b/docs/interop/vhost-user.txt
@@ -17,8 +17,13 @@ The protocol defines 2 sides of the communication, master and slave. Master is
 the application that shares its virtqueues, in our case QEMU. Slave is the
 consumer of the virtqueues.
 
-In the current implementation QEMU is the Master, and the Slave is intended to
-be a software Ethernet switch running in user space, such as Snabbswitch.
+In the current implementation QEMU is the Master, and the Slave is the
+external process consuming the virtio queues, for example a software
+Ethernet switch running in user space, such as Snabbswitch, or a block
+device backend processing read & write to a virtual disk. In order to
+facilitate interoperability between various backend implementations,
+it is recommended to follow the "Backend program conventions"
+described in this document.
 
 Master and slave can be either a client (i.e. connecting) or server (listening)
 in the socket communication.
@@ -835,3 +840,95 @@ resilient for selective requests.
 For the message types that already solicit a reply from the client, the
 presence of VHOST_USER_PROTOCOL_F_REPLY_ACK or need_reply bit being set brings
 no behavioural change. (See the 'Communication' section for details.)
+
+Backend program conventions
+---------------------------
+
+vhost-user backends can provide various devices & services and may
+need to be configured manually depending on the use case. However, it
+is a good idea to follow the conventions listed here when
+possible. Users, QEMU or libvirt, can then rely on some common
+behaviour to avoid heterogenous configuration and management of the
+backend programs and facilitate interoperability.
+
+Each backend installed on a host system should come with at least one
+JSON file that conforms to the vhost-user.json schema. Each file
+informs the management applications about the backend type, and binary
+location. In addition, it defines rules for management apps for
+picking the highest priority backend when multiple match the search
+criteria (see @VhostUserBackend documentation in the schema file).
+
+If the backend is not capable of enabling a requested feature on the
+host (such as 3D acceleration with virgl), or the initialization
+failed, the backend should fail to start early and exit with a status
+!= 0. It may also print a message to stderr for further details.
+
+The backend program must not daemonize itself, but it may be
+daemonized by the management layer. It may also have a restricted
+access to the system.
+
+File descriptors 0, 1 and 2 will exist, and have regular
+stdin/stdout/stderr usage (they may have been redirected to /dev/null
+by the management layer, or to a log handler).
+
+The backend program must end (as quickly and cleanly as possible) when
+the SIGTERM signal is received. Eventually, it may receive SIGKILL by
+the management layer after a few seconds.
+
+The following command line options have an expected behaviour. They
+are mandatory, unless explicitly said differently:
+
+* --socket-path=PATH
+
+This option specify the location of the vhost-user Unix domain socket.
+It is incompatible with --fd.
+
+* --fd=FDNUM
+
+When this argument is given, the backend program is started with the
+vhost-user socket as file descriptor FDNUM. It is incompatible with
+--socket-path.
+
+* --print-capabilities
+
+Output to stdout the backend capabilities in JSON format, and then
+exit successfully. Other options and arguments should be ignored, and
+the backend program should not perform its normal function.  The
+capabilities can be reported dynamically depending on the host
+capabilities.
+
+The JSON output is described in the vhost-user.json schema, by
+@VHostUserBackendCapabilities.  Example:
+{
+  "type": "foo",
+  "features": [
+    "feature-a",
+    "feature-b"
+  ]
+}
+
+vhost-user-input
+----------------
+
+Command line options:
+
+* --evdev-path=PATH (optional)
+
+Specify the linux input device.
+
+* --no-grab (optional)
+
+Do no request exclusive access to the input device.
+
+vhost-user-gpu
+--------------
+
+Command line options:
+
+* --render-node=PATH (optional)
+
+Specify the GPU DRM render node.
+
+* --virgl (optional)
+
+Enable virgl rendering support.

From 0b99f22461e59ec7a31c75ebc4c057a45dd9e9a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Fri, 8 Mar 2019 15:04:45 +0100
Subject: [PATCH 07/26] vhost-user: simplify vhost_user_init/vhost_user_cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Take a VhostUserState* that can be pre-allocated, and initialize it
with the associated chardev.

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reviewed-by: Tiwei Bie <tiwei.bie@intel.com>
Message-Id: <20190308140454.32437-4-marcandre.lureau@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 backends/cryptodev-vhost-user.c     | 18 ++++--------------
 hw/block/vhost-user-blk.c           | 22 ++++------------------
 hw/scsi/vhost-user-scsi.c           | 20 ++++----------------
 hw/virtio/vhost-stub.c              |  4 ++--
 hw/virtio/vhost-user.c              | 16 ++++++++++++----
 include/hw/virtio/vhost-user-blk.h  |  2 +-
 include/hw/virtio/vhost-user-scsi.h |  2 +-
 include/hw/virtio/vhost-user.h      |  2 +-
 net/vhost-user.c                    | 13 ++++---------
 9 files changed, 33 insertions(+), 66 deletions(-)

diff --git a/backends/cryptodev-vhost-user.c b/backends/cryptodev-vhost-user.c
index d539f14d59..1052a5d0e9 100644
--- a/backends/cryptodev-vhost-user.c
+++ b/backends/cryptodev-vhost-user.c
@@ -47,7 +47,7 @@
 typedef struct CryptoDevBackendVhostUser {
     CryptoDevBackend parent_obj;
 
-    VhostUserState *vhost_user;
+    VhostUserState vhost_user;
     CharBackend chr;
     char *chr_name;
     bool opened;
@@ -104,7 +104,7 @@ cryptodev_vhost_user_start(int queues,
             continue;
         }
 
-        options.opaque = s->vhost_user;
+        options.opaque = &s->vhost_user;
         options.backend_type = VHOST_BACKEND_TYPE_USER;
         options.cc = b->conf.peers.ccs[i];
         s->vhost_crypto[i] = cryptodev_vhost_init(&options);
@@ -182,7 +182,6 @@ static void cryptodev_vhost_user_init(
     size_t i;
     Error *local_err = NULL;
     Chardev *chr;
-    VhostUserState *user;
     CryptoDevBackendClient *cc;
     CryptoDevBackendVhostUser *s =
                       CRYPTODEV_BACKEND_VHOST_USER(backend);
@@ -213,15 +212,10 @@ static void cryptodev_vhost_user_init(
         }
     }
 
-    user = vhost_user_init();
-    if (!user) {
-        error_setg(errp, "Failed to init vhost_user");
+    if (!vhost_user_init(&s->vhost_user, &s->chr, errp)) {
         return;
     }
 
-    user->chr = &s->chr;
-    s->vhost_user = user;
-
     qemu_chr_fe_set_handlers(&s->chr, NULL, NULL,
                      cryptodev_vhost_user_event, NULL, s, NULL, true);
 
@@ -307,11 +301,7 @@ static void cryptodev_vhost_user_cleanup(
         }
     }
 
-    if (s->vhost_user) {
-        vhost_user_cleanup(s->vhost_user);
-        g_free(s->vhost_user);
-        s->vhost_user = NULL;
-    }
+    vhost_user_cleanup(&s->vhost_user);
 }
 
 static void cryptodev_vhost_user_set_chardev(Object *obj,
diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 44ac814016..767c734a81 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -253,7 +253,6 @@ static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
 {
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VHostUserBlk *s = VHOST_USER_BLK(vdev);
-    VhostUserState *user;
     struct vhost_virtqueue *vqs = NULL;
     int i, ret;
 
@@ -272,15 +271,10 @@ static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
         return;
     }
 
-    user = vhost_user_init();
-    if (!user) {
-        error_setg(errp, "vhost-user-blk: failed to init vhost_user");
+    if (!vhost_user_init(&s->vhost_user, &s->chardev, errp)) {
         return;
     }
 
-    user->chr = &s->chardev;
-    s->vhost_user = user;
-
     virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
                 sizeof(struct virtio_blk_config));
 
@@ -297,7 +291,7 @@ static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
 
     vhost_dev_set_config_notifier(&s->dev, &blk_ops);
 
-    ret = vhost_dev_init(&s->dev, s->vhost_user, VHOST_BACKEND_TYPE_USER, 0);
+    ret = vhost_dev_init(&s->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0);
     if (ret < 0) {
         error_setg(errp, "vhost-user-blk: vhost initialization failed: %s",
                    strerror(-ret));
@@ -322,10 +316,7 @@ vhost_err:
 virtio_err:
     g_free(vqs);
     virtio_cleanup(vdev);
-
-    vhost_user_cleanup(user);
-    g_free(user);
-    s->vhost_user = NULL;
+    vhost_user_cleanup(&s->vhost_user);
 }
 
 static void vhost_user_blk_device_unrealize(DeviceState *dev, Error **errp)
@@ -338,12 +329,7 @@ static void vhost_user_blk_device_unrealize(DeviceState *dev, Error **errp)
     vhost_dev_cleanup(&s->dev);
     g_free(vqs);
     virtio_cleanup(vdev);
-
-    if (s->vhost_user) {
-        vhost_user_cleanup(s->vhost_user);
-        g_free(s->vhost_user);
-        s->vhost_user = NULL;
-    }
+    vhost_user_cleanup(&s->vhost_user);
 }
 
 static void vhost_user_blk_instance_init(Object *obj)
diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
index 6728878a52..8b1e6876db 100644
--- a/hw/scsi/vhost-user-scsi.c
+++ b/hw/scsi/vhost-user-scsi.c
@@ -69,7 +69,6 @@ static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
     VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
     VHostUserSCSI *s = VHOST_USER_SCSI(dev);
     VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
-    VhostUserState *user;
     Error *err = NULL;
     int ret;
 
@@ -86,30 +85,24 @@ static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
         return;
     }
 
-    user = vhost_user_init();
-    if (!user) {
-        error_setg(errp, "vhost-user-scsi: failed to init vhost_user");
+    if (!vhost_user_init(&s->vhost_user, &vs->conf.chardev, errp)) {
         return;
     }
-    user->chr = &vs->conf.chardev;
 
     vsc->dev.nvqs = 2 + vs->conf.num_queues;
     vsc->dev.vqs = g_new(struct vhost_virtqueue, vsc->dev.nvqs);
     vsc->dev.vq_index = 0;
     vsc->dev.backend_features = 0;
 
-    ret = vhost_dev_init(&vsc->dev, user,
+    ret = vhost_dev_init(&vsc->dev, &s->vhost_user,
                          VHOST_BACKEND_TYPE_USER, 0);
     if (ret < 0) {
         error_setg(errp, "vhost-user-scsi: vhost initialization failed: %s",
                    strerror(-ret));
-        vhost_user_cleanup(user);
-        g_free(user);
+        vhost_user_cleanup(&s->vhost_user);
         return;
     }
 
-    s->vhost_user = user;
-
     /* Channel and lun both are 0 for bootable vhost-user-scsi disk */
     vsc->channel = 0;
     vsc->lun = 0;
@@ -130,12 +123,7 @@ static void vhost_user_scsi_unrealize(DeviceState *dev, Error **errp)
     g_free(vqs);
 
     virtio_scsi_common_unrealize(dev, errp);
-
-    if (s->vhost_user) {
-        vhost_user_cleanup(s->vhost_user);
-        g_free(s->vhost_user);
-        s->vhost_user = NULL;
-    }
+    vhost_user_cleanup(&s->vhost_user);
 }
 
 static Property vhost_user_scsi_properties[] = {
diff --git a/hw/virtio/vhost-stub.c b/hw/virtio/vhost-stub.c
index 049089b5e2..c175148fce 100644
--- a/hw/virtio/vhost-stub.c
+++ b/hw/virtio/vhost-stub.c
@@ -7,9 +7,9 @@ bool vhost_has_free_slot(void)
     return true;
 }
 
-VhostUserState *vhost_user_init(void)
+bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp)
 {
-    return NULL;
+    return false;
 }
 
 void vhost_user_cleanup(VhostUserState *user)
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 0d6c64e5ca..0164060b2b 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -1750,17 +1750,24 @@ static bool vhost_user_mem_section_filter(struct vhost_dev *dev,
     return result;
 }
 
-VhostUserState *vhost_user_init(void)
+bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp)
 {
-    VhostUserState *user = g_new0(struct VhostUserState, 1);
-
-    return user;
+    if (user->chr) {
+        error_setg(errp, "Cannot initialize vhost-user state");
+        return false;
+    }
+    user->chr = chr;
+    return true;
 }
 
 void vhost_user_cleanup(VhostUserState *user)
 {
     int i;
 
+    if (!user->chr) {
+        return;
+    }
+
     for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
         if (user->notifier[i].addr) {
             object_unparent(OBJECT(&user->notifier[i].mr));
@@ -1768,6 +1775,7 @@ void vhost_user_cleanup(VhostUserState *user)
             user->notifier[i].addr = NULL;
         }
     }
+    user->chr = NULL;
 }
 
 const VhostOps user_ops = {
diff --git a/include/hw/virtio/vhost-user-blk.h b/include/hw/virtio/vhost-user-blk.h
index d52944aeeb..a8a106eecb 100644
--- a/include/hw/virtio/vhost-user-blk.h
+++ b/include/hw/virtio/vhost-user-blk.h
@@ -36,7 +36,7 @@ typedef struct VHostUserBlk {
     uint32_t queue_size;
     uint32_t config_wce;
     struct vhost_dev dev;
-    VhostUserState *vhost_user;
+    VhostUserState vhost_user;
 } VHostUserBlk;
 
 #endif
diff --git a/include/hw/virtio/vhost-user-scsi.h b/include/hw/virtio/vhost-user-scsi.h
index e429cacd8e..738f9288bd 100644
--- a/include/hw/virtio/vhost-user-scsi.h
+++ b/include/hw/virtio/vhost-user-scsi.h
@@ -30,7 +30,7 @@
 
 typedef struct VHostUserSCSI {
     VHostSCSICommon parent_obj;
-    VhostUserState *vhost_user;
+    VhostUserState vhost_user;
 } VHostUserSCSI;
 
 #endif /* VHOST_USER_SCSI_H */
diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h
index fd660393a0..811e325f42 100644
--- a/include/hw/virtio/vhost-user.h
+++ b/include/hw/virtio/vhost-user.h
@@ -22,7 +22,7 @@ typedef struct VhostUserState {
     VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX];
 } VhostUserState;
 
-VhostUserState *vhost_user_init(void);
+bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp);
 void vhost_user_cleanup(VhostUserState *user);
 
 #endif
diff --git a/net/vhost-user.c b/net/vhost-user.c
index cd9659df87..5a26a24708 100644
--- a/net/vhost-user.c
+++ b/net/vhost-user.c
@@ -304,19 +304,14 @@ static int net_vhost_user_init(NetClientState *peer, const char *device,
 {
     Error *err = NULL;
     NetClientState *nc, *nc0 = NULL;
-    VhostUserState *user = NULL;
     NetVhostUserState *s = NULL;
+    VhostUserState *user;
     int i;
 
     assert(name);
     assert(queues > 0);
 
-    user = vhost_user_init();
-    if (!user) {
-        error_report("failed to init vhost_user");
-        goto err;
-    }
-
+    user = g_new0(struct VhostUserState, 1);
     for (i = 0; i < queues; i++) {
         nc = qemu_new_net_client(&net_vhost_user_info, peer, device, name);
         snprintf(nc->info_str, sizeof(nc->info_str), "vhost-user%d to %s",
@@ -325,11 +320,11 @@ static int net_vhost_user_init(NetClientState *peer, const char *device,
         if (!nc0) {
             nc0 = nc;
             s = DO_UPCAST(NetVhostUserState, nc, nc);
-            if (!qemu_chr_fe_init(&s->chr, chr, &err)) {
+            if (!qemu_chr_fe_init(&s->chr, chr, &err) ||
+                !vhost_user_init(user, &s->chr, &err)) {
                 error_report_err(err);
                 goto err;
             }
-            user->chr = &s->chr;
         }
         s = DO_UPCAST(NetVhostUserState, nc, nc);
         s->vhost_user = user;

From 6079865526474dea7d8680ed2cf0b6d15ac6e99f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Fri, 8 Mar 2019 15:04:46 +0100
Subject: [PATCH 08/26] libvhost-user: exit by default on VHOST_USER_NONE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit 2566378d6d13bf4d28c7770bdbda5f7682594bbe, libvhost-user
no longer panics on disconnect (rc == 0), and instead silently ignores
an invalid VHOST_USER_NONE message.

Without extra work from the API user, this will simply busy-loop on
HUP events. The obvious thing to do is to exit(0) instead, while
additional or different work can be done by overriding
iface->process_msg().

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reviewed-by: Jens Freimann <jfreimann@redhat.com>
Message-Id: <20190308140454.32437-5-marcandre.lureau@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 contrib/libvhost-user/libvhost-user.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index 3f14b4138b..fcf5014240 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -1285,7 +1285,8 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
     case VHOST_USER_SET_CONFIG:
         return vu_set_config(dev, vmsg);
     case VHOST_USER_NONE:
-        break;
+        /* if you need processing before exit, override iface->process_msg */
+        exit(0);
     case VHOST_USER_POSTCOPY_ADVISE:
         return vu_set_postcopy_advise(dev, vmsg);
     case VHOST_USER_POSTCOPY_LISTEN:

From 917d7dd72a774d1f82700a284adb4ea632692638 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Fri, 8 Mar 2019 15:04:47 +0100
Subject: [PATCH 09/26] vhost-user: wrap some read/write with retry handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-Id: <20190308140454.32437-6-marcandre.lureau@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/vhost-user.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 0164060b2b..3b09c40578 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -968,7 +968,10 @@ static void slave_read(void *opaque)
     iov.iov_base = &hdr;
     iov.iov_len = VHOST_USER_HDR_SIZE;
 
-    size = recvmsg(u->slave_fd, &msgh, 0);
+    do {
+        size = recvmsg(u->slave_fd, &msgh, 0);
+    } while (size < 0 && (errno == EINTR || errno == EAGAIN));
+
     if (size != VHOST_USER_HDR_SIZE) {
         error_report("Failed to read from slave.");
         goto err;
@@ -997,7 +1000,10 @@ static void slave_read(void *opaque)
     }
 
     /* Read payload */
-    size = read(u->slave_fd, &payload, hdr.size);
+    do {
+        size = read(u->slave_fd, &payload, hdr.size);
+    } while (size < 0 && (errno == EINTR || errno == EAGAIN));
+
     if (size != hdr.size) {
         error_report("Failed to read payload from slave.");
         goto err;
@@ -1045,7 +1051,10 @@ static void slave_read(void *opaque)
         iovec[1].iov_base = &payload;
         iovec[1].iov_len = hdr.size;
 
-        size = writev(u->slave_fd, iovec, ARRAY_SIZE(iovec));
+        do {
+            size = writev(u->slave_fd, iovec, ARRAY_SIZE(iovec));
+        } while (size < 0 && (errno == EINTR || errno == EAGAIN));
+
         if (size != VHOST_USER_HDR_SIZE + hdr.size) {
             error_report("Failed to send msg reply to slave.");
             goto err;

From 9af84c02e2321e7e73234940cba9ce521dfc9423 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Fri, 8 Mar 2019 15:04:49 +0100
Subject: [PATCH 10/26] vhost-user: split vhost_user_read()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Split vhost_user_read(), so only header can be read with
vhost_user_read_header().

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
Message-Id: <20190308140454.32437-8-marcandre.lureau@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/vhost-user.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 3b09c40578..5df73405bc 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -214,7 +214,7 @@ static bool ioeventfd_enabled(void)
     return !kvm_enabled() || kvm_eventfds_enabled();
 }
 
-static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg)
+static int vhost_user_read_header(struct vhost_dev *dev, VhostUserMsg *msg)
 {
     struct vhost_user *u = dev->opaque;
     CharBackend *chr = u->user->chr;
@@ -225,7 +225,7 @@ static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg)
     if (r != size) {
         error_report("Failed to read msg header. Read %d instead of %d."
                      " Original request %d.", r, size, msg->hdr.request);
-        goto fail;
+        return -1;
     }
 
     /* validate received flags */
@@ -233,7 +233,21 @@ static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg)
         error_report("Failed to read msg header."
                 " Flags 0x%x instead of 0x%x.", msg->hdr.flags,
                 VHOST_USER_REPLY_MASK | VHOST_USER_VERSION);
-        goto fail;
+        return -1;
+    }
+
+    return 0;
+}
+
+static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg)
+{
+    struct vhost_user *u = dev->opaque;
+    CharBackend *chr = u->user->chr;
+    uint8_t *p = (uint8_t *) msg;
+    int r, size;
+
+    if (vhost_user_read_header(dev, msg) < 0) {
+        return -1;
     }
 
     /* validate message size is sane */
@@ -241,7 +255,7 @@ static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg)
         error_report("Failed to read msg header."
                 " Size %d exceeds the maximum %zu.", msg->hdr.size,
                 VHOST_USER_PAYLOAD_SIZE);
-        goto fail;
+        return -1;
     }
 
     if (msg->hdr.size) {
@@ -251,14 +265,11 @@ static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg)
         if (r != size) {
             error_report("Failed to read msg payload."
                          " Read %d instead of %d.", r, msg->hdr.size);
-            goto fail;
+            return -1;
         }
     }
 
     return 0;
-
-fail:
-    return -1;
 }
 
 static int process_message_reply(struct vhost_dev *dev,

From 922ef483ec812730f87102b372691c510313fcea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Fri, 8 Mar 2019 15:04:51 +0100
Subject: [PATCH 11/26] libvhost-user-glib: export vug_source_new()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Simplify the creation of FD sources for other users. This is just
convenience to avoid duplicating similar code elsewhere.

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20190308140454.32437-10-marcandre.lureau@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 contrib/libvhost-user/libvhost-user-glib.c | 11 ++++++-----
 contrib/libvhost-user/libvhost-user-glib.h |  3 +++
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/contrib/libvhost-user/libvhost-user-glib.c b/contrib/libvhost-user/libvhost-user-glib.c
index 545f089587..42660a1b36 100644
--- a/contrib/libvhost-user/libvhost-user-glib.c
+++ b/contrib/libvhost-user/libvhost-user-glib.c
@@ -68,15 +68,16 @@ static GSourceFuncs vug_src_funcs = {
     NULL
 };
 
-static GSource *
-vug_source_new(VuDev *dev, int fd, GIOCondition cond,
+GSource *
+vug_source_new(VugDev *gdev, int fd, GIOCondition cond,
                vu_watch_cb vu_cb, gpointer data)
 {
+    VuDev *dev = &gdev->parent;
     GSource *gsrc;
     VugSrc *src;
     guint id;
 
-    g_assert(dev);
+    g_assert(gdev);
     g_assert(fd >= 0);
     g_assert(vu_cb);
 
@@ -106,7 +107,7 @@ set_watch(VuDev *vu_dev, int fd, int vu_evt, vu_watch_cb cb, void *pvt)
     g_assert(cb);
 
     dev = container_of(vu_dev, VugDev, parent);
-    src = vug_source_new(vu_dev, fd, vu_evt, cb, pvt);
+    src = vug_source_new(dev, fd, vu_evt, cb, pvt);
     g_hash_table_replace(dev->fdmap, GINT_TO_POINTER(fd), src);
 }
 
@@ -141,7 +142,7 @@ vug_init(VugDev *dev, int socket,
     dev->fdmap = g_hash_table_new_full(NULL, NULL, NULL,
                                        (GDestroyNotify) g_source_destroy);
 
-    dev->src = vug_source_new(&dev->parent, socket, G_IO_IN, vug_watch, NULL);
+    dev->src = vug_source_new(dev, socket, G_IO_IN, vug_watch, NULL);
 }
 
 void
diff --git a/contrib/libvhost-user/libvhost-user-glib.h b/contrib/libvhost-user/libvhost-user-glib.h
index 6b2110b94c..d3200f3afc 100644
--- a/contrib/libvhost-user/libvhost-user-glib.h
+++ b/contrib/libvhost-user/libvhost-user-glib.h
@@ -29,4 +29,7 @@ void vug_init(VugDev *dev, int socket,
               vu_panic_cb panic, const VuDevIface *iface);
 void vug_deinit(VugDev *dev);
 
+GSource *vug_source_new(VugDev *dev, int fd, GIOCondition cond,
+                        vu_watch_cb vu_cb, gpointer data);
+
 #endif /* LIBVHOST_USER_GLIB_H */

From b13919ab64e712158a031bf4ea48e92b20ed8ef5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Fri, 8 Mar 2019 15:04:52 +0100
Subject: [PATCH 12/26] libvhost-user: add vu_queue_unpop()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

vhost-user-input will make use of this function to undo some queue pop
in case the virtio queue does not have enough room.

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20190308140454.32437-11-marcandre.lureau@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 contrib/libvhost-user/libvhost-user.c | 16 ++++++++++++++++
 contrib/libvhost-user/libvhost-user.h | 14 ++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index fcf5014240..df72d3e440 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -1966,6 +1966,22 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
     return elem;
 }
 
+static void
+vu_queue_detach_element(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem,
+                        size_t len)
+{
+    vq->inuse--;
+    /* unmap, when DMA support is added */
+}
+
+void
+vu_queue_unpop(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem,
+               size_t len)
+{
+    vq->last_avail_idx--;
+    vu_queue_detach_element(dev, vq, elem, len);
+}
+
 bool
 vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num)
 {
diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
index c0133b7f3f..3de8414898 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -458,6 +458,20 @@ void vu_queue_notify(VuDev *dev, VuVirtq *vq);
  */
 void *vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz);
 
+
+/**
+ * vu_queue_unpop:
+ * @dev: a VuDev context
+ * @vq: a VuVirtq queue
+ * @elem: The #VuVirtqElement
+ * @len: number of bytes written
+ *
+ * Pretend the most recent element wasn't popped from the virtqueue.  The next
+ * call to vu_queue_pop() will refetch the element.
+ */
+void vu_queue_unpop(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem,
+                    size_t len);
+
 /**
  * vu_queue_rewind:
  * @dev: a VuDev context

From fb43cf739e1a74178212bfcd4313b598109d2310 Mon Sep 17 00:00:00 2001
From: "Liu, Yi L" <yi.l.liu@intel.com>
Date: Tue, 5 Mar 2019 10:34:53 +0800
Subject: [PATCH 13/26] intel_iommu: scalable mode emulation

Intel(R) VT-d 3.0 spec introduces scalable mode address translation to
replace extended context mode. This patch extends current emulator to
support Scalable Mode which includes root table, context table and new
pasid table format change. Now intel_iommu emulates both legacy mode
and scalable mode (with legacy-equivalent capability set).

The key points are below:
1. Extend root table operations to support both legacy mode and scalable
   mode.
2. Extend context table operations to support both legacy mode and
   scalable mode.
3. Add pasid tabled operations to support scalable mode.

Signed-off-by: Liu, Yi L <yi.l.liu@intel.com>
[Yi Sun is co-developer to contribute much to refine the whole commit.]
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Message-Id: <1551753295-30167-2-git-send-email-yi.y.sun@linux.intel.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
---
 hw/i386/intel_iommu.c          | 498 ++++++++++++++++++++++++++-------
 hw/i386/intel_iommu_internal.h |  41 ++-
 hw/i386/trace-events           |   2 +-
 include/hw/i386/intel_iommu.h  |  24 +-
 4 files changed, 466 insertions(+), 99 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index ee22e754f0..ea9ea93e42 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -37,6 +37,27 @@
 #include "kvm_i386.h"
 #include "trace.h"
 
+/* context entry operations */
+#define VTD_CE_GET_RID2PASID(ce) \
+    ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK)
+#define VTD_CE_GET_PASID_DIR_TABLE(ce) \
+    ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK)
+
+/* pe operations */
+#define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
+#define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
+#define VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write) {\
+    if (ret_fr) {                                                             \
+        ret_fr = -ret_fr;                                                     \
+        if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {                   \
+            trace_vtd_fault_disabled();                                       \
+        } else {                                                              \
+            vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);      \
+        }                                                                     \
+        goto error;                                                           \
+    }                                                                         \
+}
+
 static void vtd_address_space_refresh_all(IntelIOMMUState *s);
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
 
@@ -512,9 +533,15 @@ static void vtd_generate_completion_event(IntelIOMMUState *s)
     }
 }
 
-static inline bool vtd_root_entry_present(VTDRootEntry *root)
+static inline bool vtd_root_entry_present(IntelIOMMUState *s,
+                                          VTDRootEntry *re,
+                                          uint8_t devfn)
 {
-    return root->val & VTD_ROOT_ENTRY_P;
+    if (s->root_scalable && devfn > UINT8_MAX / 2) {
+        return re->hi & VTD_ROOT_ENTRY_P;
+    }
+
+    return re->lo & VTD_ROOT_ENTRY_P;
 }
 
 static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index,
@@ -524,10 +551,11 @@ static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index,
 
     addr = s->root + index * sizeof(*re);
     if (dma_memory_read(&address_space_memory, addr, re, sizeof(*re))) {
-        re->val = 0;
+        re->lo = 0;
         return -VTD_FR_ROOT_TABLE_INV;
     }
-    re->val = le64_to_cpu(re->val);
+    re->lo = le64_to_cpu(re->lo);
+    re->hi = le64_to_cpu(re->hi);
     return 0;
 }
 
@@ -536,18 +564,35 @@ static inline bool vtd_ce_present(VTDContextEntry *context)
     return context->lo & VTD_CONTEXT_ENTRY_P;
 }
 
-static int vtd_get_context_entry_from_root(VTDRootEntry *root, uint8_t index,
+static int vtd_get_context_entry_from_root(IntelIOMMUState *s,
+                                           VTDRootEntry *re,
+                                           uint8_t index,
                                            VTDContextEntry *ce)
 {
-    dma_addr_t addr;
+    dma_addr_t addr, ce_size;
 
     /* we have checked that root entry is present */
-    addr = (root->val & VTD_ROOT_ENTRY_CTP) + index * sizeof(*ce);
-    if (dma_memory_read(&address_space_memory, addr, ce, sizeof(*ce))) {
+    ce_size = s->root_scalable ? VTD_CTX_ENTRY_SCALABLE_SIZE :
+              VTD_CTX_ENTRY_LEGACY_SIZE;
+
+    if (s->root_scalable && index > UINT8_MAX / 2) {
+        index = index & (~VTD_DEVFN_CHECK_MASK);
+        addr = re->hi & VTD_ROOT_ENTRY_CTP;
+    } else {
+        addr = re->lo & VTD_ROOT_ENTRY_CTP;
+    }
+
+    addr = addr + index * ce_size;
+    if (dma_memory_read(&address_space_memory, addr, ce, ce_size)) {
         return -VTD_FR_CONTEXT_TABLE_INV;
     }
+
     ce->lo = le64_to_cpu(ce->lo);
     ce->hi = le64_to_cpu(ce->hi);
+    if (ce_size == VTD_CTX_ENTRY_SCALABLE_SIZE) {
+        ce->val[2] = le64_to_cpu(ce->val[2]);
+        ce->val[3] = le64_to_cpu(ce->val[3]);
+    }
     return 0;
 }
 
@@ -600,6 +645,144 @@ static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level)
            (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT));
 }
 
+/* Return true if check passed, otherwise false */
+static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu,
+                                     VTDPASIDEntry *pe)
+{
+    switch (VTD_PE_GET_TYPE(pe)) {
+    case VTD_SM_PASID_ENTRY_FLT:
+    case VTD_SM_PASID_ENTRY_SLT:
+    case VTD_SM_PASID_ENTRY_NESTED:
+        break;
+    case VTD_SM_PASID_ENTRY_PT:
+        if (!x86_iommu->pt_supported) {
+            return false;
+        }
+        break;
+    default:
+        /* Unknwon type */
+        return false;
+    }
+    return true;
+}
+
+static int vtd_get_pasid_dire(dma_addr_t pasid_dir_base,
+                              uint32_t pasid,
+                              VTDPASIDDirEntry *pdire)
+{
+    uint32_t index;
+    dma_addr_t addr, entry_size;
+
+    index = VTD_PASID_DIR_INDEX(pasid);
+    entry_size = VTD_PASID_DIR_ENTRY_SIZE;
+    addr = pasid_dir_base + index * entry_size;
+    if (dma_memory_read(&address_space_memory, addr, pdire, entry_size)) {
+        return -VTD_FR_PASID_TABLE_INV;
+    }
+
+    return 0;
+}
+
+static int vtd_get_pasid_entry(IntelIOMMUState *s,
+                               uint32_t pasid,
+                               VTDPASIDDirEntry *pdire,
+                               VTDPASIDEntry *pe)
+{
+    uint32_t index;
+    dma_addr_t addr, entry_size;
+    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
+    index = VTD_PASID_TABLE_INDEX(pasid);
+    entry_size = VTD_PASID_ENTRY_SIZE;
+    addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK;
+    addr = addr + index * entry_size;
+    if (dma_memory_read(&address_space_memory, addr, pe, entry_size)) {
+        return -VTD_FR_PASID_TABLE_INV;
+    }
+
+    /* Do translation type check */
+    if (!vtd_pe_type_check(x86_iommu, pe)) {
+        return -VTD_FR_PASID_TABLE_INV;
+    }
+
+    if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) {
+        return -VTD_FR_PASID_TABLE_INV;
+    }
+
+    return 0;
+}
+
+static int vtd_get_pasid_entry_from_pasid(IntelIOMMUState *s,
+                                          dma_addr_t pasid_dir_base,
+                                          uint32_t pasid,
+                                          VTDPASIDEntry *pe)
+{
+    int ret;
+    VTDPASIDDirEntry pdire;
+
+    ret = vtd_get_pasid_dire(pasid_dir_base, pasid, &pdire);
+    if (ret) {
+        return ret;
+    }
+
+    ret = vtd_get_pasid_entry(s, pasid, &pdire, pe);
+    if (ret) {
+        return ret;
+    }
+
+    return ret;
+}
+
+static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s,
+                                      VTDContextEntry *ce,
+                                      VTDPASIDEntry *pe)
+{
+    uint32_t pasid;
+    dma_addr_t pasid_dir_base;
+    int ret = 0;
+
+    pasid = VTD_CE_GET_RID2PASID(ce);
+    pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
+    ret = vtd_get_pasid_entry_from_pasid(s, pasid_dir_base, pasid, pe);
+
+    return ret;
+}
+
+static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s,
+                                VTDContextEntry *ce,
+                                bool *pe_fpd_set)
+{
+    int ret;
+    uint32_t pasid;
+    dma_addr_t pasid_dir_base;
+    VTDPASIDDirEntry pdire;
+    VTDPASIDEntry pe;
+
+    pasid = VTD_CE_GET_RID2PASID(ce);
+    pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
+
+    ret = vtd_get_pasid_dire(pasid_dir_base, pasid, &pdire);
+    if (ret) {
+        return ret;
+    }
+
+    if (pdire.val & VTD_PASID_DIR_FPD) {
+        *pe_fpd_set = true;
+        return 0;
+    }
+
+    ret = vtd_get_pasid_entry(s, pasid, &pdire, &pe);
+    if (ret) {
+        return ret;
+    }
+
+    if (pe.val[0] & VTD_PASID_ENTRY_FPD) {
+        *pe_fpd_set = true;
+    }
+
+    return 0;
+}
+
 /* Get the page-table level that hardware should use for the second-level
  * page-table walk from the Address Width field of context-entry.
  */
@@ -608,17 +791,43 @@ static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce)
     return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW);
 }
 
+static uint32_t vtd_get_iova_level(IntelIOMMUState *s,
+                                   VTDContextEntry *ce)
+{
+    VTDPASIDEntry pe;
+
+    if (s->root_scalable) {
+        vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+        return VTD_PE_GET_LEVEL(&pe);
+    }
+
+    return vtd_ce_get_level(ce);
+}
+
 static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce)
 {
     return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9;
 }
 
+static uint32_t vtd_get_iova_agaw(IntelIOMMUState *s,
+                                  VTDContextEntry *ce)
+{
+    VTDPASIDEntry pe;
+
+    if (s->root_scalable) {
+        vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+        return 30 + ((pe.val[0] >> 2) & VTD_SM_PASID_ENTRY_AW) * 9;
+    }
+
+    return vtd_ce_get_agaw(ce);
+}
+
 static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce)
 {
     return ce->lo & VTD_CONTEXT_ENTRY_TT;
 }
 
-/* Return true if check passed, otherwise false */
+/* Only for Legacy Mode. Return true if check passed, otherwise false */
 static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu,
                                      VTDContextEntry *ce)
 {
@@ -639,7 +848,7 @@ static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu,
         }
         break;
     default:
-        /* Unknwon type */
+        /* Unknown type */
         error_report_once("%s: unknown ce type: %"PRIu32, __func__,
                           vtd_ce_get_type(ce));
         return false;
@@ -647,21 +856,36 @@ static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu,
     return true;
 }
 
-static inline uint64_t vtd_iova_limit(VTDContextEntry *ce, uint8_t aw)
+static inline uint64_t vtd_iova_limit(IntelIOMMUState *s,
+                                      VTDContextEntry *ce, uint8_t aw)
 {
-    uint32_t ce_agaw = vtd_ce_get_agaw(ce);
+    uint32_t ce_agaw = vtd_get_iova_agaw(s, ce);
     return 1ULL << MIN(ce_agaw, aw);
 }
 
 /* Return true if IOVA passes range check, otherwise false. */
-static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce,
+static inline bool vtd_iova_range_check(IntelIOMMUState *s,
+                                        uint64_t iova, VTDContextEntry *ce,
                                         uint8_t aw)
 {
     /*
      * Check if @iova is above 2^X-1, where X is the minimum of MGAW
      * in CAP_REG and AW in context-entry.
      */
-    return !(iova & ~(vtd_iova_limit(ce, aw) - 1));
+    return !(iova & ~(vtd_iova_limit(s, ce, aw) - 1));
+}
+
+static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s,
+                                          VTDContextEntry *ce)
+{
+    VTDPASIDEntry pe;
+
+    if (s->root_scalable) {
+        vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+        return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
+    }
+
+    return vtd_ce_get_slpt_base(ce);
 }
 
 /*
@@ -707,17 +931,18 @@ static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
 /* Given the @iova, get relevant @slptep. @slpte_level will be the last level
  * of the translation, can be used for deciding the size of large page.
  */
-static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
+static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce,
+                             uint64_t iova, bool is_write,
                              uint64_t *slptep, uint32_t *slpte_level,
                              bool *reads, bool *writes, uint8_t aw_bits)
 {
-    dma_addr_t addr = vtd_ce_get_slpt_base(ce);
-    uint32_t level = vtd_ce_get_level(ce);
+    dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce);
+    uint32_t level = vtd_get_iova_level(s, ce);
     uint32_t offset;
     uint64_t slpte;
     uint64_t access_right_check;
 
-    if (!vtd_iova_range_check(iova, ce, aw_bits)) {
+    if (!vtd_iova_range_check(s, iova, ce, aw_bits)) {
         error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")",
                           __func__, iova);
         return -VTD_FR_ADDR_BEYOND_MGAW;
@@ -733,7 +958,7 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
         if (slpte == (uint64_t)-1) {
             error_report_once("%s: detected read error on DMAR slpte "
                               "(iova=0x%" PRIx64 ")", __func__, iova);
-            if (level == vtd_ce_get_level(ce)) {
+            if (level == vtd_get_iova_level(s, ce)) {
                 /* Invalid programming of context-entry */
                 return -VTD_FR_CONTEXT_ENTRY_INV;
             } else {
@@ -962,29 +1187,96 @@ next:
 /**
  * vtd_page_walk - walk specific IOVA range, and call the hook
  *
+ * @s: intel iommu state
  * @ce: context entry to walk upon
  * @start: IOVA address to start the walk
  * @end: IOVA range end address (start <= addr < end)
  * @info: page walking information struct
  */
-static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
+static int vtd_page_walk(IntelIOMMUState *s, VTDContextEntry *ce,
+                         uint64_t start, uint64_t end,
                          vtd_page_walk_info *info)
 {
-    dma_addr_t addr = vtd_ce_get_slpt_base(ce);
-    uint32_t level = vtd_ce_get_level(ce);
+    dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce);
+    uint32_t level = vtd_get_iova_level(s, ce);
 
-    if (!vtd_iova_range_check(start, ce, info->aw)) {
+    if (!vtd_iova_range_check(s, start, ce, info->aw)) {
         return -VTD_FR_ADDR_BEYOND_MGAW;
     }
 
-    if (!vtd_iova_range_check(end, ce, info->aw)) {
+    if (!vtd_iova_range_check(s, end, ce, info->aw)) {
         /* Fix end so that it reaches the maximum */
-        end = vtd_iova_limit(ce, info->aw);
+        end = vtd_iova_limit(s, ce, info->aw);
     }
 
     return vtd_page_walk_level(addr, start, end, level, true, true, info);
 }
 
+static int vtd_root_entry_rsvd_bits_check(IntelIOMMUState *s,
+                                          VTDRootEntry *re)
+{
+    /* Legacy Mode reserved bits check */
+    if (!s->root_scalable &&
+        (re->hi || (re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits))))
+        goto rsvd_err;
+
+    /* Scalable Mode reserved bits check */
+    if (s->root_scalable &&
+        ((re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)) ||
+         (re->hi & VTD_ROOT_ENTRY_RSVD(s->aw_bits))))
+        goto rsvd_err;
+
+    return 0;
+
+rsvd_err:
+    error_report_once("%s: invalid root entry: hi=0x%"PRIx64
+                      ", lo=0x%"PRIx64,
+                      __func__, re->hi, re->lo);
+    return -VTD_FR_ROOT_ENTRY_RSVD;
+}
+
+static inline int vtd_context_entry_rsvd_bits_check(IntelIOMMUState *s,
+                                                    VTDContextEntry *ce)
+{
+    if (!s->root_scalable &&
+        (ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI ||
+         ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) {
+        error_report_once("%s: invalid context entry: hi=%"PRIx64
+                          ", lo=%"PRIx64" (reserved nonzero)",
+                          __func__, ce->hi, ce->lo);
+        return -VTD_FR_CONTEXT_ENTRY_RSVD;
+    }
+
+    if (s->root_scalable &&
+        (ce->val[0] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(s->aw_bits) ||
+         ce->val[1] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 ||
+         ce->val[2] ||
+         ce->val[3])) {
+        error_report_once("%s: invalid context entry: val[3]=%"PRIx64
+                          ", val[2]=%"PRIx64
+                          ", val[1]=%"PRIx64
+                          ", val[0]=%"PRIx64" (reserved nonzero)",
+                          __func__, ce->val[3], ce->val[2],
+                          ce->val[1], ce->val[0]);
+        return -VTD_FR_CONTEXT_ENTRY_RSVD;
+    }
+
+    return 0;
+}
+
+static int vtd_ce_rid2pasid_check(IntelIOMMUState *s,
+                                  VTDContextEntry *ce)
+{
+    VTDPASIDEntry pe;
+
+    /*
+     * Make sure in Scalable Mode, a present context entry
+     * has valid rid2pasid setting, which includes valid
+     * rid2pasid field and corresponding pasid entry setting
+     */
+    return vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+}
+
 /* Map a device to its corresponding domain (context-entry) */
 static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
                                     uint8_t devfn, VTDContextEntry *ce)
@@ -998,20 +1290,18 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
         return ret_fr;
     }
 
-    if (!vtd_root_entry_present(&re)) {
+    if (!vtd_root_entry_present(s, &re, devfn)) {
         /* Not error - it's okay we don't have root entry. */
         trace_vtd_re_not_present(bus_num);
         return -VTD_FR_ROOT_ENTRY_P;
     }
 
-    if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD(s->aw_bits))) {
-        error_report_once("%s: invalid root entry: rsvd=0x%"PRIx64
-                          ", val=0x%"PRIx64" (reserved nonzero)",
-                          __func__, re.rsvd, re.val);
-        return -VTD_FR_ROOT_ENTRY_RSVD;
+    ret_fr = vtd_root_entry_rsvd_bits_check(s, &re);
+    if (ret_fr) {
+        return ret_fr;
     }
 
-    ret_fr = vtd_get_context_entry_from_root(&re, devfn, ce);
+    ret_fr = vtd_get_context_entry_from_root(s, &re, devfn, ce);
     if (ret_fr) {
         return ret_fr;
     }
@@ -1022,26 +1312,38 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
         return -VTD_FR_CONTEXT_ENTRY_P;
     }
 
-    if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
-               (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) {
-        error_report_once("%s: invalid context entry: hi=%"PRIx64
-                          ", lo=%"PRIx64" (reserved nonzero)",
-                          __func__, ce->hi, ce->lo);
-        return -VTD_FR_CONTEXT_ENTRY_RSVD;
+    ret_fr = vtd_context_entry_rsvd_bits_check(s, ce);
+    if (ret_fr) {
+        return ret_fr;
     }
 
     /* Check if the programming of context-entry is valid */
-    if (!vtd_is_level_supported(s, vtd_ce_get_level(ce))) {
+    if (!s->root_scalable &&
+        !vtd_is_level_supported(s, vtd_ce_get_level(ce))) {
         error_report_once("%s: invalid context entry: hi=%"PRIx64
                           ", lo=%"PRIx64" (level %d not supported)",
-                          __func__, ce->hi, ce->lo, vtd_ce_get_level(ce));
+                          __func__, ce->hi, ce->lo,
+                          vtd_ce_get_level(ce));
         return -VTD_FR_CONTEXT_ENTRY_INV;
     }
 
-    /* Do translation type check */
-    if (!vtd_ce_type_check(x86_iommu, ce)) {
-        /* Errors dumped in vtd_ce_type_check() */
-        return -VTD_FR_CONTEXT_ENTRY_INV;
+    if (!s->root_scalable) {
+        /* Do translation type check */
+        if (!vtd_ce_type_check(x86_iommu, ce)) {
+            /* Errors dumped in vtd_ce_type_check() */
+            return -VTD_FR_CONTEXT_ENTRY_INV;
+        }
+    } else {
+        /*
+         * Check if the programming of context-entry.rid2pasid
+         * and corresponding pasid setting is valid, and thus
+         * avoids to check pasid entry fetching result in future
+         * helper function calling.
+         */
+        ret_fr = vtd_ce_rid2pasid_check(s, ce);
+        if (ret_fr) {
+            return ret_fr;
+        }
     }
 
     return 0;
@@ -1054,6 +1356,19 @@ static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry,
     return 0;
 }
 
+static uint16_t vtd_get_domain_id(IntelIOMMUState *s,
+                                  VTDContextEntry *ce)
+{
+    VTDPASIDEntry pe;
+
+    if (s->root_scalable) {
+        vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+        return VTD_SM_PASID_ENTRY_DID(pe.val[1]);
+    }
+
+    return VTD_CONTEXT_ENTRY_DID(ce->hi);
+}
+
 static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
                                             VTDContextEntry *ce,
                                             hwaddr addr, hwaddr size)
@@ -1065,10 +1380,10 @@ static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
         .notify_unmap = true,
         .aw = s->aw_bits,
         .as = vtd_as,
-        .domain_id = VTD_CONTEXT_ENTRY_DID(ce->hi),
+        .domain_id = vtd_get_domain_id(s, ce),
     };
 
-    return vtd_page_walk(ce, addr, addr + size, &info);
+    return vtd_page_walk(s, ce, addr, addr + size, &info);
 }
 
 static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
@@ -1103,35 +1418,24 @@ static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
 }
 
 /*
- * Fetch translation type for specific device. Returns <0 if error
- * happens, otherwise return the shifted type to check against
- * VTD_CONTEXT_TT_*.
+ * Check if specific device is configed to bypass address
+ * translation for DMA requests. In Scalable Mode, bypass
+ * 1st-level translation or 2nd-level translation, it depends
+ * on PGTT setting.
  */
-static int vtd_dev_get_trans_type(VTDAddressSpace *as)
+static bool vtd_dev_pt_enabled(VTDAddressSpace *as)
 {
     IntelIOMMUState *s;
     VTDContextEntry ce;
-    int ret;
-
-    s = as->iommu_state;
-
-    ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus),
-                                   as->devfn, &ce);
-    if (ret) {
-        return ret;
-    }
-
-    return vtd_ce_get_type(&ce);
-}
-
-static bool vtd_dev_pt_enabled(VTDAddressSpace *as)
-{
+    VTDPASIDEntry pe;
     int ret;
 
     assert(as);
 
-    ret = vtd_dev_get_trans_type(as);
-    if (ret < 0) {
+    s = as->iommu_state;
+    ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus),
+                                   as->devfn, &ce);
+    if (ret) {
         /*
          * Possibly failed to parse the context entry for some reason
          * (e.g., during init, or any guest configuration errors on
@@ -1141,7 +1445,17 @@ static bool vtd_dev_pt_enabled(VTDAddressSpace *as)
         return false;
     }
 
-    return ret == VTD_CONTEXT_TT_PASS_THROUGH;
+    if (s->root_scalable) {
+        ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe);
+        if (ret) {
+            error_report_once("%s: vtd_ce_get_rid2pasid_entry error: %"PRId32,
+                              __func__, ret);
+            return false;
+        }
+        return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT);
+    }
+
+    return (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH);
 }
 
 /* Return whether the device is using IOMMU translation. */
@@ -1221,6 +1535,7 @@ static const bool vtd_qualified_faults[] = {
     [VTD_FR_ROOT_ENTRY_RSVD] = false,
     [VTD_FR_PAGING_ENTRY_RSVD] = true,
     [VTD_FR_CONTEXT_ENTRY_TT] = true,
+    [VTD_FR_PASID_TABLE_INV] = false,
     [VTD_FR_RESERVED_ERR] = false,
     [VTD_FR_MAX] = false,
 };
@@ -1322,18 +1637,17 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
                                cc_entry->context_cache_gen);
         ce = cc_entry->context_entry;
         is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
+        if (!is_fpd_set && s->root_scalable) {
+            ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set);
+            VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
+        }
     } else {
         ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
         is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
-        if (ret_fr) {
-            ret_fr = -ret_fr;
-            if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
-                trace_vtd_fault_disabled();
-            } else {
-                vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
-            }
-            goto error;
+        if (!ret_fr && !is_fpd_set && s->root_scalable) {
+            ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set);
         }
+        VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
         /* Update context-cache */
         trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo,
                                   cc_entry->context_cache_gen,
@@ -1367,21 +1681,13 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
         return true;
     }
 
-    ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level,
+    ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level,
                                &reads, &writes, s->aw_bits);
-    if (ret_fr) {
-        ret_fr = -ret_fr;
-        if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
-            trace_vtd_fault_disabled();
-        } else {
-            vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
-        }
-        goto error;
-    }
+    VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
 
     page_mask = vtd_slpt_level_page_mask(level);
     access_flags = IOMMU_ACCESS_FLAG(reads, writes);
-    vtd_update_iotlb(s, source_id, VTD_CONTEXT_ENTRY_DID(ce.hi), addr, slpte,
+    vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce), addr, slpte,
                      access_flags, level);
 out:
     vtd_iommu_unlock(s);
@@ -1573,7 +1879,7 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
         if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
                                       vtd_as->devfn, &ce) &&
-            domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
+            domain_id == vtd_get_domain_id(s, &ce)) {
             vtd_sync_shadow_page_table(vtd_as);
         }
     }
@@ -1591,7 +1897,7 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
     QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
         ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
                                        vtd_as->devfn, &ce);
-        if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
+        if (!ret && domain_id == vtd_get_domain_id(s, &ce)) {
             if (vtd_as_has_map_notifier(vtd_as)) {
                 /*
                  * As long as we have MAP notifications registered in
@@ -2629,6 +2935,7 @@ static const VMStateDescription vtd_vmstate = {
         VMSTATE_UINT8_ARRAY(csr, IntelIOMMUState, DMAR_REG_SIZE),
         VMSTATE_UINT8(iq_last_desc_type, IntelIOMMUState),
         VMSTATE_BOOL(root_extended, IntelIOMMUState),
+        VMSTATE_BOOL(root_scalable, IntelIOMMUState),
         VMSTATE_BOOL(dmar_enabled, IntelIOMMUState),
         VMSTATE_BOOL(qi_enabled, IntelIOMMUState),
         VMSTATE_BOOL(intr_enabled, IntelIOMMUState),
@@ -3098,9 +3405,11 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
     vtd_address_space_unmap(vtd_as, n);
 
     if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
-        trace_vtd_replay_ce_valid(bus_n, PCI_SLOT(vtd_as->devfn),
+        trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" :
+                                  "legacy mode",
+                                  bus_n, PCI_SLOT(vtd_as->devfn),
                                   PCI_FUNC(vtd_as->devfn),
-                                  VTD_CONTEXT_ENTRY_DID(ce.hi),
+                                  vtd_get_domain_id(s, &ce),
                                   ce.hi, ce.lo);
         if (vtd_as_has_map_notifier(vtd_as)) {
             /* This is required only for MAP typed notifiers */
@@ -3110,10 +3419,10 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
                 .notify_unmap = false,
                 .aw = s->aw_bits,
                 .as = vtd_as,
-                .domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi),
+                .domain_id = vtd_get_domain_id(s, &ce),
             };
 
-            vtd_page_walk(&ce, 0, ~0ULL, &info);
+            vtd_page_walk(s, &ce, 0, ~0ULL, &info);
         }
     } else {
         trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
@@ -3137,6 +3446,7 @@ static void vtd_init(IntelIOMMUState *s)
 
     s->root = 0;
     s->root_extended = false;
+    s->root_scalable = false;
     s->dmar_enabled = false;
     s->intr_enabled = false;
     s->iq_head = 0;
@@ -3199,7 +3509,7 @@ static void vtd_init(IntelIOMMUState *s)
     vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0);
     vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL);
     vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0);
-    vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffff000ULL, 0);
+    vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffffc00ULL, 0);
     vtd_define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0);
     vtd_define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL);
 
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 00e9edbc66..fe72bc38f0 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -172,6 +172,7 @@
 
 /* RTADDR_REG */
 #define VTD_RTADDR_RTT              (1ULL << 11)
+#define VTD_RTADDR_SMT              (1ULL << 10)
 #define VTD_RTADDR_ADDR_MASK(aw)    (VTD_HAW_MASK(aw) ^ 0xfffULL)
 
 /* IRTA_REG */
@@ -294,6 +295,8 @@ typedef enum VTDFaultReason {
                                   * request while disabled */
     VTD_FR_IR_SID_ERR = 0x26,   /* Invalid Source-ID */
 
+    VTD_FR_PASID_TABLE_INV = 0x58,  /*Invalid PASID table entry */
+
     /* This is not a normal fault reason. We use this to indicate some faults
      * that are not referenced by the VT-d specification.
      * Fault event with such reason should not be recorded.
@@ -411,8 +414,8 @@ typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo;
 #define VTD_PAGE_MASK_1G            (~((1ULL << VTD_PAGE_SHIFT_1G) - 1))
 
 struct VTDRootEntry {
-    uint64_t val;
-    uint64_t rsvd;
+    uint64_t lo;
+    uint64_t hi;
 };
 typedef struct VTDRootEntry VTDRootEntry;
 
@@ -423,6 +426,8 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_ROOT_ENTRY_NR           (VTD_PAGE_SIZE / sizeof(VTDRootEntry))
 #define VTD_ROOT_ENTRY_RSVD(aw)     (0xffeULL | ~VTD_HAW_MASK(aw))
 
+#define VTD_DEVFN_CHECK_MASK        0x80
+
 /* Masks for struct VTDContextEntry */
 /* lo */
 #define VTD_CONTEXT_ENTRY_P         (1ULL << 0)
@@ -441,6 +446,38 @@ typedef struct VTDRootEntry VTDRootEntry;
 
 #define VTD_CONTEXT_ENTRY_NR        (VTD_PAGE_SIZE / sizeof(VTDContextEntry))
 
+#define VTD_CTX_ENTRY_LEGACY_SIZE     16
+#define VTD_CTX_ENTRY_SCALABLE_SIZE   32
+
+#define VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK 0xfffff
+#define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw)  (0x1e0ULL | ~VTD_HAW_MASK(aw))
+#define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1      0xffffffffffe00000ULL
+
+/* PASID Table Related Definitions */
+#define VTD_PASID_DIR_BASE_ADDR_MASK  (~0xfffULL)
+#define VTD_PASID_TABLE_BASE_ADDR_MASK (~0xfffULL)
+#define VTD_PASID_DIR_ENTRY_SIZE      8
+#define VTD_PASID_ENTRY_SIZE          64
+#define VTD_PASID_DIR_BITS_MASK       (0x3fffULL)
+#define VTD_PASID_DIR_INDEX(pasid)    (((pasid) >> 6) & VTD_PASID_DIR_BITS_MASK)
+#define VTD_PASID_DIR_FPD             (1ULL << 1) /* Fault Processing Disable */
+#define VTD_PASID_TABLE_BITS_MASK     (0x3fULL)
+#define VTD_PASID_TABLE_INDEX(pasid)  ((pasid) & VTD_PASID_TABLE_BITS_MASK)
+#define VTD_PASID_ENTRY_FPD           (1ULL << 1) /* Fault Processing Disable */
+
+/* PASID Granular Translation Type Mask */
+#define VTD_SM_PASID_ENTRY_PGTT        (7ULL << 6)
+#define VTD_SM_PASID_ENTRY_FLT         (1ULL << 6)
+#define VTD_SM_PASID_ENTRY_SLT         (2ULL << 6)
+#define VTD_SM_PASID_ENTRY_NESTED      (3ULL << 6)
+#define VTD_SM_PASID_ENTRY_PT          (4ULL << 6)
+
+#define VTD_SM_PASID_ENTRY_AW          7ULL /* Adjusted guest-address-width */
+#define VTD_SM_PASID_ENTRY_DID(val)    ((val) & VTD_DOMAIN_ID_MASK)
+
+/* Second Level Page Translation Pointer*/
+#define VTD_SM_PASID_ENTRY_SLPTPTR     (~0xfffULL)
+
 /* Paging Structure common */
 #define VTD_SL_PT_PAGE_SIZE_MASK    (1ULL << 7)
 /* Bits to decide the offset for each level */
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 77244fc384..cae1b76fde 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -30,7 +30,7 @@ vtd_iotlb_cc_hit(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32
 vtd_iotlb_cc_update(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen1, uint32_t gen2) "IOTLB context update bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32" -> gen %"PRIu32
 vtd_iotlb_reset(const char *reason) "IOTLB reset (reason: %s)"
 vtd_fault_disabled(void) "Fault processing disabled for context entry"
-vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint64_t hi, uint64_t lo) "replay valid context device %02"PRIx8":%02"PRIx8".%02"PRIx8" domain 0x%"PRIx16" hi 0x%"PRIx64" lo 0x%"PRIx64
+vtd_replay_ce_valid(const char *mode, uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint64_t hi, uint64_t lo) "%s: replay valid context device %02"PRIx8":%02"PRIx8".%02"PRIx8" domain 0x%"PRIx16" hi 0x%"PRIx64" lo 0x%"PRIx64
 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
 vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index a321cc9691..72c5ca6a59 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -66,11 +66,20 @@ typedef struct VTDIOTLBEntry VTDIOTLBEntry;
 typedef struct VTDBus VTDBus;
 typedef union VTD_IR_TableEntry VTD_IR_TableEntry;
 typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress;
+typedef struct VTDPASIDDirEntry VTDPASIDDirEntry;
+typedef struct VTDPASIDEntry VTDPASIDEntry;
 
 /* Context-Entry */
 struct VTDContextEntry {
-    uint64_t lo;
-    uint64_t hi;
+    union {
+        struct {
+            uint64_t lo;
+            uint64_t hi;
+        };
+        struct {
+            uint64_t val[4];
+        };
+    };
 };
 
 struct VTDContextCacheEntry {
@@ -81,6 +90,16 @@ struct VTDContextCacheEntry {
     struct VTDContextEntry context_entry;
 };
 
+/* PASID Directory Entry */
+struct VTDPASIDDirEntry {
+    uint64_t val;
+};
+
+/* PASID Table Entry */
+struct VTDPASIDEntry {
+    uint64_t val[8];
+};
+
 struct VTDAddressSpace {
     PCIBus *bus;
     uint8_t devfn;
@@ -212,6 +231,7 @@ struct IntelIOMMUState {
 
     dma_addr_t root;                /* Current root table pointer */
     bool root_extended;             /* Type of root table (extended or not) */
+    bool root_scalable;             /* Type of root table (scalable or not) */
     bool dmar_enabled;              /* Set if DMA remapping is enabled */
 
     uint16_t iq_head;               /* Current invalidation queue head */

From c0c1d351849bc249304e0c505e1597e570d1e777 Mon Sep 17 00:00:00 2001
From: "Liu, Yi L" <yi.l.liu@intel.com>
Date: Tue, 5 Mar 2019 10:34:54 +0800
Subject: [PATCH 14/26] intel_iommu: add 256 bits qi_desc support

Per Intel(R) VT-d 3.0, the qi_desc is 256 bits in Scalable
Mode. This patch adds emulation of 256bits qi_desc.

Signed-off-by: Liu, Yi L <yi.l.liu@intel.com>
[Yi Sun is co-developer to rebase and refine the patch.]
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Message-Id: <1551753295-30167-3-git-send-email-yi.y.sun@linux.intel.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c          | 40 ++++++++++++++++++++++++----------
 hw/i386/intel_iommu_internal.h |  9 +++++++-
 include/hw/i386/intel_iommu.h  |  1 +
 3 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index ea9ea93e42..06b48b3172 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2005,7 +2005,7 @@ static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
     if (en) {
         s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits);
         /* 2^(x+8) entries */
-        s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8);
+        s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8 - (s->iq_dw ? 1 : 0));
         s->qi_enabled = true;
         trace_vtd_inv_qi_setup(s->iq, s->iq_size);
         /* Ok - report back to driver */
@@ -2172,19 +2172,24 @@ static void vtd_handle_iotlb_write(IntelIOMMUState *s)
 }
 
 /* Fetch an Invalidation Descriptor from the Invalidation Queue */
-static bool vtd_get_inv_desc(dma_addr_t base_addr, uint32_t offset,
+static bool vtd_get_inv_desc(IntelIOMMUState *s,
                              VTDInvDesc *inv_desc)
 {
-    dma_addr_t addr = base_addr + offset * sizeof(*inv_desc);
-    if (dma_memory_read(&address_space_memory, addr, inv_desc,
-        sizeof(*inv_desc))) {
-        error_report_once("Read INV DESC failed");
-        inv_desc->lo = 0;
-        inv_desc->hi = 0;
+    dma_addr_t base_addr = s->iq;
+    uint32_t offset = s->iq_head;
+    uint32_t dw = s->iq_dw ? 32 : 16;
+    dma_addr_t addr = base_addr + offset * dw;
+
+    if (dma_memory_read(&address_space_memory, addr, inv_desc, dw)) {
+        error_report_once("Read INV DESC failed.");
         return false;
     }
     inv_desc->lo = le64_to_cpu(inv_desc->lo);
     inv_desc->hi = le64_to_cpu(inv_desc->hi);
+    if (dw == 32) {
+        inv_desc->val[2] = le64_to_cpu(inv_desc->val[2]);
+        inv_desc->val[3] = le64_to_cpu(inv_desc->val[3]);
+    }
     return true;
 }
 
@@ -2390,10 +2395,11 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
     uint8_t desc_type;
 
     trace_vtd_inv_qi_head(s->iq_head);
-    if (!vtd_get_inv_desc(s->iq, s->iq_head, &inv_desc)) {
+    if (!vtd_get_inv_desc(s, &inv_desc)) {
         s->iq_last_desc_type = VTD_INV_DESC_NONE;
         return false;
     }
+
     desc_type = inv_desc.lo & VTD_INV_DESC_TYPE;
     /* FIXME: should update at first or at last? */
     s->iq_last_desc_type = desc_type;
@@ -2478,7 +2484,12 @@ static void vtd_handle_iqt_write(IntelIOMMUState *s)
 {
     uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG);
 
-    s->iq_tail = VTD_IQT_QT(val);
+    if (s->iq_dw && (val & VTD_IQT_QT_256_RSV_BIT)) {
+        error_report_once("%s: RSV bit is set: val=0x%"PRIx64,
+                          __func__, val);
+        return;
+    }
+    s->iq_tail = VTD_IQT_QT(s->iq_dw, val);
     trace_vtd_inv_qi_tail(s->iq_tail);
 
     if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
@@ -2747,6 +2758,12 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
         } else {
             vtd_set_quad(s, addr, val);
         }
+        if (s->ecap & VTD_ECAP_SMTS &&
+            val & VTD_IQA_DW_MASK) {
+            s->iq_dw = true;
+        } else {
+            s->iq_dw = false;
+        }
         break;
 
     case DMAR_IQA_REG_HI:
@@ -3455,6 +3472,7 @@ static void vtd_init(IntelIOMMUState *s)
     s->iq_size = 0;
     s->qi_enabled = false;
     s->iq_last_desc_type = VTD_INV_DESC_NONE;
+    s->iq_dw = false;
     s->next_frcd_reg = 0;
     s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
              VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
@@ -3532,7 +3550,7 @@ static void vtd_init(IntelIOMMUState *s)
 
     vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0);
     vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0);
-    vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff007ULL, 0);
+    vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff807ULL, 0);
     vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL);
     vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0);
     vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0);
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index fe72bc38f0..016fa4c7c5 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -190,6 +190,7 @@
 #define VTD_ECAP_EIM                (1ULL << 4)
 #define VTD_ECAP_PT                 (1ULL << 6)
 #define VTD_ECAP_MHMV               (15ULL << 20)
+#define VTD_ECAP_SMTS               (1ULL << 43)
 
 /* CAP_REG */
 /* (offset >> 4) << 24 */
@@ -218,11 +219,14 @@
 #define VTD_CAP_SAGAW_48bit         (0x4ULL << VTD_CAP_SAGAW_SHIFT)
 
 /* IQT_REG */
-#define VTD_IQT_QT(val)             (((val) >> 4) & 0x7fffULL)
+#define VTD_IQT_QT(dw_bit, val)     (dw_bit ? (((val) >> 5) & 0x3fffULL) : \
+                                     (((val) >> 4) & 0x7fffULL))
+#define VTD_IQT_QT_256_RSV_BIT      0x10
 
 /* IQA_REG */
 #define VTD_IQA_IQA_MASK(aw)        (VTD_HAW_MASK(aw) ^ 0xfffULL)
 #define VTD_IQA_QS                  0x7ULL
+#define VTD_IQA_DW_MASK             0x800
 
 /* IQH_REG */
 #define VTD_IQH_QH_SHIFT            4
@@ -324,6 +328,9 @@ union VTDInvDesc {
         uint64_t lo;
         uint64_t hi;
     };
+    struct {
+        uint64_t val[4];
+    };
     union {
         VTDInvDescIEC iec;
     };
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 72c5ca6a59..2877c94ca2 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -238,6 +238,7 @@ struct IntelIOMMUState {
     uint16_t iq_tail;               /* Current invalidation queue tail */
     dma_addr_t iq;                  /* Current invalidation queue pointer */
     uint16_t iq_size;               /* IQ Size in number of entries */
+    bool iq_dw;                     /* IQ descriptor width 256bit or not */
     bool qi_enabled;                /* Set if the QI is enabled */
     uint8_t iq_last_desc_type;      /* The type of last completed descriptor */
 

From 4a4f219e8a109b0859b23b1f9c8ad8bfbcb59ee1 Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.y.sun@linux.intel.com>
Date: Tue, 5 Mar 2019 10:34:55 +0800
Subject: [PATCH 15/26] intel_iommu: add scalable-mode option to make scalable
 mode work

This patch adds an option to provide flexibility for user to expose
Scalable Mode to guest. User could expose Scalable Mode to guest by
the config as below:

"-device intel-iommu,caching-mode=on,scalable-mode=on"

The Linux iommu driver has supported scalable mode. Please refer below
patch set:

    https://www.spinics.net/lists/kernel/msg2985279.html

Signed-off-by: Liu, Yi L <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Message-Id: <1551753295-30167-4-git-send-email-yi.y.sun@linux.intel.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c          | 25 +++++++++++++++++++++++++
 hw/i386/intel_iommu_internal.h |  4 ++++
 include/hw/i386/intel_iommu.h  |  3 ++-
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 06b48b3172..b90de6c664 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1710,6 +1710,9 @@ static void vtd_root_table_setup(IntelIOMMUState *s)
 {
     s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
     s->root_extended = s->root & VTD_RTADDR_RTT;
+    if (s->scalable_mode) {
+        s->root_scalable = s->root & VTD_RTADDR_SMT;
+    }
     s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits);
 
     trace_vtd_reg_dmar_root(s->root, s->root_extended);
@@ -2419,6 +2422,17 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
         }
         break;
 
+    /*
+     * TODO: the entity of below two cases will be implemented in future series.
+     * To make guest (which integrates scalable mode support patch set in
+     * iommu driver) work, just return true is enough so far.
+     */
+    case VTD_INV_DESC_PC:
+        break;
+
+    case VTD_INV_DESC_PIOTLB:
+        break;
+
     case VTD_INV_DESC_WAIT:
         trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo);
         if (!vtd_process_wait_desc(s, &inv_desc)) {
@@ -2983,6 +2997,7 @@ static Property vtd_properties[] = {
     DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
                       VTD_HOST_ADDRESS_WIDTH),
     DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
+    DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
     DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
     DEFINE_PROP_END_OF_LIST(),
 };
@@ -3518,6 +3533,11 @@ static void vtd_init(IntelIOMMUState *s)
         s->cap |= VTD_CAP_CM;
     }
 
+    /* TODO: read cap/ecap from host to decide which cap to be exposed. */
+    if (s->scalable_mode) {
+        s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
+    }
+
     vtd_reset_caches(s);
 
     /* Define registers with default values and bit semantics */
@@ -3629,6 +3649,11 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
         return false;
     }
 
+    if (s->scalable_mode && !s->dma_drain) {
+        error_setg(errp, "Need to set dma_drain for scalable mode");
+        return false;
+    }
+
     return true;
 }
 
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 016fa4c7c5..1160618177 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -190,7 +190,9 @@
 #define VTD_ECAP_EIM                (1ULL << 4)
 #define VTD_ECAP_PT                 (1ULL << 6)
 #define VTD_ECAP_MHMV               (15ULL << 20)
+#define VTD_ECAP_SRS                (1ULL << 31)
 #define VTD_ECAP_SMTS               (1ULL << 43)
+#define VTD_ECAP_SLTS               (1ULL << 46)
 
 /* CAP_REG */
 /* (offset >> 4) << 24 */
@@ -345,6 +347,8 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_IEC                0x4 /* Interrupt Entry Cache
                                                Invalidate Descriptor */
 #define VTD_INV_DESC_WAIT               0x5 /* Invalidation Wait Descriptor */
+#define VTD_INV_DESC_PIOTLB             0x6 /* PASID-IOTLB Invalidate Desc */
+#define VTD_INV_DESC_PC                 0x7 /* PASID-cache Invalidate Desc */
 #define VTD_INV_DESC_NONE               0   /* Not an Invalidate Descriptor */
 
 /* Masks for Invalidation Wait Descriptor*/
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 2877c94ca2..c11e3d5b34 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -227,7 +227,8 @@ struct IntelIOMMUState {
     uint8_t womask[DMAR_REG_SIZE];  /* WO (write only - read returns 0) */
     uint32_t version;
 
-    bool caching_mode;          /* RO - is cap CM enabled? */
+    bool caching_mode;              /* RO - is cap CM enabled? */
+    bool scalable_mode;             /* RO - is Scalable Mode supported? */
 
     dma_addr_t root;                /* Current root table pointer */
     bool root_extended;             /* Type of root table (extended or not) */

From b096c11458e58ddcebf8e6ae239a3c608de1ef78 Mon Sep 17 00:00:00 2001
From: Wei Yang <richardw.yang@linux.intel.com>
Date: Wed, 27 Feb 2019 15:50:58 +0800
Subject: [PATCH 16/26] nvdimm: fix typo in nvdimm_build_nvdimm_devices
 argument

>From dsm_dma_arrea to dsm_dma_area.

Signed-off-by: Wei Yang <richardw.yang@linux.intel.com>
Message-Id: <20190227075101.6263-2-richardw.yang@linux.intel.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
---
 hw/acpi/nvdimm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index f73cfb9d90..280bfb9d78 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -1260,7 +1260,7 @@ static void nvdimm_build_nvdimm_devices(Aml *root_dev, uint32_t ram_slots)
 }
 
 static void nvdimm_build_ssdt(GArray *table_offsets, GArray *table_data,
-                              BIOSLinker *linker, GArray *dsm_dma_arrea,
+                              BIOSLinker *linker, GArray *dsm_dma_area,
                               uint32_t ram_slots)
 {
     Aml *ssdt, *sb_scope, *dev;
@@ -1307,7 +1307,7 @@ static void nvdimm_build_ssdt(GArray *table_offsets, GArray *table_data,
                                                NVDIMM_ACPI_MEM_ADDR);
 
     bios_linker_loader_alloc(linker,
-                             NVDIMM_DSM_MEM_FILE, dsm_dma_arrea,
+                             NVDIMM_DSM_MEM_FILE, dsm_dma_area,
                              sizeof(NvdimmDsmIn), false /* high memory */);
     bios_linker_loader_add_pointer(linker,
         ACPI_BUILD_TABLE_FILE, mem_addr_offset, sizeof(uint32_t),

From ac265cacdd117ae965515df41bb4452478d22179 Mon Sep 17 00:00:00 2001
From: Wei Yang <richardw.yang@linux.intel.com>
Date: Wed, 27 Feb 2019 15:50:59 +0800
Subject: [PATCH 17/26] nvdimm: use *function* directly instead of allocating
 it again

At the beginning or nvdimm_build_common_dsm(), variable *function* is
already allocated for Arg2.

This patch reuse variable *function* instead of allocating it again.

Signed-off-by: Wei Yang <richardw.yang@linux.intel.com>
Message-Id: <20190227075101.6263-3-richardw.yang@linux.intel.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
---
 hw/acpi/nvdimm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 280bfb9d78..3e73ed456f 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -1086,7 +1086,7 @@ static void nvdimm_build_common_dsm(Aml *dev)
      */
     aml_append(method, aml_store(handle, aml_name(NVDIMM_DSM_HANDLE)));
     aml_append(method, aml_store(aml_arg(1), aml_name(NVDIMM_DSM_REVISION)));
-    aml_append(method, aml_store(aml_arg(2), aml_name(NVDIMM_DSM_FUNCTION)));
+    aml_append(method, aml_store(function, aml_name(NVDIMM_DSM_FUNCTION)));
 
     /*
      * The fourth parameter (Arg3) of _DSM is a package which contains

From 1b8fff57584d04a94cc77b8d3f535d35f5827a5e Mon Sep 17 00:00:00 2001
From: Wei Yang <richardw.yang@linux.intel.com>
Date: Wed, 27 Feb 2019 15:51:00 +0800
Subject: [PATCH 18/26] nvdimm: use NVDIMM_ACPI_IO_LEN for the proper IO size

The IO range is defined to 4 bytes with NVDIMM_ACPI_IO_LEN, so it is
more proper to use this macro instead of calculating it by sizeof.

Signed-off-by: Wei Yang <richardw.yang@linux.intel.com>
Message-Id: <20190227075101.6263-4-richardw.yang@linux.intel.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
---
 hw/acpi/nvdimm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 3e73ed456f..9fdad6dc3f 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -992,7 +992,7 @@ static void nvdimm_build_common_dsm(Aml *dev)
     field = aml_field(NVDIMM_DSM_IOPORT, AML_DWORD_ACC, AML_NOLOCK,
                       AML_PRESERVE);
     aml_append(field, aml_named_field(NVDIMM_DSM_NOTIFY,
-               sizeof(uint32_t) * BITS_PER_BYTE));
+               NVDIMM_ACPI_IO_LEN * BITS_PER_BYTE));
     aml_append(method, field);
 
     /*

From 5ad204bf2a06bf29234df9e6f1ef6c566771b40c Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@baidu.com>
Date: Thu, 28 Feb 2019 16:53:49 +0800
Subject: [PATCH 19/26] vhost-user: Support transferring inflight buffer
 between qemu and backend

This patch introduces two new messages VHOST_USER_GET_INFLIGHT_FD
and VHOST_USER_SET_INFLIGHT_FD to support transferring a shared
buffer between qemu and backend.

Firstly, qemu uses VHOST_USER_GET_INFLIGHT_FD to get the
shared buffer from backend. Then qemu should send it back
through VHOST_USER_SET_INFLIGHT_FD each time we start vhost-user.

This shared buffer is used to track inflight I/O by backend.
Qemu should retrieve a new one when vm reset.

Signed-off-by: Xie Yongji <xieyongji@baidu.com>
Signed-off-by: Chai Wen <chaiwen@baidu.com>
Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
Message-Id: <20190228085355.9614-2-xieyongji@baidu.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 docs/interop/vhost-user.txt       | 285 ++++++++++++++++++++++++++++++
 hw/virtio/vhost-user.c            | 107 +++++++++++
 hw/virtio/vhost.c                 |  96 ++++++++++
 include/hw/virtio/vhost-backend.h |  10 ++
 include/hw/virtio/vhost.h         |  18 ++
 5 files changed, 516 insertions(+)

diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt
index 9ee2a60cfb..4dbd530cb9 100644
--- a/docs/interop/vhost-user.txt
+++ b/docs/interop/vhost-user.txt
@@ -147,6 +147,17 @@ Depending on the request type, payload can be:
    Offset: a 64-bit offset of this area from the start of the
        supplied file descriptor
 
+ * Inflight description
+   -----------------------------------------------------
+   | mmap size | mmap offset | num queues | queue size |
+   -----------------------------------------------------
+
+   mmap size: a 64-bit size of area to track inflight I/O
+   mmap offset: a 64-bit offset of this area from the start
+                of the supplied file descriptor
+   num queues: a 16-bit number of virtqueues
+   queue size: a 16-bit size of virtqueues
+
 In QEMU the vhost-user message is implemented with the following struct:
 
 typedef struct VhostUserMsg {
@@ -162,6 +173,7 @@ typedef struct VhostUserMsg {
         struct vhost_iotlb_msg iotlb;
         VhostUserConfig config;
         VhostUserVringArea area;
+        VhostUserInflight inflight;
     };
 } QEMU_PACKED VhostUserMsg;
 
@@ -180,6 +192,7 @@ the ones that do:
  * VHOST_USER_GET_PROTOCOL_FEATURES
  * VHOST_USER_GET_VRING_BASE
  * VHOST_USER_SET_LOG_BASE (if VHOST_USER_PROTOCOL_F_LOG_SHMFD)
+ * VHOST_USER_GET_INFLIGHT_FD (if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)
 
 [ Also see the section on REPLY_ACK protocol extension. ]
 
@@ -193,6 +206,7 @@ in the ancillary data:
  * VHOST_USER_SET_VRING_CALL
  * VHOST_USER_SET_VRING_ERR
  * VHOST_USER_SET_SLAVE_REQ_FD
+ * VHOST_USER_SET_INFLIGHT_FD (if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)
 
 If Master is unable to send the full message or receives a wrong reply it will
 close the connection. An optional reconnection mechanism can be implemented.
@@ -387,6 +401,256 @@ If VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD protocol feature is negotiated,
 slave can send file descriptors (at most 8 descriptors in each message)
 to master via ancillary data using this fd communication channel.
 
+Inflight I/O tracking
+---------------------
+
+To support reconnecting after restart or crash, slave may need to resubmit
+inflight I/Os. If virtqueue is processed in order, we can easily achieve
+that by getting the inflight descriptors from descriptor table (split virtqueue)
+or descriptor ring (packed virtqueue). However, it can't work when we process
+descriptors out-of-order because some entries which store the information of
+inflight descriptors in available ring (split virtqueue) or descriptor
+ring (packed virtqueue) might be overrided by new entries. To solve this
+problem, slave need to allocate an extra buffer to store this information of inflight
+descriptors and share it with master for persistent. VHOST_USER_GET_INFLIGHT_FD and
+VHOST_USER_SET_INFLIGHT_FD are used to transfer this buffer between master
+and slave. And the format of this buffer is described below:
+
+-------------------------------------------------------
+| queue0 region | queue1 region | ... | queueN region |
+-------------------------------------------------------
+
+N is the number of available virtqueues. Slave could get it from num queues
+field of VhostUserInflight.
+
+For split virtqueue, queue region can be implemented as:
+
+typedef struct DescStateSplit {
+    /* Indicate whether this descriptor is inflight or not.
+     * Only available for head-descriptor. */
+    uint8_t inflight;
+
+    /* Padding */
+    uint8_t padding[5];
+
+    /* Maintain a list for the last batch of used descriptors.
+     * Only available when batching is used for submitting */
+    uint16_t next;
+
+    /* Used to preserve the order of fetching available descriptors.
+     * Only available for head-descriptor. */
+    uint64_t counter;
+} DescStateSplit;
+
+typedef struct QueueRegionSplit {
+    /* The feature flags of this region. Now it's initialized to 0. */
+    uint64_t features;
+
+    /* The version of this region. It's 1 currently.
+     * Zero value indicates an uninitialized buffer */
+    uint16_t version;
+
+    /* The size of DescStateSplit array. It's equal to the virtqueue
+     * size. Slave could get it from queue size field of VhostUserInflight. */
+    uint16_t desc_num;
+
+    /* The head of list that track the last batch of used descriptors. */
+    uint16_t last_batch_head;
+
+    /* Store the idx value of used ring */
+    uint16_t used_idx;
+
+    /* Used to track the state of each descriptor in descriptor table */
+    DescStateSplit desc[0];
+} QueueRegionSplit;
+
+To track inflight I/O, the queue region should be processed as follows:
+
+When receiving available buffers from the driver:
+
+    1. Get the next available head-descriptor index from available ring, i
+
+    2. Set desc[i].counter to the value of global counter
+
+    3. Increase global counter by 1
+
+    4. Set desc[i].inflight to 1
+
+When supplying used buffers to the driver:
+
+    1. Get corresponding used head-descriptor index, i
+
+    2. Set desc[i].next to last_batch_head
+
+    3. Set last_batch_head to i
+
+    4. Steps 1,2,3 may be performed repeatedly if batching is possible
+
+    5. Increase the idx value of used ring by the size of the batch
+
+    6. Set the inflight field of each DescStateSplit entry in the batch to 0
+
+    7. Set used_idx to the idx value of used ring
+
+When reconnecting:
+
+    1. If the value of used_idx does not match the idx value of used ring (means
+    the inflight field of DescStateSplit entries in last batch may be incorrect),
+
+        (a) Subtract the value of used_idx from the idx value of used ring to get
+        last batch size of DescStateSplit entries
+
+        (b) Set the inflight field of each DescStateSplit entry to 0 in last batch
+        list which starts from last_batch_head
+
+        (c) Set used_idx to the idx value of used ring
+
+    2. Resubmit inflight DescStateSplit entries in order of their counter value
+
+For packed virtqueue, queue region can be implemented as:
+
+typedef struct DescStatePacked {
+    /* Indicate whether this descriptor is inflight or not.
+     * Only available for head-descriptor. */
+    uint8_t inflight;
+
+    /* Padding */
+    uint8_t padding;
+
+    /* Link to the next free entry */
+    uint16_t next;
+
+    /* Link to the last entry of descriptor list.
+     * Only available for head-descriptor. */
+    uint16_t last;
+
+    /* The length of descriptor list.
+     * Only available for head-descriptor. */
+    uint16_t num;
+
+    /* Used to preserve the order of fetching available descriptors.
+     * Only available for head-descriptor. */
+    uint64_t counter;
+
+    /* The buffer id */
+    uint16_t id;
+
+    /* The descriptor flags */
+    uint16_t flags;
+
+    /* The buffer length */
+    uint32_t len;
+
+    /* The buffer address */
+    uint64_t addr;
+} DescStatePacked;
+
+typedef struct QueueRegionPacked {
+    /* The feature flags of this region. Now it's initialized to 0. */
+    uint64_t features;
+
+    /* The version of this region. It's 1 currently.
+     * Zero value indicates an uninitialized buffer */
+    uint16_t version;
+
+    /* The size of DescStatePacked array. It's equal to the virtqueue
+     * size. Slave could get it from queue size field of VhostUserInflight. */
+    uint16_t desc_num;
+
+    /* The head of free DescStatePacked entry list */
+    uint16_t free_head;
+
+    /* The old head of free DescStatePacked entry list */
+    uint16_t old_free_head;
+
+    /* The used index of descriptor ring */
+    uint16_t used_idx;
+
+    /* The old used index of descriptor ring */
+    uint16_t old_used_idx;
+
+    /* Device ring wrap counter */
+    uint8_t used_wrap_counter;
+
+    /* The old device ring wrap counter */
+    uint8_t old_used_wrap_counter;
+
+    /* Padding */
+    uint8_t padding[7];
+
+    /* Used to track the state of each descriptor fetched from descriptor ring */
+    DescStatePacked desc[0];
+} QueueRegionPacked;
+
+To track inflight I/O, the queue region should be processed as follows:
+
+When receiving available buffers from the driver:
+
+    1. Get the next available descriptor entry from descriptor ring, d
+
+    2. If d is head descriptor,
+
+        (a) Set desc[old_free_head].num to 0
+
+        (b) Set desc[old_free_head].counter to the value of global counter
+
+        (c) Increase global counter by 1
+
+        (d) Set desc[old_free_head].inflight to 1
+
+    3. If d is last descriptor, set desc[old_free_head].last to free_head
+
+    4. Increase desc[old_free_head].num by 1
+
+    5. Set desc[free_head].addr, desc[free_head].len, desc[free_head].flags,
+    desc[free_head].id to d.addr, d.len, d.flags, d.id
+
+    6. Set free_head to desc[free_head].next
+
+    7. If d is last descriptor, set old_free_head to free_head
+
+When supplying used buffers to the driver:
+
+    1. Get corresponding used head-descriptor entry from descriptor ring, d
+
+    2. Get corresponding DescStatePacked entry, e
+
+    3. Set desc[e.last].next to free_head
+
+    4. Set free_head to the index of e
+
+    5. Steps 1,2,3,4 may be performed repeatedly if batching is possible
+
+    6. Increase used_idx by the size of the batch and update used_wrap_counter if needed
+
+    7. Update d.flags
+
+    8. Set the inflight field of each head DescStatePacked entry in the batch to 0
+
+    9. Set old_free_head, old_used_idx, old_used_wrap_counter to free_head, used_idx,
+    used_wrap_counter
+
+When reconnecting:
+
+    1. If used_idx does not match old_used_idx (means the inflight field of DescStatePacked
+    entries in last batch may be incorrect),
+
+        (a) Get the next descriptor ring entry through old_used_idx, d
+
+        (b) Use old_used_wrap_counter to calculate the available flags
+
+        (c) If d.flags is not equal to the calculated flags value (means slave has
+        submitted the buffer to guest driver before crash, so it has to commit the
+        in-progres update), set old_free_head, old_used_idx, old_used_wrap_counter
+        to free_head, used_idx, used_wrap_counter
+
+    2. Set free_head, used_idx, used_wrap_counter to old_free_head, old_used_idx,
+    old_used_wrap_counter (roll back any in-progress update)
+
+    3. Set the inflight field of each DescStatePacked entry in free list to 0
+
+    4. Resubmit inflight DescStatePacked entries in order of their counter value
+
 Protocol features
 -----------------
 
@@ -402,6 +666,7 @@ Protocol features
 #define VHOST_USER_PROTOCOL_F_CONFIG         9
 #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD  10
 #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  11
+#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
 
 Master message types
 --------------------
@@ -766,6 +1031,26 @@ Master message types
       was previously sent.
       The value returned is an error indication; 0 is success.
 
+ * VHOST_USER_GET_INFLIGHT_FD
+      Id: 31
+      Equivalent ioctl: N/A
+      Master payload: inflight description
+
+      When VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD protocol feature has been
+      successfully negotiated, this message is submitted by master to get
+      a shared buffer from slave. The shared buffer will be used to track
+      inflight I/O by slave. QEMU should retrieve a new one when vm reset.
+
+ * VHOST_USER_SET_INFLIGHT_FD
+      Id: 32
+      Equivalent ioctl: N/A
+      Master payload: inflight description
+
+      When VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD protocol feature has been
+      successfully negotiated, this message is submitted by master to send
+      the shared inflight buffer back to slave so that slave could get
+      inflight I/O after a crash or restart.
+
 Slave message types
 -------------------
 
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 5df73405bc..553319c7ac 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -56,6 +56,7 @@ enum VhostUserProtocolFeature {
     VHOST_USER_PROTOCOL_F_CONFIG = 9,
     VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
     VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
+    VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
     VHOST_USER_PROTOCOL_F_MAX
 };
 
@@ -93,6 +94,8 @@ typedef enum VhostUserRequest {
     VHOST_USER_POSTCOPY_ADVISE  = 28,
     VHOST_USER_POSTCOPY_LISTEN  = 29,
     VHOST_USER_POSTCOPY_END     = 30,
+    VHOST_USER_GET_INFLIGHT_FD = 31,
+    VHOST_USER_SET_INFLIGHT_FD = 32,
     VHOST_USER_MAX
 } VhostUserRequest;
 
@@ -151,6 +154,13 @@ typedef struct VhostUserVringArea {
     uint64_t offset;
 } VhostUserVringArea;
 
+typedef struct VhostUserInflight {
+    uint64_t mmap_size;
+    uint64_t mmap_offset;
+    uint16_t num_queues;
+    uint16_t queue_size;
+} VhostUserInflight;
+
 typedef struct {
     VhostUserRequest request;
 
@@ -173,6 +183,7 @@ typedef union {
         VhostUserConfig config;
         VhostUserCryptoSession session;
         VhostUserVringArea area;
+        VhostUserInflight inflight;
 } VhostUserPayload;
 
 typedef struct VhostUserMsg {
@@ -1770,6 +1781,100 @@ static bool vhost_user_mem_section_filter(struct vhost_dev *dev,
     return result;
 }
 
+static int vhost_user_get_inflight_fd(struct vhost_dev *dev,
+                                      uint16_t queue_size,
+                                      struct vhost_inflight *inflight)
+{
+    void *addr;
+    int fd;
+    struct vhost_user *u = dev->opaque;
+    CharBackend *chr = u->user->chr;
+    VhostUserMsg msg = {
+        .hdr.request = VHOST_USER_GET_INFLIGHT_FD,
+        .hdr.flags = VHOST_USER_VERSION,
+        .payload.inflight.num_queues = dev->nvqs,
+        .payload.inflight.queue_size = queue_size,
+        .hdr.size = sizeof(msg.payload.inflight),
+    };
+
+    if (!virtio_has_feature(dev->protocol_features,
+                            VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+        return 0;
+    }
+
+    if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
+        return -1;
+    }
+
+    if (vhost_user_read(dev, &msg) < 0) {
+        return -1;
+    }
+
+    if (msg.hdr.request != VHOST_USER_GET_INFLIGHT_FD) {
+        error_report("Received unexpected msg type. "
+                     "Expected %d received %d",
+                     VHOST_USER_GET_INFLIGHT_FD, msg.hdr.request);
+        return -1;
+    }
+
+    if (msg.hdr.size != sizeof(msg.payload.inflight)) {
+        error_report("Received bad msg size.");
+        return -1;
+    }
+
+    if (!msg.payload.inflight.mmap_size) {
+        return 0;
+    }
+
+    fd = qemu_chr_fe_get_msgfd(chr);
+    if (fd < 0) {
+        error_report("Failed to get mem fd");
+        return -1;
+    }
+
+    addr = mmap(0, msg.payload.inflight.mmap_size, PROT_READ | PROT_WRITE,
+                MAP_SHARED, fd, msg.payload.inflight.mmap_offset);
+
+    if (addr == MAP_FAILED) {
+        error_report("Failed to mmap mem fd");
+        close(fd);
+        return -1;
+    }
+
+    inflight->addr = addr;
+    inflight->fd = fd;
+    inflight->size = msg.payload.inflight.mmap_size;
+    inflight->offset = msg.payload.inflight.mmap_offset;
+    inflight->queue_size = queue_size;
+
+    return 0;
+}
+
+static int vhost_user_set_inflight_fd(struct vhost_dev *dev,
+                                      struct vhost_inflight *inflight)
+{
+    VhostUserMsg msg = {
+        .hdr.request = VHOST_USER_SET_INFLIGHT_FD,
+        .hdr.flags = VHOST_USER_VERSION,
+        .payload.inflight.mmap_size = inflight->size,
+        .payload.inflight.mmap_offset = inflight->offset,
+        .payload.inflight.num_queues = dev->nvqs,
+        .payload.inflight.queue_size = inflight->queue_size,
+        .hdr.size = sizeof(msg.payload.inflight),
+    };
+
+    if (!virtio_has_feature(dev->protocol_features,
+                            VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+        return 0;
+    }
+
+    if (vhost_user_write(dev, &msg, &inflight->fd, 1) < 0) {
+        return -1;
+    }
+
+    return 0;
+}
+
 bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp)
 {
     if (user->chr) {
@@ -1829,4 +1934,6 @@ const VhostOps user_ops = {
         .vhost_crypto_create_session = vhost_user_crypto_create_session,
         .vhost_crypto_close_session = vhost_user_crypto_close_session,
         .vhost_backend_mem_section_filter = vhost_user_mem_section_filter,
+        .vhost_get_inflight_fd = vhost_user_get_inflight_fd,
+        .vhost_set_inflight_fd = vhost_user_set_inflight_fd,
 };
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 311432f190..7f61018f2a 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1481,6 +1481,102 @@ void vhost_dev_set_config_notifier(struct vhost_dev *hdev,
     hdev->config_ops = ops;
 }
 
+void vhost_dev_free_inflight(struct vhost_inflight *inflight)
+{
+    if (inflight->addr) {
+        qemu_memfd_free(inflight->addr, inflight->size, inflight->fd);
+        inflight->addr = NULL;
+        inflight->fd = -1;
+    }
+}
+
+static int vhost_dev_resize_inflight(struct vhost_inflight *inflight,
+                                     uint64_t new_size)
+{
+    Error *err = NULL;
+    int fd = -1;
+    void *addr = qemu_memfd_alloc("vhost-inflight", new_size,
+                                  F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
+                                  &fd, &err);
+
+    if (err) {
+        error_report_err(err);
+        return -1;
+    }
+
+    vhost_dev_free_inflight(inflight);
+    inflight->offset = 0;
+    inflight->addr = addr;
+    inflight->fd = fd;
+    inflight->size = new_size;
+
+    return 0;
+}
+
+void vhost_dev_save_inflight(struct vhost_inflight *inflight, QEMUFile *f)
+{
+    if (inflight->addr) {
+        qemu_put_be64(f, inflight->size);
+        qemu_put_be16(f, inflight->queue_size);
+        qemu_put_buffer(f, inflight->addr, inflight->size);
+    } else {
+        qemu_put_be64(f, 0);
+    }
+}
+
+int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f)
+{
+    uint64_t size;
+
+    size = qemu_get_be64(f);
+    if (!size) {
+        return 0;
+    }
+
+    if (inflight->size != size) {
+        if (vhost_dev_resize_inflight(inflight, size)) {
+            return -1;
+        }
+    }
+    inflight->queue_size = qemu_get_be16(f);
+
+    qemu_get_buffer(f, inflight->addr, size);
+
+    return 0;
+}
+
+int vhost_dev_set_inflight(struct vhost_dev *dev,
+                           struct vhost_inflight *inflight)
+{
+    int r;
+
+    if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) {
+        r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight);
+        if (r) {
+            VHOST_OPS_DEBUG("vhost_set_inflight_fd failed");
+            return -errno;
+        }
+    }
+
+    return 0;
+}
+
+int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
+                           struct vhost_inflight *inflight)
+{
+    int r;
+
+    if (dev->vhost_ops->vhost_get_inflight_fd) {
+        r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight);
+        if (r) {
+            VHOST_OPS_DEBUG("vhost_get_inflight_fd failed");
+            return -errno;
+        }
+    }
+
+    return 0;
+}
+
 /* Host notifiers must be enabled at this point. */
 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
 {
diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h
index 81283ec50f..d6632a18e6 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -25,6 +25,7 @@ typedef enum VhostSetConfigType {
     VHOST_SET_CONFIG_TYPE_MIGRATION = 1,
 } VhostSetConfigType;
 
+struct vhost_inflight;
 struct vhost_dev;
 struct vhost_log;
 struct vhost_memory;
@@ -104,6 +105,13 @@ typedef int (*vhost_crypto_close_session_op)(struct vhost_dev *dev,
 typedef bool (*vhost_backend_mem_section_filter_op)(struct vhost_dev *dev,
                                                 MemoryRegionSection *section);
 
+typedef int (*vhost_get_inflight_fd_op)(struct vhost_dev *dev,
+                                        uint16_t queue_size,
+                                        struct vhost_inflight *inflight);
+
+typedef int (*vhost_set_inflight_fd_op)(struct vhost_dev *dev,
+                                        struct vhost_inflight *inflight);
+
 typedef struct VhostOps {
     VhostBackendType backend_type;
     vhost_backend_init vhost_backend_init;
@@ -142,6 +150,8 @@ typedef struct VhostOps {
     vhost_crypto_create_session_op vhost_crypto_create_session;
     vhost_crypto_close_session_op vhost_crypto_close_session;
     vhost_backend_mem_section_filter_op vhost_backend_mem_section_filter;
+    vhost_get_inflight_fd_op vhost_get_inflight_fd;
+    vhost_set_inflight_fd_op vhost_set_inflight_fd;
 } VhostOps;
 
 extern const VhostOps user_ops;
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index a7f449fa87..619498c8f4 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -7,6 +7,15 @@
 #include "exec/memory.h"
 
 /* Generic structures common for any vhost based device. */
+
+struct vhost_inflight {
+    int fd;
+    void *addr;
+    uint64_t size;
+    uint64_t offset;
+    uint16_t queue_size;
+};
+
 struct vhost_virtqueue {
     int kick;
     int call;
@@ -120,4 +129,13 @@ int vhost_dev_set_config(struct vhost_dev *dev, const uint8_t *data,
  */
 void vhost_dev_set_config_notifier(struct vhost_dev *dev,
                                    const VhostDevConfigOps *ops);
+
+void vhost_dev_reset_inflight(struct vhost_inflight *inflight);
+void vhost_dev_free_inflight(struct vhost_inflight *inflight);
+void vhost_dev_save_inflight(struct vhost_inflight *inflight, QEMUFile *f);
+int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f);
+int vhost_dev_set_inflight(struct vhost_dev *dev,
+                           struct vhost_inflight *inflight);
+int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
+                           struct vhost_inflight *inflight);
 #endif

From 792468cc6f15716fa37e2f0afa387b58bf9d9d05 Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@baidu.com>
Date: Thu, 28 Feb 2019 16:53:50 +0800
Subject: [PATCH 20/26] libvhost-user: Remove unnecessary FD flag check for
 event file descriptors

The vu_check_queue_msg_file() has checked the FD flag. So let's
delete the redundant check after it.

Signed-off-by: Xie Yongji <xieyongji@baidu.com>
Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
Message-Id: <20190228085355.9614-3-xieyongji@baidu.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 contrib/libvhost-user/libvhost-user.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index df72d3e440..219a1fd06f 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -907,10 +907,8 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
         dev->vq[index].kick_fd = -1;
     }
 
-    if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
-        dev->vq[index].kick_fd = vmsg->fds[0];
-        DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index);
-    }
+    dev->vq[index].kick_fd = vmsg->fds[0];
+    DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index);
 
     dev->vq[index].started = true;
     if (dev->iface->queue_set_started) {
@@ -995,9 +993,7 @@ vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg)
         dev->vq[index].call_fd = -1;
     }
 
-    if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
-        dev->vq[index].call_fd = vmsg->fds[0];
-    }
+    dev->vq[index].call_fd = vmsg->fds[0];
 
     DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index);
 
@@ -1020,9 +1016,7 @@ vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg)
         dev->vq[index].err_fd = -1;
     }
 
-    if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
-        dev->vq[index].err_fd = vmsg->fds[0];
-    }
+    dev->vq[index].err_fd = vmsg->fds[0];
 
     return false;
 }

From f7671f3d49418f570a86de6b54795edcd5cb68dd Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@baidu.com>
Date: Thu, 28 Feb 2019 16:53:51 +0800
Subject: [PATCH 21/26] libvhost-user: Introduce vu_queue_map_desc()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce vu_queue_map_desc() which should be
independent with vu_queue_pop();

Signed-off-by: Xie Yongji <xieyongji@baidu.com>
Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20190228085355.9614-4-xieyongji@baidu.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 contrib/libvhost-user/libvhost-user.c | 88 ++++++++++++++++-----------
 1 file changed, 51 insertions(+), 37 deletions(-)

diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index 219a1fd06f..ddd15d79cf 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -1848,49 +1848,20 @@ virtqueue_alloc_element(size_t sz,
     return elem;
 }
 
-void *
-vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
+static void *
+vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz)
 {
-    unsigned int i, head, max, desc_len;
+    struct vring_desc *desc = vq->vring.desc;
     uint64_t desc_addr, read_len;
+    unsigned int desc_len;
+    unsigned int max = vq->vring.num;
+    unsigned int i = idx;
     VuVirtqElement *elem;
-    unsigned out_num, in_num;
+    unsigned int out_num = 0, in_num = 0;
     struct iovec iov[VIRTQUEUE_MAX_SIZE];
     struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
-    struct vring_desc *desc;
     int rc;
 
-    if (unlikely(dev->broken) ||
-        unlikely(!vq->vring.avail)) {
-        return NULL;
-    }
-
-    if (vu_queue_empty(dev, vq)) {
-        return NULL;
-    }
-    /* Needed after virtio_queue_empty(), see comment in
-     * virtqueue_num_heads(). */
-    smp_rmb();
-
-    /* When we start there are none of either input nor output. */
-    out_num = in_num = 0;
-
-    max = vq->vring.num;
-    if (vq->inuse >= vq->vring.num) {
-        vu_panic(dev, "Virtqueue size exceeded");
-        return NULL;
-    }
-
-    if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) {
-        return NULL;
-    }
-
-    if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
-        vring_set_avail_event(vq, vq->last_avail_idx);
-    }
-
-    i = head;
-    desc = vq->vring.desc;
     if (desc[i].flags & VRING_DESC_F_INDIRECT) {
         if (desc[i].len % sizeof(struct vring_desc)) {
             vu_panic(dev, "Invalid size for indirect buffer table");
@@ -1942,12 +1913,13 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
     } while (rc == VIRTQUEUE_READ_DESC_MORE);
 
     if (rc == VIRTQUEUE_READ_DESC_ERROR) {
+        vu_panic(dev, "read descriptor error");
         return NULL;
     }
 
     /* Now copy what we have collected and mapped */
     elem = virtqueue_alloc_element(sz, out_num, in_num);
-    elem->index = head;
+    elem->index = idx;
     for (i = 0; i < out_num; i++) {
         elem->out_sg[i] = iov[i];
     }
@@ -1955,6 +1927,48 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
         elem->in_sg[i] = iov[out_num + i];
     }
 
+    return elem;
+}
+
+void *
+vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
+{
+    unsigned int head;
+    VuVirtqElement *elem;
+
+    if (unlikely(dev->broken) ||
+        unlikely(!vq->vring.avail)) {
+        return NULL;
+    }
+
+    if (vu_queue_empty(dev, vq)) {
+        return NULL;
+    }
+    /*
+     * Needed after virtio_queue_empty(), see comment in
+     * virtqueue_num_heads().
+     */
+    smp_rmb();
+
+    if (vq->inuse >= vq->vring.num) {
+        vu_panic(dev, "Virtqueue size exceeded");
+        return NULL;
+    }
+
+    if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) {
+        return NULL;
+    }
+
+    if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
+        vring_set_avail_event(vq, vq->last_avail_idx);
+    }
+
+    elem = vu_queue_map_desc(dev, vq, head, sz);
+
+    if (!elem) {
+        return NULL;
+    }
+
     vq->inuse++;
 
     return elem;

From 5f9ff1eff38ab341fb38ce32d4ae95e33f878781 Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@baidu.com>
Date: Thu, 28 Feb 2019 16:53:52 +0800
Subject: [PATCH 22/26] libvhost-user: Support tracking inflight I/O in shared
 memory

This patch adds support for VHOST_USER_GET_INFLIGHT_FD and
VHOST_USER_SET_INFLIGHT_FD message to set/get shared buffer
to/from qemu. Then backend can track inflight I/O in this buffer.

Signed-off-by: Xie Yongji <xieyongji@baidu.com>
Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
Message-Id: <20190228085355.9614-5-xieyongji@baidu.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 Makefile                              |   2 +-
 contrib/libvhost-user/libvhost-user.c | 349 ++++++++++++++++++++++++--
 contrib/libvhost-user/libvhost-user.h |  70 ++++++
 3 files changed, 400 insertions(+), 21 deletions(-)

diff --git a/Makefile b/Makefile
index 6ccb8639b0..abd78a9826 100644
--- a/Makefile
+++ b/Makefile
@@ -497,7 +497,7 @@ Makefile: $(version-obj-y)
 # Build libraries
 
 libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y)
-libvhost-user.a: $(libvhost-user-obj-y)
+libvhost-user.a: $(libvhost-user-obj-y) $(util-obj-y) $(stub-obj-y)
 
 ######################################################################
 
diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index ddd15d79cf..e08d6c7b97 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -41,6 +41,8 @@
 #endif
 
 #include "qemu/atomic.h"
+#include "qemu/osdep.h"
+#include "qemu/memfd.h"
 
 #include "libvhost-user.h"
 
@@ -53,6 +55,18 @@
             _min1 < _min2 ? _min1 : _min2; })
 #endif
 
+/* Round number down to multiple */
+#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
+
+/* Round number up to multiple */
+#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
+
+/* Align each region to cache line size in inflight buffer */
+#define INFLIGHT_ALIGNMENT 64
+
+/* The version of inflight buffer */
+#define INFLIGHT_VERSION 1
+
 #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
 
 /* The version of the protocol we support */
@@ -66,6 +80,20 @@
         }                                       \
     } while (0)
 
+static inline
+bool has_feature(uint64_t features, unsigned int fbit)
+{
+    assert(fbit < 64);
+    return !!(features & (1ULL << fbit));
+}
+
+static inline
+bool vu_has_feature(VuDev *dev,
+                    unsigned int fbit)
+{
+    return has_feature(dev->features, fbit);
+}
+
 static const char *
 vu_request_to_string(unsigned int req)
 {
@@ -100,6 +128,8 @@ vu_request_to_string(unsigned int req)
         REQ(VHOST_USER_POSTCOPY_ADVISE),
         REQ(VHOST_USER_POSTCOPY_LISTEN),
         REQ(VHOST_USER_POSTCOPY_END),
+        REQ(VHOST_USER_GET_INFLIGHT_FD),
+        REQ(VHOST_USER_SET_INFLIGHT_FD),
         REQ(VHOST_USER_MAX),
     };
 #undef REQ
@@ -890,6 +920,91 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
     return true;
 }
 
+static int
+inflight_desc_compare(const void *a, const void *b)
+{
+    VuVirtqInflightDesc *desc0 = (VuVirtqInflightDesc *)a,
+                        *desc1 = (VuVirtqInflightDesc *)b;
+
+    if (desc1->counter > desc0->counter &&
+        (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) {
+        return 1;
+    }
+
+    return -1;
+}
+
+static int
+vu_check_queue_inflights(VuDev *dev, VuVirtq *vq)
+{
+    int i = 0;
+
+    if (!has_feature(dev->protocol_features,
+        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+        return 0;
+    }
+
+    if (unlikely(!vq->inflight)) {
+        return -1;
+    }
+
+    if (unlikely(!vq->inflight->version)) {
+        /* initialize the buffer */
+        vq->inflight->version = INFLIGHT_VERSION;
+        return 0;
+    }
+
+    vq->used_idx = vq->vring.used->idx;
+    vq->resubmit_num = 0;
+    vq->resubmit_list = NULL;
+    vq->counter = 0;
+
+    if (unlikely(vq->inflight->used_idx != vq->used_idx)) {
+        vq->inflight->desc[vq->inflight->last_batch_head].inflight = 0;
+
+        barrier();
+
+        vq->inflight->used_idx = vq->used_idx;
+    }
+
+    for (i = 0; i < vq->inflight->desc_num; i++) {
+        if (vq->inflight->desc[i].inflight == 1) {
+            vq->inuse++;
+        }
+    }
+
+    vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
+
+    if (vq->inuse) {
+        vq->resubmit_list = malloc(sizeof(VuVirtqInflightDesc) * vq->inuse);
+        if (!vq->resubmit_list) {
+            return -1;
+        }
+
+        for (i = 0; i < vq->inflight->desc_num; i++) {
+            if (vq->inflight->desc[i].inflight) {
+                vq->resubmit_list[vq->resubmit_num].index = i;
+                vq->resubmit_list[vq->resubmit_num].counter =
+                                        vq->inflight->desc[i].counter;
+                vq->resubmit_num++;
+            }
+        }
+
+        if (vq->resubmit_num > 1) {
+            qsort(vq->resubmit_list, vq->resubmit_num,
+                  sizeof(VuVirtqInflightDesc), inflight_desc_compare);
+        }
+        vq->counter = vq->resubmit_list[0].counter + 1;
+    }
+
+    /* in case of I/O hang after reconnecting */
+    if (eventfd_write(vq->kick_fd, 1)) {
+        return -1;
+    }
+
+    return 0;
+}
+
 static bool
 vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
 {
@@ -923,6 +1038,10 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
                dev->vq[index].kick_fd, index);
     }
 
+    if (vu_check_queue_inflights(dev, &dev->vq[index])) {
+        vu_panic(dev, "Failed to check inflights for vq: %d\n", index);
+    }
+
     return false;
 }
 
@@ -995,6 +1114,11 @@ vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg)
 
     dev->vq[index].call_fd = vmsg->fds[0];
 
+    /* in case of I/O hang after reconnecting */
+    if (eventfd_write(vmsg->fds[0], 1)) {
+        return -1;
+    }
+
     DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index);
 
     return false;
@@ -1209,6 +1333,116 @@ vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
     return true;
 }
 
+static inline uint64_t
+vu_inflight_queue_size(uint16_t queue_size)
+{
+    return ALIGN_UP(sizeof(VuDescStateSplit) * queue_size +
+           sizeof(uint16_t), INFLIGHT_ALIGNMENT);
+}
+
+static bool
+vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
+{
+    int fd;
+    void *addr;
+    uint64_t mmap_size;
+    uint16_t num_queues, queue_size;
+
+    if (vmsg->size != sizeof(vmsg->payload.inflight)) {
+        vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size);
+        vmsg->payload.inflight.mmap_size = 0;
+        return true;
+    }
+
+    num_queues = vmsg->payload.inflight.num_queues;
+    queue_size = vmsg->payload.inflight.queue_size;
+
+    DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
+    DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size);
+
+    mmap_size = vu_inflight_queue_size(queue_size) * num_queues;
+
+    addr = qemu_memfd_alloc("vhost-inflight", mmap_size,
+                            F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
+                            &fd, NULL);
+
+    if (!addr) {
+        vu_panic(dev, "Failed to alloc vhost inflight area");
+        vmsg->payload.inflight.mmap_size = 0;
+        return true;
+    }
+
+    memset(addr, 0, mmap_size);
+
+    dev->inflight_info.addr = addr;
+    dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size;
+    dev->inflight_info.fd = vmsg->fds[0] = fd;
+    vmsg->fd_num = 1;
+    vmsg->payload.inflight.mmap_offset = 0;
+
+    DPRINT("send inflight mmap_size: %"PRId64"\n",
+           vmsg->payload.inflight.mmap_size);
+    DPRINT("send inflight mmap offset: %"PRId64"\n",
+           vmsg->payload.inflight.mmap_offset);
+
+    return true;
+}
+
+static bool
+vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
+{
+    int fd, i;
+    uint64_t mmap_size, mmap_offset;
+    uint16_t num_queues, queue_size;
+    void *rc;
+
+    if (vmsg->fd_num != 1 ||
+        vmsg->size != sizeof(vmsg->payload.inflight)) {
+        vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d",
+                 vmsg->size, vmsg->fd_num);
+        return false;
+    }
+
+    fd = vmsg->fds[0];
+    mmap_size = vmsg->payload.inflight.mmap_size;
+    mmap_offset = vmsg->payload.inflight.mmap_offset;
+    num_queues = vmsg->payload.inflight.num_queues;
+    queue_size = vmsg->payload.inflight.queue_size;
+
+    DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size);
+    DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset);
+    DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
+    DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size);
+
+    rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+              fd, mmap_offset);
+
+    if (rc == MAP_FAILED) {
+        vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno));
+        return false;
+    }
+
+    if (dev->inflight_info.fd) {
+        close(dev->inflight_info.fd);
+    }
+
+    if (dev->inflight_info.addr) {
+        munmap(dev->inflight_info.addr, dev->inflight_info.size);
+    }
+
+    dev->inflight_info.fd = fd;
+    dev->inflight_info.addr = rc;
+    dev->inflight_info.size = mmap_size;
+
+    for (i = 0; i < num_queues; i++) {
+        dev->vq[i].inflight = (VuVirtqInflight *)rc;
+        dev->vq[i].inflight->desc_num = queue_size;
+        rc = (void *)((char *)rc + vu_inflight_queue_size(queue_size));
+    }
+
+    return false;
+}
+
 static bool
 vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
 {
@@ -1287,6 +1521,10 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
         return vu_set_postcopy_listen(dev, vmsg);
     case VHOST_USER_POSTCOPY_END:
         return vu_set_postcopy_end(dev, vmsg);
+    case VHOST_USER_GET_INFLIGHT_FD:
+        return vu_get_inflight_fd(dev, vmsg);
+    case VHOST_USER_SET_INFLIGHT_FD:
+        return vu_set_inflight_fd(dev, vmsg);
     default:
         vmsg_close_fds(vmsg);
         vu_panic(dev, "Unhandled request: %d", vmsg->request);
@@ -1354,8 +1592,24 @@ vu_deinit(VuDev *dev)
             close(vq->err_fd);
             vq->err_fd = -1;
         }
+
+        if (vq->resubmit_list) {
+            free(vq->resubmit_list);
+            vq->resubmit_list = NULL;
+        }
+
+        vq->inflight = NULL;
     }
 
+    if (dev->inflight_info.addr) {
+        munmap(dev->inflight_info.addr, dev->inflight_info.size);
+        dev->inflight_info.addr = NULL;
+    }
+
+    if (dev->inflight_info.fd > 0) {
+        close(dev->inflight_info.fd);
+        dev->inflight_info.fd = -1;
+    }
 
     vu_close_log(dev);
     if (dev->slave_fd != -1) {
@@ -1682,20 +1936,6 @@ vu_queue_empty(VuDev *dev, VuVirtq *vq)
     return vring_avail_idx(vq) == vq->last_avail_idx;
 }
 
-static inline
-bool has_feature(uint64_t features, unsigned int fbit)
-{
-    assert(fbit < 64);
-    return !!(features & (1ULL << fbit));
-}
-
-static inline
-bool vu_has_feature(VuDev *dev,
-                    unsigned int fbit)
-{
-    return has_feature(dev->features, fbit);
-}
-
 static bool
 vring_notify(VuDev *dev, VuVirtq *vq)
 {
@@ -1824,12 +2064,6 @@ virtqueue_map_desc(VuDev *dev,
     *p_num_sg = num_sg;
 }
 
-/* Round number down to multiple */
-#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
-
-/* Round number up to multiple */
-#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
-
 static void *
 virtqueue_alloc_element(size_t sz,
                                      unsigned out_num, unsigned in_num)
@@ -1930,9 +2164,68 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz)
     return elem;
 }
 
+static int
+vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx)
+{
+    if (!has_feature(dev->protocol_features,
+        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+        return 0;
+    }
+
+    if (unlikely(!vq->inflight)) {
+        return -1;
+    }
+
+    vq->inflight->desc[desc_idx].counter = vq->counter++;
+    vq->inflight->desc[desc_idx].inflight = 1;
+
+    return 0;
+}
+
+static int
+vu_queue_inflight_pre_put(VuDev *dev, VuVirtq *vq, int desc_idx)
+{
+    if (!has_feature(dev->protocol_features,
+        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+        return 0;
+    }
+
+    if (unlikely(!vq->inflight)) {
+        return -1;
+    }
+
+    vq->inflight->last_batch_head = desc_idx;
+
+    return 0;
+}
+
+static int
+vu_queue_inflight_post_put(VuDev *dev, VuVirtq *vq, int desc_idx)
+{
+    if (!has_feature(dev->protocol_features,
+        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+        return 0;
+    }
+
+    if (unlikely(!vq->inflight)) {
+        return -1;
+    }
+
+    barrier();
+
+    vq->inflight->desc[desc_idx].inflight = 0;
+
+    barrier();
+
+    vq->inflight->used_idx = vq->used_idx;
+
+    return 0;
+}
+
 void *
 vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
 {
+    int i;
     unsigned int head;
     VuVirtqElement *elem;
 
@@ -1941,6 +2234,18 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
         return NULL;
     }
 
+    if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) {
+        i = (--vq->resubmit_num);
+        elem = vu_queue_map_desc(dev, vq, vq->resubmit_list[i].index, sz);
+
+        if (!vq->resubmit_num) {
+            free(vq->resubmit_list);
+            vq->resubmit_list = NULL;
+        }
+
+        return elem;
+    }
+
     if (vu_queue_empty(dev, vq)) {
         return NULL;
     }
@@ -1971,6 +2276,8 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
 
     vq->inuse++;
 
+    vu_queue_inflight_get(dev, vq, head);
+
     return elem;
 }
 
@@ -2131,5 +2438,7 @@ vu_queue_push(VuDev *dev, VuVirtq *vq,
               const VuVirtqElement *elem, unsigned int len)
 {
     vu_queue_fill(dev, vq, elem, len, 0);
+    vu_queue_inflight_pre_put(dev, vq, elem->index);
     vu_queue_flush(dev, vq, 1);
+    vu_queue_inflight_post_put(dev, vq, elem->index);
 }
diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
index 3de8414898..414ceb0a2f 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -53,6 +53,7 @@ enum VhostUserProtocolFeature {
     VHOST_USER_PROTOCOL_F_CONFIG = 9,
     VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
     VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
+    VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
 
     VHOST_USER_PROTOCOL_F_MAX
 };
@@ -91,6 +92,8 @@ typedef enum VhostUserRequest {
     VHOST_USER_POSTCOPY_ADVISE  = 28,
     VHOST_USER_POSTCOPY_LISTEN  = 29,
     VHOST_USER_POSTCOPY_END     = 30,
+    VHOST_USER_GET_INFLIGHT_FD = 31,
+    VHOST_USER_SET_INFLIGHT_FD = 32,
     VHOST_USER_MAX
 } VhostUserRequest;
 
@@ -138,6 +141,13 @@ typedef struct VhostUserVringArea {
     uint64_t offset;
 } VhostUserVringArea;
 
+typedef struct VhostUserInflight {
+    uint64_t mmap_size;
+    uint64_t mmap_offset;
+    uint16_t num_queues;
+    uint16_t queue_size;
+} VhostUserInflight;
+
 #if defined(_WIN32)
 # define VU_PACKED __attribute__((gcc_struct, packed))
 #else
@@ -163,6 +173,7 @@ typedef struct VhostUserMsg {
         VhostUserLog log;
         VhostUserConfig config;
         VhostUserVringArea area;
+        VhostUserInflight inflight;
     } payload;
 
     int fds[VHOST_MEMORY_MAX_NREGIONS];
@@ -234,9 +245,61 @@ typedef struct VuRing {
     uint32_t flags;
 } VuRing;
 
+typedef struct VuDescStateSplit {
+    /* Indicate whether this descriptor is inflight or not.
+     * Only available for head-descriptor. */
+    uint8_t inflight;
+
+    /* Padding */
+    uint8_t padding[5];
+
+    /* Maintain a list for the last batch of used descriptors.
+     * Only available when batching is used for submitting */
+    uint16_t next;
+
+    /* Used to preserve the order of fetching available descriptors.
+     * Only available for head-descriptor. */
+    uint64_t counter;
+} VuDescStateSplit;
+
+typedef struct VuVirtqInflight {
+    /* The feature flags of this region. Now it's initialized to 0. */
+    uint64_t features;
+
+    /* The version of this region. It's 1 currently.
+     * Zero value indicates a vm reset happened. */
+    uint16_t version;
+
+    /* The size of VuDescStateSplit array. It's equal to the virtqueue
+     * size. Slave could get it from queue size field of VhostUserInflight. */
+    uint16_t desc_num;
+
+    /* The head of list that track the last batch of used descriptors. */
+    uint16_t last_batch_head;
+
+    /* Storing the idx value of used ring */
+    uint16_t used_idx;
+
+    /* Used to track the state of each descriptor in descriptor table */
+    VuDescStateSplit desc[0];
+} VuVirtqInflight;
+
+typedef struct VuVirtqInflightDesc {
+    uint16_t index;
+    uint64_t counter;
+} VuVirtqInflightDesc;
+
 typedef struct VuVirtq {
     VuRing vring;
 
+    VuVirtqInflight *inflight;
+
+    VuVirtqInflightDesc *resubmit_list;
+
+    uint16_t resubmit_num;
+
+    uint64_t counter;
+
     /* Next head to pop */
     uint16_t last_avail_idx;
 
@@ -279,11 +342,18 @@ typedef void (*vu_set_watch_cb) (VuDev *dev, int fd, int condition,
                                  vu_watch_cb cb, void *data);
 typedef void (*vu_remove_watch_cb) (VuDev *dev, int fd);
 
+typedef struct VuDevInflightInfo {
+    int fd;
+    void *addr;
+    uint64_t size;
+} VuDevInflightInfo;
+
 struct VuDev {
     int sock;
     uint32_t nregions;
     VuDevRegion regions[VHOST_MEMORY_MAX_NREGIONS];
     VuVirtq vq[VHOST_MAX_NR_VIRTQUEUE];
+    VuDevInflightInfo inflight_info;
     int log_call_fd;
     int slave_fd;
     uint64_t log_size;

From a1fe0b8f273c2272a7964655548179478b9d6a91 Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@baidu.com>
Date: Thu, 28 Feb 2019 16:53:53 +0800
Subject: [PATCH 23/26] vhost-user-blk: Add support to get/set inflight buffer

This patch adds support for vhost-user-blk device to get/set
inflight buffer from/to backend.

Signed-off-by: Xie Yongji <xieyongji@baidu.com>
Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
Message-Id: <20190228085355.9614-6-xieyongji@baidu.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/block/vhost-user-blk.c          | 28 ++++++++++++++++++++++++++++
 include/hw/virtio/vhost-user-blk.h |  1 +
 2 files changed, 29 insertions(+)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 767c734a81..28b81368f7 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -128,6 +128,21 @@ static void vhost_user_blk_start(VirtIODevice *vdev)
     }
 
     s->dev.acked_features = vdev->guest_features;
+
+    if (!s->inflight->addr) {
+        ret = vhost_dev_get_inflight(&s->dev, s->queue_size, s->inflight);
+        if (ret < 0) {
+            error_report("Error get inflight: %d", -ret);
+            goto err_guest_notifiers;
+        }
+    }
+
+    ret = vhost_dev_set_inflight(&s->dev, s->inflight);
+    if (ret < 0) {
+        error_report("Error set inflight: %d", -ret);
+        goto err_guest_notifiers;
+    }
+
     ret = vhost_dev_start(&s->dev, vdev);
     if (ret < 0) {
         error_report("Error starting vhost: %d", -ret);
@@ -249,6 +264,13 @@ static void vhost_user_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
     }
 }
 
+static void vhost_user_blk_reset(VirtIODevice *vdev)
+{
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+
+    vhost_dev_free_inflight(s->inflight);
+}
+
 static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
 {
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
@@ -283,6 +305,8 @@ static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
                          vhost_user_blk_handle_output);
     }
 
+    s->inflight = g_new0(struct vhost_inflight, 1);
+
     s->dev.nvqs = s->num_queues;
     s->dev.vqs = g_new(struct vhost_virtqueue, s->dev.nvqs);
     s->dev.vq_index = 0;
@@ -315,6 +339,7 @@ vhost_err:
     vhost_dev_cleanup(&s->dev);
 virtio_err:
     g_free(vqs);
+    g_free(s->inflight);
     virtio_cleanup(vdev);
     vhost_user_cleanup(&s->vhost_user);
 }
@@ -327,7 +352,9 @@ static void vhost_user_blk_device_unrealize(DeviceState *dev, Error **errp)
 
     vhost_user_blk_set_status(vdev, 0);
     vhost_dev_cleanup(&s->dev);
+    vhost_dev_free_inflight(s->inflight);
     g_free(vqs);
+    g_free(s->inflight);
     virtio_cleanup(vdev);
     vhost_user_cleanup(&s->vhost_user);
 }
@@ -372,6 +399,7 @@ static void vhost_user_blk_class_init(ObjectClass *klass, void *data)
     vdc->set_config = vhost_user_blk_set_config;
     vdc->get_features = vhost_user_blk_get_features;
     vdc->set_status = vhost_user_blk_set_status;
+    vdc->reset = vhost_user_blk_reset;
 }
 
 static const TypeInfo vhost_user_blk_info = {
diff --git a/include/hw/virtio/vhost-user-blk.h b/include/hw/virtio/vhost-user-blk.h
index a8a106eecb..68634bee61 100644
--- a/include/hw/virtio/vhost-user-blk.h
+++ b/include/hw/virtio/vhost-user-blk.h
@@ -36,6 +36,7 @@ typedef struct VHostUserBlk {
     uint32_t queue_size;
     uint32_t config_wce;
     struct vhost_dev dev;
+    struct vhost_inflight *inflight;
     VhostUserState vhost_user;
 } VHostUserBlk;
 

From db891a9ba3b5aef52c2735f9b907d4d3678a86f8 Mon Sep 17 00:00:00 2001
From: Knut Omang <knut.omang@oracle.com>
Date: Thu, 21 Feb 2019 19:13:22 +0100
Subject: [PATCH 24/26] pcie: Add a simple PCIe ACS (Access Control Services)
 helper function

Implementing an ACS capability on downstream ports and multifunction
endpoints indicates isolation and IOMMU visibility to a finer
granularity. This creates smaller IOMMU groups in the guest and thus
more flexibility in assigning endpoints to guest userspace or an L2
guest.

Signed-off-by: Knut Omang <knut.omang@oracle.com>
Message-Id: <07489975121696f5573b0a92baaf3486ef51e35d.1550768238.git-series.knut.omang@oracle.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/pci/pcie.c              | 38 ++++++++++++++++++++++++++++++++++++++
 include/hw/pci/pcie.h      |  6 ++++++
 include/hw/pci/pcie_regs.h |  4 ++++
 3 files changed, 48 insertions(+)

diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index 640f678773..cf1ca30f93 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -914,3 +914,41 @@ void pcie_ats_init(PCIDevice *dev, uint16_t offset)
 
     pci_set_word(dev->wmask + dev->exp.ats_cap + PCI_ATS_CTRL, 0x800f);
 }
+
+/* ACS (Access Control Services) */
+void pcie_acs_init(PCIDevice *dev, uint16_t offset)
+{
+    bool is_downstream = pci_is_express_downstream_port(dev);
+    uint16_t cap_bits = 0;
+
+    /* For endpoints, only multifunction devs may have an ACS capability: */
+    assert(is_downstream ||
+           (dev->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) ||
+           PCI_FUNC(dev->devfn));
+
+    pcie_add_capability(dev, PCI_EXT_CAP_ID_ACS, PCI_ACS_VER, offset,
+                        PCI_ACS_SIZEOF);
+    dev->exp.acs_cap = offset;
+
+    if (is_downstream) {
+        /*
+         * Downstream ports must implement SV, TB, RR, CR, UF, and DT (with
+         * caveats on the latter four that we ignore for simplicity).
+         * Endpoints may also implement a subset of ACS capabilities,
+         * but these are optional if the endpoint does not support
+         * peer-to-peer between functions and thus omitted here.
+         */
+        cap_bits = PCI_ACS_SV | PCI_ACS_TB | PCI_ACS_RR |
+            PCI_ACS_CR | PCI_ACS_UF | PCI_ACS_DT;
+    }
+
+    pci_set_word(dev->config + offset + PCI_ACS_CAP, cap_bits);
+    pci_set_word(dev->wmask + offset + PCI_ACS_CTRL, cap_bits);
+}
+
+void pcie_acs_reset(PCIDevice *dev)
+{
+    if (dev->exp.acs_cap) {
+        pci_set_word(dev->config + dev->exp.acs_cap + PCI_ACS_CTRL, 0);
+    }
+}
diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
index 5b82a0d244..e30334d74d 100644
--- a/include/hw/pci/pcie.h
+++ b/include/hw/pci/pcie.h
@@ -79,6 +79,9 @@ struct PCIExpressDevice {
 
     /* Offset of ATS capability in config space */
     uint16_t ats_cap;
+
+    /* ACS */
+    uint16_t acs_cap;
 };
 
 #define COMPAT_PROP_PCP "power_controller_present"
@@ -128,6 +131,9 @@ void pcie_add_capability(PCIDevice *dev,
                          uint16_t offset, uint16_t size);
 void pcie_sync_bridge_lnk(PCIDevice *dev);
 
+void pcie_acs_init(PCIDevice *dev, uint16_t offset);
+void pcie_acs_reset(PCIDevice *dev);
+
 void pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn);
 void pcie_dev_ser_num_init(PCIDevice *dev, uint16_t offset, uint64_t ser_num);
 void pcie_ats_init(PCIDevice *dev, uint16_t offset);
diff --git a/include/hw/pci/pcie_regs.h b/include/hw/pci/pcie_regs.h
index ad4e7808b8..1db86b0ec4 100644
--- a/include/hw/pci/pcie_regs.h
+++ b/include/hw/pci/pcie_regs.h
@@ -175,4 +175,8 @@ typedef enum PCIExpLinkWidth {
                                          PCI_ERR_COR_INTERNAL |         \
                                          PCI_ERR_COR_HL_OVERFLOW)
 
+/* ACS */
+#define PCI_ACS_VER                     0x1
+#define PCI_ACS_SIZEOF                  8
+
 #endif /* QEMU_PCIE_REGS_H */

From e07fb4b50b629141eb1517002ccfa070dbdc1ea7 Mon Sep 17 00:00:00 2001
From: Knut Omang <knut.omang@oracle.com>
Date: Thu, 21 Feb 2019 19:13:23 +0100
Subject: [PATCH 25/26] gen_pcie_root_port: Add ACS (Access Control Services)
 capability

Claim ACS support in the generic PCIe root port to allow
passthrough of individual functions of a device to different
guests (in a nested virt.setting) with VFIO.
Without this patch, all functions of a device, such as all VFs of
an SR/IOV device, will end up in the same IOMMU group.
A similar situation occurs on Windows with Hyper-V.

In the single function device case, it also has a small cosmetic
benefit in that the root port itself is not grouped with
the device. VFIO handles that situation in that binding rules
only apply to endpoints, so it does not limit passthrough in
those cases.

Signed-off-by: Knut Omang <knut.omang@oracle.com>
Reviewed-by: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
Message-Id: <319460b483f566dd57487eb3dd340ed4c10aa53c.1550768238.git-series.knut.omang@oracle.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/pci-bridge/gen_pcie_root_port.c | 4 ++++
 hw/pci-bridge/pcie_root_port.c     | 4 ++++
 include/hw/pci/pcie_port.h         | 1 +
 3 files changed, 9 insertions(+)

diff --git a/hw/pci-bridge/gen_pcie_root_port.c b/hw/pci-bridge/gen_pcie_root_port.c
index 9766edb445..26bda73eae 100644
--- a/hw/pci-bridge/gen_pcie_root_port.c
+++ b/hw/pci-bridge/gen_pcie_root_port.c
@@ -20,6 +20,9 @@
         OBJECT_CHECK(GenPCIERootPort, (obj), TYPE_GEN_PCIE_ROOT_PORT)
 
 #define GEN_PCIE_ROOT_PORT_AER_OFFSET           0x100
+#define GEN_PCIE_ROOT_PORT_ACS_OFFSET \
+        (GEN_PCIE_ROOT_PORT_AER_OFFSET + PCI_ERR_SIZEOF)
+
 #define GEN_PCIE_ROOT_PORT_MSIX_NR_VECTOR       1
 
 typedef struct GenPCIERootPort {
@@ -149,6 +152,7 @@ static void gen_rp_dev_class_init(ObjectClass *klass, void *data)
     rpc->interrupts_init = gen_rp_interrupts_init;
     rpc->interrupts_uninit = gen_rp_interrupts_uninit;
     rpc->aer_offset = GEN_PCIE_ROOT_PORT_AER_OFFSET;
+    rpc->acs_offset = GEN_PCIE_ROOT_PORT_ACS_OFFSET;
 }
 
 static const TypeInfo gen_rp_dev_info = {
diff --git a/hw/pci-bridge/pcie_root_port.c b/hw/pci-bridge/pcie_root_port.c
index 34ad76743c..e94d918b6d 100644
--- a/hw/pci-bridge/pcie_root_port.c
+++ b/hw/pci-bridge/pcie_root_port.c
@@ -47,6 +47,7 @@ static void rp_reset(DeviceState *qdev)
     pcie_cap_deverr_reset(d);
     pcie_cap_slot_reset(d);
     pcie_cap_arifwd_reset(d);
+    pcie_acs_reset(d);
     pcie_aer_root_reset(d);
     pci_bridge_reset(qdev);
     pci_bridge_disable_base_limit(d);
@@ -106,6 +107,9 @@ static void rp_realize(PCIDevice *d, Error **errp)
     pcie_aer_root_init(d);
     rp_aer_vector_update(d);
 
+    if (rpc->acs_offset) {
+        pcie_acs_init(d, rpc->acs_offset);
+    }
     return;
 
 err:
diff --git a/include/hw/pci/pcie_port.h b/include/hw/pci/pcie_port.h
index df242a0caf..09586f4641 100644
--- a/include/hw/pci/pcie_port.h
+++ b/include/hw/pci/pcie_port.h
@@ -78,6 +78,7 @@ typedef struct PCIERootPortClass {
     int exp_offset;
     int aer_offset;
     int ssvid_offset;
+    int acs_offset;    /* If nonzero, optional ACS capability offset */
     int ssid;
 } PCIERootPortClass;
 

From 9040e6dfa8c3fed87695a3de555d2c775727bb51 Mon Sep 17 00:00:00 2001
From: Wei Yang <richardw.yang@linux.intel.com>
Date: Fri, 1 Mar 2019 11:35:48 +0800
Subject: [PATCH 26/26] i386, acpi: check acpi_memory_hotplug capacity in
 pre_plug

Currently we do device realization like below:

   hotplug_handler_pre_plug()
   dc->realize()
   hotplug_handler_plug()

Before we do device realization and plug, we should allocate necessary
resources and check if memory-hotplug-support property is enabled.

At the piix4 and ich9, the memory-hotplug-support property is checked at
plug stage. This means that device has been realized and mapped into guest
address space 'pc_dimm_plug()' by the time acpi plug handler is called,
where it might fail and crash QEMU due to reaching g_assert_not_reached()
(piix4) or error_abort (ich9).

Fix it by checking if memory hotplug is enabled at pre_plug stage
where we can gracefully abort hotplug request.

Signed-off-by: Wei Yang <richardw.yang@linux.intel.com>
CC: Igor Mammedov <imammedo@redhat.com>
CC: Eric Blake <eblake@redhat.com>
Signed-off-by: Wei Yang <richardw.yang@linux.intel.com>

Message-Id: <20190301033548.6691-1-richardw.yang@linux.intel.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/acpi/ich9.c         | 15 +++++++++++++--
 hw/acpi/piix4.c        | 13 ++++++++++---
 hw/i386/pc.c           |  2 ++
 hw/isa/lpc_ich9.c      |  1 +
 include/hw/acpi/ich9.h |  2 ++
 5 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c
index c5d8646abc..e53dfe1ee3 100644
--- a/hw/acpi/ich9.c
+++ b/hw/acpi/ich9.c
@@ -483,13 +483,24 @@ void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm, Error **errp)
                              NULL);
 }
 
+void ich9_pm_device_pre_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev,
+                                Error **errp)
+{
+    ICH9LPCState *lpc = ICH9_LPC_DEVICE(hotplug_dev);
+
+    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) &&
+        !lpc->pm.acpi_memory_hotplug.is_enabled)
+        error_setg(errp,
+                   "memory hotplug is not enabled: %s.memory-hotplug-support "
+                   "is not set", object_get_typename(OBJECT(lpc)));
+}
+
 void ich9_pm_device_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev,
                             Error **errp)
 {
     ICH9LPCState *lpc = ICH9_LPC_DEVICE(hotplug_dev);
 
-    if (lpc->pm.acpi_memory_hotplug.is_enabled &&
-        object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
         if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) {
             nvdimm_acpi_plug_cb(hotplug_dev, dev);
         } else {
diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c
index 7b98121070..9c079d6834 100644
--- a/hw/acpi/piix4.c
+++ b/hw/acpi/piix4.c
@@ -380,9 +380,17 @@ static void piix4_pm_powerdown_req(Notifier *n, void *opaque)
 static void piix4_device_pre_plug_cb(HotplugHandler *hotplug_dev,
                                     DeviceState *dev, Error **errp)
 {
+    PIIX4PMState *s = PIIX4_PM(hotplug_dev);
+
     if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
         acpi_pcihp_device_pre_plug_cb(hotplug_dev, dev, errp);
-    } else if (!object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) &&
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+        if (!s->acpi_memory_hotplug.is_enabled) {
+            error_setg(errp,
+                "memory hotplug is not enabled: %s.memory-hotplug-support "
+                "is not set", object_get_typename(OBJECT(s)));
+        }
+    } else if (
                !object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
         error_setg(errp, "acpi: device pre plug request for not supported"
                    " device type: %s", object_get_typename(OBJECT(dev)));
@@ -394,8 +402,7 @@ static void piix4_device_plug_cb(HotplugHandler *hotplug_dev,
 {
     PIIX4PMState *s = PIIX4_PM(hotplug_dev);
 
-    if (s->acpi_memory_hotplug.is_enabled &&
-        object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
         if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) {
             nvdimm_acpi_plug_cb(hotplug_dev, dev);
         } else {
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 1cdaff5f4d..6077d27361 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -2090,6 +2090,8 @@ static void pc_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
         return;
     }
 
+    hotplug_handler_pre_plug(pcms->acpi_dev, dev, errp);
+
     if (is_nvdimm && !ms->nvdimms_state->is_enabled) {
         error_setg(errp, "nvdimm is not enabled: missing 'nvdimm' in '-M'");
         return;
diff --git a/hw/isa/lpc_ich9.c b/hw/isa/lpc_ich9.c
index e692b9fdc1..ac44aa53be 100644
--- a/hw/isa/lpc_ich9.c
+++ b/hw/isa/lpc_ich9.c
@@ -805,6 +805,7 @@ static void ich9_lpc_class_init(ObjectClass *klass, void *data)
      * pc_q35_init()
      */
     dc->user_creatable = false;
+    hc->pre_plug = ich9_pm_device_pre_plug_cb;
     hc->plug = ich9_pm_device_plug_cb;
     hc->unplug_request = ich9_pm_device_unplug_request_cb;
     hc->unplug = ich9_pm_device_unplug_cb;
diff --git a/include/hw/acpi/ich9.h b/include/hw/acpi/ich9.h
index 59aeb06393..41568d1837 100644
--- a/include/hw/acpi/ich9.h
+++ b/include/hw/acpi/ich9.h
@@ -74,6 +74,8 @@ extern const VMStateDescription vmstate_ich9_pm;
 
 void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm, Error **errp);
 
+void ich9_pm_device_pre_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev,
+                                Error **errp);
 void ich9_pm_device_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev,
                             Error **errp);
 void ich9_pm_device_unplug_request_cb(HotplugHandler *hotplug_dev,