From 573581b18dfd458ddac22f832bfb3f6fc9b585dc Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Wed, 14 Feb 2024 03:27:51 -0800 Subject: [PATCH 01/68] vdpa: add back vhost_vdpa_net_first_nc_vdpa MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous commits had it removed. Now adding it back because this function will be needed by future patches. Message-Id: <1707910082-10243-2-git-send-email-si-wei.liu@oracle.com> Reviewed-by: Eugenio Pérez Signed-off-by: Si-Wei Liu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- net/vhost-vdpa.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index e6bdb4562d..1d9496cdd9 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -287,6 +287,16 @@ static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf, return size; } + +/** From any vdpa net client, get the netclient of the first queue pair */ +static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s) +{ + NICState *nic = qemu_get_nic(s->nc.peer); + NetClientState *nc0 = qemu_get_peer(nic->ncs, 0); + + return DO_UPCAST(VhostVDPAState, nc, nc0); +} + static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable) { struct vhost_vdpa *v = &s->vhost_vdpa; @@ -511,7 +521,7 @@ dma_map_err: static int vhost_vdpa_net_cvq_start(NetClientState *nc) { - VhostVDPAState *s; + VhostVDPAState *s, *s0; struct vhost_vdpa *v; int64_t cvq_group; int r; @@ -522,7 +532,8 @@ static int vhost_vdpa_net_cvq_start(NetClientState *nc) s = DO_UPCAST(VhostVDPAState, nc, nc); v = &s->vhost_vdpa; - v->shadow_vqs_enabled = v->shared->shadow_data; + s0 = vhost_vdpa_net_first_nc_vdpa(s); + v->shadow_vqs_enabled = s0->vhost_vdpa.shadow_vqs_enabled; s->vhost_vdpa.address_space_id = VHOST_VDPA_GUEST_PA_ASID; if (v->shared->shadow_data) { From c812b0655f8ccd1def48f14b89cec07e8fb68d83 Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Wed, 14 Feb 2024 03:27:53 -0800 Subject: [PATCH 02/68] vdpa: factor out vhost_vdpa_last_dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generalize duplicated condition check for the last vq of vdpa device to a common function. Message-Id: <1707910082-10243-4-git-send-email-si-wei.liu@oracle.com> Reviewed-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Si-Wei Liu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost-vdpa.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index ddae494ca8..8bd62e8dd2 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -555,6 +555,11 @@ static bool vhost_vdpa_first_dev(struct vhost_dev *dev) return v->index == 0; } +static bool vhost_vdpa_last_dev(struct vhost_dev *dev) +{ + return dev->vq_index + dev->nvqs == dev->vq_index_end; +} + static int vhost_vdpa_get_dev_features(struct vhost_dev *dev, uint64_t *features) { @@ -1315,7 +1320,7 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); } - if (dev->vq_index + dev->nvqs != dev->vq_index_end) { + if (!vhost_vdpa_last_dev(dev)) { return 0; } @@ -1337,7 +1342,7 @@ static void vhost_vdpa_reset_status(struct vhost_dev *dev) { struct vhost_vdpa *v = dev->opaque; - if (dev->vq_index + dev->nvqs != dev->vq_index_end) { + if (!vhost_vdpa_last_dev(dev)) { return; } From 77c3a336a44272e8a6e9b18c6b765f08aa84151f Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Wed, 14 Feb 2024 03:27:54 -0800 Subject: [PATCH 03/68] vdpa: factor out vhost_vdpa_net_get_nc_vdpa MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce new API. No functional change on existing API. Message-Id: <1707910082-10243-5-git-send-email-si-wei.liu@oracle.com> Reviewed-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Si-Wei Liu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- net/vhost-vdpa.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 1d9496cdd9..85efda9e67 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -288,13 +288,18 @@ static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf, } -/** From any vdpa net client, get the netclient of the first queue pair */ -static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s) +/** From any vdpa net client, get the netclient of the i-th queue pair */ +static VhostVDPAState *vhost_vdpa_net_get_nc_vdpa(VhostVDPAState *s, int i) { NICState *nic = qemu_get_nic(s->nc.peer); - NetClientState *nc0 = qemu_get_peer(nic->ncs, 0); + NetClientState *nc_i = qemu_get_peer(nic->ncs, i); - return DO_UPCAST(VhostVDPAState, nc, nc0); + return DO_UPCAST(VhostVDPAState, nc, nc_i); +} + +static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s) +{ + return vhost_vdpa_net_get_nc_vdpa(s, 0); } static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable) From 62845d3296ab7565e66f6e1f7bcfedb877f6fe7b Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Wed, 14 Feb 2024 03:27:55 -0800 Subject: [PATCH 04/68] vdpa: add vhost_vdpa_set_address_space_id trace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For better debuggability and observability. Message-Id: <1707910082-10243-6-git-send-email-si-wei.liu@oracle.com> Reviewed-by: Eugenio Pérez Signed-off-by: Si-Wei Liu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- net/trace-events | 3 +++ net/vhost-vdpa.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/net/trace-events b/net/trace-events index 823a071bdc..aab666a6a0 100644 --- a/net/trace-events +++ b/net/trace-events @@ -23,3 +23,6 @@ colo_compare_tcp_info(const char *pkt, uint32_t seq, uint32_t ack, int hdlen, in # filter-rewriter.c colo_filter_rewriter_pkt_info(const char *func, const char *src, const char *dst, uint32_t seq, uint32_t ack, uint32_t flag) "%s: src/dst: %s/%s p: seq/ack=%u/%u flags=0x%x" colo_filter_rewriter_conn_offset(uint32_t offset) ": offset=%u" + +# vhost-vdpa.c +vhost_vdpa_set_address_space_id(void *v, unsigned vq_group, unsigned asid_num) "vhost_vdpa: %p vq_group: %u asid: %u" diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 85efda9e67..9e8aded41d 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -29,6 +29,7 @@ #include "migration/migration.h" #include "migration/misc.h" #include "hw/virtio/vhost.h" +#include "trace.h" /* Todo:need to add the multiqueue support here */ typedef struct VhostVDPAState { @@ -460,6 +461,8 @@ static int vhost_vdpa_set_address_space_id(struct vhost_vdpa *v, }; int r; + trace_vhost_vdpa_set_address_space_id(v, vq_group, asid_num); + r = ioctl(v->shared->device_fd, VHOST_VDPA_SET_GROUP_ASID, &asid); if (unlikely(r < 0)) { error_report("Can't set vq group %u asid %u, errno=%d (%s)", From 6ec0a7467828f228e00ec83978fb5267f81079e0 Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Wed, 14 Feb 2024 03:27:56 -0800 Subject: [PATCH 05/68] vdpa: add vhost_vdpa_get_vring_base trace for svq mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For better debuggability and observability. Message-Id: <1707910082-10243-7-git-send-email-si-wei.liu@oracle.com> Reviewed-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Si-Wei Liu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/trace-events | 2 +- hw/virtio/vhost-vdpa.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 77905d1994..28d6d78380 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -58,7 +58,7 @@ vhost_vdpa_set_log_base(void *dev, uint64_t base, unsigned long long size, int r vhost_vdpa_set_vring_addr(void *dev, unsigned int index, unsigned int flags, uint64_t desc_user_addr, uint64_t used_user_addr, uint64_t avail_user_addr, uint64_t log_guest_addr) "dev: %p index: %u flags: 0x%x desc_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" log_guest_addr: 0x%"PRIx64 vhost_vdpa_set_vring_num(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u" vhost_vdpa_set_vring_base(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u" -vhost_vdpa_get_vring_base(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u" +vhost_vdpa_get_vring_base(void *dev, unsigned int index, unsigned int num, bool svq) "dev: %p index: %u num: %u svq: %d" vhost_vdpa_set_vring_kick(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d" vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d" vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 8bd62e8dd2..c7271093dd 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -1412,6 +1412,7 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev, if (v->shadow_vqs_enabled) { ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index); + trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num, true); return 0; } @@ -1424,7 +1425,7 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev, } ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring); - trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num); + trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num, false); return ret; } From 19a060bce17316d9ff7d8b3637fb391010be8144 Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Wed, 14 Feb 2024 03:27:57 -0800 Subject: [PATCH 06/68] vdpa: add vhost_vdpa_set_dev_vring_base trace for svq mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For better debuggability and observability. Message-Id: <1707910082-10243-8-git-send-email-si-wei.liu@oracle.com> Reviewed-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Si-Wei Liu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/trace-events | 2 +- hw/virtio/vhost-vdpa.c | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 28d6d78380..20577aa584 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -57,7 +57,7 @@ vhost_vdpa_dev_start(void *dev, bool started) "dev: %p started: %d" vhost_vdpa_set_log_base(void *dev, uint64_t base, unsigned long long size, int refcnt, int fd, void *log) "dev: %p base: 0x%"PRIx64" size: %llu refcnt: %d fd: %d log: %p" vhost_vdpa_set_vring_addr(void *dev, unsigned int index, unsigned int flags, uint64_t desc_user_addr, uint64_t used_user_addr, uint64_t avail_user_addr, uint64_t log_guest_addr) "dev: %p index: %u flags: 0x%x desc_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" log_guest_addr: 0x%"PRIx64 vhost_vdpa_set_vring_num(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u" -vhost_vdpa_set_vring_base(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u" +vhost_vdpa_set_dev_vring_base(void *dev, unsigned int index, unsigned int num, bool svq) "dev: %p index: %u num: %u svq: %d" vhost_vdpa_get_vring_base(void *dev, unsigned int index, unsigned int num, bool svq) "dev: %p index: %u num: %u svq: %d" vhost_vdpa_set_vring_kick(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d" vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d" diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index c7271093dd..fc84cf6ec6 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -970,7 +970,10 @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config, static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev, struct vhost_vring_state *ring) { - trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num); + struct vhost_vdpa *v = dev->opaque; + + trace_vhost_vdpa_set_dev_vring_base(dev, ring->index, ring->num, + v->shadow_vqs_enabled); return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring); } From faed74468fe4ade9503025094f8a03673c8bd416 Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Wed, 14 Feb 2024 03:27:58 -0800 Subject: [PATCH 07/68] vdpa: add trace events for vhost_vdpa_net_load_cmd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For better debuggability and observability. Message-Id: <1707910082-10243-9-git-send-email-si-wei.liu@oracle.com> Reviewed-by: Eugenio Pérez Signed-off-by: Si-Wei Liu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- net/trace-events | 2 ++ net/vhost-vdpa.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/net/trace-events b/net/trace-events index aab666a6a0..88f56f2428 100644 --- a/net/trace-events +++ b/net/trace-events @@ -26,3 +26,5 @@ colo_filter_rewriter_conn_offset(uint32_t offset) ": offset=%u" # vhost-vdpa.c vhost_vdpa_set_address_space_id(void *v, unsigned vq_group, unsigned asid_num) "vhost_vdpa: %p vq_group: %u asid: %u" +vhost_vdpa_net_load_cmd(void *s, uint8_t class, uint8_t cmd, int data_num, int data_size) "vdpa state: %p class: %u cmd: %u sg_num: %d size: %d" +vhost_vdpa_net_load_cmd_retval(void *s, uint8_t class, uint8_t cmd, int r) "vdpa state: %p class: %u cmd: %u retval: %d" diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 9e8aded41d..2c95a98cc6 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -715,6 +715,7 @@ static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s, assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl)); cmd_size = sizeof(ctrl) + data_size; + trace_vhost_vdpa_net_load_cmd(s, class, cmd, data_num, data_size); if (vhost_svq_available_slots(svq) < 2 || iov_size(out_cursor, 1) < cmd_size) { /* @@ -746,6 +747,7 @@ static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s, r = vhost_vdpa_net_cvq_add(s, &out, 1, &in, 1); if (unlikely(r < 0)) { + trace_vhost_vdpa_net_load_cmd_retval(s, class, cmd, r); return r; } From 1c4eab477fb0aa5a039513c26dac63d3460e1b08 Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Wed, 14 Feb 2024 03:27:59 -0800 Subject: [PATCH 08/68] vdpa: add trace event for vhost_vdpa_net_load_mq MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For better debuggability and observability. Message-Id: <1707910082-10243-10-git-send-email-si-wei.liu@oracle.com> Reviewed-by: Eugenio Pérez Signed-off-by: Si-Wei Liu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- net/trace-events | 1 + net/vhost-vdpa.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/net/trace-events b/net/trace-events index 88f56f2428..cda960f42b 100644 --- a/net/trace-events +++ b/net/trace-events @@ -28,3 +28,4 @@ colo_filter_rewriter_conn_offset(uint32_t offset) ": offset=%u" vhost_vdpa_set_address_space_id(void *v, unsigned vq_group, unsigned asid_num) "vhost_vdpa: %p vq_group: %u asid: %u" vhost_vdpa_net_load_cmd(void *s, uint8_t class, uint8_t cmd, int data_num, int data_size) "vdpa state: %p class: %u cmd: %u sg_num: %d size: %d" vhost_vdpa_net_load_cmd_retval(void *s, uint8_t class, uint8_t cmd, int r) "vdpa state: %p class: %u cmd: %u retval: %d" +vhost_vdpa_net_load_mq(void *s, int ncurqps) "vdpa state: %p current_qpairs: %d" diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 2c95a98cc6..2254859dec 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -939,6 +939,8 @@ static int vhost_vdpa_net_load_mq(VhostVDPAState *s, return 0; } + trace_vhost_vdpa_net_load_mq(s, n->curr_queue_pairs); + mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs); const struct iovec data = { .iov_base = &mq, From bb000fff0a1d9774444c431e0831550e99e159ce Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Wed, 14 Feb 2024 03:28:00 -0800 Subject: [PATCH 09/68] vdpa: define SVQ transitioning state for mode switching Will be used in following patches. DISABLING(-1) means SVQ is being switched off to passthrough mode. ENABLING(1) means passthrough VQs are being switched to SVQ. DONE(0) means SVQ switching is completed. Message-Id: <1707910082-10243-11-git-send-email-si-wei.liu@oracle.com> Signed-off-by: Si-Wei Liu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/hw/virtio/vhost-vdpa.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h index 8f54e5edd4..0a9575b469 100644 --- a/include/hw/virtio/vhost-vdpa.h +++ b/include/hw/virtio/vhost-vdpa.h @@ -30,6 +30,12 @@ typedef struct VhostVDPAHostNotifier { void *addr; } VhostVDPAHostNotifier; +typedef enum SVQTransitionState { + SVQ_TSTATE_DISABLING = -1, + SVQ_TSTATE_DONE, + SVQ_TSTATE_ENABLING +} SVQTransitionState; + /* Info shared by all vhost_vdpa device models */ typedef struct vhost_vdpa_shared { int device_fd; @@ -47,6 +53,9 @@ typedef struct vhost_vdpa_shared { /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */ bool shadow_data; + + /* SVQ switching is in progress, or already completed? */ + SVQTransitionState svq_switching; } VhostVDPAShared; typedef struct vhost_vdpa { From db4cba36a7356f5e94e97735af8c578c55838386 Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Wed, 14 Feb 2024 03:28:01 -0800 Subject: [PATCH 10/68] vdpa: indicate transitional state for SVQ switching svq_switching indicates the transitional state whether or not SVQ mode switching is in progress, and towards which direction. Add the neccessary state around where the switching would take place. Message-Id: <1707910082-10243-12-git-send-email-si-wei.liu@oracle.com> Signed-off-by: Si-Wei Liu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- net/vhost-vdpa.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 2254859dec..5f06c39f63 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -324,6 +324,8 @@ static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable) data_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1; cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ? n->max_ncs - n->max_queue_pairs : 0; + v->shared->svq_switching = enable ? + SVQ_TSTATE_ENABLING : SVQ_TSTATE_DISABLING; /* * TODO: vhost_net_stop does suspend, get_base and reset. We can be smarter * in the future and resume the device if read-only operations between @@ -336,6 +338,7 @@ static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable) if (unlikely(r < 0)) { error_report("unable to start vhost net: %s(%d)", g_strerror(-r), -r); } + v->shared->svq_switching = SVQ_TSTATE_DONE; } static int vdpa_net_migration_state_notifier(NotifierWithReturn *notifier, From 9ed62809b6e28ab0d887aff502ed24f77f1edafd Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Wed, 14 Feb 2024 03:28:02 -0800 Subject: [PATCH 11/68] vdpa: fix network breakage after cancelling migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix an issue where cancellation of ongoing migration ends up with no network connectivity. When canceling migration, SVQ will be switched back to the passthrough mode, but the right call fd is not programed to the device and the svq's own call fd is still used. At the point of this transitioning period, the shadow_vqs_enabled hadn't been set back to false yet, causing the installation of call fd inadvertently bypassed. Message-Id: <1707910082-10243-13-git-send-email-si-wei.liu@oracle.com> Fixes: a8ac88585da1 ("vhost: Add Shadow VirtQueue call forwarding capabilities") Cc: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Si-Wei Liu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost-vdpa.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index fc84cf6ec6..c968278e70 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -1456,7 +1456,15 @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev, /* Remember last call fd because we can switch to SVQ anytime. */ vhost_svq_set_svq_call_fd(svq, file->fd); - if (v->shadow_vqs_enabled) { + /* + * When SVQ is transitioning to off, shadow_vqs_enabled has + * not been set back to false yet, but the underlying call fd + * will have to switch back to the guest notifier to signal the + * passthrough virtqueues. In other situations, SVQ's own call + * fd shall be used to signal the device model. + */ + if (v->shadow_vqs_enabled && + v->shared->svq_switching != SVQ_TSTATE_DISABLING) { return 0; } From d884e2727849fc95d337262ec91be60165473c4a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:48 +0100 Subject: [PATCH 12/68] libvhost-user: Dynamically allocate memory for memory slots Let's prepare for increasing VHOST_USER_MAX_RAM_SLOTS by dynamically allocating dev->regions. We don't have any ABI guarantees (not dynamically linked), so we can simply change the layout of VuDev. Let's zero out the memory, just as we used to do. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-2-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 11 +++++++++++ subprojects/libvhost-user/libvhost-user.h | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index a3b158c671..360c5366d6 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -2171,6 +2171,8 @@ vu_deinit(VuDev *dev) free(dev->vq); dev->vq = NULL; + free(dev->regions); + dev->regions = NULL; } bool @@ -2205,9 +2207,18 @@ vu_init(VuDev *dev, dev->backend_fd = -1; dev->max_queues = max_queues; + dev->regions = malloc(VHOST_USER_MAX_RAM_SLOTS * sizeof(dev->regions[0])); + if (!dev->regions) { + DPRINT("%s: failed to malloc mem regions\n", __func__); + return false; + } + memset(dev->regions, 0, VHOST_USER_MAX_RAM_SLOTS * sizeof(dev->regions[0])); + dev->vq = malloc(max_queues * sizeof(dev->vq[0])); if (!dev->vq) { DPRINT("%s: failed to malloc virtqueues\n", __func__); + free(dev->regions); + dev->regions = NULL; return false; } diff --git a/subprojects/libvhost-user/libvhost-user.h b/subprojects/libvhost-user/libvhost-user.h index c2352904f0..c882b4e3a2 100644 --- a/subprojects/libvhost-user/libvhost-user.h +++ b/subprojects/libvhost-user/libvhost-user.h @@ -398,7 +398,7 @@ typedef struct VuDevInflightInfo { struct VuDev { int sock; uint32_t nregions; - VuDevRegion regions[VHOST_USER_MAX_RAM_SLOTS]; + VuDevRegion *regions; VuVirtq *vq; VuDevInflightInfo inflight_info; int log_call_fd; From 0fa6344c90a093e2421ecac3c96a8823ad74ee89 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:49 +0100 Subject: [PATCH 13/68] libvhost-user: Bump up VHOST_USER_MAX_RAM_SLOTS to 509 Let's support up to 509 mem slots, just like vhost in the kernel usually does and the rust vhost-user implementation recently [1] started doing. This is required to properly support memory hotplug, either using multiple DIMMs (ACPI supports up to 256) or using virtio-mem. The 509 used to be the KVM limit, it supported 512, but 3 were used for internal purposes. Currently, KVM supports more than 512, but it usually doesn't make use of more than ~260 (i.e., 256 DIMMs + boot memory), except when other memory devices like PCI devices with BARs are used. So, 509 seems to work well for vhost in the kernel. Details can be found in the QEMU change that made virtio-mem consume up to 256 mem slots across all virtio-mem devices. [2] 509 mem slots implies 509 VMAs/mappings in the worst case (even though, in practice with virtio-mem we won't be seeing more than ~260 in most setups). With max_map_count under Linux defaulting to 64k, 509 mem slots still correspond to less than 1% of the maximum number of mappings. There are plenty left for the application to consume. [1] https://github.com/rust-vmm/vhost/pull/224 [2] https://lore.kernel.org/all/20230926185738.277351-1-david@redhat.com/ Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-3-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/subprojects/libvhost-user/libvhost-user.h b/subprojects/libvhost-user/libvhost-user.h index c882b4e3a2..deb40e77b3 100644 --- a/subprojects/libvhost-user/libvhost-user.h +++ b/subprojects/libvhost-user/libvhost-user.h @@ -31,10 +31,12 @@ #define VHOST_MEMORY_BASELINE_NREGIONS 8 /* - * Set a reasonable maximum number of ram slots, which will be supported by - * any architecture. + * vhost in the kernel usually supports 509 mem slots. 509 used to be the + * KVM limit, it supported 512, but 3 were used for internal purposes. This + * limit is sufficient to support many DIMMs and virtio-mem in + * "dynamic-memslots" mode. */ -#define VHOST_USER_MAX_RAM_SLOTS 32 +#define VHOST_USER_MAX_RAM_SLOTS 509 #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) From bec5820908919f70f87b57b92e4c92be128f5cfd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:50 +0100 Subject: [PATCH 14/68] libvhost-user: Factor out removing all mem regions Let's factor it out. Note that the check for MAP_FAILED was wrong as we never set mmap_addr if mmap() failed. We'll remove the NULL check separately. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-4-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 34 ++++++++++++----------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 360c5366d6..e4907dfc26 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -240,6 +240,22 @@ qva_to_va(VuDev *dev, uint64_t qemu_addr) return NULL; } +static void +vu_remove_all_mem_regs(VuDev *dev) +{ + unsigned int i; + + for (i = 0; i < dev->nregions; i++) { + VuDevRegion *r = &dev->regions[i]; + void *ma = (void *)(uintptr_t)r->mmap_addr; + + if (ma) { + munmap(ma, r->size + r->mmap_offset); + } + } + dev->nregions = 0; +} + static void vmsg_close_fds(VhostUserMsg *vmsg) { @@ -1003,14 +1019,7 @@ vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) unsigned int i; VhostUserMemory m = vmsg->payload.memory, *memory = &m; - for (i = 0; i < dev->nregions; i++) { - VuDevRegion *r = &dev->regions[i]; - void *ma = (void *) (uintptr_t) r->mmap_addr; - - if (ma) { - munmap(ma, r->size + r->mmap_offset); - } - } + vu_remove_all_mem_regs(dev); dev->nregions = memory->nregions; if (dev->postcopy_listening) { @@ -2112,14 +2121,7 @@ vu_deinit(VuDev *dev) { unsigned int i; - for (i = 0; i < dev->nregions; i++) { - VuDevRegion *r = &dev->regions[i]; - void *m = (void *) (uintptr_t) r->mmap_addr; - if (m != MAP_FAILED) { - munmap(m, r->size + r->mmap_offset); - } - } - dev->nregions = 0; + vu_remove_all_mem_regs(dev); for (i = 0; i < dev->max_queues; i++) { VuVirtq *vq = &dev->vq[i]; From 05a58ce47167a95b554f9afc47f78f267c394c6e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:51 +0100 Subject: [PATCH 15/68] libvhost-user: Merge vu_set_mem_table_exec_postcopy() into vu_set_mem_table_exec() Let's reduce some code duplication and prepare for further changes. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-5-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 119 +++++++--------------- 1 file changed, 39 insertions(+), 80 deletions(-) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index e4907dfc26..a7bd7de3cd 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -937,95 +937,23 @@ vu_get_shared_object(VuDev *dev, VhostUserMsg *vmsg) } static bool -vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg) +vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) { - unsigned int i; VhostUserMemory m = vmsg->payload.memory, *memory = &m; - dev->nregions = memory->nregions; + int prot = PROT_READ | PROT_WRITE; + unsigned int i; - DPRINT("Nregions: %u\n", memory->nregions); - for (i = 0; i < dev->nregions; i++) { - void *mmap_addr; - VhostUserMemoryRegion *msg_region = &memory->regions[i]; - VuDevRegion *dev_region = &dev->regions[i]; - - DPRINT("Region %d\n", i); - DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", - msg_region->guest_phys_addr); - DPRINT(" memory_size: 0x%016"PRIx64"\n", - msg_region->memory_size); - DPRINT(" userspace_addr 0x%016"PRIx64"\n", - msg_region->userspace_addr); - DPRINT(" mmap_offset 0x%016"PRIx64"\n", - msg_region->mmap_offset); - - dev_region->gpa = msg_region->guest_phys_addr; - dev_region->size = msg_region->memory_size; - dev_region->qva = msg_region->userspace_addr; - dev_region->mmap_offset = msg_region->mmap_offset; - - /* We don't use offset argument of mmap() since the - * mapped address has to be page aligned, and we use huge - * pages. + if (dev->postcopy_listening) { + /* * In postcopy we're using PROT_NONE here to catch anyone * accessing it before we userfault */ - mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, - PROT_NONE, MAP_SHARED | MAP_NORESERVE, - vmsg->fds[i], 0); - - if (mmap_addr == MAP_FAILED) { - vu_panic(dev, "region mmap error: %s", strerror(errno)); - } else { - dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; - DPRINT(" mmap_addr: 0x%016"PRIx64"\n", - dev_region->mmap_addr); - } - - /* Return the address to QEMU so that it can translate the ufd - * fault addresses back. - */ - msg_region->userspace_addr = (uintptr_t)(mmap_addr + - dev_region->mmap_offset); - close(vmsg->fds[i]); + prot = PROT_NONE; } - /* Send the message back to qemu with the addresses filled in */ - vmsg->fd_num = 0; - if (!vu_send_reply(dev, dev->sock, vmsg)) { - vu_panic(dev, "failed to respond to set-mem-table for postcopy"); - return false; - } - - /* Wait for QEMU to confirm that it's registered the handler for the - * faults. - */ - if (!dev->read_msg(dev, dev->sock, vmsg) || - vmsg->size != sizeof(vmsg->payload.u64) || - vmsg->payload.u64 != 0) { - vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table"); - return false; - } - - /* OK, now we can go and register the memory and generate faults */ - (void)generate_faults(dev); - - return false; -} - -static bool -vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) -{ - unsigned int i; - VhostUserMemory m = vmsg->payload.memory, *memory = &m; - vu_remove_all_mem_regs(dev); dev->nregions = memory->nregions; - if (dev->postcopy_listening) { - return vu_set_mem_table_exec_postcopy(dev, vmsg); - } - DPRINT("Nregions: %u\n", memory->nregions); for (i = 0; i < dev->nregions; i++) { void *mmap_addr; @@ -1051,8 +979,7 @@ vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) * mapped address has to be page aligned, and we use huge * pages. */ mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, - vmsg->fds[i], 0); + prot, MAP_SHARED | MAP_NORESERVE, vmsg->fds[i], 0); if (mmap_addr == MAP_FAILED) { vu_panic(dev, "region mmap error: %s", strerror(errno)); @@ -1062,9 +989,41 @@ vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) dev_region->mmap_addr); } + if (dev->postcopy_listening) { + /* + * Return the address to QEMU so that it can translate the ufd + * fault addresses back. + */ + msg_region->userspace_addr = (uintptr_t)(mmap_addr + + dev_region->mmap_offset); + } close(vmsg->fds[i]); } + if (dev->postcopy_listening) { + /* Send the message back to qemu with the addresses filled in */ + vmsg->fd_num = 0; + if (!vu_send_reply(dev, dev->sock, vmsg)) { + vu_panic(dev, "failed to respond to set-mem-table for postcopy"); + return false; + } + + /* + * Wait for QEMU to confirm that it's registered the handler for the + * faults. + */ + if (!dev->read_msg(dev, dev->sock, vmsg) || + vmsg->size != sizeof(vmsg->payload.u64) || + vmsg->payload.u64 != 0) { + vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table"); + return false; + } + + /* OK, now we can go and register the memory and generate faults */ + (void)generate_faults(dev); + return false; + } + for (i = 0; i < dev->max_queues; i++) { if (dev->vq[i].vring.desc) { if (map_ring(dev, &dev->vq[i])) { From 93fec23d8cecebf0e6917044a0c1635df20e350d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:52 +0100 Subject: [PATCH 16/68] libvhost-user: Factor out adding a memory region Let's factor it out, reducing quite some code duplication and perparing for further changes. If we fail to mmap a region and panic, we now simply don't add that (broken) region. Note that we now increment dev->nregions as we are successfully adding memory regions, and don't increment dev->nregions if anything went wrong. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-6-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 168 ++++++++-------------- 1 file changed, 60 insertions(+), 108 deletions(-) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index a7bd7de3cd..f43b5096d0 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -256,6 +256,61 @@ vu_remove_all_mem_regs(VuDev *dev) dev->nregions = 0; } +static void +_vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) +{ + int prot = PROT_READ | PROT_WRITE; + VuDevRegion *r; + void *mmap_addr; + + DPRINT("Adding region %d\n", dev->nregions); + DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", + msg_region->guest_phys_addr); + DPRINT(" memory_size: 0x%016"PRIx64"\n", + msg_region->memory_size); + DPRINT(" userspace_addr: 0x%016"PRIx64"\n", + msg_region->userspace_addr); + DPRINT(" mmap_offset: 0x%016"PRIx64"\n", + msg_region->mmap_offset); + + if (dev->postcopy_listening) { + /* + * In postcopy we're using PROT_NONE here to catch anyone + * accessing it before we userfault + */ + prot = PROT_NONE; + } + + /* + * We don't use offset argument of mmap() since the mapped address has + * to be page aligned, and we use huge pages. + */ + mmap_addr = mmap(0, msg_region->memory_size + msg_region->mmap_offset, + prot, MAP_SHARED | MAP_NORESERVE, fd, 0); + if (mmap_addr == MAP_FAILED) { + vu_panic(dev, "region mmap error: %s", strerror(errno)); + return; + } + DPRINT(" mmap_addr: 0x%016"PRIx64"\n", + (uint64_t)(uintptr_t)mmap_addr); + + r = &dev->regions[dev->nregions]; + r->gpa = msg_region->guest_phys_addr; + r->size = msg_region->memory_size; + r->qva = msg_region->userspace_addr; + r->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; + r->mmap_offset = msg_region->mmap_offset; + dev->nregions++; + + if (dev->postcopy_listening) { + /* + * Return the address to QEMU so that it can translate the ufd + * fault addresses back. + */ + msg_region->userspace_addr = r->mmap_addr + r->mmap_offset; + } +} + static void vmsg_close_fds(VhostUserMsg *vmsg) { @@ -727,10 +782,7 @@ generate_faults(VuDev *dev) { static bool vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { int i; - bool track_ramblocks = dev->postcopy_listening; VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; - VuDevRegion *dev_region = &dev->regions[dev->nregions]; - void *mmap_addr; if (vmsg->fd_num != 1) { vmsg_close_fds(vmsg); @@ -760,69 +812,20 @@ vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { * we know all the postcopy client bases have been received, and we * should start generating faults. */ - if (track_ramblocks && + if (dev->postcopy_listening && vmsg->size == sizeof(vmsg->payload.u64) && vmsg->payload.u64 == 0) { (void)generate_faults(dev); return false; } - DPRINT("Adding region: %u\n", dev->nregions); - DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", - msg_region->guest_phys_addr); - DPRINT(" memory_size: 0x%016"PRIx64"\n", - msg_region->memory_size); - DPRINT(" userspace_addr 0x%016"PRIx64"\n", - msg_region->userspace_addr); - DPRINT(" mmap_offset 0x%016"PRIx64"\n", - msg_region->mmap_offset); - - dev_region->gpa = msg_region->guest_phys_addr; - dev_region->size = msg_region->memory_size; - dev_region->qva = msg_region->userspace_addr; - dev_region->mmap_offset = msg_region->mmap_offset; - - /* - * We don't use offset argument of mmap() since the - * mapped address has to be page aligned, and we use huge - * pages. - */ - if (track_ramblocks) { - /* - * In postcopy we're using PROT_NONE here to catch anyone - * accessing it before we userfault. - */ - mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, - PROT_NONE, MAP_SHARED | MAP_NORESERVE, - vmsg->fds[0], 0); - } else { - mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, - vmsg->fds[0], 0); - } - - if (mmap_addr == MAP_FAILED) { - vu_panic(dev, "region mmap error: %s", strerror(errno)); - } else { - dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; - DPRINT(" mmap_addr: 0x%016"PRIx64"\n", - dev_region->mmap_addr); - } - + _vu_add_mem_reg(dev, msg_region, vmsg->fds[0]); close(vmsg->fds[0]); - if (track_ramblocks) { - /* - * Return the address to QEMU so that it can translate the ufd - * fault addresses back. - */ - msg_region->userspace_addr = (uintptr_t)(mmap_addr + - dev_region->mmap_offset); - + if (dev->postcopy_listening) { /* Send the message back to qemu with the addresses filled in. */ vmsg->fd_num = 0; DPRINT("Successfully added new region in postcopy\n"); - dev->nregions++; return true; } else { for (i = 0; i < dev->max_queues; i++) { @@ -835,7 +838,6 @@ vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { } DPRINT("Successfully added new region\n"); - dev->nregions++; return false; } } @@ -940,63 +942,13 @@ static bool vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) { VhostUserMemory m = vmsg->payload.memory, *memory = &m; - int prot = PROT_READ | PROT_WRITE; unsigned int i; - if (dev->postcopy_listening) { - /* - * In postcopy we're using PROT_NONE here to catch anyone - * accessing it before we userfault - */ - prot = PROT_NONE; - } - vu_remove_all_mem_regs(dev); - dev->nregions = memory->nregions; DPRINT("Nregions: %u\n", memory->nregions); - for (i = 0; i < dev->nregions; i++) { - void *mmap_addr; - VhostUserMemoryRegion *msg_region = &memory->regions[i]; - VuDevRegion *dev_region = &dev->regions[i]; - - DPRINT("Region %d\n", i); - DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", - msg_region->guest_phys_addr); - DPRINT(" memory_size: 0x%016"PRIx64"\n", - msg_region->memory_size); - DPRINT(" userspace_addr 0x%016"PRIx64"\n", - msg_region->userspace_addr); - DPRINT(" mmap_offset 0x%016"PRIx64"\n", - msg_region->mmap_offset); - - dev_region->gpa = msg_region->guest_phys_addr; - dev_region->size = msg_region->memory_size; - dev_region->qva = msg_region->userspace_addr; - dev_region->mmap_offset = msg_region->mmap_offset; - - /* We don't use offset argument of mmap() since the - * mapped address has to be page aligned, and we use huge - * pages. */ - mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, - prot, MAP_SHARED | MAP_NORESERVE, vmsg->fds[i], 0); - - if (mmap_addr == MAP_FAILED) { - vu_panic(dev, "region mmap error: %s", strerror(errno)); - } else { - dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; - DPRINT(" mmap_addr: 0x%016"PRIx64"\n", - dev_region->mmap_addr); - } - - if (dev->postcopy_listening) { - /* - * Return the address to QEMU so that it can translate the ufd - * fault addresses back. - */ - msg_region->userspace_addr = (uintptr_t)(mmap_addr + - dev_region->mmap_offset); - } + for (i = 0; i < memory->nregions; i++) { + _vu_add_mem_reg(dev, &memory->regions[i], vmsg->fds[i]); close(vmsg->fds[i]); } From 4f865c3b15a71dbeae999a10256742751410394e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:53 +0100 Subject: [PATCH 17/68] libvhost-user: No need to check for NULL when unmapping We never add a memory region if mmap() failed. Therefore, no need to check for NULL. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-7-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index f43b5096d0..225283f764 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -247,11 +247,8 @@ vu_remove_all_mem_regs(VuDev *dev) for (i = 0; i < dev->nregions; i++) { VuDevRegion *r = &dev->regions[i]; - void *ma = (void *)(uintptr_t)r->mmap_addr; - if (ma) { - munmap(ma, r->size + r->mmap_offset); - } + munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset); } dev->nregions = 0; } @@ -888,11 +885,8 @@ vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { for (i = 0; i < dev->nregions; i++) { if (reg_equal(&dev->regions[i], msg_region)) { VuDevRegion *r = &dev->regions[i]; - void *ma = (void *) (uintptr_t) r->mmap_addr; - if (ma) { - munmap(ma, r->size + r->mmap_offset); - } + munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset); /* * Shift all affected entries by 1 to close the hole at index i and From c6f90b785291389168b15fe797788bcbb5a7803a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:54 +0100 Subject: [PATCH 18/68] libvhost-user: Don't zero out memory for memory regions dev->nregions always covers only valid entries. Stop zeroing out other array elements that are unused. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-8-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 225283f764..2e8b611385 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -888,13 +888,9 @@ vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset); - /* - * Shift all affected entries by 1 to close the hole at index i and - * zero out the last entry. - */ + /* Shift all affected entries by 1 to close the hole at index. */ memmove(dev->regions + i, dev->regions + i + 1, sizeof(VuDevRegion) * (dev->nregions - i - 1)); - memset(dev->regions + dev->nregions - 1, 0, sizeof(VuDevRegion)); DPRINT("Successfully removed a region\n"); dev->nregions--; i--; @@ -2119,7 +2115,6 @@ vu_init(VuDev *dev, DPRINT("%s: failed to malloc mem regions\n", __func__); return false; } - memset(dev->regions, 0, VHOST_USER_MAX_RAM_SLOTS * sizeof(dev->regions[0])); dev->vq = malloc(max_queues * sizeof(dev->vq[0])); if (!dev->vq) { From 9c254cb413a631f6601e6e34429547759d7d63a6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:55 +0100 Subject: [PATCH 19/68] libvhost-user: Don't search for duplicates when removing memory regions We cannot have duplicate memory regions, something would be deeply flawed elsewhere. Let's just stop the search once we found an entry. We'll add more sanity checks when adding memory regions later. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-9-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 2e8b611385..7f29e01c30 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -896,8 +896,7 @@ vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { i--; found = true; - - /* Continue the search for eventual duplicates. */ + break; } } From 60ccdca42d96f730f57478caaf376da90c3d97f3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:56 +0100 Subject: [PATCH 20/68] libvhost-user: Factor out search for memory region by GPA and simplify Memory regions cannot overlap, and if we ever hit that case something would be really flawed. For example, when vhost code in QEMU decides to increase the size of memory regions to cover full huge pages, it makes sure to never create overlaps, and if there would be overlaps, it would bail out. QEMU commits 48d7c9757749 ("vhost: Merge sections added to temporary list"), c1ece84e7c93 ("vhost: Huge page align and merge") and e7b94a84b6cb ("vhost: Allow adjoining regions") added and clarified that handling and how overlaps are impossible. Consequently, each GPA can belong to at most one memory region, and everything else doesn't make sense. Let's factor out our search to prepare for further changes. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-10-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 79 +++++++++++++---------- 1 file changed, 45 insertions(+), 34 deletions(-) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 7f29e01c30..d72f25396d 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -195,30 +195,47 @@ vu_panic(VuDev *dev, const char *msg, ...) */ } +/* Search for a memory region that covers this guest physical address. */ +static VuDevRegion * +vu_gpa_to_mem_region(VuDev *dev, uint64_t guest_addr) +{ + unsigned int i; + + /* + * Memory regions cannot overlap in guest physical address space. Each + * GPA belongs to exactly one memory region, so there can only be one + * match. + */ + for (i = 0; i < dev->nregions; i++) { + VuDevRegion *cur = &dev->regions[i]; + + if (guest_addr >= cur->gpa && guest_addr < cur->gpa + cur->size) { + return cur; + } + } + return NULL; +} + /* Translate guest physical address to our virtual address. */ void * vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr) { - unsigned int i; + VuDevRegion *r; if (*plen == 0) { return NULL; } - /* Find matching memory region. */ - for (i = 0; i < dev->nregions; i++) { - VuDevRegion *r = &dev->regions[i]; - - if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { - if ((guest_addr + *plen) > (r->gpa + r->size)) { - *plen = r->gpa + r->size - guest_addr; - } - return (void *)(uintptr_t) - guest_addr - r->gpa + r->mmap_addr + r->mmap_offset; - } + r = vu_gpa_to_mem_region(dev, guest_addr); + if (!r) { + return NULL; } - return NULL; + if ((guest_addr + *plen) > (r->gpa + r->size)) { + *plen = r->gpa + r->size - guest_addr; + } + return (void *)(uintptr_t)guest_addr - r->gpa + r->mmap_addr + + r->mmap_offset; } /* Translate qemu virtual address to our virtual address. */ @@ -854,8 +871,8 @@ static inline bool reg_equal(VuDevRegion *vudev_reg, static bool vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; - unsigned int i; - bool found = false; + unsigned int idx; + VuDevRegion *r; if (vmsg->fd_num > 1) { vmsg_close_fds(vmsg); @@ -882,28 +899,22 @@ vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { DPRINT(" mmap_offset 0x%016"PRIx64"\n", msg_region->mmap_offset); - for (i = 0; i < dev->nregions; i++) { - if (reg_equal(&dev->regions[i], msg_region)) { - VuDevRegion *r = &dev->regions[i]; - - munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset); - - /* Shift all affected entries by 1 to close the hole at index. */ - memmove(dev->regions + i, dev->regions + i + 1, - sizeof(VuDevRegion) * (dev->nregions - i - 1)); - DPRINT("Successfully removed a region\n"); - dev->nregions--; - i--; - - found = true; - break; - } - } - - if (!found) { + r = vu_gpa_to_mem_region(dev, msg_region->guest_phys_addr); + if (!r || !reg_equal(r, msg_region)) { + vmsg_close_fds(vmsg); vu_panic(dev, "Specified region not found\n"); + return false; } + munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset); + + idx = r - dev->regions; + assert(idx < dev->nregions); + /* Shift all affected entries by 1 to close the hole. */ + memmove(r, r + 1, sizeof(VuDevRegion) * (dev->nregions - idx - 1)); + DPRINT("Successfully removed a region\n"); + dev->nregions--; + vmsg_close_fds(vmsg); return false; From a3c0118c5a1e64d32a2208388b5ea90e2ec9d214 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:57 +0100 Subject: [PATCH 21/68] libvhost-user: Speedup gpa_to_mem_region() and vu_gpa_to_va() Let's speed up GPA to memory region / virtual address lookup. Store the memory regions ordered by guest physical addresses, and use binary search for address translation, as well as when adding/removing memory regions. Most importantly, this will speed up GPA->VA address translation when we have many memslots. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-11-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 49 +++++++++++++++++++++-- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index d72f25396d..ef6353d847 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -199,19 +199,30 @@ vu_panic(VuDev *dev, const char *msg, ...) static VuDevRegion * vu_gpa_to_mem_region(VuDev *dev, uint64_t guest_addr) { - unsigned int i; + int low = 0; + int high = dev->nregions - 1; /* * Memory regions cannot overlap in guest physical address space. Each * GPA belongs to exactly one memory region, so there can only be one * match. + * + * We store our memory regions ordered by GPA and can simply perform a + * binary search. */ - for (i = 0; i < dev->nregions; i++) { - VuDevRegion *cur = &dev->regions[i]; + while (low <= high) { + unsigned int mid = low + (high - low) / 2; + VuDevRegion *cur = &dev->regions[mid]; if (guest_addr >= cur->gpa && guest_addr < cur->gpa + cur->size) { return cur; } + if (guest_addr >= cur->gpa + cur->size) { + low = mid + 1; + } + if (guest_addr < cur->gpa) { + high = mid - 1; + } } return NULL; } @@ -273,9 +284,14 @@ vu_remove_all_mem_regs(VuDev *dev) static void _vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) { + const uint64_t start_gpa = msg_region->guest_phys_addr; + const uint64_t end_gpa = start_gpa + msg_region->memory_size; int prot = PROT_READ | PROT_WRITE; VuDevRegion *r; void *mmap_addr; + int low = 0; + int high = dev->nregions - 1; + unsigned int idx; DPRINT("Adding region %d\n", dev->nregions); DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", @@ -295,6 +311,29 @@ _vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) prot = PROT_NONE; } + /* + * We will add memory regions into the array sorted by GPA. Perform a + * binary search to locate the insertion point: it will be at the low + * index. + */ + while (low <= high) { + unsigned int mid = low + (high - low) / 2; + VuDevRegion *cur = &dev->regions[mid]; + + /* Overlap of GPA addresses. */ + if (start_gpa < cur->gpa + cur->size && cur->gpa < end_gpa) { + vu_panic(dev, "regions with overlapping guest physical addresses"); + return; + } + if (start_gpa >= cur->gpa + cur->size) { + low = mid + 1; + } + if (start_gpa < cur->gpa) { + high = mid - 1; + } + } + idx = low; + /* * We don't use offset argument of mmap() since the mapped address has * to be page aligned, and we use huge pages. @@ -308,7 +347,9 @@ _vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) DPRINT(" mmap_addr: 0x%016"PRIx64"\n", (uint64_t)(uintptr_t)mmap_addr); - r = &dev->regions[dev->nregions]; + /* Shift all affected entries by 1 to open a hole at idx. */ + r = &dev->regions[idx]; + memmove(r + 1, r, sizeof(VuDevRegion) * (dev->nregions - idx)); r->gpa = msg_region->guest_phys_addr; r->size = msg_region->memory_size; r->qva = msg_region->userspace_addr; From b2b63008b311126ade0a44ddb488d3704dffa51f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:58 +0100 Subject: [PATCH 22/68] libvhost-user: Use most of mmap_offset as fd_offset In the past, QEMU would create memory regions that could partially cover hugetlb pages, making mmap() fail if we would use the mmap_offset as an fd_offset. For that reason, we never used the mmap_offset as an offset into the fd and instead always mapped the fd from the very start. However, that can easily result in us mmap'ing a lot of unnecessary parts of an fd, possibly repeatedly. QEMU nowadays does not create memory regions that partially cover huge pages -- it never really worked with postcopy. QEMU handles merging of regions that partially cover huge pages (due to holes in boot memory) since 2018 in c1ece84e7c93 ("vhost: Huge page align and merge"). Let's be a bit careful and not unconditionally convert the mmap_offset into an fd_offset. Instead, let's simply detect the hugetlb size and pass as much as we can as fd_offset, making sure that we call mmap() with a properly aligned offset. With QEMU and a virtio-mem device that is fully plugged (50GiB using 50 memslots) the qemu-storage daemon process consumes in the VA space 1281GiB before this change and 58GiB after this change. ================ Vhost user message ================ Request: VHOST_USER_ADD_MEM_REG (37) Flags: 0x9 Size: 40 Fds: 59 Adding region 4 guest_phys_addr: 0x0000000200000000 memory_size: 0x0000000040000000 userspace_addr: 0x00007fb73bffe000 old mmap_offset: 0x0000000080000000 fd_offset: 0x0000000080000000 new mmap_offset: 0x0000000000000000 mmap_addr: 0x00007f02f1bdc000 Successfully added new region ================ Vhost user message ================ Request: VHOST_USER_ADD_MEM_REG (37) Flags: 0x9 Size: 40 Fds: 59 Adding region 5 guest_phys_addr: 0x0000000240000000 memory_size: 0x0000000040000000 userspace_addr: 0x00007fb77bffe000 old mmap_offset: 0x00000000c0000000 fd_offset: 0x00000000c0000000 new mmap_offset: 0x0000000000000000 mmap_addr: 0x00007f0284000000 Successfully added new region Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-12-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 54 ++++++++++++++++++++--- 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index ef6353d847..55aef5fcc6 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -43,6 +43,8 @@ #include #include #include +#include +#include #ifdef __NR_userfaultfd #include @@ -281,12 +283,32 @@ vu_remove_all_mem_regs(VuDev *dev) dev->nregions = 0; } +static size_t +get_fd_hugepagesize(int fd) +{ +#if defined(__linux__) + struct statfs fs; + int ret; + + do { + ret = fstatfs(fd, &fs); + } while (ret != 0 && errno == EINTR); + + if (!ret && (unsigned int)fs.f_type == HUGETLBFS_MAGIC) { + return fs.f_bsize; + } +#endif + return 0; +} + static void _vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) { const uint64_t start_gpa = msg_region->guest_phys_addr; const uint64_t end_gpa = start_gpa + msg_region->memory_size; int prot = PROT_READ | PROT_WRITE; + uint64_t mmap_offset, fd_offset; + size_t hugepagesize; VuDevRegion *r; void *mmap_addr; int low = 0; @@ -300,7 +322,7 @@ _vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) msg_region->memory_size); DPRINT(" userspace_addr: 0x%016"PRIx64"\n", msg_region->userspace_addr); - DPRINT(" mmap_offset: 0x%016"PRIx64"\n", + DPRINT(" old mmap_offset: 0x%016"PRIx64"\n", msg_region->mmap_offset); if (dev->postcopy_listening) { @@ -335,11 +357,31 @@ _vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) idx = low; /* - * We don't use offset argument of mmap() since the mapped address has - * to be page aligned, and we use huge pages. + * Convert most of msg_region->mmap_offset to fd_offset. In almost all + * cases, this will leave us with mmap_offset == 0, mmap()'ing only + * what we really need. Only if a memory region would partially cover + * hugetlb pages, we'd get mmap_offset != 0, which usually doesn't happen + * anymore (i.e., modern QEMU). + * + * Note that mmap() with hugetlb would fail if the offset into the file + * is not aligned to the huge page size. */ - mmap_addr = mmap(0, msg_region->memory_size + msg_region->mmap_offset, - prot, MAP_SHARED | MAP_NORESERVE, fd, 0); + hugepagesize = get_fd_hugepagesize(fd); + if (hugepagesize) { + fd_offset = ALIGN_DOWN(msg_region->mmap_offset, hugepagesize); + mmap_offset = msg_region->mmap_offset - fd_offset; + } else { + fd_offset = msg_region->mmap_offset; + mmap_offset = 0; + } + + DPRINT(" fd_offset: 0x%016"PRIx64"\n", + fd_offset); + DPRINT(" new mmap_offset: 0x%016"PRIx64"\n", + mmap_offset); + + mmap_addr = mmap(0, msg_region->memory_size + mmap_offset, + prot, MAP_SHARED | MAP_NORESERVE, fd, fd_offset); if (mmap_addr == MAP_FAILED) { vu_panic(dev, "region mmap error: %s", strerror(errno)); return; @@ -354,7 +396,7 @@ _vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) r->size = msg_region->memory_size; r->qva = msg_region->userspace_addr; r->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; - r->mmap_offset = msg_region->mmap_offset; + r->mmap_offset = mmap_offset; dev->nregions++; if (dev->postcopy_listening) { From 2a29022768f1777d71e26b784a264323d1914dd6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:16:59 +0100 Subject: [PATCH 23/68] libvhost-user: Factor out vq usability check Let's factor it out to prepare for further changes. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-13-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 24 +++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 55aef5fcc6..ed0a978d4f 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -283,6 +283,12 @@ vu_remove_all_mem_regs(VuDev *dev) dev->nregions = 0; } +static bool +vu_is_vq_usable(VuDev *dev, VuVirtq *vq) +{ + return likely(!dev->broken) && likely(vq->vring.avail); +} + static size_t get_fd_hugepagesize(int fd) { @@ -2380,8 +2386,7 @@ vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes, idx = vq->last_avail_idx; total_bufs = in_total = out_total = 0; - if (unlikely(dev->broken) || - unlikely(!vq->vring.avail)) { + if (!vu_is_vq_usable(dev, vq)) { goto done; } @@ -2496,8 +2501,7 @@ vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes, bool vu_queue_empty(VuDev *dev, VuVirtq *vq) { - if (unlikely(dev->broken) || - unlikely(!vq->vring.avail)) { + if (!vu_is_vq_usable(dev, vq)) { return true; } @@ -2536,8 +2540,7 @@ vring_notify(VuDev *dev, VuVirtq *vq) static void _vu_queue_notify(VuDev *dev, VuVirtq *vq, bool sync) { - if (unlikely(dev->broken) || - unlikely(!vq->vring.avail)) { + if (!vu_is_vq_usable(dev, vq)) { return; } @@ -2862,8 +2865,7 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz) unsigned int head; VuVirtqElement *elem; - if (unlikely(dev->broken) || - unlikely(!vq->vring.avail)) { + if (!vu_is_vq_usable(dev, vq)) { return NULL; } @@ -3020,8 +3022,7 @@ vu_queue_fill(VuDev *dev, VuVirtq *vq, { struct vring_used_elem uelem; - if (unlikely(dev->broken) || - unlikely(!vq->vring.avail)) { + if (!vu_is_vq_usable(dev, vq)) { return; } @@ -3050,8 +3051,7 @@ vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count) { uint16_t old, new; - if (unlikely(dev->broken) || - unlikely(!vq->vring.avail)) { + if (!vu_is_vq_usable(dev, vq)) { return; } From 67f4f663cd6179d57f3e5a558f1526c7dc8c6742 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:17:00 +0100 Subject: [PATCH 24/68] libvhost-user: Dynamically remap rings after (temporarily?) removing memory regions Currently, we try to remap all rings whenever we add a single new memory region. That doesn't quite make sense, because we already map rings when setting the ring address, and panic if that goes wrong. Likely, that handling was simply copied from set_mem_table code, where we actually have to remap all rings. Remapping all rings might require us to walk quite a lot of memory regions to perform the address translations. Ideally, we'd simply remove that remapping. However, let's be a bit careful. There might be some weird corner cases where we might temporarily remove a single memory region (e.g., resize it), that would have worked for now. Further, a ring might be located on hotplugged memory, and as the VM reboots, we might unplug that memory, to hotplug memory before resetting the ring addresses. So let's unmap affected rings as we remove a memory region, and try dynamically mapping the ring again when required. Acked-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-14-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 107 ++++++++++++++++------ 1 file changed, 78 insertions(+), 29 deletions(-) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index ed0a978d4f..61fb3050b3 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -283,10 +283,75 @@ vu_remove_all_mem_regs(VuDev *dev) dev->nregions = 0; } +static bool +map_ring(VuDev *dev, VuVirtq *vq) +{ + vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr); + vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr); + vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr); + + DPRINT("Setting virtq addresses:\n"); + DPRINT(" vring_desc at %p\n", vq->vring.desc); + DPRINT(" vring_used at %p\n", vq->vring.used); + DPRINT(" vring_avail at %p\n", vq->vring.avail); + + return !(vq->vring.desc && vq->vring.used && vq->vring.avail); +} + static bool vu_is_vq_usable(VuDev *dev, VuVirtq *vq) { - return likely(!dev->broken) && likely(vq->vring.avail); + if (unlikely(dev->broken)) { + return false; + } + + if (likely(vq->vring.avail)) { + return true; + } + + /* + * In corner cases, we might temporarily remove a memory region that + * mapped a ring. When removing a memory region we make sure to + * unmap any rings that would be impacted. Let's try to remap if we + * already succeeded mapping this ring once. + */ + if (!vq->vra.desc_user_addr || !vq->vra.used_user_addr || + !vq->vra.avail_user_addr) { + return false; + } + if (map_ring(dev, vq)) { + vu_panic(dev, "remapping queue on access"); + return false; + } + return true; +} + +static void +unmap_rings(VuDev *dev, VuDevRegion *r) +{ + int i; + + for (i = 0; i < dev->max_queues; i++) { + VuVirtq *vq = &dev->vq[i]; + const uintptr_t desc = (uintptr_t)vq->vring.desc; + const uintptr_t used = (uintptr_t)vq->vring.used; + const uintptr_t avail = (uintptr_t)vq->vring.avail; + + if (desc < r->mmap_addr || desc >= r->mmap_addr + r->size) { + continue; + } + if (used < r->mmap_addr || used >= r->mmap_addr + r->size) { + continue; + } + if (avail < r->mmap_addr || avail >= r->mmap_addr + r->size) { + continue; + } + + DPRINT("Unmapping rings of queue %d\n", i); + vq->vring.desc = NULL; + vq->vring.used = NULL; + vq->vring.avail = NULL; + } } static size_t @@ -786,21 +851,6 @@ vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg) return false; } -static bool -map_ring(VuDev *dev, VuVirtq *vq) -{ - vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr); - vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr); - vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr); - - DPRINT("Setting virtq addresses:\n"); - DPRINT(" vring_desc at %p\n", vq->vring.desc); - DPRINT(" vring_used at %p\n", vq->vring.used); - DPRINT(" vring_avail at %p\n", vq->vring.avail); - - return !(vq->vring.desc && vq->vring.used && vq->vring.avail); -} - static bool generate_faults(VuDev *dev) { unsigned int i; @@ -884,7 +934,6 @@ generate_faults(VuDev *dev) { static bool vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { - int i; VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; if (vmsg->fd_num != 1) { @@ -930,19 +979,9 @@ vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { vmsg->fd_num = 0; DPRINT("Successfully added new region in postcopy\n"); return true; - } else { - for (i = 0; i < dev->max_queues; i++) { - if (dev->vq[i].vring.desc) { - if (map_ring(dev, &dev->vq[i])) { - vu_panic(dev, "remapping queue %d for new memory region", - i); - } - } - } - - DPRINT("Successfully added new region\n"); - return false; } + DPRINT("Successfully added new region\n"); + return false; } static inline bool reg_equal(VuDevRegion *vudev_reg, @@ -995,6 +1034,16 @@ vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { return false; } + /* + * There might be valid cases where we temporarily remove memory regions + * to readd them again, or remove memory regions and don't use the rings + * anymore before we set the ring addresses and restart the device. + * + * Unmap all affected rings, remapping them on demand later. This should + * be a corner case. + */ + unmap_rings(dev, r); + munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset); idx = r - dev->regions; From 52767e1063beaa17d59c739efd0b9c342923929d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 16:17:01 +0100 Subject: [PATCH 25/68] libvhost-user: Mark mmap'ed region memory as MADV_DONTDUMP We already use MADV_NORESERVE to deal with sparse memory regions. Let's also set madvise(MADV_DONTDUMP), otherwise a crash of the process can result in us allocating all memory in the mmap'ed region for dumping purposes. This change implies that the mmap'ed rings won't be included in a coredump. If ever required for debugging purposes, we could mark only the mapped rings MADV_DODUMP. Ignore errors during madvise() for now. Reviewed-by: Raphael Norwitz Acked-by: Stefano Garzarella Signed-off-by: David Hildenbrand Message-Id: <20240214151701.29906-15-david@redhat.com> Tested-by: Mario Casquero Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- subprojects/libvhost-user/libvhost-user.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 61fb3050b3..a879149fef 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -460,6 +460,12 @@ _vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) DPRINT(" mmap_addr: 0x%016"PRIx64"\n", (uint64_t)(uintptr_t)mmap_addr); +#if defined(__linux__) + /* Don't include all guest memory in a coredump. */ + madvise(mmap_addr, msg_region->memory_size + mmap_offset, + MADV_DONTDUMP); +#endif + /* Shift all affected entries by 1 to open a hole at idx. */ r = &dev->regions[idx]; memmove(r + 1, r, sizeof(VuDevRegion) * (dev->nregions - idx)); From c08da86dc412cd44039bc78df02227578bc06268 Mon Sep 17 00:00:00 2001 From: Lukas Stockner Date: Thu, 15 Feb 2024 02:23:26 +0100 Subject: [PATCH 26/68] pcie: Support PCIe Gen5/Gen6 link speeds This patch extends the PCIe link speed option so that slots can be configured as supporting 32GT/s (Gen5) or 64GT/s (Gen5) speeds. This is as simple as setting the appropriate bit in LnkCap2 and the appropriate value in LnkCap and LnkCtl2. Signed-off-by: Lukas Stockner Message-Id: <20240215012326.3272366-1-lstockner@genesiscloud.com> Reviewed-by: Manos Pitsidianakis Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/core/qdev-properties-system.c | 16 ++++++++++++++-- hw/pci/pcie.c | 8 ++++++++ include/hw/pci/pcie_regs.h | 2 ++ qapi/common.json | 6 +++++- 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index b45e90edb2..28ce6162c7 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -955,7 +955,7 @@ const PropertyInfo qdev_prop_off_auto_pcibar = { .set_default_value = qdev_propinfo_set_default_value_enum, }; -/* --- PCIELinkSpeed 2_5/5/8/16 -- */ +/* --- PCIELinkSpeed 2_5/5/8/16/32/64 -- */ static void get_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) @@ -977,6 +977,12 @@ static void get_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name, case QEMU_PCI_EXP_LNK_16GT: speed = PCIE_LINK_SPEED_16; break; + case QEMU_PCI_EXP_LNK_32GT: + speed = PCIE_LINK_SPEED_32; + break; + case QEMU_PCI_EXP_LNK_64GT: + speed = PCIE_LINK_SPEED_64; + break; default: /* Unreachable */ abort(); @@ -1010,6 +1016,12 @@ static void set_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name, case PCIE_LINK_SPEED_16: *p = QEMU_PCI_EXP_LNK_16GT; break; + case PCIE_LINK_SPEED_32: + *p = QEMU_PCI_EXP_LNK_32GT; + break; + case PCIE_LINK_SPEED_64: + *p = QEMU_PCI_EXP_LNK_64GT; + break; default: /* Unreachable */ abort(); @@ -1018,7 +1030,7 @@ static void set_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name, const PropertyInfo qdev_prop_pcie_link_speed = { .name = "PCIELinkSpeed", - .description = "2_5/5/8/16", + .description = "2_5/5/8/16/32/64", .enum_table = &PCIELinkSpeed_lookup, .get = get_prop_pcielinkspeed, .set = set_prop_pcielinkspeed, diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index 6db0cf69cd..0b4817e144 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -153,6 +153,14 @@ static void pcie_cap_fill_slot_lnk(PCIDevice *dev) pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2, PCI_EXP_LNKCAP2_SLS_16_0GB); } + if (s->speed > QEMU_PCI_EXP_LNK_16GT) { + pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2, + PCI_EXP_LNKCAP2_SLS_32_0GB); + } + if (s->speed > QEMU_PCI_EXP_LNK_32GT) { + pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2, + PCI_EXP_LNKCAP2_SLS_64_0GB); + } } } diff --git a/include/hw/pci/pcie_regs.h b/include/hw/pci/pcie_regs.h index 4972106c42..9d3b6868dc 100644 --- a/include/hw/pci/pcie_regs.h +++ b/include/hw/pci/pcie_regs.h @@ -39,6 +39,8 @@ typedef enum PCIExpLinkSpeed { QEMU_PCI_EXP_LNK_5GT, QEMU_PCI_EXP_LNK_8GT, QEMU_PCI_EXP_LNK_16GT, + QEMU_PCI_EXP_LNK_32GT, + QEMU_PCI_EXP_LNK_64GT, } PCIExpLinkSpeed; #define QEMU_PCI_EXP_LNKCAP_MLS(speed) (speed) diff --git a/qapi/common.json b/qapi/common.json index f1bb841951..867a9ad9b0 100644 --- a/qapi/common.json +++ b/qapi/common.json @@ -107,10 +107,14 @@ # # @16: 16.0GT/s # +# @32: 32.0GT/s +# +# @64: 64.0GT/s +# # Since: 4.0 ## { 'enum': 'PCIELinkSpeed', - 'data': [ '2_5', '5', '8', '16' ] } + 'data': [ '2_5', '5', '8', '16', '32', '64' ] } ## # @PCIELinkWidth: From a55834579aa65f27519f948275d5efd9e2173bef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Thu, 15 Feb 2024 11:36:15 +0100 Subject: [PATCH 27/68] vdpa: stash memory region properties in vars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Next changes uses this variables, so avoid call repeatedly to memory region functions. No functional change intended. Signed-off-by: Eugenio Pérez Message-Id: <20240215103616.330518-2-eperezma@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost-vdpa.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index c968278e70..5559d4f89c 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -47,12 +47,14 @@ static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, int page_mask) { Int128 llend; + bool is_ram = memory_region_is_ram(section->mr); + bool is_iommu = memory_region_is_iommu(section->mr); + bool is_protected = memory_region_is_protected(section->mr); - if ((!memory_region_is_ram(section->mr) && - !memory_region_is_iommu(section->mr)) || - memory_region_is_protected(section->mr) || - /* vhost-vDPA doesn't allow MMIO to be mapped */ - memory_region_is_ram_device(section->mr)) { + /* vhost-vDPA doesn't allow MMIO to be mapped */ + bool is_ram_device = memory_region_is_ram_device(section->mr); + + if ((!is_ram && !is_iommu) || is_protected || is_ram_device) { return true; } @@ -69,7 +71,7 @@ static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, * size that maps to the kernel */ - if (!memory_region_is_iommu(section->mr)) { + if (!is_iommu) { llend = vhost_vdpa_section_end(section, page_mask); if (int128_gt(llend, int128_make64(iova_max))) { error_report("RAM section out of device range (max=0x%" PRIx64 From a8516e5c9719cf45fd81d6790bc9ffdcf753376b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Thu, 15 Feb 2024 11:36:16 +0100 Subject: [PATCH 28/68] vdpa: trace skipped memory sections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sometimes, certain parts are not being skipped in vhost_vdpa_listener_region_del, but they are skipped in vhost_vdpa_listener_region_add, or vice versa. The vhost-vdpa code expects all parts to maintain their properties, so we're adding a trace to help with debugging when any part is skipped. Signed-off-by: Eugenio Pérez Message-Id: <20240215103616.330518-3-eperezma@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/trace-events | 1 + hw/virtio/vhost-vdpa.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 20577aa584..9df24864a2 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -30,6 +30,7 @@ vhost_user_write(uint32_t req, uint32_t flags) "req:%d flags:0x%"PRIx32"" vhost_user_create_notifier(int idx, void *n) "idx:%d n:%p" # vhost-vdpa.c +vhost_vdpa_skipped_memory_section(int is_ram, int is_iommu, int is_protected, int is_ram_device, uint64_t first, uint64_t last, int page_mask) "is_ram=%d, is_iommu=%d, is_protected=%d, is_ram_device=%d iova_min=0x%"PRIx64" iova_last=0x%"PRIx64" page_mask=0x%x" vhost_vdpa_dma_map(void *vdpa, int fd, uint32_t msg_type, uint32_t asid, uint64_t iova, uint64_t size, uint64_t uaddr, uint8_t perm, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" asid: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" uaddr: 0x%"PRIx64" perm: 0x%"PRIx8" type: %"PRIu8 vhost_vdpa_dma_unmap(void *vdpa, int fd, uint32_t msg_type, uint32_t asid, uint64_t iova, uint64_t size, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" asid: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" type: %"PRIu8 vhost_vdpa_listener_begin_batch(void *v, int fd, uint32_t msg_type, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" type: %"PRIu8 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 5559d4f89c..3bcd05cc22 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -55,6 +55,9 @@ static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, bool is_ram_device = memory_region_is_ram_device(section->mr); if ((!is_ram && !is_iommu) || is_protected || is_ram_device) { + trace_vhost_vdpa_skipped_memory_section(is_ram, is_iommu, is_protected, + is_ram_device, iova_min, + iova_max, page_mask); return true; } From 3a95f572112ab4c789d66af666644adcdb2b45a6 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Thu, 15 Feb 2024 15:52:06 +0000 Subject: [PATCH 29/68] hw/pci-bridge/pxb-cxl: Drop RAS capability from host bridge. This CXL component isn't allowed to have a RAS capability. Whilst this should be harmless as software is not expected to look here, good to clean it up. Signed-off-by: Jonathan Cameron Message-Id: <20240215155206.2736-1-Jonathan.Cameron@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/cxl/cxl-component-utils.c | 21 +++++++++++++++++---- hw/pci-bridge/pci_expander_bridge.c | 2 +- include/hw/cxl/cxl_component.h | 1 + 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c index 84ab503325..cd116c0401 100644 --- a/hw/cxl/cxl-component-utils.c +++ b/hw/cxl/cxl-component-utils.c @@ -297,6 +297,7 @@ void cxl_component_register_init_common(uint32_t *reg_state, caps = 3; break; case CXL2_ROOT_PORT: + case CXL2_RC: /* + Extended Security, + Snoop */ caps = 5; break; @@ -326,8 +327,19 @@ void cxl_component_register_init_common(uint32_t *reg_state, CXL_##reg##_REGISTERS_OFFSET); \ } while (0) + switch (type) { + case CXL2_DEVICE: + case CXL2_TYPE3_DEVICE: + case CXL2_LOGICAL_DEVICE: + case CXL2_ROOT_PORT: + case CXL2_UPSTREAM_PORT: + case CXL2_DOWNSTREAM_PORT: init_cap_reg(RAS, 2, CXL_RAS_CAPABILITY_VERSION); - ras_init_common(reg_state, write_msk); + ras_init_common(reg_state, write_msk); + break; + default: + break; + } init_cap_reg(LINK, 4, CXL_LINK_CAPABILITY_VERSION); @@ -335,9 +347,10 @@ void cxl_component_register_init_common(uint32_t *reg_state, return; } - init_cap_reg(HDM, 5, CXL_HDM_CAPABILITY_VERSION); - hdm_init_common(reg_state, write_msk, type); - + if (type != CXL2_ROOT_PORT) { + init_cap_reg(HDM, 5, CXL_HDM_CAPABILITY_VERSION); + hdm_init_common(reg_state, write_msk, type); + } if (caps < 5) { return; } diff --git a/hw/pci-bridge/pci_expander_bridge.c b/hw/pci-bridge/pci_expander_bridge.c index 535889f7c2..0411ad31ea 100644 --- a/hw/pci-bridge/pci_expander_bridge.c +++ b/hw/pci-bridge/pci_expander_bridge.c @@ -290,7 +290,7 @@ static void pxb_cxl_dev_reset(DeviceState *dev) uint32_t *write_msk = cxl_cstate->crb.cache_mem_regs_write_mask; int dsp_count = 0; - cxl_component_register_init_common(reg_state, write_msk, CXL2_ROOT_PORT); + cxl_component_register_init_common(reg_state, write_msk, CXL2_RC); /* * The CXL specification allows for host bridges with no HDM decoders * if they only have a single root port. diff --git a/include/hw/cxl/cxl_component.h b/include/hw/cxl/cxl_component.h index 0e5d35c263..5012fab6f7 100644 --- a/include/hw/cxl/cxl_component.h +++ b/include/hw/cxl/cxl_component.h @@ -25,6 +25,7 @@ enum reg_type { CXL2_TYPE3_DEVICE, CXL2_LOGICAL_DEVICE, CXL2_ROOT_PORT, + CXL2_RC, CXL2_UPSTREAM_PORT, CXL2_DOWNSTREAM_PORT, CXL3_SWITCH_MAILBOX_CCI, From 633487df8d303b37a88584d5a57a39dbcd91c7bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Volker=20R=C3=BCmelin?= Date: Sun, 18 Feb 2024 09:33:41 +0100 Subject: [PATCH 30/68] hw/audio/virtio-sound: return correct command response size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The payload size returned by command VIRTIO_SND_R_PCM_INFO is wrong. The code in process_cmd() assumes that all commands return only a virtio_snd_hdr payload, but some commands like VIRTIO_SND_R_PCM_INFO may return an additional payload. Add a zero initialized payload_size variable to struct virtio_snd_ctrl_command to allow for additional payloads. Reviewed-by: Marc-André Lureau Signed-off-by: Volker Rümelin Message-Id: <20240218083351.8524-1-vr_qemu@t-online.de> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/audio/virtio-snd.c | 7 +++++-- include/hw/audio/virtio-snd.h | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/hw/audio/virtio-snd.c b/hw/audio/virtio-snd.c index ea2aeaef14..e604d8f30c 100644 --- a/hw/audio/virtio-snd.c +++ b/hw/audio/virtio-snd.c @@ -243,12 +243,13 @@ static void virtio_snd_handle_pcm_info(VirtIOSound *s, memset(&pcm_info[i].padding, 0, 5); } + cmd->payload_size = sizeof(virtio_snd_pcm_info) * count; cmd->resp.code = cpu_to_le32(VIRTIO_SND_S_OK); iov_from_buf(cmd->elem->in_sg, cmd->elem->in_num, sizeof(virtio_snd_hdr), pcm_info, - sizeof(virtio_snd_pcm_info) * count); + cmd->payload_size); } /* @@ -749,7 +750,8 @@ process_cmd(VirtIOSound *s, virtio_snd_ctrl_command *cmd) 0, &cmd->resp, sizeof(virtio_snd_hdr)); - virtqueue_push(cmd->vq, cmd->elem, sizeof(virtio_snd_hdr)); + virtqueue_push(cmd->vq, cmd->elem, + sizeof(virtio_snd_hdr) + cmd->payload_size); virtio_notify(VIRTIO_DEVICE(s), cmd->vq); } @@ -808,6 +810,7 @@ static void virtio_snd_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) cmd->elem = elem; cmd->vq = vq; cmd->resp.code = cpu_to_le32(VIRTIO_SND_S_OK); + /* implicit cmd->payload_size = 0; */ QTAILQ_INSERT_TAIL(&s->cmdq, cmd, next); elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); } diff --git a/include/hw/audio/virtio-snd.h b/include/hw/audio/virtio-snd.h index c3767f442b..3d79181364 100644 --- a/include/hw/audio/virtio-snd.h +++ b/include/hw/audio/virtio-snd.h @@ -230,6 +230,7 @@ struct virtio_snd_ctrl_command { VirtQueue *vq; virtio_snd_hdr ctrl; virtio_snd_hdr resp; + size_t payload_size; QTAILQ_ENTRY(virtio_snd_ctrl_command) next; }; #endif From 043e127a126bb3ceb5fc753deee27d261fd0c5ce Mon Sep 17 00:00:00 2001 From: Albert Esteve Date: Mon, 19 Feb 2024 15:34:19 +0100 Subject: [PATCH 31/68] hw/virtio: check owner for removing objects Shared objects lack spoofing protection. For VHOST_USER_BACKEND_SHARED_OBJECT_REMOVE messages received by the vhost-user interface, any backend was allowed to remove entries from the shared table just by knowing the UUID. Only the owner of the entry shall be allowed to removed their resources from the table. To fix that, add a check for all *SHARED_OBJECT_REMOVE messages received. A vhost device can only remove TYPE_VHOST_DEV entries that are owned by them, otherwise skip the removal, and inform the device that the entry has not been removed in the answer. Signed-off-by: Albert Esteve Acked-by: Stefan Hajnoczi Message-Id: <20240219143423.272012-2-aesteve@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- docs/interop/vhost-user.rst | 4 +++- hw/virtio/vhost-user.c | 21 +++++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst index d1ed39dfa0..d8419fd2f1 100644 --- a/docs/interop/vhost-user.rst +++ b/docs/interop/vhost-user.rst @@ -1839,7 +1839,9 @@ is sent by the front-end. When the ``VHOST_USER_PROTOCOL_F_SHARED_OBJECT`` protocol feature has been successfully negotiated, this message can be submitted by the backend to remove themselves from to the virtio-dmabuf shared - table API. The shared table will remove the back-end device associated with + table API. Only the back-end owning the entry (i.e., the one that first added + it) will have permission to remove it. Otherwise, the message is ignored. + The shared table will remove the back-end device associated with the UUID. If ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, and the back-end sets the ``VHOST_USER_NEED_REPLY`` flag, the front-end must respond with zero when operation is successfully completed, or non-zero otherwise. diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index a1eea8547e..9d654efd3d 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -1611,11 +1611,27 @@ vhost_user_backend_handle_shared_object_add(struct vhost_dev *dev, } static int -vhost_user_backend_handle_shared_object_remove(VhostUserShared *object) +vhost_user_backend_handle_shared_object_remove(struct vhost_dev *dev, + VhostUserShared *object) { QemuUUID uuid; memcpy(uuid.data, object->uuid, sizeof(object->uuid)); + switch (virtio_object_type(&uuid)) { + case TYPE_VHOST_DEV: + { + struct vhost_dev *owner = virtio_lookup_vhost_device(&uuid); + if (dev != owner) { + /* Not allowed to remove non-owned entries */ + return 0; + } + break; + } + default: + /* Not allowed to remove non-owned entries */ + return 0; + } + return virtio_remove_resource(&uuid); } @@ -1794,7 +1810,8 @@ static gboolean backend_read(QIOChannel *ioc, GIOCondition condition, ret = vhost_user_backend_handle_shared_object_add(dev, &payload.object); break; case VHOST_USER_BACKEND_SHARED_OBJECT_REMOVE: - ret = vhost_user_backend_handle_shared_object_remove(&payload.object); + ret = vhost_user_backend_handle_shared_object_remove(dev, + &payload.object); break; case VHOST_USER_BACKEND_SHARED_OBJECT_LOOKUP: ret = vhost_user_backend_handle_shared_object_lookup(dev->opaque, ioc, From cd341fd1ffded978b2aa0b5309b00be7c42e347c Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Wed, 21 Feb 2024 15:38:02 +0800 Subject: [PATCH 32/68] hw/virtio: Add support for VDPA network simulation devices This patch adds support for VDPA network simulation devices. The device is developed based on virtio-net and tap backend, and supports hardware live migration function. For more details, please refer to "docs/system/devices/vdpa-net.rst" Signed-off-by: Hao Chen Message-Id: <20240221073802.2888022-1-chenh@yusur.tech> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- MAINTAINERS | 5 + docs/system/device-emulation.rst | 1 + docs/system/devices/vdpa-net.rst | 121 +++++++++++++ hw/net/virtio-net.c | 16 ++ hw/virtio/virtio-pci.c | 189 +++++++++++++++++++- hw/virtio/virtio.c | 39 ++++ include/hw/virtio/virtio-pci.h | 5 + include/hw/virtio/virtio.h | 19 ++ include/standard-headers/linux/virtio_pci.h | 7 + 9 files changed, 399 insertions(+), 3 deletions(-) create mode 100644 docs/system/devices/vdpa-net.rst diff --git a/MAINTAINERS b/MAINTAINERS index 4d96f855de..5abf6f54c4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2372,6 +2372,11 @@ F: hw/virtio/vhost-user-scmi* F: include/hw/virtio/vhost-user-scmi.h F: tests/qtest/libqos/virtio-scmi.* +vdpa-net +M: Hao Chen +S: Maintained +F: docs/system/devices/vdpa-net.rst + virtio-crypto M: Gonglei S: Supported diff --git a/docs/system/device-emulation.rst b/docs/system/device-emulation.rst index f19777411c..e4a27f53c8 100644 --- a/docs/system/device-emulation.rst +++ b/docs/system/device-emulation.rst @@ -99,3 +99,4 @@ Emulated Devices devices/canokey.rst devices/usb-u2f.rst devices/igb.rst + devices/vdpa-net.rst diff --git a/docs/system/devices/vdpa-net.rst b/docs/system/devices/vdpa-net.rst new file mode 100644 index 0000000000..323d8c926a --- /dev/null +++ b/docs/system/devices/vdpa-net.rst @@ -0,0 +1,121 @@ +vdpa net +============ + +This document explains the setup and usage of the vdpa network device. +The vdpa network device is a paravirtualized vdpa emulate device. + +Description +----------- + +VDPA net devices support dirty page bitmap mark and vring state saving and recovery. + +Users can use this VDPA device for live migration simulation testing in a nested virtualization environment. + +Registers layout +---------------- + +The vdpa device add live migrate registers layout as follow:: + + Offset Register Name Bitwidth Associated vq + 0x0 LM_LOGGING_CTRL 4bits + 0x10 LM_BASE_ADDR_LOW 32bits + 0x14 LM_BASE_ADDR_HIGH 32bits + 0x18 LM_END_ADDR_LOW 32bits + 0x1c LM_END_ADDR_HIGH 32bits + 0x20 LM_RING_STATE_OFFSET 32bits vq0 + 0x24 LM_RING_STATE_OFFSET 32bits vq1 + 0x28 LM_RING_STATE_OFFSET 32bits vq2 + ...... + 0x20+1023*4 LM_RING_STATE_OFFSET 32bits vq1023 + +These registers are extended at the end of the notify bar space. + +Architecture diagram +-------------------- +:: + + |------------------------------------------------------------------------| + | guest-L1-user-space | + | | + | |----------------------------------------| + | | [virtio-net driver] | + | | ^ guest-L2-src(iommu=on) | + | |--------------|-------------------------| + | | | qemu-L2-src(viommu) | + | [dpdk-vdpa]<->[vhost socket]<-+->[vhost-user backend(iommu=on)] | + -------------------------------------------------------------------------- + -------------------------------------------------------------------------- + | ^ guest-L1-kernel-space | + | | | + | [VFIO] | + | ^ | + | | guest-L1-src(iommu=on) | + --------|----------------------------------------------------------------- + --------|----------------------------------------------------------------- + | [vdpa net device(iommu=on)] [manager nic device] | + | | | | + | | | | + | [tap device] qemu-L1-src(viommu) | | + ------------------------------------------------+------------------------- + | + | + --------------------- | + | kernel net bridge |<----- + | virbr0 |<---------------------------------- + --------------------- | + | + | + -------------------------------------------------------------------------- | + | guest-L1-user-space | | + | | | + | |----------------------------------------| | + | | [virtio-net driver] | | + | | ^ guest-L2-dst(iommu=on) | | + | |--------------|-------------------------| | + | | | qemu-L2-dst(viommu) | | + | [dpdk-vdpa]<->[vhost socket]<-+->[vhost-user backend(iommu=on)] | | + -------------------------------------------------------------------------- | + -------------------------------------------------------------------------- | + | ^ guest-L1-kernel-space | | + | | | | + | [VFIO] | | + | ^ | | + | | guest-L1-dst(iommu=on) | | + --------|----------------------------------------------------------------- | + --------|----------------------------------------------------------------- | + | [vdpa net device(iommu=on)] [manager nic device]----------------+---- + | | | + | | | + | [tap device] qemu-L1-dst(viommu) | + -------------------------------------------------------------------------- + + +Device properties +----------------- + +The Virtio vdpa device can be configured with the following properties: + + * ``vdpa=on`` open vdpa device emulated. + +Usages +-------- +This patch add virtio sriov support and vdpa live migrate support. +You can open vdpa by set xml file as follow:: + + + + + + + + + + +Limitations +----------- +1. Dependent on tap device with param ``vhost=off``. +2. Nested virtualization environment only supports ``q35`` machines. +3. Current only support split vring live migrate. + + + diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index a3c711b56d..27055a4b8e 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1999,6 +1999,22 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf, goto err; } + /* Mark dirty page's bitmap of guest memory */ + if (vdev->lm_logging_ctrl == LM_ENABLE) { + uint64_t chunk = elem->in_addr[i] / VHOST_LOG_CHUNK; + /* Get chunk index */ + BitmapMemoryRegionCaches *caches = qatomic_rcu_read(&vdev->caches); + uint64_t index = chunk / 8; + uint64_t shift = chunk % 8; + uint8_t val = 0; + address_space_read_cached(&caches->bitmap, index, &val, + sizeof(val)); + val |= 1 << shift; + address_space_write_cached(&caches->bitmap, index, &val, + sizeof(val)); + address_space_cache_invalidate(&caches->bitmap, index, sizeof(val)); + } + elems[i] = elem; lens[i] = total; i++; diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 1a7039fb0c..e42ac6e7f9 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1442,6 +1442,155 @@ int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy, return virtio_pci_add_mem_cap(proxy, &cap.cap); } +/* Called within call_rcu(). */ +static void bitmap_free_region_cache(BitmapMemoryRegionCaches *caches) +{ + assert(caches != NULL); + address_space_cache_destroy(&caches->bitmap); + g_free(caches); +} + +static void lm_disable(VirtIODevice *vdev) +{ + BitmapMemoryRegionCaches *caches; + caches = qatomic_read(&vdev->caches); + qatomic_rcu_set(&vdev->caches, NULL); + if (caches) { + call_rcu(caches, bitmap_free_region_cache, rcu); + } +} + +static void lm_enable(VirtIODevice *vdev) +{ + BitmapMemoryRegionCaches *old = vdev->caches; + BitmapMemoryRegionCaches *new = NULL; + hwaddr addr, end, size; + int64_t len; + + addr = vdev->lm_base_addr_low | ((hwaddr)(vdev->lm_base_addr_high) << 32); + end = vdev->lm_end_addr_low | ((hwaddr)(vdev->lm_end_addr_high) << 32); + size = end - addr; + if (size <= 0) { + error_report("Invalid lm size."); + return; + } + + new = g_new0(BitmapMemoryRegionCaches, 1); + len = address_space_cache_init(&new->bitmap, vdev->dma_as, addr, size, + true); + if (len < size) { + virtio_error(vdev, "Cannot map bitmap"); + goto err_bitmap; + } + qatomic_rcu_set(&vdev->caches, new); + + if (old) { + call_rcu(old, bitmap_free_region_cache, rcu); + } + + return; + +err_bitmap: + address_space_cache_destroy(&new->bitmap); + g_free(new); +} + +static uint64_t virtio_pci_lm_read(void *opaque, hwaddr addr, + unsigned size) +{ + VirtIOPCIProxy *proxy = opaque; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + hwaddr offset_end = LM_VRING_STATE_OFFSET + + virtio_pci_queue_mem_mult(proxy) * VIRTIO_QUEUE_MAX; + uint32_t val; + int qid; + + if (vdev == NULL) { + return UINT64_MAX; + } + switch (addr) { + case LM_LOGGING_CTRL: + val = vdev->lm_logging_ctrl; + break; + case LM_BASE_ADDR_LOW: + val = vdev->lm_base_addr_low; + break; + case LM_BASE_ADDR_HIGH: + val = vdev->lm_base_addr_high; + break; + case LM_END_ADDR_LOW: + val = vdev->lm_end_addr_low; + break; + case LM_END_ADDR_HIGH: + val = vdev->lm_end_addr_high; + break; + default: + if (addr >= LM_VRING_STATE_OFFSET && addr <= offset_end) { + qid = (addr - LM_VRING_STATE_OFFSET) / + virtio_pci_queue_mem_mult(proxy); + val = virtio_queue_get_vring_states(vdev, qid); + } else + val = 0; + + break; + } + + return val; +} + +static void virtio_pci_lm_write(void *opaque, hwaddr addr, + uint64_t val, unsigned size) +{ + VirtIOPCIProxy *proxy = opaque; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + hwaddr offset_end = LM_VRING_STATE_OFFSET + + virtio_pci_queue_mem_mult(proxy) * VIRTIO_QUEUE_MAX; + int qid; + + if (vdev == NULL) { + return; + } + + switch (addr) { + case LM_LOGGING_CTRL: + vdev->lm_logging_ctrl = val; + switch (val) { + case LM_DISABLE: + lm_disable(vdev); + break; + case LM_ENABLE: + lm_enable(vdev); + break; + default: + virtio_error(vdev, "Unsupport LM_LOGGING_CTRL value: %"PRIx64, + val); + break; + }; + + break; + case LM_BASE_ADDR_LOW: + vdev->lm_base_addr_low = val; + break; + case LM_BASE_ADDR_HIGH: + vdev->lm_base_addr_high = val; + break; + case LM_END_ADDR_LOW: + vdev->lm_end_addr_low = val; + break; + case LM_END_ADDR_HIGH: + vdev->lm_end_addr_high = val; + break; + default: + if (addr >= LM_VRING_STATE_OFFSET && addr <= offset_end) { + qid = (addr - LM_VRING_STATE_OFFSET) / + virtio_pci_queue_mem_mult(proxy); + virtio_queue_set_vring_states(vdev, qid, val); + } else + virtio_error(vdev, "Unsupport addr: %"PRIx64, addr); + break; + } +} + static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr, unsigned size) { @@ -1823,6 +1972,15 @@ static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy, }, .endianness = DEVICE_LITTLE_ENDIAN, }; + static const MemoryRegionOps lm_ops = { + .read = virtio_pci_lm_read, + .write = virtio_pci_lm_write, + .impl = { + .min_access_size = 1, + .max_access_size = 4, + }, + .endianness = DEVICE_LITTLE_ENDIAN, + }; g_autoptr(GString) name = g_string_new(NULL); g_string_printf(name, "virtio-pci-common-%s", vdev_name); @@ -1859,6 +2017,14 @@ static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy, proxy, name->str, proxy->notify_pio.size); + if (proxy->flags & VIRTIO_PCI_FLAG_VDPA) { + g_string_printf(name, "virtio-pci-lm-%s", vdev_name); + memory_region_init_io(&proxy->lm.mr, OBJECT(proxy), + &lm_ops, + proxy, + name->str, + proxy->lm.size); + } } static void virtio_pci_modern_region_map(VirtIOPCIProxy *proxy, @@ -2021,6 +2187,10 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) virtio_pci_modern_mem_region_map(proxy, &proxy->isr, &cap); virtio_pci_modern_mem_region_map(proxy, &proxy->device, &cap); virtio_pci_modern_mem_region_map(proxy, &proxy->notify, ¬ify.cap); + if (proxy->flags & VIRTIO_PCI_FLAG_VDPA) { + memory_region_add_subregion(&proxy->modern_bar, + proxy->lm.offset, &proxy->lm.mr); + } if (modern_pio) { memory_region_init(&proxy->io_bar, OBJECT(proxy), @@ -2090,6 +2260,9 @@ static void virtio_pci_device_unplugged(DeviceState *d) virtio_pci_modern_mem_region_unmap(proxy, &proxy->isr); virtio_pci_modern_mem_region_unmap(proxy, &proxy->device); virtio_pci_modern_mem_region_unmap(proxy, &proxy->notify); + if (proxy->flags & VIRTIO_PCI_FLAG_VDPA) { + memory_region_del_subregion(&proxy->modern_bar, &proxy->lm.mr); + } if (modern_pio) { virtio_pci_modern_io_region_unmap(proxy, &proxy->notify_pio); } @@ -2144,9 +2317,17 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) proxy->notify_pio.type = VIRTIO_PCI_CAP_NOTIFY_CFG; /* subclasses can enforce modern, so do this unconditionally */ - memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci", - /* PCI BAR regions must be powers of 2 */ - pow2ceil(proxy->notify.offset + proxy->notify.size)); + if (!(proxy->flags & VIRTIO_PCI_FLAG_VDPA)) { + memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci", + /* PCI BAR regions must be powers of 2 */ + pow2ceil(proxy->notify.offset + proxy->notify.size)); + } else { + proxy->lm.offset = proxy->notify.offset + proxy->notify.size; + proxy->lm.size = 0x20 + VIRTIO_QUEUE_MAX * 4; + memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci", + /* PCI BAR regions must be powers of 2 */ + pow2ceil(proxy->lm.offset + proxy->lm.size)); + } if (proxy->disable_legacy == ON_OFF_AUTO_AUTO) { proxy->disable_legacy = pcie_port ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; @@ -2301,6 +2482,8 @@ static Property virtio_pci_properties[] = { VIRTIO_PCI_FLAG_INIT_FLR_BIT, true), DEFINE_PROP_BIT("aer", VirtIOPCIProxy, flags, VIRTIO_PCI_FLAG_AER_BIT, false), + DEFINE_PROP_BIT("vdpa", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_VDPA_BIT, false), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index d229755eae..fb6b4ccd83 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -3368,6 +3368,18 @@ static uint16_t virtio_queue_split_get_last_avail_idx(VirtIODevice *vdev, return vdev->vq[n].last_avail_idx; } +static uint32_t virtio_queue_split_get_vring_states(VirtIODevice *vdev, + int n) +{ + struct VirtQueue *vq = &vdev->vq[n]; + uint16_t avail, used; + + avail = vq->last_avail_idx; + used = vq->used_idx; + + return avail | (uint32_t)used << 16; +} + unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n) { if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { @@ -3377,6 +3389,33 @@ unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n) } } +unsigned int virtio_queue_get_vring_states(VirtIODevice *vdev, int n) +{ + if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + return -1; + } else { + return virtio_queue_split_get_vring_states(vdev, n); + } +} + +static void virtio_queue_split_set_vring_states(VirtIODevice *vdev, + int n, uint32_t idx) +{ + struct VirtQueue *vq = &vdev->vq[n]; + vq->last_avail_idx = (uint16_t)(idx & 0xffff); + vq->shadow_avail_idx = (uint16_t)(idx & 0xffff); + vq->used_idx = (uint16_t)(idx >> 16); +} + +void virtio_queue_set_vring_states(VirtIODevice *vdev, int n, uint32_t idx) +{ + if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + return; + } else { + virtio_queue_split_set_vring_states(vdev, n, idx); + } +} + static void virtio_queue_packed_set_last_avail_idx(VirtIODevice *vdev, int n, unsigned int idx) { diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h index 59d88018c1..4d57a9c751 100644 --- a/include/hw/virtio/virtio-pci.h +++ b/include/hw/virtio/virtio-pci.h @@ -43,6 +43,7 @@ enum { VIRTIO_PCI_FLAG_INIT_FLR_BIT, VIRTIO_PCI_FLAG_AER_BIT, VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT, + VIRTIO_PCI_FLAG_VDPA_BIT, }; /* Need to activate work-arounds for buggy guests at vmstate load. */ @@ -89,6 +90,9 @@ enum { #define VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED \ (1 << VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT) +/* VDPA supported flags */ +#define VIRTIO_PCI_FLAG_VDPA (1 << VIRTIO_PCI_FLAG_VDPA_BIT) + typedef struct { MSIMessage msg; int virq; @@ -140,6 +144,7 @@ struct VirtIOPCIProxy { }; VirtIOPCIRegion regs[5]; }; + VirtIOPCIRegion lm; MemoryRegion modern_bar; MemoryRegion io_bar; uint32_t legacy_io_bar_idx; diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index c8f72850bc..b3c74a1bca 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -35,6 +35,9 @@ (0x1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \ (0x1ULL << VIRTIO_F_ANY_LAYOUT)) +#define LM_DISABLE 0x00 +#define LM_ENABLE 0x01 + struct VirtQueue; static inline hwaddr vring_align(hwaddr addr, @@ -95,6 +98,11 @@ enum virtio_device_endian { VIRTIO_DEVICE_ENDIAN_BIG, }; +typedef struct BitmapMemoryRegionCaches { + struct rcu_head rcu; + MemoryRegionCache bitmap; +} BitmapMemoryRegionCaches; + /** * struct VirtIODevice - common VirtIO structure * @name: name of the device @@ -128,6 +136,14 @@ struct VirtIODevice uint32_t generation; int nvectors; VirtQueue *vq; + uint8_t lm_logging_ctrl; + uint32_t lm_base_addr_low; + uint32_t lm_base_addr_high; + uint32_t lm_end_addr_low; + uint32_t lm_end_addr_high; + + BitmapMemoryRegionCaches *caches; + MemoryListener listener; uint16_t device_id; /* @vm_running: current VM running state via virtio_vmstate_change() */ @@ -379,8 +395,11 @@ hwaddr virtio_queue_get_desc_size(VirtIODevice *vdev, int n); hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n); hwaddr virtio_queue_get_used_size(VirtIODevice *vdev, int n); unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n); +unsigned int virtio_queue_get_vring_states(VirtIODevice *vdev, int n); void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, unsigned int idx); +void virtio_queue_set_vring_states(VirtIODevice *vdev, int n, + unsigned int idx); void virtio_queue_restore_last_avail_idx(VirtIODevice *vdev, int n); void virtio_queue_invalidate_signalled_used(VirtIODevice *vdev, int n); void virtio_queue_update_used_idx(VirtIODevice *vdev, int n); diff --git a/include/standard-headers/linux/virtio_pci.h b/include/standard-headers/linux/virtio_pci.h index 3e2bc2c97e..86733278ba 100644 --- a/include/standard-headers/linux/virtio_pci.h +++ b/include/standard-headers/linux/virtio_pci.h @@ -221,6 +221,13 @@ struct virtio_pci_cfg_cap { #define VIRTIO_PCI_COMMON_ADM_Q_IDX 60 #define VIRTIO_PCI_COMMON_ADM_Q_NUM 62 +#define LM_LOGGING_CTRL 0 +#define LM_BASE_ADDR_LOW 4 +#define LM_BASE_ADDR_HIGH 8 +#define LM_END_ADDR_LOW 12 +#define LM_END_ADDR_HIGH 16 +#define LM_VRING_STATE_OFFSET 0x20 + #endif /* VIRTIO_PCI_NO_MODERN */ /* Admin command status. */ From 2a0e0a35002db7ac64f4e82ea2a4ad2fb6d934b0 Mon Sep 17 00:00:00 2001 From: Zhao Liu Date: Fri, 23 Feb 2024 16:56:47 +0800 Subject: [PATCH 33/68] hw/cxl/cxl-host: Fix missing ERRP_GUARD() in cxl_fixed_memory_window_config() As the comment in qapi/error, dereferencing @errp requires ERRP_GUARD(): * = Why, when and how to use ERRP_GUARD() = * * Without ERRP_GUARD(), use of the @errp parameter is restricted: * - It must not be dereferenced, because it may be null. ... * ERRP_GUARD() lifts these restrictions. * * To use ERRP_GUARD(), add it right at the beginning of the function. * @errp can then be used without worrying about the argument being * NULL or &error_fatal. * * Using it when it's not needed is safe, but please avoid cluttering * the source with useless code. But in cxl_fixed_memory_window_config(), @errp is dereferenced in 2 places without ERRP_GUARD(): fw->enc_int_ways = cxl_interleave_ways_enc(fw->num_targets, errp); if (*errp) { return; } and fw->enc_int_gran = cxl_interleave_granularity_enc(object->interleave_granularity, errp); if (*errp) { return; } For the above 2 places, we check "*errp", because neither function returns a suitable error code. And since machine_set_cfmw() - the caller of cxl_fixed_memory_window_config() - doesn't get the NULL @errp parameter as the "set" method of object property, cxl_fixed_memory_window_config() hasn't triggered the bug that dereferencing the NULL @errp. To follow the requirement of @errp, add missing ERRP_GUARD() in cxl_fixed_memory_window_config(). Suggested-by: Markus Armbruster Signed-off-by: Zhao Liu Reviewed-by: Markus Armbruster Message-Id: <20240223085653.1255438-2-zhao1.liu@linux.intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Acked-by: Jonathan Cameron --- hw/cxl/cxl-host.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c index 2aa776c79c..c5f5fcfd64 100644 --- a/hw/cxl/cxl-host.c +++ b/hw/cxl/cxl-host.c @@ -26,6 +26,7 @@ static void cxl_fixed_memory_window_config(CXLState *cxl_state, CXLFixedMemoryWindowOptions *object, Error **errp) { + ERRP_GUARD(); g_autofree CXLFixedWindow *fw = g_malloc0(sizeof(*fw)); strList *target; int i; From 5aa4a6417b0f7acbfd7f4c21dca26293bc3d9348 Mon Sep 17 00:00:00 2001 From: Zhao Liu Date: Fri, 23 Feb 2024 16:56:48 +0800 Subject: [PATCH 34/68] hw/display/macfb: Fix missing ERRP_GUARD() in macfb_nubus_realize() As the comment in qapi/error, dereferencing @errp requires ERRP_GUARD(): * = Why, when and how to use ERRP_GUARD() = * * Without ERRP_GUARD(), use of the @errp parameter is restricted: * - It must not be dereferenced, because it may be null. ... * ERRP_GUARD() lifts these restrictions. * * To use ERRP_GUARD(), add it right at the beginning of the function. * @errp can then be used without worrying about the argument being * NULL or &error_fatal. * * Using it when it's not needed is safe, but please avoid cluttering * the source with useless code. But in macfb_nubus_realize(), @errp is dereferenced without ERRP_GUARD(): ndc->parent_realize(dev, errp); if (*errp) { return; } Here we check *errp, because the ndc->parent_realize(), as a DeviceClass.realize() callback, returns void. And since macfb_nubus_realize(), also as a DeviceClass.realize(), doesn't get the NULL @errp parameter, it hasn't triggered the bug that dereferencing the NULL @errp. To follow the requirement of @errp, add missing ERRP_GUARD() in macfb_nubus_realize(). Suggested-by: Markus Armbruster Signed-off-by: Zhao Liu Reviewed-by: Markus Armbruster Message-Id: <20240223085653.1255438-3-zhao1.liu@linux.intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/display/macfb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/display/macfb.c b/hw/display/macfb.c index 418e99c8e1..1ace341a0f 100644 --- a/hw/display/macfb.c +++ b/hw/display/macfb.c @@ -714,6 +714,7 @@ static void macfb_nubus_set_irq(void *opaque, int n, int level) static void macfb_nubus_realize(DeviceState *dev, Error **errp) { + ERRP_GUARD(); NubusDevice *nd = NUBUS_DEVICE(dev); MacfbNubusState *s = NUBUS_MACFB(dev); MacfbNubusDeviceClass *ndc = NUBUS_MACFB_GET_CLASS(dev); From d477d07a5d2c392c7aa99cb76921ed3e40d141ef Mon Sep 17 00:00:00 2001 From: Zhao Liu Date: Fri, 23 Feb 2024 16:56:49 +0800 Subject: [PATCH 35/68] hw/mem/cxl_type3: Fix missing ERRP_GUARD() in ct3_realize() As the comment in qapi/error, dereferencing @errp requires ERRP_GUARD(): * = Why, when and how to use ERRP_GUARD() = * * Without ERRP_GUARD(), use of the @errp parameter is restricted: * - It must not be dereferenced, because it may be null. ... * ERRP_GUARD() lifts these restrictions. * * To use ERRP_GUARD(), add it right at the beginning of the function. * @errp can then be used without worrying about the argument being * NULL or &error_fatal. * * Using it when it's not needed is safe, but please avoid cluttering * the source with useless code. But in ct3_realize(), @errp is dereferenced without ERRP_GUARD(): cxl_doe_cdat_init(cxl_cstate, errp); if (*errp) { goto err_free_special_ops; } Here we check *errp, because cxl_doe_cdat_init() returns void. And ct3_realize() - as a PCIDeviceClass.realize() method - doesn't get the NULL @errp parameter, it hasn't triggered the bug that dereferencing the NULL @errp. To follow the requirement of @errp, add missing ERRP_GUARD() in ct3_realize(). Suggested-by: Markus Armbruster Signed-off-by: Zhao Liu Reviewed-by: Markus Armbruster Message-Id: <20240223085653.1255438-4-zhao1.liu@linux.intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Acked-by: Jonathan Cameron --- hw/mem/cxl_type3.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c index b679dfae1c..b0a7e9f11b 100644 --- a/hw/mem/cxl_type3.c +++ b/hw/mem/cxl_type3.c @@ -645,6 +645,7 @@ static DOEProtocol doe_cdat_prot[] = { static void ct3_realize(PCIDevice *pci_dev, Error **errp) { + ERRP_GUARD(); CXLType3Dev *ct3d = CXL_TYPE3(pci_dev); CXLComponentState *cxl_cstate = &ct3d->cxl_cstate; ComponentRegisters *regs = &cxl_cstate->crb; From 305446015848f0b5d71b817b53e7e02b08c36ede Mon Sep 17 00:00:00 2001 From: Zhao Liu Date: Fri, 23 Feb 2024 16:56:50 +0800 Subject: [PATCH 36/68] hw/misc/xlnx-versal-trng: Check returned bool in trng_prop_fault_event_set() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As the comment in qapi/error, dereferencing @errp requires ERRP_GUARD(): * = Why, when and how to use ERRP_GUARD() = * * Without ERRP_GUARD(), use of the @errp parameter is restricted: * - It must not be dereferenced, because it may be null. ... * ERRP_GUARD() lifts these restrictions. * * To use ERRP_GUARD(), add it right at the beginning of the function. * @errp can then be used without worrying about the argument being * NULL or &error_fatal. * * Using it when it's not needed is safe, but please avoid cluttering * the source with useless code. But in trng_prop_fault_event_set, @errp is dereferenced without ERRP_GUARD(): visit_type_uint32(v, name, events, errp); if (*errp) { return; } Currently, since trng_prop_fault_event_set() doesn't get the NULL @errp parameter as a "set" method of object property, it hasn't triggered the bug that dereferencing the NULL @errp. And since visit_type_uint32() returns bool, check the returned bool directly instead of dereferencing @errp, then we needn't the add missing ERRP_GUARD(). Suggested-by: Markus Armbruster Signed-off-by: Zhao Liu Message-Id: <20240223085653.1255438-5-zhao1.liu@linux.intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Philippe Mathieu-Daudé --- hw/misc/xlnx-versal-trng.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/misc/xlnx-versal-trng.c b/hw/misc/xlnx-versal-trng.c index b8111b8b66..6495188dc7 100644 --- a/hw/misc/xlnx-versal-trng.c +++ b/hw/misc/xlnx-versal-trng.c @@ -644,8 +644,7 @@ static void trng_prop_fault_event_set(Object *obj, Visitor *v, Property *prop = opaque; uint32_t *events = object_field_prop_ptr(obj, prop); - visit_type_uint32(v, name, events, errp); - if (*errp) { + if (!visit_type_uint32(v, name, events, errp)) { return; } From 4f5a3f49b934ff24227ebc95b3f9177f0147ff52 Mon Sep 17 00:00:00 2001 From: Zhao Liu Date: Fri, 23 Feb 2024 16:56:51 +0800 Subject: [PATCH 37/68] hw/pci-bridge/cxl_upstream: Fix missing ERRP_GUARD() in cxl_usp_realize() As the comment in qapi/error, dereferencing @errp requires ERRP_GUARD(): * = Why, when and how to use ERRP_GUARD() = * * Without ERRP_GUARD(), use of the @errp parameter is restricted: * - It must not be dereferenced, because it may be null. ... * ERRP_GUARD() lifts these restrictions. * * To use ERRP_GUARD(), add it right at the beginning of the function. * @errp can then be used without worrying about the argument being * NULL or &error_fatal. * * Using it when it's not needed is safe, but please avoid cluttering * the source with useless code. But in cxl_usp_realize(), @errp is dereferenced without ERRP_GUARD(): cxl_doe_cdat_init(cxl_cstate, errp); if (*errp) { goto err_cap; } Here we check *errp, because cxl_doe_cdat_init() returns void. And since cxl_usp_realize() - as a PCIDeviceClass.realize() method - doesn't get the NULL @errp parameter, it hasn't triggered the bug that dereferencing the NULL @errp. To follow the requirement of @errp, add missing ERRP_GUARD() in cxl_usp_realize(). Suggested-by: Markus Armbruster Signed-off-by: Zhao Liu Reviewed-by: Markus Armbruster Message-Id: <20240223085653.1255438-6-zhao1.liu@linux.intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Acked-by: Jonathan Cameron Reviewed-by: Thomas Huth --- hw/pci-bridge/cxl_upstream.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/pci-bridge/cxl_upstream.c b/hw/pci-bridge/cxl_upstream.c index 537f9affb8..783fa6adac 100644 --- a/hw/pci-bridge/cxl_upstream.c +++ b/hw/pci-bridge/cxl_upstream.c @@ -289,6 +289,7 @@ static void free_default_cdat_table(CDATSubHeader **cdat_table, int num, static void cxl_usp_realize(PCIDevice *d, Error **errp) { + ERRP_GUARD(); PCIEPort *p = PCIE_PORT(d); CXLUpstreamPort *usp = CXL_USP(d); CXLComponentState *cxl_cstate = &usp->cxl_cstate; From ccd1fd0c5d4f4cba7fa8642ab466a82db3e9f093 Mon Sep 17 00:00:00 2001 From: Zhao Liu Date: Fri, 23 Feb 2024 16:56:52 +0800 Subject: [PATCH 38/68] hw/vfio/iommufd: Fix missing ERRP_GUARD() in iommufd_cdev_getfd() As the comment in qapi/error, dereferencing @errp requires ERRP_GUARD(): * = Why, when and how to use ERRP_GUARD() = * * Without ERRP_GUARD(), use of the @errp parameter is restricted: * - It must not be dereferenced, because it may be null. ... * ERRP_GUARD() lifts these restrictions. * * To use ERRP_GUARD(), add it right at the beginning of the function. * @errp can then be used without worrying about the argument being * NULL or &error_fatal. * * Using it when it's not needed is safe, but please avoid cluttering * the source with useless code. But in iommufd_cdev_getfd(), @errp is dereferenced without ERRP_GUARD(): if (*errp) { error_prepend(errp, VFIO_MSG_PREFIX, path); } Currently, since vfio_attach_device() - the caller of iommufd_cdev_getfd() - is always called in DeviceClass.realize() context and doesn't get the NULL @errp parameter, iommufd_cdev_getfd() hasn't triggered the bug that dereferencing the NULL @errp. To follow the requirement of @errp, add missing ERRP_GUARD() in iommufd_cdev_getfd(). Suggested-by: Markus Armbruster Signed-off-by: Zhao Liu Reviewed-by: Markus Armbruster Message-Id: <20240223085653.1255438-7-zhao1.liu@linux.intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/vfio/iommufd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 9bfddc1360..7baf49e6ee 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -116,6 +116,7 @@ static void iommufd_cdev_unbind_and_disconnect(VFIODevice *vbasedev) static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp) { + ERRP_GUARD(); long int ret = -ENOTTY; char *path, *vfio_dev_path = NULL, *vfio_path = NULL; DIR *dir = NULL; From 0f9c30350bdf9d062609a15124f30f6c2b0a4b60 Mon Sep 17 00:00:00 2001 From: Zhao Liu Date: Fri, 23 Feb 2024 16:56:53 +0800 Subject: [PATCH 39/68] hw/intc: Check @errp to handle the error of IOAPICCommonClass.realize() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IOAPICCommonClass implements its own private realize(), and this private realize() allows error. Since IOAPICCommonClass.realize() returns void, to check the error, dereference @errp with ERRP_GUARD(). Signed-off-by: Zhao Liu Message-Id: <20240223085653.1255438-8-zhao1.liu@linux.intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Philippe Mathieu-Daudé --- hw/intc/ioapic_common.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hw/intc/ioapic_common.c b/hw/intc/ioapic_common.c index cb9bf62146..efbe6958c8 100644 --- a/hw/intc/ioapic_common.c +++ b/hw/intc/ioapic_common.c @@ -152,6 +152,7 @@ static int ioapic_dispatch_post_load(void *opaque, int version_id) static void ioapic_common_realize(DeviceState *dev, Error **errp) { + ERRP_GUARD(); IOAPICCommonState *s = IOAPIC_COMMON(dev); IOAPICCommonClass *info; @@ -162,6 +163,9 @@ static void ioapic_common_realize(DeviceState *dev, Error **errp) info = IOAPIC_COMMON_GET_CLASS(s); info->realize(dev, errp); + if (*errp) { + return; + } sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->io_memory); ioapic_no++; From 735eee07d1f963635d3c3bf9f5e4bf1bc000870e Mon Sep 17 00:00:00 2001 From: Felix Wu Date: Wed, 21 Feb 2024 17:00:26 +0000 Subject: [PATCH 40/68] Implement base of SMBIOS type 9 descriptor. Version 2.1+. Signed-off-by: Felix Wu Signed-off-by: Nabih Estefan Message-Id: <20240221170027.1027325-2-nabihestefan@google.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/smbios/smbios.c | 99 ++++++++++++++++++++++++++++++++++++ include/hw/firmware/smbios.h | 13 +++++ qemu-options.hx | 3 ++ 3 files changed, 115 insertions(+) diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c index a3c4e52ce9..38b3ea172c 100644 --- a/hw/smbios/smbios.c +++ b/hw/smbios/smbios.c @@ -121,6 +121,16 @@ struct type8_instance { }; static QTAILQ_HEAD(, type8_instance) type8 = QTAILQ_HEAD_INITIALIZER(type8); +/* type 9 instance for parsing */ +struct type9_instance { + const char *slot_designation; + uint8_t slot_type, slot_data_bus_width, current_usage, slot_length, + slot_characteristics1, slot_characteristics2; + uint16_t slot_id; + QTAILQ_ENTRY(type9_instance) next; +}; +static QTAILQ_HEAD(, type9_instance) type9 = QTAILQ_HEAD_INITIALIZER(type9); + static struct { size_t nvalues; char **values; @@ -380,6 +390,54 @@ static const QemuOptDesc qemu_smbios_type8_opts[] = { { /* end of list */ } }; +static const QemuOptDesc qemu_smbios_type9_opts[] = { + { + .name = "type", + .type = QEMU_OPT_NUMBER, + .help = "SMBIOS element type", + }, + { + .name = "slot_designation", + .type = QEMU_OPT_STRING, + .help = "string number for reference designation", + }, + { + .name = "slot_type", + .type = QEMU_OPT_NUMBER, + .help = "connector type", + }, + { + .name = "slot_data_bus_width", + .type = QEMU_OPT_NUMBER, + .help = "port type", + }, + { + .name = "current_usage", + .type = QEMU_OPT_NUMBER, + .help = "current usage", + }, + { + .name = "slot_length", + .type = QEMU_OPT_NUMBER, + .help = "system slot length", + }, + { + .name = "slot_id", + .type = QEMU_OPT_NUMBER, + .help = "system slot id", + }, + { + .name = "slot_characteristics1", + .type = QEMU_OPT_NUMBER, + .help = "slot characteristics1, see the spec", + }, + { + .name = "slot_characteristics2", + .type = QEMU_OPT_NUMBER, + .help = "slot characteristics2, see the spec", + }, +}; + static const QemuOptDesc qemu_smbios_type11_opts[] = { { .name = "type", @@ -609,6 +667,7 @@ bool smbios_skip_table(uint8_t type, bool required_table) #define T2_BASE 0x200 #define T3_BASE 0x300 #define T4_BASE 0x400 +#define T9_BASE 0x900 #define T11_BASE 0xe00 #define T16_BASE 0x1000 @@ -807,6 +866,28 @@ static void smbios_build_type_8_table(void) } } +static void smbios_build_type_9_table(void) +{ + unsigned instance = 0; + struct type9_instance *t9; + + QTAILQ_FOREACH(t9, &type9, next) { + SMBIOS_BUILD_TABLE_PRE(9, T9_BASE + instance, true); + + SMBIOS_TABLE_SET_STR(9, slot_designation, t9->slot_designation); + t->slot_type = t9->slot_type; + t->slot_data_bus_width = t9->slot_data_bus_width; + t->current_usage = t9->current_usage; + t->slot_length = t9->slot_length; + t->slot_id = t9->slot_id; + t->slot_characteristics1 = t9->slot_characteristics1; + t->slot_characteristics2 = t9->slot_characteristics2; + + SMBIOS_BUILD_TABLE_POST; + instance++; + } +} + static void smbios_build_type_11_table(void) { char count_str[128]; @@ -1126,6 +1207,7 @@ void smbios_get_tables(MachineState *ms, } smbios_build_type_8_table(); + smbios_build_type_9_table(); smbios_build_type_11_table(); #define MAX_DIMM_SZ (16 * GiB) @@ -1460,6 +1542,23 @@ void smbios_entry_add(QemuOpts *opts, Error **errp) t8_i->port_type = qemu_opt_get_number(opts, "port_type", 0); QTAILQ_INSERT_TAIL(&type8, t8_i, next); return; + case 9: { + if (!qemu_opts_validate(opts, qemu_smbios_type9_opts, errp)) { + return; + } + struct type9_instance *t; + t = g_new0(struct type9_instance, 1); + save_opt(&t->slot_designation, opts, "slot_designation"); + t->slot_type = qemu_opt_get_number(opts, "slot_type", 0); + t->slot_data_bus_width = qemu_opt_get_number(opts, "slot_data_bus_width", 0); + t->current_usage = qemu_opt_get_number(opts, "current_usage", 0); + t->slot_length = qemu_opt_get_number(opts, "slot_length", 0); + t->slot_id = qemu_opt_get_number(opts, "slot_id", 0); + t->slot_characteristics1 = qemu_opt_get_number(opts, "slot_characteristics1", 0); + t->slot_characteristics2 = qemu_opt_get_number(opts, "slot_characteristics2", 0); + QTAILQ_INSERT_TAIL(&type9, t, next); + return; + } case 11: if (!qemu_opts_validate(opts, qemu_smbios_type11_opts, errp)) { return; diff --git a/include/hw/firmware/smbios.h b/include/hw/firmware/smbios.h index 6e514982d4..9ab114aea2 100644 --- a/include/hw/firmware/smbios.h +++ b/include/hw/firmware/smbios.h @@ -211,6 +211,19 @@ struct smbios_type_8 { uint8_t port_type; } QEMU_PACKED; +/* SMBIOS type 9 - System Slots (v2.1+) */ +struct smbios_type_9 { + struct smbios_structure_header header; + uint8_t slot_designation; + uint8_t slot_type; + uint8_t slot_data_bus_width; + uint8_t current_usage; + uint8_t slot_length; + uint16_t slot_id; + uint8_t slot_characteristics1; + uint8_t slot_characteristics2; +} QEMU_PACKED; + /* SMBIOS type 11 - OEM strings */ struct smbios_type_11 { struct smbios_structure_header header; diff --git a/qemu-options.hx b/qemu-options.hx index ac4a30fa83..7da9235b08 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -2718,6 +2718,9 @@ SRST ``-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str][,asset=str][,part=str][,processor-family=%d][,processor-id=%d]`` Specify SMBIOS type 4 fields +``-smbios type=9[,slot_designation=str][,slot_type=%d][,slot_data_bus_width=%d][,current_usage=%d][,slot_length=%d][,slot_id=%d][,slot_characteristics1=%d][,slot_characteristics12=%d]`` + Specify SMBIOS type 9 fields + ``-smbios type=11[,value=str][,path=filename]`` Specify SMBIOS type 11 fields From 04f143d828845d0fd52dd4a52664d81a4f5431f7 Mon Sep 17 00:00:00 2001 From: Felix Wu Date: Wed, 21 Feb 2024 17:00:27 +0000 Subject: [PATCH 41/68] Implement SMBIOS type 9 v2.6 Signed-off-by: Felix Wu Signed-off-by: Nabih Estefan Message-Id: <20240221170027.1027325-3-nabihestefan@google.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/smbios/smbios.c | 49 +++++++++++++++++++++++++++++++++--- include/hw/firmware/smbios.h | 4 +++ qemu-options.hx | 2 +- 3 files changed, 51 insertions(+), 4 deletions(-) diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c index 38b3ea172c..e3d5d8f2e2 100644 --- a/hw/smbios/smbios.c +++ b/hw/smbios/smbios.c @@ -123,7 +123,7 @@ static QTAILQ_HEAD(, type8_instance) type8 = QTAILQ_HEAD_INITIALIZER(type8); /* type 9 instance for parsing */ struct type9_instance { - const char *slot_designation; + const char *slot_designation, *pcidev; uint8_t slot_type, slot_data_bus_width, current_usage, slot_length, slot_characteristics1, slot_characteristics2; uint16_t slot_id; @@ -436,6 +436,11 @@ static const QemuOptDesc qemu_smbios_type9_opts[] = { .type = QEMU_OPT_NUMBER, .help = "slot characteristics2, see the spec", }, + { + .name = "pci_device", + .type = QEMU_OPT_STRING, + .help = "PCI device, if provided." + } }; static const QemuOptDesc qemu_smbios_type11_opts[] = { @@ -866,7 +871,7 @@ static void smbios_build_type_8_table(void) } } -static void smbios_build_type_9_table(void) +static void smbios_build_type_9_table(Error **errp) { unsigned instance = 0; struct type9_instance *t9; @@ -883,6 +888,43 @@ static void smbios_build_type_9_table(void) t->slot_characteristics1 = t9->slot_characteristics1; t->slot_characteristics2 = t9->slot_characteristics2; + if (t9->pcidev) { + PCIDevice *pdev = NULL; + int rc = pci_qdev_find_device(t9->pcidev, &pdev); + if (rc != 0) { + error_setg(errp, + "No PCI device %s for SMBIOS type 9 entry %s", + t9->pcidev, t9->slot_designation); + return; + } + /* + * We only handle the case were the device is attached to + * the PCI root bus. The general case is more complex as + * bridges are enumerated later and the table would need + * to be updated at this moment. + */ + if (!pci_bus_is_root(pci_get_bus(pdev))) { + error_setg(errp, + "Cannot create type 9 entry for PCI device %s: " + "not attached to the root bus", + t9->pcidev); + return; + } + t->segment_group_number = cpu_to_le16(0); + t->bus_number = pci_dev_bus_num(pdev); + t->device_number = pdev->devfn; + } else { + /* + * Per SMBIOS spec, For slots that are not of the PCI, AGP, PCI-X, + * or PCI-Express type that do not have bus/device/function + * information, 0FFh should be populated in the fields of Segment + * Group Number, Bus Number, Device/Function Number. + */ + t->segment_group_number = 0xff; + t->bus_number = 0xff; + t->device_number = 0xff; + } + SMBIOS_BUILD_TABLE_POST; instance++; } @@ -1207,7 +1249,7 @@ void smbios_get_tables(MachineState *ms, } smbios_build_type_8_table(); - smbios_build_type_9_table(); + smbios_build_type_9_table(errp); smbios_build_type_11_table(); #define MAX_DIMM_SZ (16 * GiB) @@ -1556,6 +1598,7 @@ void smbios_entry_add(QemuOpts *opts, Error **errp) t->slot_id = qemu_opt_get_number(opts, "slot_id", 0); t->slot_characteristics1 = qemu_opt_get_number(opts, "slot_characteristics1", 0); t->slot_characteristics2 = qemu_opt_get_number(opts, "slot_characteristics2", 0); + save_opt(&t->pcidev, opts, "pcidev"); QTAILQ_INSERT_TAIL(&type9, t, next); return; } diff --git a/include/hw/firmware/smbios.h b/include/hw/firmware/smbios.h index 9ab114aea2..c21b8d3285 100644 --- a/include/hw/firmware/smbios.h +++ b/include/hw/firmware/smbios.h @@ -222,6 +222,10 @@ struct smbios_type_9 { uint16_t slot_id; uint8_t slot_characteristics1; uint8_t slot_characteristics2; + /* SMBIOS spec v2.6+ */ + uint16_t segment_group_number; + uint8_t bus_number; + uint8_t device_number; } QEMU_PACKED; /* SMBIOS type 11 - OEM strings */ diff --git a/qemu-options.hx b/qemu-options.hx index 7da9235b08..937fd7ed84 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -2718,7 +2718,7 @@ SRST ``-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str][,asset=str][,part=str][,processor-family=%d][,processor-id=%d]`` Specify SMBIOS type 4 fields -``-smbios type=9[,slot_designation=str][,slot_type=%d][,slot_data_bus_width=%d][,current_usage=%d][,slot_length=%d][,slot_id=%d][,slot_characteristics1=%d][,slot_characteristics12=%d]`` +``-smbios type=9[,slot_designation=str][,slot_type=%d][,slot_data_bus_width=%d][,current_usage=%d][,slot_length=%d][,slot_id=%d][,slot_characteristics1=%d][,slot_characteristics12=%d][,pci_device=str]`` Specify SMBIOS type 9 fields ``-smbios type=11[,value=str][,path=filename]`` From 91bb64a8d2014fda33a81fcf0fce37340f0d3b0c Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Wed, 28 Feb 2024 20:33:12 +0900 Subject: [PATCH 42/68] hw/nvme: Use pcie_sriov_num_vfs() nvme_sriov_pre_write_ctrl() used to directly inspect SR-IOV configurations to know the number of VFs being disabled due to SR-IOV configuration writes, but the logic was flawed and resulted in out-of-bound memory access. It assumed PCI_SRIOV_NUM_VF always has the number of currently enabled VFs, but it actually doesn't in the following cases: - PCI_SRIOV_NUM_VF has been set but PCI_SRIOV_CTRL_VFE has never been. - PCI_SRIOV_NUM_VF was written after PCI_SRIOV_CTRL_VFE was set. - VFs were only partially enabled because of realization failure. It is a responsibility of pcie_sriov to interpret SR-IOV configurations and pcie_sriov does it correctly, so use pcie_sriov_num_vfs(), which it provides, to get the number of enabled VFs before and after SR-IOV configuration writes. Cc: qemu-stable@nongnu.org Fixes: CVE-2024-26328 Fixes: 11871f53ef8e ("hw/nvme: Add support for the Virtualization Management command") Suggested-by: Michael S. Tsirkin Signed-off-by: Akihiko Odaki Message-Id: <20240228-reuse-v8-1-282660281e60@daynix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/nvme/ctrl.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 76fe039704..2860a9bed1 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -8466,36 +8466,26 @@ static void nvme_pci_reset(DeviceState *qdev) nvme_ctrl_reset(n, NVME_RESET_FUNCTION); } -static void nvme_sriov_pre_write_ctrl(PCIDevice *dev, uint32_t address, - uint32_t val, int len) +static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t old_num_vfs) { NvmeCtrl *n = NVME(dev); NvmeSecCtrlEntry *sctrl; - uint16_t sriov_cap = dev->exp.sriov_cap; - uint32_t off = address - sriov_cap; - int i, num_vfs; + int i; - if (!sriov_cap) { - return; - } - - if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) { - if (!(val & PCI_SRIOV_CTRL_VFE)) { - num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); - for (i = 0; i < num_vfs; i++) { - sctrl = &n->sec_ctrl_list.sec[i]; - nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); - } - } + for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) { + sctrl = &n->sec_ctrl_list.sec[i]; + nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); } } static void nvme_pci_write_config(PCIDevice *dev, uint32_t address, uint32_t val, int len) { - nvme_sriov_pre_write_ctrl(dev, address, val, len); + uint16_t old_num_vfs = pcie_sriov_num_vfs(dev); + pci_default_write_config(dev, address, val, len); pcie_cap_flr_write_config(dev, address, val, len); + nvme_sriov_post_write_config(dev, old_num_vfs); } static const VMStateDescription nvme_vmstate = { From 6081b4243cd64dff1b2cf5b0c215c71e9d7e753b Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Wed, 28 Feb 2024 20:33:13 +0900 Subject: [PATCH 43/68] pcie_sriov: Validate NumVFs The guest may write NumVFs greater than TotalVFs and that can lead to buffer overflow in VF implementations. Cc: qemu-stable@nongnu.org Fixes: CVE-2024-26327 Fixes: 7c0fa8dff811 ("pcie: Add support for Single Root I/O Virtualization (SR/IOV)") Signed-off-by: Akihiko Odaki Message-Id: <20240228-reuse-v8-2-282660281e60@daynix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Sriram Yagnaraman --- hw/pci/pcie_sriov.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c index a1fe65f5d8..da209b7f47 100644 --- a/hw/pci/pcie_sriov.c +++ b/hw/pci/pcie_sriov.c @@ -176,6 +176,9 @@ static void register_vfs(PCIDevice *dev) assert(sriov_cap > 0); num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); + if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) { + return; + } dev->exp.sriov_pf.vf = g_new(PCIDevice *, num_vfs); From c8bc4db403e17663b69d811e69f88c9dfc6f7be2 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Wed, 28 Feb 2024 20:33:14 +0900 Subject: [PATCH 44/68] pcie_sriov: Reset SR-IOV extended capability pcie_sriov_pf_disable_vfs() is called when resetting the PF, but it only disables VFs and does not reset SR-IOV extended capability, leaking the state and making the VF Enable register inconsistent with the actual state. Replace pcie_sriov_pf_disable_vfs() with pcie_sriov_pf_reset(), which does not only disable VFs but also resets the capability. Signed-off-by: Akihiko Odaki Message-Id: <20240228-reuse-v8-3-282660281e60@daynix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Sriram Yagnaraman --- hw/net/igb.c | 2 +- hw/nvme/ctrl.c | 2 +- hw/pci/pcie_sriov.c | 26 ++++++++++++++++++-------- include/hw/pci/pcie_sriov.h | 4 ++-- 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/hw/net/igb.c b/hw/net/igb.c index 0b5c31a58b..9345506f81 100644 --- a/hw/net/igb.c +++ b/hw/net/igb.c @@ -493,7 +493,7 @@ static void igb_qdev_reset_hold(Object *obj) trace_e1000e_cb_qdev_reset_hold(); - pcie_sriov_pf_disable_vfs(d); + pcie_sriov_pf_reset(d); igb_core_reset(&s->core); } diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 2860a9bed1..447c4de6fd 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -7116,7 +7116,7 @@ static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst) } if (rst != NVME_RESET_CONTROLLER) { - pcie_sriov_pf_disable_vfs(pci_dev); + pcie_sriov_pf_reset(pci_dev); } } diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c index da209b7f47..51b66d1bb3 100644 --- a/hw/pci/pcie_sriov.c +++ b/hw/pci/pcie_sriov.c @@ -249,16 +249,26 @@ void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, } -/* Reset SR/IOV VF Enable bit to trigger an unregister of all VFs */ -void pcie_sriov_pf_disable_vfs(PCIDevice *dev) +/* Reset SR/IOV */ +void pcie_sriov_pf_reset(PCIDevice *dev) { uint16_t sriov_cap = dev->exp.sriov_cap; - if (sriov_cap) { - uint32_t val = pci_get_byte(dev->config + sriov_cap + PCI_SRIOV_CTRL); - if (val & PCI_SRIOV_CTRL_VFE) { - val &= ~PCI_SRIOV_CTRL_VFE; - pcie_sriov_config_write(dev, sriov_cap + PCI_SRIOV_CTRL, val, 1); - } + if (!sriov_cap) { + return; + } + + pci_set_word(dev->config + sriov_cap + PCI_SRIOV_CTRL, 0); + unregister_vfs(dev); + + /* + * Default is to use 4K pages, software can modify it + * to any of the supported bits + */ + pci_set_word(dev->config + sriov_cap + PCI_SRIOV_SYS_PGSIZE, 0x1); + + for (uint16_t i = 0; i < PCI_NUM_REGIONS; i++) { + pci_set_quad(dev->config + sriov_cap + PCI_SRIOV_BAR + i * 4, + dev->exp.sriov_pf.vf_bar_type[i]); } } diff --git a/include/hw/pci/pcie_sriov.h b/include/hw/pci/pcie_sriov.h index 095fb0c9ed..b77eb7bf58 100644 --- a/include/hw/pci/pcie_sriov.h +++ b/include/hw/pci/pcie_sriov.h @@ -58,8 +58,8 @@ void pcie_sriov_pf_add_sup_pgsize(PCIDevice *dev, uint16_t opt_sup_pgsize); void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, uint32_t val, int len); -/* Reset SR/IOV VF Enable bit to unregister all VFs */ -void pcie_sriov_pf_disable_vfs(PCIDevice *dev); +/* Reset SR/IOV */ +void pcie_sriov_pf_reset(PCIDevice *dev); /* Get logical VF number of a VF - only valid for VFs */ uint16_t pcie_sriov_vf_number(PCIDevice *dev); From 63eb76dda237843582f3616f4403ae795e471e17 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Wed, 28 Feb 2024 20:33:15 +0900 Subject: [PATCH 45/68] pcie_sriov: Do not reset NumVFs after disabling VFs The spec does not NumVFs is reset after disabling VFs except when resetting the PF. Clearing it is guest visible and out of spec, even though Linux doesn't rely on this value being preserved, so we never noticed. Fixes: 7c0fa8dff811 ("pcie: Add support for Single Root I/O Virtualization (SR/IOV)") Signed-off-by: Akihiko Odaki Message-Id: <20240228-reuse-v8-4-282660281e60@daynix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pcie_sriov.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c index 51b66d1bb3..e9b23221d7 100644 --- a/hw/pci/pcie_sriov.c +++ b/hw/pci/pcie_sriov.c @@ -215,7 +215,6 @@ static void unregister_vfs(PCIDevice *dev) g_free(dev->exp.sriov_pf.vf); dev->exp.sriov_pf.vf = NULL; dev->exp.sriov_pf.num_vfs = 0; - pci_set_word(dev->config + dev->exp.sriov_cap + PCI_SRIOV_NUM_VF, 0); } void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, @@ -260,6 +259,8 @@ void pcie_sriov_pf_reset(PCIDevice *dev) pci_set_word(dev->config + sriov_cap + PCI_SRIOV_CTRL, 0); unregister_vfs(dev); + pci_set_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF, 0); + /* * Default is to use 4K pages, software can modify it * to any of the supported bits From 1a909e3dd85d5c57a0e6a7e3285a29e57574f80d Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Wed, 28 Feb 2024 20:33:16 +0900 Subject: [PATCH 46/68] hw/pci: Always call pcie_sriov_pf_reset() Call pcie_sriov_pf_reset() from pci_do_device_reset() just as we do for msi_reset() and msix_reset() to prevent duplicating code for each SR-IOV PF. Signed-off-by: Akihiko Odaki Message-Id: <20240228-reuse-v8-5-282660281e60@daynix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Sriram Yagnaraman --- hw/net/igb.c | 2 -- hw/nvme/ctrl.c | 4 ---- hw/pci/pci.c | 1 + 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/hw/net/igb.c b/hw/net/igb.c index 9345506f81..9b37523d6d 100644 --- a/hw/net/igb.c +++ b/hw/net/igb.c @@ -488,12 +488,10 @@ static void igb_pci_uninit(PCIDevice *pci_dev) static void igb_qdev_reset_hold(Object *obj) { - PCIDevice *d = PCI_DEVICE(obj); IGBState *s = IGB(obj); trace_e1000e_cb_qdev_reset_hold(); - pcie_sriov_pf_reset(d); igb_core_reset(&s->core); } diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 447c4de6fd..40159f39b8 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -7114,10 +7114,6 @@ static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst) sctrl = &n->sec_ctrl_list.sec[i]; nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); } - - if (rst != NVME_RESET_CONTROLLER) { - pcie_sriov_pf_reset(pci_dev); - } } if (rst != NVME_RESET_CONTROLLER) { diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 6496d027ca..e7a39cb203 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -409,6 +409,7 @@ static void pci_do_device_reset(PCIDevice *dev) msi_reset(dev); msix_reset(dev); + pcie_sriov_pf_reset(dev); } /* From e4e98c7eebfab0f844784cdf5c108bb3d60ac9a2 Mon Sep 17 00:00:00 2001 From: Ani Sinha Date: Wed, 28 Feb 2024 20:03:51 +0530 Subject: [PATCH 47/68] pc: q35: Bump max_cpus to 4096 vcpus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit f10a570b093e6 ("KVM: x86: Add CONFIG_KVM_MAX_NR_VCPUS to allow up to 4096 vCPUs") Linux kernel can support upto a maximum number of 4096 vcpus when MAXSMP is enabled in the kernel. At present, QEMU has been tested to correctly boot a linux guest with 4096 vcpus using the current edk2 upstream master branch that has the fixes corresponding to the following two PRs: https://github.com/tianocore/edk2/pull/5410 https://github.com/tianocore/edk2/pull/5418 The changes merged into edk2 with the above PRs will be in the upcoming 2024-05 release. With current seabios firmware, it boots fine with 4096 vcpus already. So bump up the value max_cpus to 4096 for q35 machines versions 9 and newer. Q35 machines versions 8.2 and older continue to support 1024 maximum vcpus as before for compatibility reasons. If KVM is not able to support the specified number of vcpus, QEMU would return the following error messages: $ ./qemu-system-x86_64 -cpu host -accel kvm -machine q35 -smp 1728 qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (1728) exceeds the recommended cpus supported by KVM (12) qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (1728) exceeds the recommended cpus supported by KVM (12) Number of SMP cpus requested (1728) exceeds the maximum cpus supported by KVM (1024) Cc: Daniel P. Berrangé Cc: Igor Mammedov Cc: Michael S. Tsirkin Cc: Julia Suvorova Cc: kraxel@redhat.com Reviewed-by: Daniel P. Berrangé Reviewed-by: Igor Mammedov Reviewed-by: Gerd Hoffmann Signed-off-by: Ani Sinha Message-Id: <20240228143351.3967-1-anisinha@redhat.com> Reviewed-by: Zhao Liu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc_q35.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 45a4102e75..df63a92b78 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -350,7 +350,7 @@ static void pc_q35_machine_options(MachineClass *m) m->default_nic = "e1000e"; m->default_kernel_irqchip_split = false; m->no_floppy = 1; - m->max_cpus = 1024; + m->max_cpus = 4096; m->no_parallel = !module_object_class_by_name(TYPE_ISA_PARALLEL); machine_class_allow_dynamic_sysbus_dev(m, TYPE_AMD_IOMMU_DEVICE); machine_class_allow_dynamic_sysbus_dev(m, TYPE_INTEL_IOMMU_DEVICE); @@ -371,6 +371,7 @@ static void pc_q35_8_2_machine_options(MachineClass *m) { pc_q35_9_0_machine_options(m); m->alias = NULL; + m->max_cpus = 1024; compat_props_add(m->compat_props, hw_compat_8_2, hw_compat_8_2_len); compat_props_add(m->compat_props, pc_compat_8_2, pc_compat_8_2_len); } From 0fbe8d7c4c7e44fc86986217dc3ab65ec4e6a302 Mon Sep 17 00:00:00 2001 From: Bernhard Beschow Date: Mon, 26 Feb 2024 22:59:08 +0100 Subject: [PATCH 48/68] Revert "hw/i386/pc_sysfw: Inline pc_system_flash_create() and remove it" Commit 6f6ad2b24582 "hw/i386/pc: Confine system flash handling to pc_sysfw" causes a regression when specifying the property `-M pflash0` in the PCI PC machines: qemu-system-x86_64: Property 'pc-q35-9.0-machine.pflash0' not found In order to revert the commit, the commit below must be reverted first. This reverts commit cb05cc16029bb0a61ac5279ab7b3b90dcf2aa69f. Signed-off-by: Bernhard Beschow Message-Id: <20240226215909.30884-2-shentey@gmail.com> Tested-by: Alex Williamson Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc_sysfw.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c index b02e285579..b9c1eb352d 100644 --- a/hw/i386/pc_sysfw.c +++ b/hw/i386/pc_sysfw.c @@ -91,6 +91,18 @@ static PFlashCFI01 *pc_pflash_create(PCMachineState *pcms, return PFLASH_CFI01(dev); } +static void pc_system_flash_create(PCMachineState *pcms) +{ + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + + if (pcmc->pci_enabled) { + pcms->flash[0] = pc_pflash_create(pcms, "system.flash0", + "pflash0"); + pcms->flash[1] = pc_pflash_create(pcms, "system.flash1", + "pflash1"); + } +} + static void pc_system_flash_cleanup_unused(PCMachineState *pcms) { char *prop_name; @@ -198,8 +210,7 @@ void pc_system_firmware_init(PCMachineState *pcms, return; } - pcms->flash[0] = pc_pflash_create(pcms, "system.flash0", "pflash0"); - pcms->flash[1] = pc_pflash_create(pcms, "system.flash1", "pflash1"); + pc_system_flash_create(pcms); /* Map legacy -drive if=pflash to machine properties */ for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) { From f2cb9f34ad8591a7530ecec856f2302bb3efd71e Mon Sep 17 00:00:00 2001 From: Bernhard Beschow Date: Mon, 26 Feb 2024 22:59:09 +0100 Subject: [PATCH 49/68] Revert "hw/i386/pc: Confine system flash handling to pc_sysfw" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Specifying the property `-M pflash0` results in a regression: qemu-system-x86_64: Property 'pc-q35-9.0-machine.pflash0' not found Revert the change for now until a solution is found. This reverts commit 6f6ad2b24582593d8feb00434ce2396840666227. Reported-by: Volker Rümelin Signed-off-by: Bernhard Beschow Message-Id: <20240226215909.30884-3-shentey@gmail.com> Tested-by: Alex Williamson Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 1 + hw/i386/pc_piix.c | 1 + hw/i386/pc_sysfw.c | 6 ++---- include/hw/i386/pc.h | 2 ++ 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index f5ff970acf..24e8879894 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1750,6 +1750,7 @@ static void pc_machine_initfn(Object *obj) pcms->fd_bootchk = true; pcms->default_bus_bypass_iommu = false; + pc_system_flash_create(pcms); pcms->pcspk = isa_new(TYPE_PC_SPEAKER); object_property_add_alias(OBJECT(pcms), "pcspk-audiodev", OBJECT(pcms->pcspk), "audiodev"); diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 319bc4b180..9dd9f5562e 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -228,6 +228,7 @@ static void pc_init1(MachineState *machine, const char *pci_type) assert(machine->ram_size == x86ms->below_4g_mem_size + x86ms->above_4g_mem_size); + pc_system_flash_cleanup_unused(pcms); if (machine->kernel_filename != NULL) { /* For xen HVM direct kernel boot, load linux here */ xen_load_linux(pcms); diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c index b9c1eb352d..3efabbbab2 100644 --- a/hw/i386/pc_sysfw.c +++ b/hw/i386/pc_sysfw.c @@ -91,7 +91,7 @@ static PFlashCFI01 *pc_pflash_create(PCMachineState *pcms, return PFLASH_CFI01(dev); } -static void pc_system_flash_create(PCMachineState *pcms) +void pc_system_flash_create(PCMachineState *pcms) { PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); @@ -103,7 +103,7 @@ static void pc_system_flash_create(PCMachineState *pcms) } } -static void pc_system_flash_cleanup_unused(PCMachineState *pcms) +void pc_system_flash_cleanup_unused(PCMachineState *pcms) { char *prop_name; int i; @@ -210,8 +210,6 @@ void pc_system_firmware_init(PCMachineState *pcms, return; } - pc_system_flash_create(pcms); - /* Map legacy -drive if=pflash to machine properties */ for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) { pflash_cfi01_legacy_drive(pcms->flash[i], diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index b958023187..8458487a5e 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -190,6 +190,8 @@ void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs); #define TYPE_PORT92 "port92" /* pc_sysfw.c */ +void pc_system_flash_create(PCMachineState *pcms); +void pc_system_flash_cleanup_unused(PCMachineState *pcms); void pc_system_firmware_init(PCMachineState *pcms, MemoryRegion *rom_memory); bool pc_system_ovmf_table_find(const char *entry, uint8_t **data, int *data_len); From 6cd2b093e771ee606b4e065f615d9b2fafd76d22 Mon Sep 17 00:00:00 2001 From: Bernhard Beschow Date: Sun, 3 Mar 2024 19:53:29 +0100 Subject: [PATCH 50/68] hw/i386/pc: Remove "rtc_state" link again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 99e1c1137b6f "hw/i386/pc: Populate RTC attribute directly" made linking the "rtc_state" property unnecessary and removed it. Commit 84e945aad2d0 "vl, pc: turn -no-fd-bootchk into a machine property" accidently reintroduced the link. Remove it again since it is not needed. Fixes: 84e945aad2d0 "vl, pc: turn -no-fd-bootchk into a machine property" Cc: Paolo Bonzini Signed-off-by: Bernhard Beschow Message-Id: <20240303185332.1408-2-shentey@gmail.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Philippe Mathieu-Daudé --- hw/i386/pc.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 24e8879894..98bcf4d420 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -613,14 +613,6 @@ void pc_cmos_init(PCMachineState *pcms, mc146818rtc_set_cmos_data(s, 0x5c, val >> 8); mc146818rtc_set_cmos_data(s, 0x5d, val >> 16); - object_property_add_link(OBJECT(pcms), "rtc_state", - TYPE_ISA_DEVICE, - (Object **)&x86ms->rtc, - object_property_allow_set_link, - OBJ_PROP_LINK_STRONG); - object_property_set_link(OBJECT(pcms), "rtc_state", OBJECT(s), - &error_abort); - set_boot_dev(pcms, s, MACHINE(pcms)->boot_config.order, &error_fatal); val = 0; From 6605d09791df7387c3e82c9554731a752789987e Mon Sep 17 00:00:00 2001 From: Bernhard Beschow Date: Sun, 3 Mar 2024 19:53:30 +0100 Subject: [PATCH 51/68] hw/i386/pc: Avoid one use of the current_machine global MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The RTC can be accessed through the X86 machine instance, so rather than passing the RTC it's possible to pass the machine state instead. This avoids pc_boot_set() from having to access the current_machine global. Signed-off-by: Bernhard Beschow Message-Id: <20240303185332.1408-3-shentey@gmail.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Philippe Mathieu-Daudé --- hw/i386/pc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 98bcf4d420..4077d8162c 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -425,9 +425,10 @@ static void set_boot_dev(PCMachineState *pcms, MC146818RtcState *s, static void pc_boot_set(void *opaque, const char *boot_device, Error **errp) { - PCMachineState *pcms = PC_MACHINE(current_machine); + PCMachineState *pcms = opaque; + X86MachineState *x86ms = X86_MACHINE(pcms); - set_boot_dev(pcms, opaque, boot_device, errp); + set_boot_dev(pcms, MC146818_RTC(x86ms->rtc), boot_device, errp); } static void pc_cmos_init_floppy(MC146818RtcState *rtc_state, ISADevice *floppy) @@ -1252,7 +1253,7 @@ void pc_basic_device_init(struct PCMachineState *pcms, } #endif - qemu_register_boot_set(pc_boot_set, rtc_state); + qemu_register_boot_set(pc_boot_set, pcms); if (!xen_enabled() && (x86ms->pit == ON_OFF_AUTO_AUTO || x86ms->pit == ON_OFF_AUTO_ON)) { From c5e2d74448433c3d800dca6160b5c8fbd76c67d8 Mon Sep 17 00:00:00 2001 From: Bernhard Beschow Date: Sun, 3 Mar 2024 19:53:31 +0100 Subject: [PATCH 52/68] hw/i386/pc: Set "normal" boot device order in pc_basic_device_init() The boot device order may change during the lifetime of a VM. Usually, the "normal" order is set once during machine init(). However, if a user specifies `-boot once=...`, the "normal" order is overwritten by the "once" order just before machine_done, and a reset handler is registered which restores the "normal" order during the next reset. In the next patch, pc_cmos_init() will be inlined into pc_cmos_init_late() which runs during machine_done. This means that the "once" boot order would be overwritten again with the "normal" boot order -- which renders the user's choice ineffective. Fix this by setting the "normal" boot order in pc_basic_device_init() which already registers the boot_set() handler. Signed-off-by: Bernhard Beschow Message-Id: <20240303185332.1408-4-shentey@gmail.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 4077d8162c..c614697ca6 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -614,8 +614,6 @@ void pc_cmos_init(PCMachineState *pcms, mc146818rtc_set_cmos_data(s, 0x5c, val >> 8); mc146818rtc_set_cmos_data(s, 0x5d, val >> 16); - set_boot_dev(pcms, s, MACHINE(pcms)->boot_config.order, &error_fatal); - val = 0; val |= 0x02; /* FPU is there */ val |= 0x04; /* PS/2 mouse installed */ @@ -1254,6 +1252,8 @@ void pc_basic_device_init(struct PCMachineState *pcms, #endif qemu_register_boot_set(pc_boot_set, pcms); + set_boot_dev(pcms, MC146818_RTC(rtc_state), + MACHINE(pcms)->boot_config.order, &error_fatal); if (!xen_enabled() && (x86ms->pit == ON_OFF_AUTO_AUTO || x86ms->pit == ON_OFF_AUTO_ON)) { From 7d12305ec8c3fa8c60b39aae0d952245a62696d4 Mon Sep 17 00:00:00 2001 From: Bernhard Beschow Date: Sun, 3 Mar 2024 19:53:32 +0100 Subject: [PATCH 53/68] hw/i386/pc: Inline pc_cmos_init() into pc_cmos_init_late() and remove it Now that pc_cmos_init() doesn't populate the X86MachineState::rtc attribute any longer, its duties can be merged into pc_cmos_init_late() which is called within machine_done notifier. This frees pc_piix and pc_q35 from explicit CMOS initialization. Signed-off-by: Bernhard Beschow Message-Id: <20240303185332.1408-5-shentey@gmail.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 10 ---------- hw/i386/pc_piix.c | 2 -- hw/i386/pc_q35.c | 2 -- include/hw/i386/pc.h | 2 -- 4 files changed, 16 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index c614697ca6..1590352bb5 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -570,14 +570,6 @@ static void pc_cmos_init_late(PCMachineState *pcms) mc146818rtc_set_cmos_data(s, 0x39, val); pc_cmos_init_floppy(s, pc_find_fdc0()); -} - -void pc_cmos_init(PCMachineState *pcms, - ISADevice *rtc) -{ - int val; - X86MachineState *x86ms = X86_MACHINE(pcms); - MC146818RtcState *s = MC146818_RTC(rtc); /* various important CMOS locations needed by PC/Bochs bios */ @@ -618,8 +610,6 @@ void pc_cmos_init(PCMachineState *pcms, val |= 0x02; /* FPU is there */ val |= 0x04; /* PS/2 mouse installed */ mc146818rtc_set_cmos_data(s, REG_EQUIPMENT_BYTE, val); - - /* hard drives and FDC are handled by pc_cmos_init_late() */ } static void handle_a20_line_change(void *opaque, int irq, int level) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 9dd9f5562e..c9a6c0aa68 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -344,8 +344,6 @@ static void pc_init1(MachineState *machine, const char *pci_type) } #endif - pc_cmos_init(pcms, x86ms->rtc); - if (piix4_pm) { smi_irq = qemu_allocate_irq(pc_acpi_smi_interrupt, first_cpu, 0); diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index df63a92b78..1356c5d107 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -311,8 +311,6 @@ static void pc_q35_init(MachineState *machine) smbus_eeprom_init(pcms->smbus, 8, NULL, 0); } - pc_cmos_init(pcms, x86ms->rtc); - /* the rest devices to which pci devfn is automatically assigned */ pc_vga_init(isa_bus, pcms->pcibus); pc_nic_init(pcmc, isa_bus, pcms->pcibus); diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 8458487a5e..27a68071d7 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -178,8 +178,6 @@ void pc_basic_device_init(struct PCMachineState *pcms, ISADevice *rtc_state, bool create_fdctrl, uint32_t hpet_irqs); -void pc_cmos_init(PCMachineState *pcms, - ISADevice *s); void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus); void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs); From b64b7ed8bb5d475eab3b4188ef0076e332937c62 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Fri, 8 Mar 2024 14:55:23 +0000 Subject: [PATCH 54/68] qom: new object to associate device to NUMA node NVIDIA GPU's support MIG (Mult-Instance GPUs) feature [1], which allows partitioning of the GPU device resources (including device memory) into several (upto 8) isolated instances. Each of the partitioned memory needs a dedicated NUMA node to operate. The partitions are not fixed and they can be created/deleted at runtime. Unfortunately Linux OS does not provide a means to dynamically create/destroy NUMA nodes and such feature implementation is not expected to be trivial. The nodes that OS discovers at the boot time while parsing SRAT remains fixed. So we utilize the Generic Initiator (GI) Affinity structures that allows association between nodes and devices. Multiple GI structures per BDF is possible, allowing creation of multiple nodes by exposing unique PXM in each of these structures. Implement the mechanism to build the GI affinity structures as Qemu currently does not. Introduce a new acpi-generic-initiator object to allow host admin link a device with an associated NUMA node. Qemu maintains this association and use this object to build the requisite GI Affinity Structure. When multiple NUMA nodes are associated with a device, it is required to create those many number of acpi-generic-initiator objects, each representing a unique device:node association. Following is one of a decoded GI affinity structure in VM ACPI SRAT. [0C8h 0200 1] Subtable Type : 05 [Generic Initiator Affinity] [0C9h 0201 1] Length : 20 [0CAh 0202 1] Reserved1 : 00 [0CBh 0203 1] Device Handle Type : 01 [0CCh 0204 4] Proximity Domain : 00000007 [0D0h 0208 16] Device Handle : 00 00 20 00 00 00 00 00 00 00 00 00 00 00 00 00 [0E0h 0224 4] Flags (decoded below) : 00000001 Enabled : 1 [0E4h 0228 4] Reserved2 : 00000000 [0E8h 0232 1] Subtable Type : 05 [Generic Initiator Affinity] [0E9h 0233 1] Length : 20 An admin can provide a range of acpi-generic-initiator objects, each associating a device (by providing the id through pci-dev argument) to the desired NUMA node (using the node argument). Currently, only PCI device is supported. For the grace hopper system, create a range of 8 nodes and associate that with the device using the acpi-generic-initiator object. While a configuration of less than 8 nodes per device is allowed, such configuration will prevent utilization of the feature to the fullest. The following sample creates 8 nodes per PCI device for a VM with 2 PCI devices and link them to the respecitve PCI device using acpi-generic-initiator objects: -numa node,nodeid=2 -numa node,nodeid=3 -numa node,nodeid=4 \ -numa node,nodeid=5 -numa node,nodeid=6 -numa node,nodeid=7 \ -numa node,nodeid=8 -numa node,nodeid=9 \ -device vfio-pci-nohotplug,host=0009:01:00.0,bus=pcie.0,addr=04.0,rombar=0,id=dev0 \ -object acpi-generic-initiator,id=gi0,pci-dev=dev0,node=2 \ -object acpi-generic-initiator,id=gi1,pci-dev=dev0,node=3 \ -object acpi-generic-initiator,id=gi2,pci-dev=dev0,node=4 \ -object acpi-generic-initiator,id=gi3,pci-dev=dev0,node=5 \ -object acpi-generic-initiator,id=gi4,pci-dev=dev0,node=6 \ -object acpi-generic-initiator,id=gi5,pci-dev=dev0,node=7 \ -object acpi-generic-initiator,id=gi6,pci-dev=dev0,node=8 \ -object acpi-generic-initiator,id=gi7,pci-dev=dev0,node=9 \ -numa node,nodeid=10 -numa node,nodeid=11 -numa node,nodeid=12 \ -numa node,nodeid=13 -numa node,nodeid=14 -numa node,nodeid=15 \ -numa node,nodeid=16 -numa node,nodeid=17 \ -device vfio-pci-nohotplug,host=0009:01:01.0,bus=pcie.0,addr=05.0,rombar=0,id=dev1 \ -object acpi-generic-initiator,id=gi8,pci-dev=dev1,node=10 \ -object acpi-generic-initiator,id=gi9,pci-dev=dev1,node=11 \ -object acpi-generic-initiator,id=gi10,pci-dev=dev1,node=12 \ -object acpi-generic-initiator,id=gi11,pci-dev=dev1,node=13 \ -object acpi-generic-initiator,id=gi12,pci-dev=dev1,node=14 \ -object acpi-generic-initiator,id=gi13,pci-dev=dev1,node=15 \ -object acpi-generic-initiator,id=gi14,pci-dev=dev1,node=16 \ -object acpi-generic-initiator,id=gi15,pci-dev=dev1,node=17 \ Link: https://www.nvidia.com/en-in/technologies/multi-instance-gpu [1] Cc: Jonathan Cameron Cc: Alex Williamson Cc: Markus Armbruster Reviewed-by: Jonathan Cameron Acked-by: Markus Armbruster Signed-off-by: Ankit Agrawal Message-Id: <20240308145525.10886-2-ankita@nvidia.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/acpi_generic_initiator.c | 71 ++++++++++++++++++++++++ hw/acpi/meson.build | 1 + include/hw/acpi/acpi_generic_initiator.h | 22 ++++++++ qapi/qom.json | 17 ++++++ 4 files changed, 111 insertions(+) create mode 100644 hw/acpi/acpi_generic_initiator.c create mode 100644 include/hw/acpi/acpi_generic_initiator.h diff --git a/hw/acpi/acpi_generic_initiator.c b/hw/acpi/acpi_generic_initiator.c new file mode 100644 index 0000000000..130d6ae8c1 --- /dev/null +++ b/hw/acpi/acpi_generic_initiator.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include "qemu/osdep.h" +#include "hw/acpi/acpi_generic_initiator.h" +#include "hw/boards.h" +#include "qemu/error-report.h" + +typedef struct AcpiGenericInitiatorClass { + ObjectClass parent_class; +} AcpiGenericInitiatorClass; + +OBJECT_DEFINE_TYPE_WITH_INTERFACES(AcpiGenericInitiator, acpi_generic_initiator, + ACPI_GENERIC_INITIATOR, OBJECT, + { TYPE_USER_CREATABLE }, + { NULL }) + +OBJECT_DECLARE_SIMPLE_TYPE(AcpiGenericInitiator, ACPI_GENERIC_INITIATOR) + +static void acpi_generic_initiator_init(Object *obj) +{ + AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj); + + gi->node = MAX_NODES; + gi->pci_dev = NULL; +} + +static void acpi_generic_initiator_finalize(Object *obj) +{ + AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj); + + g_free(gi->pci_dev); +} + +static void acpi_generic_initiator_set_pci_device(Object *obj, const char *val, + Error **errp) +{ + AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj); + + gi->pci_dev = g_strdup(val); +} + +static void acpi_generic_initiator_set_node(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj); + uint32_t value; + + if (!visit_type_uint32(v, name, &value, errp)) { + return; + } + + if (value >= MAX_NODES) { + error_printf("%s: Invalid NUMA node specified\n", + TYPE_ACPI_GENERIC_INITIATOR); + exit(1); + } + + gi->node = value; +} + +static void acpi_generic_initiator_class_init(ObjectClass *oc, void *data) +{ + object_class_property_add_str(oc, "pci-dev", NULL, + acpi_generic_initiator_set_pci_device); + object_class_property_add(oc, "node", "int", NULL, + acpi_generic_initiator_set_node, NULL, NULL); +} diff --git a/hw/acpi/meson.build b/hw/acpi/meson.build index 5441c9b1e4..fa5c07db90 100644 --- a/hw/acpi/meson.build +++ b/hw/acpi/meson.build @@ -1,5 +1,6 @@ acpi_ss = ss.source_set() acpi_ss.add(files( + 'acpi_generic_initiator.c', 'acpi_interface.c', 'aml-build.c', 'bios-linker-loader.c', diff --git a/include/hw/acpi/acpi_generic_initiator.h b/include/hw/acpi/acpi_generic_initiator.h new file mode 100644 index 0000000000..16de1d3d80 --- /dev/null +++ b/include/hw/acpi/acpi_generic_initiator.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef ACPI_GENERIC_INITIATOR_H +#define ACPI_GENERIC_INITIATOR_H + +#include "qom/object_interfaces.h" + +#define TYPE_ACPI_GENERIC_INITIATOR "acpi-generic-initiator" + +typedef struct AcpiGenericInitiator { + /* private */ + Object parent; + + /* public */ + char *pci_dev; + uint16_t node; +} AcpiGenericInitiator; + +#endif diff --git a/qapi/qom.json b/qapi/qom.json index 032c6fa037..baae3a183f 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -811,6 +811,21 @@ { 'struct': 'IOMMUFDProperties', 'data': { '*fd': 'str' } } +## +# @AcpiGenericInitiatorProperties: +# +# Properties for acpi-generic-initiator objects. +# +# @pci-dev: PCI device ID to be associated with the node +# +# @node: NUMA node associated with the PCI device +# +# Since: 9.0 +## +{ 'struct': 'AcpiGenericInitiatorProperties', + 'data': { 'pci-dev': 'str', + 'node': 'uint32' } } + ## # @RngProperties: # @@ -928,6 +943,7 @@ ## { 'enum': 'ObjectType', 'data': [ + 'acpi-generic-initiator', 'authz-list', 'authz-listfile', 'authz-pam', @@ -999,6 +1015,7 @@ 'id': 'str' }, 'discriminator': 'qom-type', 'data': { + 'acpi-generic-initiator': 'AcpiGenericInitiatorProperties', 'authz-list': 'AuthZListProperties', 'authz-listfile': 'AuthZListFileProperties', 'authz-pam': 'AuthZPAMProperties', From 0a5b5acdf2d8c7302ca48d42e6ef3423e1b956d5 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Fri, 8 Mar 2024 14:55:24 +0000 Subject: [PATCH 55/68] hw/acpi: Implement the SRAT GI affinity structure ACPI spec provides a scheme to associate "Generic Initiators" [1] (e.g. heterogeneous processors and accelerators, GPUs, and I/O devices with integrated compute or DMA engines GPUs) with Proximity Domains. This is achieved using Generic Initiator Affinity Structure in SRAT. During bootup, Linux kernel parse the ACPI SRAT to determine the PXM ids and create a NUMA node for each unique PXM ID encountered. Qemu currently do not implement these structures while building SRAT. Add GI structures while building VM ACPI SRAT. The association between device and node are stored using acpi-generic-initiator object. Lookup presence of all such objects and use them to build these structures. The structure needs a PCI device handle [2] that consists of the device BDF. The vfio-pci device corresponding to the acpi-generic-initiator object is located to determine the BDF. [1] ACPI Spec 6.3, Section 5.2.16.6 [2] ACPI Spec 6.3, Table 5.80 Cc: Jonathan Cameron Cc: Alex Williamson Cc: Cedric Le Goater Reviewed-by: Jonathan Cameron Signed-off-by: Ankit Agrawal Message-Id: <20240308145525.10886-3-ankita@nvidia.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/acpi_generic_initiator.c | 77 ++++++++++++++++++++++++ hw/acpi/hmat.c | 2 +- hw/arm/virt-acpi-build.c | 3 + hw/core/numa.c | 3 +- include/hw/acpi/acpi_generic_initiator.h | 25 ++++++++ include/sysemu/numa.h | 1 + 6 files changed, 109 insertions(+), 2 deletions(-) diff --git a/hw/acpi/acpi_generic_initiator.c b/hw/acpi/acpi_generic_initiator.c index 130d6ae8c1..17b9a052f5 100644 --- a/hw/acpi/acpi_generic_initiator.c +++ b/hw/acpi/acpi_generic_initiator.c @@ -5,7 +5,9 @@ #include "qemu/osdep.h" #include "hw/acpi/acpi_generic_initiator.h" +#include "hw/acpi/aml-build.h" #include "hw/boards.h" +#include "hw/pci/pci_device.h" #include "qemu/error-report.h" typedef struct AcpiGenericInitiatorClass { @@ -47,6 +49,7 @@ static void acpi_generic_initiator_set_node(Object *obj, Visitor *v, Error **errp) { AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj); + MachineState *ms = MACHINE(qdev_get_machine()); uint32_t value; if (!visit_type_uint32(v, name, &value, errp)) { @@ -60,6 +63,7 @@ static void acpi_generic_initiator_set_node(Object *obj, Visitor *v, } gi->node = value; + ms->numa_state->nodes[gi->node].has_gi = true; } static void acpi_generic_initiator_class_init(ObjectClass *oc, void *data) @@ -69,3 +73,76 @@ static void acpi_generic_initiator_class_init(ObjectClass *oc, void *data) object_class_property_add(oc, "node", "int", NULL, acpi_generic_initiator_set_node, NULL, NULL); } + +/* + * ACPI 6.3: + * Table 5-78 Generic Initiator Affinity Structure + */ +static void +build_srat_generic_pci_initiator_affinity(GArray *table_data, int node, + PCIDeviceHandle *handle) +{ + uint8_t index; + + build_append_int_noprefix(table_data, 5, 1); /* Type */ + build_append_int_noprefix(table_data, 32, 1); /* Length */ + build_append_int_noprefix(table_data, 0, 1); /* Reserved */ + build_append_int_noprefix(table_data, 1, 1); /* Device Handle Type: PCI */ + build_append_int_noprefix(table_data, node, 4); /* Proximity Domain */ + + /* Device Handle - PCI */ + build_append_int_noprefix(table_data, handle->segment, 2); + build_append_int_noprefix(table_data, handle->bdf, 2); + for (index = 0; index < 12; index++) { + build_append_int_noprefix(table_data, 0, 1); + } + + build_append_int_noprefix(table_data, GEN_AFFINITY_ENABLED, 4); /* Flags */ + build_append_int_noprefix(table_data, 0, 4); /* Reserved */ +} + +static int build_all_acpi_generic_initiators(Object *obj, void *opaque) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + AcpiGenericInitiator *gi; + GArray *table_data = opaque; + PCIDeviceHandle dev_handle; + PCIDevice *pci_dev; + Object *o; + + if (!object_dynamic_cast(obj, TYPE_ACPI_GENERIC_INITIATOR)) { + return 0; + } + + gi = ACPI_GENERIC_INITIATOR(obj); + if (gi->node >= ms->numa_state->num_nodes) { + error_printf("%s: Specified node %d is invalid.\n", + TYPE_ACPI_GENERIC_INITIATOR, gi->node); + exit(1); + } + + o = object_resolve_path_type(gi->pci_dev, TYPE_PCI_DEVICE, NULL); + if (!o) { + error_printf("%s: Specified device must be a PCI device.\n", + TYPE_ACPI_GENERIC_INITIATOR); + exit(1); + } + + pci_dev = PCI_DEVICE(o); + + dev_handle.segment = 0; + dev_handle.bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), + pci_dev->devfn); + + build_srat_generic_pci_initiator_affinity(table_data, + gi->node, &dev_handle); + + return 0; +} + +void build_srat_generic_pci_initiator(GArray *table_data) +{ + object_child_foreach_recursive(object_get_root(), + build_all_acpi_generic_initiators, + table_data); +} diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c index 3042d223c8..2242981e18 100644 --- a/hw/acpi/hmat.c +++ b/hw/acpi/hmat.c @@ -214,7 +214,7 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) } for (i = 0; i < numa_state->num_nodes; i++) { - if (numa_state->nodes[i].has_cpu) { + if (numa_state->nodes[i].has_cpu || numa_state->nodes[i].has_gi) { initiator_list[num_initiator++] = i; } } diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 6a1bde61ce..c3ccfef026 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -57,6 +57,7 @@ #include "migration/vmstate.h" #include "hw/acpi/ghes.h" #include "hw/acpi/viot.h" +#include "hw/acpi/acpi_generic_initiator.h" #include "hw/virtio/virtio-acpi.h" #include "target/arm/multiprocessing.h" @@ -504,6 +505,8 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) } } + build_srat_generic_pci_initiator(table_data); + if (ms->nvdimms_state->is_enabled) { nvdimm_build_srat(table_data); } diff --git a/hw/core/numa.c b/hw/core/numa.c index f08956ddb0..58a32f1564 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -229,7 +229,8 @@ void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, node->target, numa_state->num_nodes); return; } - if (!numa_info[node->initiator].has_cpu) { + if (!numa_info[node->initiator].has_cpu && + !numa_info[node->initiator].has_gi) { error_setg(errp, "Invalid initiator=%d, it isn't an " "initiator proximity domain", node->initiator); return; diff --git a/include/hw/acpi/acpi_generic_initiator.h b/include/hw/acpi/acpi_generic_initiator.h index 16de1d3d80..a304bad73e 100644 --- a/include/hw/acpi/acpi_generic_initiator.h +++ b/include/hw/acpi/acpi_generic_initiator.h @@ -19,4 +19,29 @@ typedef struct AcpiGenericInitiator { uint16_t node; } AcpiGenericInitiator; +/* + * ACPI 6.3: + * Table 5-81 Flags – Generic Initiator Affinity Structure + */ +typedef enum { + /* + * If clear, the OSPM ignores the contents of the Generic + * Initiator/Port Affinity Structure. This allows system firmware + * to populate the SRAT with a static number of structures, but only + * enable them as necessary. + */ + GEN_AFFINITY_ENABLED = (1 << 0), +} GenericAffinityFlags; + +/* + * ACPI 6.3: + * Table 5-80 Device Handle - PCI + */ +typedef struct PCIDeviceHandle { + uint16_t segment; + uint16_t bdf; +} PCIDeviceHandle; + +void build_srat_generic_pci_initiator(GArray *table_data); + #endif diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h index 4173ef2afa..825cfe86bc 100644 --- a/include/sysemu/numa.h +++ b/include/sysemu/numa.h @@ -41,6 +41,7 @@ struct NodeInfo { struct HostMemoryBackend *node_memdev; bool present; bool has_cpu; + bool has_gi; uint8_t lb_info_provided; uint16_t initiator; uint8_t distance[MAX_NODES]; From 5deced6a13de9409fd9114432b25072189a68942 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Fri, 8 Mar 2024 14:55:25 +0000 Subject: [PATCH 56/68] hw/i386/acpi-build: Add support for SRAT Generic Initiator structures The acpi-generic-initiator object is added to allow a host device to be linked with a NUMA node. Qemu use it to build the SRAT Generic Initiator Affinity structure [1]. Add support for i386. [1] ACPI Spec 6.3, Section 5.2.16.6 Suggested-by: Jonathan Cameron Signed-off-by: Ankit Agrawal Message-Id: <20240308145525.10886-4-ankita@nvidia.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Jonathan Cameron --- hw/i386/acpi-build.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 15242b9096..53f804ac16 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -68,6 +68,7 @@ #include "hw/acpi/utils.h" #include "hw/acpi/pci.h" #include "hw/acpi/cxl.h" +#include "hw/acpi/acpi_generic_initiator.h" #include "qom/qom-qobject.h" #include "hw/i386/amd_iommu.h" @@ -2046,6 +2047,8 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) build_srat_memory(table_data, 0, 0, 0, MEM_AFFINITY_NOFLAGS); } + build_srat_generic_pci_initiator(table_data); + /* * Entry is required for Windows to enable memory hotplug in OS * and for Linux to enable SWIOTLB when booted with less than From 294ac5fef3aa78111de07357734285744103f47c Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 7 Mar 2024 14:43:03 +0100 Subject: [PATCH 57/68] virtio-iommu: Add a granule property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows to choose which granule will be used by default by the virtio-iommu. Current page size mask default is qemu_target_page_mask so this translates into a 4k granule on ARM and x86_64 where virtio-iommu is supported. Signed-off-by: Eric Auger Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Zhenzhong Duan Message-Id: <20240307134445.92296-3-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-iommu.c | 28 +++++++++++++++++++++++++--- include/hw/virtio/virtio-iommu.h | 2 ++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 86623d55a5..84d6819d3b 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -29,6 +29,7 @@ #include "sysemu/reset.h" #include "sysemu/sysemu.h" #include "qemu/reserved-region.h" +#include "qemu/units.h" #include "qapi/error.h" #include "qemu/error-report.h" #include "trace.h" @@ -1115,8 +1116,8 @@ static int virtio_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu_mr, } /* - * The default mask (TARGET_PAGE_MASK) is the smallest supported guest granule, - * for example 0xfffffffffffff000. When an assigned device has page size + * The default mask depends on the "granule" property. For example, with + * 4k granule, it is -(4 * KiB). When an assigned device has page size * restrictions due to the hardware IOMMU configuration, apply this restriction * to the mask. */ @@ -1313,8 +1314,27 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) * in vfio realize */ s->config.bypass = s->boot_bypass; - s->config.page_size_mask = qemu_target_page_mask(); s->config.input_range.end = UINT64_MAX; + + switch (s->granule_mode) { + case GRANULE_MODE_4K: + s->config.page_size_mask = -(4 * KiB); + break; + case GRANULE_MODE_8K: + s->config.page_size_mask = -(8 * KiB); + break; + case GRANULE_MODE_16K: + s->config.page_size_mask = -(16 * KiB); + break; + case GRANULE_MODE_64K: + s->config.page_size_mask = -(64 * KiB); + break; + case GRANULE_MODE_HOST: + s->config.page_size_mask = qemu_real_host_page_mask(); + break; + default: + error_setg(errp, "Unsupported granule mode"); + } s->config.domain_range.end = UINT32_MAX; s->config.probe_size = VIOMMU_PROBE_SIZE; @@ -1522,6 +1542,8 @@ static Property virtio_iommu_properties[] = { DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus, TYPE_PCI_BUS, PCIBus *), DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true), + DEFINE_PROP_GRANULE_MODE("granule", VirtIOIOMMU, granule_mode, + GRANULE_MODE_4K), DEFINE_PROP_END_OF_LIST(), }; diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h index 781ebaea8f..67ea5022af 100644 --- a/include/hw/virtio/virtio-iommu.h +++ b/include/hw/virtio/virtio-iommu.h @@ -24,6 +24,7 @@ #include "hw/virtio/virtio.h" #include "hw/pci/pci.h" #include "qom/object.h" +#include "qapi/qapi-types-virtio.h" #define TYPE_VIRTIO_IOMMU "virtio-iommu-device" #define TYPE_VIRTIO_IOMMU_PCI "virtio-iommu-pci" @@ -66,6 +67,7 @@ struct VirtIOIOMMU { bool boot_bypass; Notifier machine_done; bool granule_frozen; + GranuleMode granule_mode; }; #endif From 9dd5e808fc17ac92180965178a6e867c1e96ac57 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 7 Mar 2024 14:43:04 +0100 Subject: [PATCH 58/68] virtio-iommu: Change the default granule to the host page size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We used to set the default granule to 4KB but with VFIO assignment it makes more sense to use the actual host page size. Indeed when hotplugging a VFIO device protected by a virtio-iommu on a 64kB/64kB host/guest config, we current get a qemu crash: "vfio: DMA mapping failed, unable to continue" This is due to the hot-attached VFIO device calling memory_region_iommu_set_page_size_mask() with 64kB granule whereas the virtio-iommu granule was already frozen to 4KB on machine init done. Set the granule property to "host" and introduce a new compat. The page size mask used before 9.0 was qemu_target_page_mask(). Since the virtio-iommu currently only supports x86_64 and aarch64, this matched a 4KB granule. Note that the new default will prevent 4kB guest on 64kB host because the granule will be set to 64kB which would be larger than the guest page size. In that situation, the virtio-iommu driver fails on viommu_domain_finalise() with "granule 0x10000 larger than system page size 0x1000". In that case the workaround is to request 4K granule. The current limitation of global granule in the virtio-iommu should be removed and turned into per domain granule. But until we get this upgraded, this new default is probably better because I don't think anyone is currently interested in running a 4KB page size guest with virtio-iommu on a 64KB host. However supporting 64kB guest on 64kB host with virtio-iommu and VFIO looks a more important feature. Signed-off-by: Eric Auger Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Zhenzhong Duan Message-Id: <20240307134445.92296-4-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/core/machine.c | 5 ++++- hw/virtio/virtio-iommu.c | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index 9ac5d5389a..6bd09d4592 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -30,9 +30,12 @@ #include "exec/confidential-guest-support.h" #include "hw/virtio/virtio-pci.h" #include "hw/virtio/virtio-net.h" +#include "hw/virtio/virtio-iommu.h" #include "audio/audio.h" -GlobalProperty hw_compat_8_2[] = {}; +GlobalProperty hw_compat_8_2[] = { + { TYPE_VIRTIO_IOMMU_PCI, "granule", "4k" }, +}; const size_t hw_compat_8_2_len = G_N_ELEMENTS(hw_compat_8_2); GlobalProperty hw_compat_8_1[] = { diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 84d6819d3b..aab97e1527 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -1543,7 +1543,7 @@ static Property virtio_iommu_properties[] = { TYPE_PCI_BUS, PCIBus *), DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true), DEFINE_PROP_GRANULE_MODE("granule", VirtIOIOMMU, granule_mode, - GRANULE_MODE_4K), + GRANULE_MODE_HOST), DEFINE_PROP_END_OF_LIST(), }; From 695012903f99d42c03a5bc87fe24f591b6bf7153 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 7 Mar 2024 14:43:05 +0100 Subject: [PATCH 59/68] qemu-options.hx: Document the virtio-iommu-pci granule option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We are missing an entry for the virtio-iommu-pci device. Add the information on which machine it is currently supported and document the new granule option. Signed-off-by: Eric Auger Message-Id: <20240307134445.92296-5-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Philippe Mathieu-Daudé --- qemu-options.hx | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/qemu-options.hx b/qemu-options.hx index 937fd7ed84..c6f3d2e76d 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -1172,6 +1172,14 @@ SRST Please also refer to the wiki page for general scenarios of VT-d emulation in QEMU: https://wiki.qemu.org/Features/VT-d. +``-device virtio-iommu-pci[,option=...]`` + This is only supported by ``-machine q35`` (x86_64) and ``-machine virt`` (ARM). + It supports below options: + + ``granule=val`` (possible values are 4k, 8k, 16k, 64k and host; default: host) + This decides the default granule to be be exposed by the + virtio-iommu. If host, the granule matches the host page size. + ERST DEF("name", HAS_ARG, QEMU_OPTION_name, From fdda908f945bcd985be4ebd2a06d00719bc15e93 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 7 Mar 2024 14:43:06 +0100 Subject: [PATCH 60/68] virtio-iommu: Trace domain range limits as unsigned int MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use %u format to trace domain_range limits. Signed-off-by: Eric Auger Reviewed-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Message-Id: <20240307134445.92296-6-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/trace-events | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 9df24864a2..13b6991179 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -112,7 +112,7 @@ virtio_iommu_device_reset(void) "reset!" virtio_iommu_system_reset(void) "system reset!" virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx64 virtio_iommu_device_status(uint8_t status) "driver status = %d" -virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_start, uint32_t domain_end, uint32_t probe_size, uint8_t bypass) "page_size_mask=0x%"PRIx64" input range start=0x%"PRIx64" input range end=0x%"PRIx64" domain range start=%d domain range end=%d probe_size=0x%x bypass=0x%x" +virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_start, uint32_t domain_end, uint32_t probe_size, uint8_t bypass) "page_size_mask=0x%"PRIx64" input range start=0x%"PRIx64" input range end=0x%"PRIx64" domain range start=%u domain range end=%u probe_size=0x%x bypass=0x%x" virtio_iommu_set_config(uint8_t bypass) "bypass=0x%x" virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" From 01e7e4921ccebb81cebc69eb648040a57be4f5ff Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 7 Mar 2024 14:43:07 +0100 Subject: [PATCH 61/68] virtio-iommu: Add an option to define the input range width MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit aw-bits is a new option that allows to set the bit width of the input address range. This value will be used as a default for the device config input_range.end. By default it is set to 64 bits which is the current value. Signed-off-by: Eric Auger Reviewed-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Message-Id: <20240307134445.92296-7-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-iommu.c | 8 +++++++- include/hw/virtio/virtio-iommu.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index aab97e1527..1326c6ec41 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -1314,7 +1314,12 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) * in vfio realize */ s->config.bypass = s->boot_bypass; - s->config.input_range.end = UINT64_MAX; + if (s->aw_bits < 32 || s->aw_bits > 64) { + error_setg(errp, "aw-bits must be within [32,64]"); + return; + } + s->config.input_range.end = + s->aw_bits == 64 ? UINT64_MAX : BIT_ULL(s->aw_bits) - 1; switch (s->granule_mode) { case GRANULE_MODE_4K: @@ -1544,6 +1549,7 @@ static Property virtio_iommu_properties[] = { DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true), DEFINE_PROP_GRANULE_MODE("granule", VirtIOIOMMU, granule_mode, GRANULE_MODE_HOST), + DEFINE_PROP_UINT8("aw-bits", VirtIOIOMMU, aw_bits, 64), DEFINE_PROP_END_OF_LIST(), }; diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h index 67ea5022af..83a52cc446 100644 --- a/include/hw/virtio/virtio-iommu.h +++ b/include/hw/virtio/virtio-iommu.h @@ -68,6 +68,7 @@ struct VirtIOIOMMU { Notifier machine_done; bool granule_frozen; GranuleMode granule_mode; + uint8_t aw_bits; }; #endif From 9b588be373ad01e7ce09e25f69f66b811af0b799 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 7 Mar 2024 14:43:08 +0100 Subject: [PATCH 62/68] hw/i386/q35: Set virtio-iommu aw-bits default value to 39 Currently the default input range can extend to 64 bits. On x86, when the virtio-iommu protects vfio devices, the physical iommu may support only 39 bits. Let's set the default to 39, as done for the intel-iommu. We use hw_compat_8_2 to handle the compatibility for machines before 9.0 which used to have a virtio-iommu default input range of 64 bits. Of course if aw-bits is set from the command line, the default is overriden. Signed-off-by: Eric Auger Message-Id: <20240307134445.92296-8-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Zhenzhong Duan --- hw/core/machine.c | 1 + hw/i386/pc_q35.c | 9 +++++++++ tests/qtest/virtio-iommu-test.c | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index 6bd09d4592..4b89172d1c 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -35,6 +35,7 @@ GlobalProperty hw_compat_8_2[] = { { TYPE_VIRTIO_IOMMU_PCI, "granule", "4k" }, + { TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "64" }, }; const size_t hw_compat_8_2_len = G_N_ELEMENTS(hw_compat_8_2); diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 1356c5d107..8a427c4647 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -45,6 +45,7 @@ #include "hw/i386/pc.h" #include "hw/i386/amd_iommu.h" #include "hw/i386/intel_iommu.h" +#include "hw/virtio/virtio-iommu.h" #include "hw/display/ramfb.h" #include "hw/ide/pci.h" #include "hw/ide/ahci-pci.h" @@ -63,6 +64,12 @@ /* ICH9 AHCI has 6 ports */ #define MAX_SATA_PORTS 6 +static GlobalProperty pc_q35_compat_defaults[] = { + { TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "39" }, +}; +static const size_t pc_q35_compat_defaults_len = + G_N_ELEMENTS(pc_q35_compat_defaults); + struct ehci_companions { const char *name; int func; @@ -354,6 +361,8 @@ static void pc_q35_machine_options(MachineClass *m) machine_class_allow_dynamic_sysbus_dev(m, TYPE_INTEL_IOMMU_DEVICE); machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE); machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE); + compat_props_add(m->compat_props, + pc_q35_compat_defaults, pc_q35_compat_defaults_len); } static void pc_q35_9_0_machine_options(MachineClass *m) diff --git a/tests/qtest/virtio-iommu-test.c b/tests/qtest/virtio-iommu-test.c index 068e7a9e6c..afb225971d 100644 --- a/tests/qtest/virtio-iommu-test.c +++ b/tests/qtest/virtio-iommu-test.c @@ -34,7 +34,7 @@ static void pci_config(void *obj, void *data, QGuestAllocator *t_alloc) uint8_t bypass = qvirtio_config_readb(dev, 36); g_assert_cmpint(input_range_start, ==, 0); - g_assert_cmphex(input_range_end, ==, UINT64_MAX); + g_assert_cmphex(input_range_end, >=, UINT32_MAX); g_assert_cmpint(domain_range_start, ==, 0); g_assert_cmpint(domain_range_end, ==, UINT32_MAX); g_assert_cmpint(bypass, ==, 1); From 62d776002c990d5b6fd4a2b6809ab2956c6e1ff0 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 7 Mar 2024 14:43:09 +0100 Subject: [PATCH 63/68] hw/arm/virt: Set virtio-iommu aw-bits default value to 48 On ARM we set 48b as a default (matching SMMUv3 SMMU_IDR5.VAX == 0). hw_compat_8_2 is used to handle the compatibility for machine types before 9.0 (default was 64 bits). Signed-off-by: Eric Auger Reviewed-by: Zhenzhong Duan Message-Id: <20240307134445.92296-9-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/arm/virt.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 0af1943697..e5cd935232 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -85,11 +85,28 @@ #include "hw/char/pl011.h" #include "qemu/guest-random.h" +static GlobalProperty arm_virt_compat[] = { + { TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "48" }, +}; +static const size_t arm_virt_compat_len = G_N_ELEMENTS(arm_virt_compat); + +/* + * This cannot be called from the virt_machine_class_init() because + * TYPE_VIRT_MACHINE is abstract and mc->compat_props g_ptr_array_new() + * only is called on virt non abstract class init. + */ +static void arm_virt_compat_set(MachineClass *mc) +{ + compat_props_add(mc->compat_props, arm_virt_compat, + arm_virt_compat_len); +} + #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \ static void virt_##major##_##minor##_class_init(ObjectClass *oc, \ void *data) \ { \ MachineClass *mc = MACHINE_CLASS(oc); \ + arm_virt_compat_set(mc); \ virt_machine_##major##_##minor##_options(mc); \ mc->desc = "QEMU " # major "." # minor " ARM Virtual Machine"; \ if (latest) { \ From f7ada75b3f7dd1369b10bac7f8297831c3a80967 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 7 Mar 2024 14:43:10 +0100 Subject: [PATCH 64/68] qemu-options.hx: Document the virtio-iommu-pci aw-bits option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document the new aw-bits option. Signed-off-by: Eric Auger Message-Id: <20240307134445.92296-10-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Philippe Mathieu-Daudé --- qemu-options.hx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/qemu-options.hx b/qemu-options.hx index c6f3d2e76d..7fd1713fa8 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -1180,6 +1180,9 @@ SRST This decides the default granule to be be exposed by the virtio-iommu. If host, the granule matches the host page size. + ``aw-bits=val`` (val between 32 and 64, default depends on machine) + This decides the address width of the IOVA address space. + ERST DEF("name", HAS_ARG, QEMU_OPTION_name, From 2eb6672cfdaea7dacd8e9bb0523887f13b9f85ce Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Thu, 7 Mar 2024 16:03:25 +0000 Subject: [PATCH 65/68] hmat acpi: Do not add Memory Proximity Domain Attributes Structure targetting non existent memory. If qemu is started with a proximity node containing CPUs alone, it will provide one of these structures to say memory in this node is directly connected to itself. This description is arguably pointless even if there is memory in the node. If there is no memory present, and hence no SRAT entry it breaks Linux HMAT passing and the table is rejected. https://elixir.bootlin.com/linux/v6.7/source/drivers/acpi/numa/hmat.c#L444 Signed-off-by: Jonathan Cameron Message-Id: <20240307160326.31570-2-Jonathan.Cameron@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/hmat.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c index 2242981e18..8ea240878a 100644 --- a/hw/acpi/hmat.c +++ b/hw/acpi/hmat.c @@ -204,6 +204,13 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) build_append_int_noprefix(table_data, 0, 4); /* Reserved */ for (i = 0; i < numa_state->num_nodes; i++) { + /* + * Linux rejects whole HMAT table if a node with no memory + * has one of these structures listing it as a target. + */ + if (!numa_state->nodes[i].node_mem) { + continue; + } flags = 0; if (numa_state->nodes[i].initiator < MAX_NODES) { From 74e2845c5f95b0c139c79233ddb65bb17f2dd679 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Thu, 7 Mar 2024 16:03:26 +0000 Subject: [PATCH 66/68] hmat acpi: Fix out of bounds access due to missing use of indirection With a numa set up such as -numa nodeid=0,cpus=0 \ -numa nodeid=1,memdev=mem \ -numa nodeid=2,cpus=1 and appropriate hmat_lb entries the initiator list is correctly computed and writen to HMAT as 0,2 but then the LB data is accessed using the node id (here 2), landing outside the entry_list array. Stash the reverse lookup when writing the initiator list and use it to get the correct array index index. Fixes: 4586a2cb83 ("hmat acpi: Build System Locality Latency and Bandwidth Information Structure(s)") Signed-off-by: Jonathan Cameron Message-Id: <20240307160326.31570-3-Jonathan.Cameron@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/hmat.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c index 8ea240878a..9b1662b6b8 100644 --- a/hw/acpi/hmat.c +++ b/hw/acpi/hmat.c @@ -78,6 +78,7 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, uint32_t *initiator_list) { int i, index; + uint32_t initiator_to_index[MAX_NODES] = {}; HMAT_LB_Data *lb_data; uint16_t *entry_list; uint32_t base; @@ -121,6 +122,8 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, /* Initiator Proximity Domain List */ for (i = 0; i < num_initiator; i++) { build_append_int_noprefix(table_data, initiator_list[i], 4); + /* Reverse mapping for array possitions */ + initiator_to_index[initiator_list[i]] = i; } /* Target Proximity Domain List */ @@ -132,7 +135,8 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, entry_list = g_new0(uint16_t, num_initiator * num_target); for (i = 0; i < hmat_lb->list->len; i++) { lb_data = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); - index = lb_data->initiator * num_target + lb_data->target; + index = initiator_to_index[lb_data->initiator] * num_target + + lb_data->target; entry_list[index] = (uint16_t)(lb_data->data / hmat_lb->base); } From bfc2f7a6caa6843e50019ee2511ad11cd5582711 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 8 Mar 2024 14:38:31 +0000 Subject: [PATCH 67/68] hw/cxl: Fix missing reserved data in CXL Device DVSEC The r3.1 specification introduced a new 2 byte field, but to maintain DWORD alignment, a additional 2 reserved bytes were added. Forgot those in updating the structure definition but did include them in the size define leading to a buffer overrun. Also use the define so that we don't duplicate the value. Fixes: Coverity ID 1534095 buffer overrun Fixes: 8700ee15de ("hw/cxl: Standardize all references on CXL r3.1 and minor updates") Reported-by: Peter Maydell Signed-off-by: Jonathan Cameron Message-Id: <20240308143831.6256-1-Jonathan.Cameron@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/hw/cxl/cxl_pci.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/hw/cxl/cxl_pci.h b/include/hw/cxl/cxl_pci.h index 265db6c407..d0855ed78b 100644 --- a/include/hw/cxl/cxl_pci.h +++ b/include/hw/cxl/cxl_pci.h @@ -92,8 +92,9 @@ typedef struct CXLDVSECDevice { uint32_t range2_base_hi; uint32_t range2_base_lo; uint16_t cap3; + uint16_t resv; } QEMU_PACKED CXLDVSECDevice; -QEMU_BUILD_BUG_ON(sizeof(CXLDVSECDevice) != 0x3A); +QEMU_BUILD_BUG_ON(sizeof(CXLDVSECDevice) != PCIE_CXL_DEVICE_DVSEC_LENGTH); /* * CXL r3.1 Section 8.1.5: CXL Extensions DVSEC for Ports From 73279cecca03f7c2b4489c5fea994e7349eaafaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 10 Mar 2024 16:04:51 +0100 Subject: [PATCH 68/68] docs/specs/pvpanic: document shutdown event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shutdown requests are normally hardware dependent. By extending pvpanic to also handle shutdown requests, guests can submit such requests with an easily implementable and cross-platform mechanism. Signed-off-by: Thomas Weißschuh Message-Id: <20240310-pvpanic-shutdown-spec-v1-1-b258e182ce55@t-8ch.de> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- docs/specs/pvpanic.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/specs/pvpanic.rst b/docs/specs/pvpanic.rst index f894bc1955..61a80480ed 100644 --- a/docs/specs/pvpanic.rst +++ b/docs/specs/pvpanic.rst @@ -29,6 +29,8 @@ bit 1 a guest panic has happened and will be handled by the guest; the host should record it or report it, but should not affect the execution of the guest. +bit 2 + a regular guest shutdown has happened and should be processed by the host PCI Interface -------------