From d9e4070603b9727a8c33d9aa7d2aacf5eed0c0f7 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Mon, 25 Mar 2024 16:47:37 +0100 Subject: [PATCH 1/6] tests/qemu-iotests: Test 157 and 227 require virtio-blk Tests 157 and 227 use the virtio-blk device, so we have to mark these tests accordingly to be skipped if this devices is not available (e.g. when running the tests with qemu-system-avr only). Signed-off-by: Thomas Huth Message-ID: <20240325154737.1305063-1-thuth@redhat.com> Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- tests/qemu-iotests/157 | 2 ++ tests/qemu-iotests/227 | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tests/qemu-iotests/157 b/tests/qemu-iotests/157 index 0dc9ba68d2..aa2ebbfb4b 100755 --- a/tests/qemu-iotests/157 +++ b/tests/qemu-iotests/157 @@ -40,6 +40,8 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 _supported_fmt generic _supported_proto file +_require_devices virtio-blk + do_run_qemu() { ( diff --git a/tests/qemu-iotests/227 b/tests/qemu-iotests/227 index 7e45a47ac6..eddaad679e 100755 --- a/tests/qemu-iotests/227 +++ b/tests/qemu-iotests/227 @@ -40,6 +40,8 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 _supported_fmt generic _supported_proto file +_require_devices virtio-blk + do_run_qemu() { echo Testing: "$@" From 2c66de61f88dc9620a32239f7dd61524a57f66b0 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 15 Mar 2024 16:59:49 +0100 Subject: [PATCH 2/6] vdpa-dev: Fix initialisation order to restore VDUSE compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VDUSE requires that virtqueues are first enabled before the DRIVER_OK status flag is set; with the current API of the kernel module, it is impossible to enable the opposite order in our block export code because userspace is not notified when a virtqueue is enabled. This requirement also mathces the normal initialisation order as done by the generic vhost code in QEMU. However, commit 6c482547 accidentally changed the order for vdpa-dev and broke access to VDUSE devices with this. This changes vdpa-dev to use the normal order again and use the standard vhost callback .vhost_set_vring_enable for this. VDUSE devices can be used with vdpa-dev again after this fix. vhost_net intentionally avoided enabling the vrings for vdpa and does this manually later while it does enable them for other vhost backends. Reflect this in the vhost_net code and return early for vdpa, so that the behaviour doesn't change for this device. Cc: qemu-stable@nongnu.org Fixes: 6c4825476a43 ('vdpa: move vhost_vdpa_set_vring_ready to the caller') Signed-off-by: Kevin Wolf Message-ID: <20240315155949.86066-1-kwolf@redhat.com> Reviewed-by: Eugenio PĂ©rez Reviewed-by: Stefano Garzarella Signed-off-by: Kevin Wolf --- hw/net/vhost_net.c | 10 ++++++++++ hw/virtio/trace-events | 2 +- hw/virtio/vdpa-dev.c | 5 +---- hw/virtio/vhost-vdpa.c | 29 ++++++++++++++++++++++++++--- hw/virtio/vhost.c | 8 +++++++- 5 files changed, 45 insertions(+), 9 deletions(-) diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index e8e1661646..fd1a93701a 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -541,6 +541,16 @@ int vhost_set_vring_enable(NetClientState *nc, int enable) VHostNetState *net = get_vhost_net(nc); const VhostOps *vhost_ops = net->dev.vhost_ops; + /* + * vhost-vdpa network devices need to enable dataplane virtqueues after + * DRIVER_OK, so they can recover device state before starting dataplane. + * Because of that, we don't enable virtqueues here and leave it to + * net/vhost-vdpa.c. + */ + if (nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) { + return 0; + } + nc->vring_enable = enable; if (vhost_ops && vhost_ops->vhost_set_vring_enable) { diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 13b6991179..96632fd026 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -49,7 +49,7 @@ vhost_vdpa_set_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRI vhost_vdpa_get_device_id(void *dev, uint32_t device_id) "dev: %p device_id %"PRIu32 vhost_vdpa_reset_device(void *dev) "dev: %p" vhost_vdpa_get_vq_index(void *dev, int idx, int vq_idx) "dev: %p idx: %d vq idx: %d" -vhost_vdpa_set_vring_ready(void *dev, unsigned i, int r) "dev: %p, idx: %u, r: %d" +vhost_vdpa_set_vring_enable_one(void *dev, unsigned i, int enable, int r) "dev: %p, idx: %u, enable: %u, r: %d" vhost_vdpa_dump_config(void *dev, const char *line) "dev: %p %s" vhost_vdpa_set_config(void *dev, uint32_t offset, uint32_t size, uint32_t flags) "dev: %p offset: %"PRIu32" size: %"PRIu32" flags: 0x%"PRIx32 vhost_vdpa_get_config(void *dev, void *config, uint32_t config_len) "dev: %p config: %p config_len: %"PRIu32 diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c index eb9ecea83b..13e87f06f6 100644 --- a/hw/virtio/vdpa-dev.c +++ b/hw/virtio/vdpa-dev.c @@ -253,14 +253,11 @@ static int vhost_vdpa_device_start(VirtIODevice *vdev, Error **errp) s->dev.acked_features = vdev->guest_features; - ret = vhost_dev_start(&s->dev, vdev, false); + ret = vhost_dev_start(&s->dev, vdev, true); if (ret < 0) { error_setg_errno(errp, -ret, "Error starting vhost"); goto err_guest_notifiers; } - for (i = 0; i < s->dev.nvqs; ++i) { - vhost_vdpa_set_vring_ready(&s->vdpa, i); - } s->started = true; /* diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 3bcd05cc22..e827b9175f 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -896,19 +896,41 @@ static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx) return idx; } -int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx) +static int vhost_vdpa_set_vring_enable_one(struct vhost_vdpa *v, unsigned idx, + int enable) { struct vhost_dev *dev = v->dev; struct vhost_vring_state state = { .index = idx, - .num = 1, + .num = enable, }; int r = vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state); - trace_vhost_vdpa_set_vring_ready(dev, idx, r); + trace_vhost_vdpa_set_vring_enable_one(dev, idx, enable, r); return r; } +static int vhost_vdpa_set_vring_enable(struct vhost_dev *dev, int enable) +{ + struct vhost_vdpa *v = dev->opaque; + unsigned int i; + int ret; + + for (i = 0; i < dev->nvqs; ++i) { + ret = vhost_vdpa_set_vring_enable_one(v, i, enable); + if (ret < 0) { + return ret; + } + } + + return 0; +} + +int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx) +{ + return vhost_vdpa_set_vring_enable_one(v, idx, 1); +} + static int vhost_vdpa_set_config_call(struct vhost_dev *dev, int fd) { @@ -1536,6 +1558,7 @@ const VhostOps vdpa_ops = { .vhost_set_features = vhost_vdpa_set_features, .vhost_reset_device = vhost_vdpa_reset_device, .vhost_get_vq_index = vhost_vdpa_get_vq_index, + .vhost_set_vring_enable = vhost_vdpa_set_vring_enable, .vhost_get_config = vhost_vdpa_get_config, .vhost_set_config = vhost_vdpa_set_config, .vhost_requires_shm_log = NULL, diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 2e4e040db8..f50180e60e 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -1984,7 +1984,13 @@ static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable) return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable); } -/* Host notifiers must be enabled at this point. */ +/* + * Host notifiers must be enabled at this point. + * + * If @vrings is true, this function will enable all vrings before starting the + * device. If it is false, the vring initialization is left to be done by the + * caller. + */ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) { int i, r; From 3f934817c82c2f1bf1c238f8d1065a3be10a3c9e Mon Sep 17 00:00:00 2001 From: Stefan Reiter Date: Fri, 22 Mar 2024 10:50:06 +0100 Subject: [PATCH 3/6] block/io: accept NULL qiov in bdrv_pad_request Some operations, e.g. block-stream, perform reads while discarding the results (only copy-on-read matters). In this case, they will pass NULL as the target QEMUIOVector, which will however trip bdrv_pad_request, since it wants to extend its passed vector. In particular, this is the case for the blk_co_preadv() call in stream_populate(). If there is no qiov, no operation can be done with it, but the bytes and offset still need to be updated, so the subsequent aligned read will actually be aligned and not run into an assertion failure. In particular, this can happen when the request alignment of the top node is larger than the allocated part of the bottom node, in which case padding becomes necessary. For example: > ./qemu-img create /tmp/backing.qcow2 -f qcow2 64M -o cluster_size=32768 > ./qemu-io -c "write -P42 0x0 0x1" /tmp/backing.qcow2 > ./qemu-img create /tmp/top.qcow2 -f qcow2 64M -b /tmp/backing.qcow2 -F qcow2 > ./qemu-system-x86_64 --qmp stdio \ > --blockdev qcow2,node-name=node0,file.driver=file,file.filename=/tmp/top.qcow2 \ > < {"execute": "qmp_capabilities"} > {"execute": "blockdev-add", "arguments": { "driver": "compress", "file": "node0", "node-name": "node1" } } > {"execute": "block-stream", "arguments": { "job-id": "stream0", "device": "node1" } } > EOF Originally-by: Stefan Reiter Signed-off-by: Thomas Lamprecht [FE: do update bytes and offset in any case add reproducer to commit message] Signed-off-by: Fiona Ebner Message-ID: <20240322095009.346989-2-f.ebner@proxmox.com> Reviewed-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/io.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/block/io.c b/block/io.c index 33150c0359..395bea3bac 100644 --- a/block/io.c +++ b/block/io.c @@ -1726,22 +1726,29 @@ static int bdrv_pad_request(BlockDriverState *bs, return 0; } - sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes, - &sliced_head, &sliced_tail, - &sliced_niov); + /* + * For prefetching in stream_populate(), no qiov is passed along, because + * only copy-on-read matters. + */ + if (qiov && *qiov) { + sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes, + &sliced_head, &sliced_tail, + &sliced_niov); - /* Guaranteed by bdrv_check_request32() */ - assert(*bytes <= SIZE_MAX); - ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov, - sliced_head, *bytes); - if (ret < 0) { - bdrv_padding_finalize(pad); - return ret; + /* Guaranteed by bdrv_check_request32() */ + assert(*bytes <= SIZE_MAX); + ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov, + sliced_head, *bytes); + if (ret < 0) { + bdrv_padding_finalize(pad); + return ret; + } + *qiov = &pad->local_qiov; + *qiov_offset = 0; } + *bytes += pad->head + pad->tail; *offset -= pad->head; - *qiov = &pad->local_qiov; - *qiov_offset = 0; if (padded) { *padded = true; } From f6d38c9f6dae6fce99dcaf6ca16a1fe5b5e19c4c Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Fri, 22 Mar 2024 10:50:07 +0100 Subject: [PATCH 4/6] block-backend: fix edge case in bdrv_next() where BDS associated to BB changes The old_bs variable in bdrv_next() is currently determined by looking at the old block backend. However, if the block graph changes before the next bdrv_next() call, it might be that the associated BDS is not the same that was referenced previously. In that case, the wrong BDS is unreferenced, leading to an assertion failure later: > bdrv_unref: Assertion `bs->refcnt > 0' failed. In particular, this can happen in the context of bdrv_flush_all(), when polling for bdrv_co_flush() in the generated co-wrapper leads to a graph change (for example with a stream block job [0]). A racy reproducer: > #!/bin/bash > rm -f /tmp/backing.qcow2 > rm -f /tmp/top.qcow2 > ./qemu-img create /tmp/backing.qcow2 -f qcow2 64M > ./qemu-io -c "write -P42 0x0 0x1" /tmp/backing.qcow2 > ./qemu-img create /tmp/top.qcow2 -f qcow2 64M -b /tmp/backing.qcow2 -F qcow2 > ./qemu-system-x86_64 --qmp stdio \ > --blockdev qcow2,node-name=node0,file.driver=file,file.filename=/tmp/top.qcow2 \ > < {"execute": "qmp_capabilities"} > {"execute": "block-stream", "arguments": { "job-id": "stream0", "device": "node0" } } > {"execute": "quit"} > EOF [0]: > #0 bdrv_replace_child_tran (child=..., new_bs=..., tran=...) > #1 bdrv_replace_node_noperm (from=..., to=..., auto_skip=..., tran=..., errp=...) > #2 bdrv_replace_node_common (from=..., to=..., auto_skip=..., detach_subchain=..., errp=...) > #3 bdrv_drop_filter (bs=..., errp=...) > #4 bdrv_cor_filter_drop (cor_filter_bs=...) > #5 stream_prepare (job=...) > #6 job_prepare_locked (job=...) > #7 job_txn_apply_locked (fn=..., job=...) > #8 job_do_finalize_locked (job=...) > #9 job_exit (opaque=...) > #10 aio_bh_poll (ctx=...) > #11 aio_poll (ctx=..., blocking=...) > #12 bdrv_poll_co (s=...) > #13 bdrv_flush (bs=...) > #14 bdrv_flush_all () > #15 do_vm_stop (state=..., send_stop=...) > #16 vm_shutdown () Signed-off-by: Fiona Ebner Message-ID: <20240322095009.346989-3-f.ebner@proxmox.com> Reviewed-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/block-backend.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index 9c4de79e6b..28af1eb17a 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -599,14 +599,14 @@ BlockDriverState *bdrv_next(BdrvNextIterator *it) /* Must be called from the main loop */ assert(qemu_get_current_aio_context() == qemu_get_aio_context()); + old_bs = it->bs; + /* First, return all root nodes of BlockBackends. In order to avoid * returning a BDS twice when multiple BBs refer to it, we only return it * if the BB is the first one in the parent list of the BDS. */ if (it->phase == BDRV_NEXT_BACKEND_ROOTS) { BlockBackend *old_blk = it->blk; - old_bs = old_blk ? blk_bs(old_blk) : NULL; - do { it->blk = blk_all_next(it->blk); bs = it->blk ? blk_bs(it->blk) : NULL; @@ -620,11 +620,10 @@ BlockDriverState *bdrv_next(BdrvNextIterator *it) if (bs) { bdrv_ref(bs); bdrv_unref(old_bs); + it->bs = bs; return bs; } it->phase = BDRV_NEXT_MONITOR_OWNED; - } else { - old_bs = it->bs; } /* Then return the monitor-owned BDSes without a BB attached. Ignore all From bac09b093ebbb79e6a7444c7b979c32ca5540132 Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Fri, 22 Mar 2024 10:50:08 +0100 Subject: [PATCH 5/6] block-backend: fix edge case in bdrv_next_cleanup() where BDS associated to BB changes Same rationale as for commit "block-backend: fix edge case in bdrv_next() where BDS associated to BB changes". The block graph might change between the bdrv_next() call and the bdrv_next_cleanup() call, so it could be that the associated BDS is not the same that was referenced previously anymore. Instead, rely on bdrv_next() to set it->bs to the BDS it referenced and unreference that one in any case. Signed-off-by: Fiona Ebner Message-ID: <20240322095009.346989-4-f.ebner@proxmox.com> Reviewed-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/block-backend.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index 28af1eb17a..db6f9b92a3 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -663,13 +663,10 @@ void bdrv_next_cleanup(BdrvNextIterator *it) /* Must be called from the main loop */ assert(qemu_get_current_aio_context() == qemu_get_aio_context()); - if (it->phase == BDRV_NEXT_BACKEND_ROOTS) { - if (it->blk) { - bdrv_unref(blk_bs(it->blk)); - blk_unref(it->blk); - } - } else { - bdrv_unref(it->bs); + bdrv_unref(it->bs); + + if (it->phase == BDRV_NEXT_BACKEND_ROOTS && it->blk) { + blk_unref(it->blk); } bdrv_next_reset(it); From 12d7b3bbd3333cededd3b695501d8d247239d769 Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Fri, 22 Mar 2024 10:50:09 +0100 Subject: [PATCH 6/6] iotests: add test for stream job with an unaligned prefetch read Previously, bdrv_pad_request() could not deal with a NULL qiov when a read needed to be aligned. During prefetch, a stream job will pass a NULL qiov. Add a test case to cover this scenario. By accident, also covers a previous race during shutdown, where block graph changes during iteration in bdrv_flush_all() could lead to unreferencing the wrong block driver state and an assertion failure later. Signed-off-by: Fiona Ebner Message-ID: <20240322095009.346989-5-f.ebner@proxmox.com> Reviewed-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- .../tests/stream-unaligned-prefetch | 86 +++++++++++++++++++ .../tests/stream-unaligned-prefetch.out | 5 ++ 2 files changed, 91 insertions(+) create mode 100755 tests/qemu-iotests/tests/stream-unaligned-prefetch create mode 100644 tests/qemu-iotests/tests/stream-unaligned-prefetch.out diff --git a/tests/qemu-iotests/tests/stream-unaligned-prefetch b/tests/qemu-iotests/tests/stream-unaligned-prefetch new file mode 100755 index 0000000000..546db1d369 --- /dev/null +++ b/tests/qemu-iotests/tests/stream-unaligned-prefetch @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# group: rw quick +# +# Test what happens when a stream job does an unaligned prefetch read +# which requires padding while having a NULL qiov. +# +# Copyright (C) Proxmox Server Solutions GmbH +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import os +import iotests +from iotests import imgfmt, qemu_img_create, qemu_io, QMPTestCase + +image_size = 1 * 1024 * 1024 +cluster_size = 64 * 1024 +base = os.path.join(iotests.test_dir, 'base.img') +top = os.path.join(iotests.test_dir, 'top.img') + +class TestStreamUnalignedPrefetch(QMPTestCase): + def setUp(self) -> None: + """ + Create two images: + - base image {base} with {cluster_size // 2} bytes allocated + - top image {top} without any data allocated and coarser + cluster size + + Attach a compress filter for the top image, because that + requires that the request alignment is the top image's cluster + size. + """ + qemu_img_create('-f', imgfmt, + '-o', 'cluster_size={}'.format(cluster_size // 2), + base, str(image_size)) + qemu_io('-c', f'write 0 {cluster_size // 2}', base) + qemu_img_create('-f', imgfmt, + '-o', 'cluster_size={}'.format(cluster_size), + top, str(image_size)) + + self.vm = iotests.VM() + self.vm.add_blockdev(self.vm.qmp_to_opts({ + 'driver': imgfmt, + 'node-name': 'base', + 'file': { + 'driver': 'file', + 'filename': base + } + })) + self.vm.add_blockdev(self.vm.qmp_to_opts({ + 'driver': 'compress', + 'node-name': 'compress-top', + 'file': { + 'driver': imgfmt, + 'node-name': 'top', + 'file': { + 'driver': 'file', + 'filename': top + }, + 'backing': 'base' + } + })) + self.vm.launch() + + def tearDown(self) -> None: + self.vm.shutdown() + os.remove(top) + os.remove(base) + + def test_stream_unaligned_prefetch(self) -> None: + self.vm.cmd('block-stream', job_id='stream', device='compress-top') + + +if __name__ == '__main__': + iotests.main(supported_fmts=['qcow2'], supported_protocols=['file']) diff --git a/tests/qemu-iotests/tests/stream-unaligned-prefetch.out b/tests/qemu-iotests/tests/stream-unaligned-prefetch.out new file mode 100644 index 0000000000..ae1213e6f8 --- /dev/null +++ b/tests/qemu-iotests/tests/stream-unaligned-prefetch.out @@ -0,0 +1,5 @@ +. +---------------------------------------------------------------------- +Ran 1 tests + +OK