From 4be6a6d118123340f16fb8b3bf45220d0f8ed6d4 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 29 Jun 2018 18:01:31 +0200 Subject: [PATCH 01/24] block: Poll after drain on attaching a node Commit dcf94a23b1 ('block: Don't poll in parent drain callbacks') removed polling in bdrv_child_cb_drained_begin() on the grounds that the original bdrv_drain() already will poll and BdrvChildRole.drained_begin calls must not cause graph changes (and therefore must not call aio_poll() or the recursion through the graph will break. This reasoning is correct for calls through bdrv_do_drained_begin(). However, BdrvChildRole.drained_begin is also called when a node that is already in a drained section (i.e. bdrv_do_drained_begin() has already returned and therefore can't poll any more) is attached to a new parent. In this case, we must explicitly poll to have all requests completed before the drained new child can be attached to the parent. In bdrv_replace_child_noperm(), we know that we're not inside the recursion of bdrv_do_drained_begin() because graph changes are not allowed there, and bdrv_replace_child_noperm() is a graph change. The call of BdrvChildRole.drained_begin() must therefore be followed by a BDRV_POLL_WHILE() that waits for the completion of requests. Reported-by: Max Reitz Signed-off-by: Kevin Wolf --- block.c | 2 +- block/io.c | 26 ++++++++++++++++++++------ include/block/block.h | 8 ++++++++ include/block/block_int.h | 3 +++ 4 files changed, 32 insertions(+), 7 deletions(-) diff --git a/block.c b/block.c index ac8b3a3511..a2fe05ea96 100644 --- a/block.c +++ b/block.c @@ -2066,7 +2066,7 @@ static void bdrv_replace_child_noperm(BdrvChild *child, } assert(num >= 0); for (i = 0; i < num; i++) { - child->role->drained_begin(child); + bdrv_parent_drained_begin_single(child, true); } } diff --git a/block/io.c b/block/io.c index 1a2272fad3..038449f81f 100644 --- a/block/io.c +++ b/block/io.c @@ -52,9 +52,7 @@ void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore, if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { continue; } - if (c->role->drained_begin) { - c->role->drained_begin(c); - } + bdrv_parent_drained_begin_single(c, false); } } @@ -73,6 +71,14 @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore, } } +static bool bdrv_parent_drained_poll_single(BdrvChild *c) +{ + if (c->role->drained_poll) { + return c->role->drained_poll(c); + } + return false; +} + static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, bool ignore_bds_parents) { @@ -83,14 +89,22 @@ static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { continue; } - if (c->role->drained_poll) { - busy |= c->role->drained_poll(c); - } + busy |= bdrv_parent_drained_poll_single(c); } return busy; } +void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll) +{ + if (c->role->drained_begin) { + c->role->drained_begin(c); + } + if (poll) { + BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c)); + } +} + static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) { dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); diff --git a/include/block/block.h b/include/block/block.h index bc76b1e59f..706ef009ad 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -568,6 +568,14 @@ void bdrv_io_unplug(BlockDriverState *bs); void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore, bool ignore_bds_parents); +/** + * bdrv_parent_drained_begin_single: + * + * Begin a quiesced section for the parent of @c. If @poll is true, wait for + * any pending activity to cease. + */ +void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll); + /** * bdrv_parent_drained_end: * diff --git a/include/block/block_int.h b/include/block/block_int.h index af71b414be..81cd3db7a9 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -606,6 +606,9 @@ struct BdrvChildRole { * requests after returning from .drained_begin() until .drained_end() is * called. * + * These functions must not change the graph (and therefore also must not + * call aio_poll(), which could change the graph indirectly). + * * Note that this can be nested. If drained_begin() was called twice, new * I/O is allowed only after drained_end() was called twice, too. */ From b994c5bc515fe611885113e7cfa7e87817bfd4e2 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 3 Jul 2018 17:55:16 +0200 Subject: [PATCH 02/24] test-bdrv-drain: Test bdrv_append() to drained node Signed-off-by: Kevin Wolf --- tests/test-bdrv-drain.c | 43 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c index 291a050f86..17bb8508ae 100644 --- a/tests/test-bdrv-drain.c +++ b/tests/test-bdrv-drain.c @@ -1250,6 +1250,47 @@ static void test_detach_by_driver_cb(void) test_detach_indirect(false); } +static void test_append_to_drained(void) +{ + BlockBackend *blk; + BlockDriverState *base, *overlay; + BDRVTestState *base_s, *overlay_s; + + blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL); + base = bdrv_new_open_driver(&bdrv_test, "base", BDRV_O_RDWR, &error_abort); + base_s = base->opaque; + blk_insert_bs(blk, base, &error_abort); + + overlay = bdrv_new_open_driver(&bdrv_test, "overlay", BDRV_O_RDWR, + &error_abort); + overlay_s = overlay->opaque; + + do_drain_begin(BDRV_DRAIN, base); + g_assert_cmpint(base->quiesce_counter, ==, 1); + g_assert_cmpint(base_s->drain_count, ==, 1); + g_assert_cmpint(base->in_flight, ==, 0); + + /* Takes ownership of overlay, so we don't have to unref it later */ + bdrv_append(overlay, base, &error_abort); + g_assert_cmpint(base->in_flight, ==, 0); + g_assert_cmpint(overlay->in_flight, ==, 0); + + g_assert_cmpint(base->quiesce_counter, ==, 1); + g_assert_cmpint(base_s->drain_count, ==, 1); + g_assert_cmpint(overlay->quiesce_counter, ==, 1); + g_assert_cmpint(overlay_s->drain_count, ==, 1); + + do_drain_end(BDRV_DRAIN, base); + + g_assert_cmpint(base->quiesce_counter, ==, 0); + g_assert_cmpint(base_s->drain_count, ==, 0); + g_assert_cmpint(overlay->quiesce_counter, ==, 0); + g_assert_cmpint(overlay_s->drain_count, ==, 0); + + bdrv_unref(base); + blk_unref(blk); +} + int main(int argc, char **argv) { int ret; @@ -1308,6 +1349,8 @@ int main(int argc, char **argv) g_test_add_func("/bdrv-drain/detach/parent_cb", test_detach_by_parent_cb); g_test_add_func("/bdrv-drain/detach/driver_cb", test_detach_by_driver_cb); + g_test_add_func("/bdrv-drain/attach/drain", test_append_to_drained); + ret = g_test_run(); qemu_event_destroy(&done_event); return ret; From b0ddcbbb36a66a605eb232b905cb49b1cc72e74e Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 6 Jul 2018 18:41:07 +0200 Subject: [PATCH 03/24] block: Fix copy-on-read crash with partial final cluster If the virtual disk size isn't aligned to full clusters, bdrv_co_do_copy_on_readv() may get pnum == 0 before having the full cluster completed, which will let it run into an assertion failure: qemu-io: block/io.c:1203: bdrv_co_do_copy_on_readv: Assertion `skip_bytes < pnum' failed. Check for EOF, assert that we read at least as much as the read request originally wanted to have (which is true at EOF because otherwise bdrv_check_byte_request() would already have returned an error) and return success early even though we couldn't copy the full cluster. Signed-off-by: Kevin Wolf --- block/io.c | 6 ++++++ tests/qemu-iotests/197 | 9 +++++++++ tests/qemu-iotests/197.out | 8 ++++++++ 3 files changed, 23 insertions(+) diff --git a/block/io.c b/block/io.c index 038449f81f..4c0831149c 100644 --- a/block/io.c +++ b/block/io.c @@ -1200,6 +1200,12 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, pnum = MIN(cluster_bytes, max_transfer); } + /* Stop at EOF if the image ends in the middle of the cluster */ + if (ret == 0 && pnum == 0) { + assert(progress >= bytes); + break; + } + assert(skip_bytes < pnum); if (ret <= 0) { diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197 index 3ae4975eec..0369aa5cff 100755 --- a/tests/qemu-iotests/197 +++ b/tests/qemu-iotests/197 @@ -109,6 +109,15 @@ $QEMU_IO -f qcow2 -c map "$TEST_WRAP" _check_test_img $QEMU_IMG compare -f $IMGFMT -F qcow2 "$TEST_IMG" "$TEST_WRAP" +echo +echo '=== Partial final cluster ===' +echo + +_make_test_img 1024 +$QEMU_IO -f $IMGFMT -C -c 'read 0 1024' "$TEST_IMG" | _filter_qemu_io +$QEMU_IO -f $IMGFMT -c map "$TEST_IMG" +_check_test_img + # success, all done echo '*** done' status=0 diff --git a/tests/qemu-iotests/197.out b/tests/qemu-iotests/197.out index 52b4137d7b..8febda5dea 100644 --- a/tests/qemu-iotests/197.out +++ b/tests/qemu-iotests/197.out @@ -23,4 +23,12 @@ can't open device TEST_DIR/t.wrap.qcow2: Can't use copy-on-read on read-only dev 1023.938 MiB (0x3fff0000) bytes not allocated at offset 3 GiB (0xc0010000) No errors were found on the image. Images are identical. + +=== Partial final cluster === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1024 +read 1024/1024 bytes at offset 0 +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +1 KiB (0x400) bytes allocated at offset 0 bytes (0x0) +No errors were found on the image. *** done From e79c4cd1909c05a2cab6517a9c00445bd2d880a6 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 17:38:02 +0800 Subject: [PATCH 04/24] iotests: 222: Don't run with luks Luks needs special parameters to operate the image. Since this test is focusing on image fleecing, skip skip that format. Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- tests/qemu-iotests/222 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/qemu-iotests/222 b/tests/qemu-iotests/222 index ff3bfc1470..0ead56d574 100644 --- a/tests/qemu-iotests/222 +++ b/tests/qemu-iotests/222 @@ -25,6 +25,8 @@ import iotests from iotests import log, qemu_img, qemu_io, qemu_io_silent iotests.verify_platform(['linux']) +iotests.verify_image_format(supported_fmts=['qcow2', 'qcow', 'qed', 'vmdk', + 'vhdx', 'raw']) patterns = [("0x5d", "0", "64k"), ("0xd5", "1M", "64k"), From 999658a05e61a8d87b65827da665302bb44ce8c9 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 9 Jul 2018 19:37:16 +0300 Subject: [PATCH 05/24] block/io: fix copy_range Here two things are fixed: 1. Architecture On each recursion step, we go to the child of src or dst, only for one of them. So, it's wrong to create tracked requests for both on each step. It leads to tracked requests duplication. 2. Wait for serializing requests on write path independently of BDRV_REQ_NO_SERIALISING Before commit 9ded4a01149 "backup: Use copy offloading", BDRV_REQ_NO_SERIALISING was used for only one case: read in copy-on-write operation during backup. Also, the flag was handled only on read path (in bdrv_co_preadv and bdrv_aligned_preadv). After 9ded4a01149, flag is used for not waiting serializing operations on backup target (in same case of copy-on-write operation). This behavior change is unsubstantiated and potentially dangerous, let's drop it and add additional asserts and documentation. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 42 +++++++++++++++++++++++++++--------------- include/block/block.h | 12 ++++++++++++ 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/block/io.c b/block/io.c index 4c0831149c..3a321d69d3 100644 --- a/block/io.c +++ b/block/io.c @@ -1592,6 +1592,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), align); + /* BDRV_REQ_NO_SERIALISING is only for read operation */ + assert(!(flags & BDRV_REQ_NO_SERIALISING)); waited = wait_serialising_requests(req); assert(!waited || !req->serialising); assert(req->overlap_offset <= offset); @@ -2916,7 +2918,7 @@ static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, BdrvRequestFlags flags, bool recurse_src) { - BdrvTrackedRequest src_req, dst_req; + BdrvTrackedRequest req; int ret; if (!dst || !dst->bs) { @@ -2943,32 +2945,42 @@ static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, || src->bs->encrypted || dst->bs->encrypted) { return -ENOTSUP; } - bdrv_inc_in_flight(src->bs); - bdrv_inc_in_flight(dst->bs); - tracked_request_begin(&src_req, src->bs, src_offset, - bytes, BDRV_TRACKED_READ); - tracked_request_begin(&dst_req, dst->bs, dst_offset, - bytes, BDRV_TRACKED_WRITE); - if (!(flags & BDRV_REQ_NO_SERIALISING)) { - wait_serialising_requests(&src_req); - wait_serialising_requests(&dst_req); - } if (recurse_src) { + bdrv_inc_in_flight(src->bs); + tracked_request_begin(&req, src->bs, src_offset, bytes, + BDRV_TRACKED_READ); + + if (!(flags & BDRV_REQ_NO_SERIALISING)) { + wait_serialising_requests(&req); + } + ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, src, src_offset, dst, dst_offset, bytes, flags); + + tracked_request_end(&req); + bdrv_dec_in_flight(src->bs); } else { + bdrv_inc_in_flight(dst->bs); + tracked_request_begin(&req, dst->bs, dst_offset, bytes, + BDRV_TRACKED_WRITE); + + /* BDRV_REQ_NO_SERIALISING is only for read operation, + * so we ignore it in flags. + */ + wait_serialising_requests(&req); + ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, src, src_offset, dst, dst_offset, bytes, flags); + + tracked_request_end(&req); + bdrv_dec_in_flight(dst->bs); } - tracked_request_end(&src_req); - tracked_request_end(&dst_req); - bdrv_dec_in_flight(src->bs); - bdrv_dec_in_flight(dst->bs); + return ret; } diff --git a/include/block/block.h b/include/block/block.h index 706ef009ad..f7ddff45b6 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -50,6 +50,18 @@ typedef enum { * opened with BDRV_O_UNMAP. */ BDRV_REQ_MAY_UNMAP = 0x4, + + /* + * The BDRV_REQ_NO_SERIALISING flag is only valid for reads and means that + * we don't want wait_serialising_requests() during the read operation. + * + * This flag is used for backup copy-on-write operations, when we need to + * read old data before write (write notifier triggered). It is okay since + * we already waited for other serializing requests in the initiating write + * (see bdrv_aligned_pwritev), and it is necessary if the initiating write + * is already serializing (without the flag, the read would deadlock + * waiting for the serialising write to complete). + */ BDRV_REQ_NO_SERIALISING = 0x8, BDRV_REQ_FUA = 0x10, BDRV_REQ_WRITE_COMPRESSED = 0x20, From 67b51fb998c697afb5d744066fcbde53e04fe941 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 9 Jul 2018 19:37:17 +0300 Subject: [PATCH 06/24] block: split flags in copy_range Pass read flags and write flags separately. This is needed to handle coming BDRV_REQ_NO_SERIALISING clearly in following patches. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/backup.c | 2 +- block/block-backend.c | 5 ++-- block/file-posix.c | 21 ++++++++++------ block/io.c | 46 ++++++++++++++++++---------------- block/iscsi.c | 9 ++++--- block/qcow2.c | 20 ++++++++------- block/raw-format.c | 24 ++++++++++++------ include/block/block.h | 3 ++- include/block/block_int.h | 14 ++++++++--- include/sysemu/block-backend.h | 3 ++- qemu-img.c | 2 +- 11 files changed, 90 insertions(+), 59 deletions(-) diff --git a/block/backup.c b/block/backup.c index 81895ddbe2..f3e4e814b6 100644 --- a/block/backup.c +++ b/block/backup.c @@ -163,7 +163,7 @@ static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job, hbitmap_reset(job->copy_bitmap, start / job->cluster_size, nr_clusters); ret = blk_co_copy_range(blk, start, job->target, start, nbytes, - is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0); + is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0, 0); if (ret < 0) { trace_backup_do_cow_copy_range_fail(job, start, ret); hbitmap_set(job->copy_bitmap, start / job->cluster_size, diff --git a/block/block-backend.c b/block/block-backend.c index 6b75bca317..ac8c3e0b1c 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -2218,7 +2218,8 @@ void blk_unregister_buf(BlockBackend *blk, void *host) int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in, BlockBackend *blk_out, int64_t off_out, - int bytes, BdrvRequestFlags flags) + int bytes, BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { int r; r = blk_check_byte_request(blk_in, off_in, bytes); @@ -2231,5 +2232,5 @@ int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in, } return bdrv_co_copy_range(blk_in->root, off_in, blk_out->root, off_out, - bytes, flags); + bytes, read_flags, write_flags); } diff --git a/block/file-posix.c b/block/file-posix.c index 98987b80f1..4fec8cb53c 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2589,18 +2589,23 @@ static void raw_abort_perm_update(BlockDriverState *bs) raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL); } -static int coroutine_fn raw_co_copy_range_from(BlockDriverState *bs, - BdrvChild *src, uint64_t src_offset, - BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags) +static int coroutine_fn raw_co_copy_range_from( + BlockDriverState *bs, BdrvChild *src, uint64_t src_offset, + BdrvChild *dst, uint64_t dst_offset, uint64_t bytes, + BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { - return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, flags); + return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, + read_flags, write_flags); } static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs, - BdrvChild *src, uint64_t src_offset, - BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags) + BdrvChild *src, + uint64_t src_offset, + BdrvChild *dst, + uint64_t dst_offset, + uint64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { BDRVRawState *s = bs->opaque; BDRVRawState *src_s; diff --git a/block/io.c b/block/io.c index 3a321d69d3..75ab26fd58 100644 --- a/block/io.c +++ b/block/io.c @@ -2910,13 +2910,11 @@ void bdrv_unregister_buf(BlockDriverState *bs, void *host) } } -static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, - uint64_t src_offset, - BdrvChild *dst, - uint64_t dst_offset, - uint64_t bytes, - BdrvRequestFlags flags, - bool recurse_src) +static int coroutine_fn bdrv_co_copy_range_internal( + BdrvChild *src, uint64_t src_offset, BdrvChild *dst, + uint64_t dst_offset, uint64_t bytes, + BdrvRequestFlags read_flags, BdrvRequestFlags write_flags, + bool recurse_src) { BdrvTrackedRequest req; int ret; @@ -2928,8 +2926,8 @@ static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, if (ret) { return ret; } - if (flags & BDRV_REQ_ZERO_WRITE) { - return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, flags); + if (write_flags & BDRV_REQ_ZERO_WRITE) { + return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags); } if (!src || !src->bs) { @@ -2951,14 +2949,15 @@ static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, tracked_request_begin(&req, src->bs, src_offset, bytes, BDRV_TRACKED_READ); - if (!(flags & BDRV_REQ_NO_SERIALISING)) { + if (!(read_flags & BDRV_REQ_NO_SERIALISING)) { wait_serialising_requests(&req); } ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, src, src_offset, dst, dst_offset, - bytes, flags); + bytes, + read_flags, write_flags); tracked_request_end(&req); bdrv_dec_in_flight(src->bs); @@ -2967,15 +2966,15 @@ static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, tracked_request_begin(&req, dst->bs, dst_offset, bytes, BDRV_TRACKED_WRITE); - /* BDRV_REQ_NO_SERIALISING is only for read operation, - * so we ignore it in flags. - */ + /* BDRV_REQ_NO_SERIALISING is only for read operation */ + assert(!(write_flags & BDRV_REQ_NO_SERIALISING)); wait_serialising_requests(&req); ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, src, src_offset, dst, dst_offset, - bytes, flags); + bytes, + read_flags, write_flags); tracked_request_end(&req); bdrv_dec_in_flight(dst->bs); @@ -2990,10 +2989,12 @@ static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, * semantics. */ int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags) + uint64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, - bytes, flags, true); + bytes, read_flags, write_flags, true); } /* Copy range from @src to @dst. @@ -3002,19 +3003,22 @@ int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, * semantics. */ int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags) + uint64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, - bytes, flags, false); + bytes, read_flags, write_flags, false); } int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags) + uint64_t bytes, BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { return bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, - bytes, flags); + bytes, read_flags, write_flags); } static void bdrv_parent_cb_resize(BlockDriverState *bs) diff --git a/block/iscsi.c b/block/iscsi.c index ead2bd5aa7..38b917a1e5 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -2193,9 +2193,11 @@ static int coroutine_fn iscsi_co_copy_range_from(BlockDriverState *bs, BdrvChild *dst, uint64_t dst_offset, uint64_t bytes, - BdrvRequestFlags flags) + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { - return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, flags); + return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, + read_flags, write_flags); } static struct scsi_task *iscsi_xcopy_task(int param_len) @@ -2332,7 +2334,8 @@ static int coroutine_fn iscsi_co_copy_range_to(BlockDriverState *bs, BdrvChild *dst, uint64_t dst_offset, uint64_t bytes, - BdrvRequestFlags flags) + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { IscsiLun *dst_lun = dst->bs->opaque; IscsiLun *src_lun; diff --git a/block/qcow2.c b/block/qcow2.c index 33b61b7480..867ce02d50 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -3253,13 +3253,14 @@ static int coroutine_fn qcow2_co_copy_range_from(BlockDriverState *bs, BdrvChild *src, uint64_t src_offset, BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags) + uint64_t bytes, BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { BDRVQcow2State *s = bs->opaque; int ret; unsigned int cur_bytes; /* number of bytes in current iteration */ BdrvChild *child = NULL; - BdrvRequestFlags cur_flags; + BdrvRequestFlags cur_write_flags; assert(!bs->encrypted); qemu_co_mutex_lock(&s->lock); @@ -3268,7 +3269,7 @@ qcow2_co_copy_range_from(BlockDriverState *bs, uint64_t copy_offset = 0; /* prepare next request */ cur_bytes = MIN(bytes, INT_MAX); - cur_flags = flags; + cur_write_flags = write_flags; ret = qcow2_get_cluster_offset(bs, src_offset, &cur_bytes, ©_offset); if (ret < 0) { @@ -3280,20 +3281,20 @@ qcow2_co_copy_range_from(BlockDriverState *bs, if (bs->backing && bs->backing->bs) { int64_t backing_length = bdrv_getlength(bs->backing->bs); if (src_offset >= backing_length) { - cur_flags |= BDRV_REQ_ZERO_WRITE; + cur_write_flags |= BDRV_REQ_ZERO_WRITE; } else { child = bs->backing; cur_bytes = MIN(cur_bytes, backing_length - src_offset); copy_offset = src_offset; } } else { - cur_flags |= BDRV_REQ_ZERO_WRITE; + cur_write_flags |= BDRV_REQ_ZERO_WRITE; } break; case QCOW2_CLUSTER_ZERO_PLAIN: case QCOW2_CLUSTER_ZERO_ALLOC: - cur_flags |= BDRV_REQ_ZERO_WRITE; + cur_write_flags |= BDRV_REQ_ZERO_WRITE; break; case QCOW2_CLUSTER_COMPRESSED: @@ -3317,7 +3318,7 @@ qcow2_co_copy_range_from(BlockDriverState *bs, ret = bdrv_co_copy_range_from(child, copy_offset, dst, dst_offset, - cur_bytes, cur_flags); + cur_bytes, read_flags, cur_write_flags); qemu_co_mutex_lock(&s->lock); if (ret < 0) { goto out; @@ -3338,7 +3339,8 @@ static int coroutine_fn qcow2_co_copy_range_to(BlockDriverState *bs, BdrvChild *src, uint64_t src_offset, BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags) + uint64_t bytes, BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { BDRVQcow2State *s = bs->opaque; int offset_in_cluster; @@ -3382,7 +3384,7 @@ qcow2_co_copy_range_to(BlockDriverState *bs, ret = bdrv_co_copy_range_to(src, src_offset, bs->file, cluster_offset + offset_in_cluster, - cur_bytes, flags); + cur_bytes, read_flags, write_flags); qemu_co_mutex_lock(&s->lock); if (ret < 0) { goto fail; diff --git a/block/raw-format.c b/block/raw-format.c index b78da564d4..a3591985f6 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -498,9 +498,13 @@ static int raw_probe_geometry(BlockDriverState *bs, HDGeometry *geo) } static int coroutine_fn raw_co_copy_range_from(BlockDriverState *bs, - BdrvChild *src, uint64_t src_offset, - BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags) + BdrvChild *src, + uint64_t src_offset, + BdrvChild *dst, + uint64_t dst_offset, + uint64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { int ret; @@ -509,13 +513,17 @@ static int coroutine_fn raw_co_copy_range_from(BlockDriverState *bs, return ret; } return bdrv_co_copy_range_from(bs->file, src_offset, dst, dst_offset, - bytes, flags); + bytes, read_flags, write_flags); } static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs, - BdrvChild *src, uint64_t src_offset, - BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags) + BdrvChild *src, + uint64_t src_offset, + BdrvChild *dst, + uint64_t dst_offset, + uint64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { int ret; @@ -524,7 +532,7 @@ static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs, return ret; } return bdrv_co_copy_range_to(src, src_offset, bs->file, dst_offset, bytes, - flags); + read_flags, write_flags); } BlockDriver bdrv_raw = { diff --git a/include/block/block.h b/include/block/block.h index f7ddff45b6..e474f2541b 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -699,5 +699,6 @@ void bdrv_unregister_buf(BlockDriverState *bs, void *host); **/ int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags); + uint64_t bytes, BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags); #endif diff --git a/include/block/block_int.h b/include/block/block_int.h index 81cd3db7a9..920d3d122b 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -218,7 +218,8 @@ struct BlockDriver { BdrvChild *dst, uint64_t dst_offset, uint64_t bytes, - BdrvRequestFlags flags); + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags); /* Map [offset, offset + nbytes) range onto a child of bs to copy data to, * and invoke bdrv_co_copy_range_to(child, src, ...), or perform the copy @@ -234,7 +235,8 @@ struct BlockDriver { BdrvChild *dst, uint64_t dst_offset, uint64_t bytes, - BdrvRequestFlags flags); + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags); /* * Building block for bdrv_block_status[_above] and @@ -1156,10 +1158,14 @@ void blockdev_close_all_bdrv_states(void); int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags); + uint64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags); int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags); + uint64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags); int refresh_total_sectors(BlockDriverState *bs, int64_t hint); diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h index 8d03d493c2..830d873f24 100644 --- a/include/sysemu/block-backend.h +++ b/include/sysemu/block-backend.h @@ -234,6 +234,7 @@ void blk_unregister_buf(BlockBackend *blk, void *host); int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in, BlockBackend *blk_out, int64_t off_out, - int bytes, BdrvRequestFlags flags); + int bytes, BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags); #endif diff --git a/qemu-img.c b/qemu-img.c index 7651d8172c..f4074ebf75 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1783,7 +1783,7 @@ static int coroutine_fn convert_co_copy_range(ImgConvertState *s, int64_t sector ret = blk_co_copy_range(blk, offset, s->target, sector_num << BDRV_SECTOR_BITS, - n << BDRV_SECTOR_BITS, 0); + n << BDRV_SECTOR_BITS, 0, 0); if (ret < 0) { return ret; } From 09d2f948462f4979d18f573a0734d1daae8e67a9 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 9 Jul 2018 19:37:18 +0300 Subject: [PATCH 07/24] block: add BDRV_REQ_SERIALISING flag Serialized writes should be used in copy-on-write of backup(sync=none) for image fleecing scheme. We need to change an assert in bdrv_aligned_pwritev, added in 28de2dcd88de. The assert may fail now, because call to wait_serialising_requests here may become first call to it for this request with serializing flag set. It occurs if the request is aligned (otherwise, we should already set serializing flag before calling bdrv_aligned_pwritev and correspondingly waited for all intersecting requests). However, for aligned requests, we should not care about outdating of previously read data, as there no such data. Therefore, let's just update an assert to not care about aligned requests. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 28 +++++++++++++++++++++++++++- include/block/block.h | 14 +++++++++++++- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/block/io.c b/block/io.c index 75ab26fd58..6be9c40f23 100644 --- a/block/io.c +++ b/block/io.c @@ -637,6 +637,18 @@ static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); } +static bool is_request_serialising_and_aligned(BdrvTrackedRequest *req) +{ + /* + * If the request is serialising, overlap_offset and overlap_bytes are set, + * so we can check if the request is aligned. Otherwise, don't care and + * return false. + */ + + return req->serialising && (req->offset == req->overlap_offset) && + (req->bytes == req->overlap_bytes); +} + /** * Round a region to cluster boundaries */ @@ -1311,6 +1323,9 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, mark_request_serialising(req, bdrv_get_cluster_size(bs)); } + /* BDRV_REQ_SERIALISING is only for write operation */ + assert(!(flags & BDRV_REQ_SERIALISING)); + if (!(flags & BDRV_REQ_NO_SERIALISING)) { wait_serialising_requests(req); } @@ -1594,8 +1609,14 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, /* BDRV_REQ_NO_SERIALISING is only for read operation */ assert(!(flags & BDRV_REQ_NO_SERIALISING)); + + if (flags & BDRV_REQ_SERIALISING) { + mark_request_serialising(req, bdrv_get_cluster_size(bs)); + } + waited = wait_serialising_requests(req); - assert(!waited || !req->serialising); + assert(!waited || !req->serialising || + is_request_serialising_and_aligned(req)); assert(req->overlap_offset <= offset); assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); if (flags & BDRV_REQ_WRITE_UNCHANGED) { @@ -2949,6 +2970,8 @@ static int coroutine_fn bdrv_co_copy_range_internal( tracked_request_begin(&req, src->bs, src_offset, bytes, BDRV_TRACKED_READ); + /* BDRV_REQ_SERIALISING is only for write operation */ + assert(!(read_flags & BDRV_REQ_SERIALISING)); if (!(read_flags & BDRV_REQ_NO_SERIALISING)) { wait_serialising_requests(&req); } @@ -2968,6 +2991,9 @@ static int coroutine_fn bdrv_co_copy_range_internal( /* BDRV_REQ_NO_SERIALISING is only for read operation */ assert(!(write_flags & BDRV_REQ_NO_SERIALISING)); + if (write_flags & BDRV_REQ_SERIALISING) { + mark_request_serialising(&req, bdrv_get_cluster_size(dst->bs)); + } wait_serialising_requests(&req); ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, diff --git a/include/block/block.h b/include/block/block.h index e474f2541b..a91f37bedf 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -70,8 +70,20 @@ typedef enum { * content. */ BDRV_REQ_WRITE_UNCHANGED = 0x40, + /* + * BDRV_REQ_SERIALISING forces request serialisation for writes. + * It is used to ensure that writes to the backing file of a backup process + * target cannot race with a read of the backup target that defers to the + * backing file. + * + * Note, that BDRV_REQ_SERIALISING is _not_ opposite in meaning to + * BDRV_REQ_NO_SERIALISING. A more descriptive name for the latter might be + * _DO_NOT_WAIT_FOR_SERIALISING, except that is too long. + */ + BDRV_REQ_SERIALISING = 0x80, + /* Mask of valid flags */ - BDRV_REQ_MASK = 0x7f, + BDRV_REQ_MASK = 0xff, } BdrvRequestFlags; typedef struct BlockSizes { From f8d59dfb40bbc6f5aeea57c8aac1e68c1d2454ee Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 9 Jul 2018 19:37:19 +0300 Subject: [PATCH 08/24] block/backup: fix fleecing scheme: use serialized writes Fleecing scheme works as follows: we want a kind of temporary snapshot of active drive A. We create temporary image B, with B->backing = A. Then we start backup(sync=none) from A to B. From this point, B reads as point-in-time snapshot of A (A continues to be active drive, accepting guest IO). This scheme needs some additional synchronization between reads from B and backup COW operations, otherwise, the following situation is theoretically possible: (assume B is qcow2, client is NBD client, reading from B) 1. client starts reading and take qcow2 mutex in qcow2_co_preadv, and goes up to l2 table loading (assume cache miss) 2) guest write => backup COW => qcow2 write => try to take qcow2 mutex => waiting 3. l2 table loaded, we see that cluster is UNALLOCATED, go to "case QCOW2_CLUSTER_UNALLOCATED" and unlock mutex before bdrv_co_preadv(bs->backing, ...) 4) aha, mutex unlocked, backup COW continues, and we finally finish guest write and change cluster in our active disk A 5. actually, do bdrv_co_preadv(bs->backing, ...) and read _new updated_ data. To avoid this, let's make backup writes serializing, to not intersect with reads from B. Note: we expand range of handled cases from (sync=none and B->backing = A) to just (A in backing chain of B), to finally allow safe reading from B during backup for all cases when A in backing chain of B, i.e. B formally looks like point-in-time snapshot of A. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/backup.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/block/backup.c b/block/backup.c index f3e4e814b6..319fc922e8 100644 --- a/block/backup.c +++ b/block/backup.c @@ -47,6 +47,8 @@ typedef struct BackupBlockJob { HBitmap *copy_bitmap; bool use_copy_range; int64_t copy_range_size; + + bool serialize_target_writes; } BackupBlockJob; static const BlockJobDriver backup_job_driver; @@ -102,6 +104,8 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job, QEMUIOVector qiov; BlockBackend *blk = job->common.blk; int nbytes; + int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0; + int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0; hbitmap_reset(job->copy_bitmap, start / job->cluster_size, 1); nbytes = MIN(job->cluster_size, job->len - start); @@ -112,8 +116,7 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job, iov.iov_len = nbytes; qemu_iovec_init_external(&qiov, &iov, 1); - ret = blk_co_preadv(blk, start, qiov.size, &qiov, - is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0); + ret = blk_co_preadv(blk, start, qiov.size, &qiov, read_flags); if (ret < 0) { trace_backup_do_cow_read_fail(job, start, ret); if (error_is_read) { @@ -124,11 +127,11 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job, if (qemu_iovec_is_zero(&qiov)) { ret = blk_co_pwrite_zeroes(job->target, start, - qiov.size, BDRV_REQ_MAY_UNMAP); + qiov.size, write_flags | BDRV_REQ_MAY_UNMAP); } else { ret = blk_co_pwritev(job->target, start, - qiov.size, &qiov, - job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0); + qiov.size, &qiov, write_flags | + (job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0)); } if (ret < 0) { trace_backup_do_cow_write_fail(job, start, ret); @@ -156,6 +159,8 @@ static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job, int nr_clusters; BlockBackend *blk = job->common.blk; int nbytes; + int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0; + int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0; assert(QEMU_IS_ALIGNED(job->copy_range_size, job->cluster_size)); nbytes = MIN(job->copy_range_size, end - start); @@ -163,7 +168,7 @@ static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job, hbitmap_reset(job->copy_bitmap, start / job->cluster_size, nr_clusters); ret = blk_co_copy_range(blk, start, job->target, start, nbytes, - is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0, 0); + read_flags, write_flags); if (ret < 0) { trace_backup_do_cow_copy_range_fail(job, start, ret); hbitmap_set(job->copy_bitmap, start / job->cluster_size, @@ -701,6 +706,9 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, sync_bitmap : NULL; job->compress = compress; + /* Detect image-fleecing (and similar) schemes */ + job->serialize_target_writes = bdrv_chain_contains(target, bs); + /* If there is no backing file on the target, we cannot rely on COW if our * backup cluster size is smaller than the target cluster size. Even for * targets with a backing file, try to avoid COW if possible. */ From 7769eaa57859362a655b3d3a78d20df7774f16cf Mon Sep 17 00:00:00 2001 From: Ari Sundholm Date: Fri, 6 Jul 2018 17:01:36 +0300 Subject: [PATCH 09/24] qapi/block-core.json: Add missing documentation for blklogwrites log-append option This was accidentally omitted. Thanks to Eric Blake for spotting this. Signed-off-by: Ari Sundholm Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- qapi/block-core.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/qapi/block-core.json b/qapi/block-core.json index 38b31250f9..62a92fa4f4 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3057,6 +3057,8 @@ # @log-sector-size: sector size used in logging writes to @file, determines # granularity of offsets and sizes of writes (default: 512) # +# @log-append: append to an existing log (default: false) +# # @log-super-update-interval: interval of write requests after which the log # super block is updated to disk (default: 4096) # From ba814c82bbf0daf053c52dcdf0eb97b1f6ab0970 Mon Sep 17 00:00:00 2001 From: Ari Sundholm Date: Fri, 6 Jul 2018 15:00:38 +0300 Subject: [PATCH 10/24] block/blklogwrites: Make sure the log sector size is not too small The sector size needs to be large enough to accommodate the data structures for the log super block and log write entries. This was previously not properly checked, which made it possible to cause QEMU to badly misbehave. Signed-off-by: Ari Sundholm Signed-off-by: Kevin Wolf --- block/blklogwrites.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/blklogwrites.c b/block/blklogwrites.c index 63bf6b34a9..efa2c7a66a 100644 --- a/block/blklogwrites.c +++ b/block/blklogwrites.c @@ -89,7 +89,10 @@ static inline uint32_t blk_log_writes_log2(uint32_t value) static inline bool blk_log_writes_sector_size_valid(uint32_t sector_size) { - return sector_size < (1ull << 24) && is_power_of_2(sector_size); + return is_power_of_2(sector_size) && + sector_size >= sizeof(struct log_write_super) && + sector_size >= sizeof(struct log_write_entry) && + sector_size < (1ull << 24); } static uint64_t blk_log_writes_find_cur_log_sector(BdrvChild *log, From 19a49c5637a3d7c2c61ba9d1149a4f6ee419988a Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Fri, 6 Jul 2018 15:06:17 +0200 Subject: [PATCH 11/24] Revert "block: Remove dead deprecation warning code" This reverts commit 6266e900b8083945cb766b45c124fb3c42932cb3. Some deprecated -drive options were still in use by libvirt, only fixed with libvirt commit b340c6c614 ("qemu: format serial and geometry on frontend disk device"), which is not yet in any released version of libvirt. So let's hold off removing the deprecated options for one more QEMU release. Reported-by: Christian Borntraeger Signed-off-by: Cornelia Huck Signed-off-by: Kevin Wolf --- blockdev.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/blockdev.c b/blockdev.c index 72f5347df5..37eb40670b 100644 --- a/blockdev.c +++ b/blockdev.c @@ -775,6 +775,8 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type) const char *filename; Error *local_err = NULL; int i; + const char *deprecated[] = { + }; /* Change legacy command line options into QMP ones */ static const struct { @@ -851,6 +853,16 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type) goto fail; } + /* Other deprecated options */ + if (!qtest_enabled()) { + for (i = 0; i < ARRAY_SIZE(deprecated); i++) { + if (qemu_opt_get(legacy_opts, deprecated[i]) != NULL) { + error_report("'%s' is deprecated, please use the corresponding " + "option of '-device' instead", deprecated[i]); + } + } + } + /* Media type */ value = qemu_opt_get(legacy_opts, "media"); if (value) { From 44e8b4689c6e3aba4df08a1201f02ac7bf3d2fdb Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Fri, 6 Jul 2018 15:06:18 +0200 Subject: [PATCH 12/24] Revert "block: Remove deprecated -drive option serial" This reverts commit b0083267444a5e0f28391f6c2831a539f878d424. Hold off removing this for one more QEMU release (current libvirt release still uses it.) Signed-off-by: Cornelia Huck Signed-off-by: Kevin Wolf --- block/block-backend.c | 1 + blockdev.c | 10 ++++++++++ hw/block/block.c | 13 +++++++++++++ hw/block/nvme.c | 1 + hw/block/virtio-blk.c | 1 + hw/ide/qdev.c | 1 + hw/scsi/scsi-disk.c | 1 + hw/usb/dev-storage.c | 1 + include/hw/block/block.h | 1 + include/sysemu/blockdev.h | 1 + qemu-doc.texi | 5 +++++ qemu-options.hx | 6 +++++- tests/ahci-test.c | 6 +++--- tests/ide-test.c | 8 ++++---- 14 files changed, 48 insertions(+), 8 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index ac8c3e0b1c..e94e3d5929 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -419,6 +419,7 @@ static void drive_info_del(DriveInfo *dinfo) return; } qemu_opts_del(dinfo->opts); + g_free(dinfo->serial); g_free(dinfo); } diff --git a/blockdev.c b/blockdev.c index 37eb40670b..6c530769fd 100644 --- a/blockdev.c +++ b/blockdev.c @@ -730,6 +730,10 @@ QemuOptsList qemu_legacy_drive_opts = { .name = "if", .type = QEMU_OPT_STRING, .help = "interface (ide, scsi, sd, mtd, floppy, pflash, virtio)", + },{ + .name = "serial", + .type = QEMU_OPT_STRING, + .help = "disk serial number", },{ .name = "file", .type = QEMU_OPT_STRING, @@ -772,10 +776,12 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type) const char *werror, *rerror; bool read_only = false; bool copy_on_read; + const char *serial; const char *filename; Error *local_err = NULL; int i; const char *deprecated[] = { + "serial" }; /* Change legacy command line options into QMP ones */ @@ -943,6 +949,9 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type) goto fail; } + /* Serial number */ + serial = qemu_opt_get(legacy_opts, "serial"); + /* no id supplied -> create one */ if (qemu_opts_id(all_opts) == NULL) { char *new_id; @@ -1017,6 +1026,7 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type) dinfo->type = type; dinfo->bus = bus_id; dinfo->unit = unit_id; + dinfo->serial = g_strdup(serial); blk_set_legacy_dinfo(blk, dinfo); diff --git a/hw/block/block.c b/hw/block/block.c index cf0eb826f1..b6c80ab0b7 100644 --- a/hw/block/block.c +++ b/hw/block/block.c @@ -15,6 +15,19 @@ #include "qapi/qapi-types-block.h" #include "qemu/error-report.h" +void blkconf_serial(BlockConf *conf, char **serial) +{ + DriveInfo *dinfo; + + if (!*serial) { + /* try to fall back to value set with legacy -drive serial=... */ + dinfo = blk_legacy_dinfo(conf->blk); + if (dinfo) { + *serial = g_strdup(dinfo->serial); + } + } +} + void blkconf_blocksizes(BlockConf *conf) { BlockBackend *blk = conf->blk; diff --git a/hw/block/nvme.c b/hw/block/nvme.c index fc7dacb816..5e508ab1b3 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1217,6 +1217,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) return; } + blkconf_serial(&n->conf, &n->serial); if (!n->serial) { error_setg(errp, "serial property not set"); return; diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 225fe44b7a..50b5c869e3 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -935,6 +935,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) return; } + blkconf_serial(&conf->conf, &conf->serial); if (!blkconf_apply_backend_options(&conf->conf, blk_is_read_only(conf->conf.blk), true, errp)) { diff --git a/hw/ide/qdev.c b/hw/ide/qdev.c index 573b022e1e..f395d24592 100644 --- a/hw/ide/qdev.c +++ b/hw/ide/qdev.c @@ -188,6 +188,7 @@ static void ide_dev_initfn(IDEDevice *dev, IDEDriveKind kind, Error **errp) return; } + blkconf_serial(&dev->conf, &dev->serial); if (kind != IDE_CD) { if (!blkconf_geometry(&dev->conf, &dev->chs_trans, 65535, 16, 255, errp)) { diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c index 32f3f96ff8..d7df357029 100644 --- a/hw/scsi/scsi-disk.c +++ b/hw/scsi/scsi-disk.c @@ -2378,6 +2378,7 @@ static void scsi_realize(SCSIDevice *dev, Error **errp) return; } + blkconf_serial(&s->qdev.conf, &s->serial); blkconf_blocksizes(&s->qdev.conf); if (s->qdev.conf.logical_block_size > diff --git a/hw/usb/dev-storage.c b/hw/usb/dev-storage.c index cd5551d94f..45a9487cdb 100644 --- a/hw/usb/dev-storage.c +++ b/hw/usb/dev-storage.c @@ -599,6 +599,7 @@ static void usb_msd_storage_realize(USBDevice *dev, Error **errp) return; } + blkconf_serial(&s->conf, &dev->serial); blkconf_blocksizes(&s->conf); if (!blkconf_apply_backend_options(&s->conf, blk_is_read_only(blk), true, errp)) { diff --git a/include/hw/block/block.h b/include/hw/block/block.h index e9f9e2223f..d4f4dfffab 100644 --- a/include/hw/block/block.h +++ b/include/hw/block/block.h @@ -72,6 +72,7 @@ static inline unsigned int get_physical_block_exp(BlockConf *conf) /* Configuration helpers */ +void blkconf_serial(BlockConf *conf, char **serial); bool blkconf_geometry(BlockConf *conf, int *trans, unsigned cyls_max, unsigned heads_max, unsigned secs_max, Error **errp); diff --git a/include/sysemu/blockdev.h b/include/sysemu/blockdev.h index 24954b94e0..c0ae3700ec 100644 --- a/include/sysemu/blockdev.h +++ b/include/sysemu/blockdev.h @@ -35,6 +35,7 @@ struct DriveInfo { bool is_default; /* Added by default_drive() ? */ int media_cd; QemuOpts *opts; + char *serial; QTAILQ_ENTRY(DriveInfo) next; }; diff --git a/qemu-doc.texi b/qemu-doc.texi index d3924e928e..d343affd6d 100644 --- a/qemu-doc.texi +++ b/qemu-doc.texi @@ -2887,6 +2887,11 @@ with ``-device ...,netdev=x''), or ``-nic user,smb=/some/dir'' (for embedded NICs). The new syntax allows different settings to be provided per NIC. +@subsection -drive serial=... (since 2.10.0) + +The drive serial argument is replaced by the the serial argument +that can be specified with the ``-device'' parameter. + @subsection -usbdevice (since 2.10.0) The ``-usbdevice DEV'' argument is now a synonym for setting diff --git a/qemu-options.hx b/qemu-options.hx index 16208f63f2..381648b9cb 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -805,7 +805,7 @@ ETEXI DEF("drive", HAS_ARG, QEMU_OPTION_drive, "-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][,index=i]\n" " [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n" - " [,snapshot=on|off][,rerror=ignore|stop|report]\n" + " [,snapshot=on|off][,serial=s][,rerror=ignore|stop|report]\n" " [,werror=ignore|stop|report|enospc][,id=name][,aio=threads|native]\n" " [,readonly=on|off][,copy-on-read=on|off]\n" " [,discard=ignore|unmap][,detect-zeroes=on|off|unmap]\n" @@ -879,6 +879,10 @@ The default mode is @option{cache=writeback}. Specify which disk @var{format} will be used rather than detecting the format. Can be used to specify format=raw to avoid interpreting an untrusted format header. +@item serial=@var{serial} +This option specifies the serial number to assign to the device. This +parameter is deprecated, use the corresponding parameter of @code{-device} +instead. @item werror=@var{action},rerror=@var{action} Specify which @var{action} to take on write and read errors. Valid actions are: "ignore" (ignore the error and try to continue), "stop" (pause QEMU), diff --git a/tests/ahci-test.c b/tests/ahci-test.c index 937ed2f910..1a7b761304 100644 --- a/tests/ahci-test.c +++ b/tests/ahci-test.c @@ -180,12 +180,12 @@ static AHCIQState *ahci_boot(const char *cli, ...) s = ahci_vboot(cli, ap); va_end(ap); } else { - cli = "-drive if=none,id=drive0,file=%s,cache=writeback,format=%s" + cli = "-drive if=none,id=drive0,file=%s,cache=writeback,serial=%s" + ",format=%s" " -M q35 " "-device ide-hd,drive=drive0 " - "-global ide-hd.serial=%s " "-global ide-hd.ver=%s"; - s = ahci_boot(cli, tmp_path, imgfmt, "testdisk", "version"); + s = ahci_boot(cli, tmp_path, "testdisk", imgfmt, "version"); } return s; diff --git a/tests/ide-test.c b/tests/ide-test.c index f39431b1a9..2384c2c3e2 100644 --- a/tests/ide-test.c +++ b/tests/ide-test.c @@ -529,8 +529,8 @@ static void test_bmdma_no_busmaster(void) static void test_bmdma_setup(void) { ide_test_start( - "-drive file=%s,if=ide,cache=writeback,format=raw " - "-global ide-hd.serial=%s -global ide-hd.ver=%s", + "-drive file=%s,if=ide,serial=%s,cache=writeback,format=raw " + "-global ide-hd.ver=%s", tmp_path, "testdisk", "version"); qtest_irq_intercept_in(global_qtest, "ioapic"); } @@ -561,8 +561,8 @@ static void test_identify(void) int ret; ide_test_start( - "-drive file=%s,if=ide,cache=writeback,format=raw " - "-global ide-hd.serial=%s -global ide-hd.ver=%s", + "-drive file=%s,if=ide,serial=%s,cache=writeback,format=raw " + "-global ide-hd.ver=%s", tmp_path, "testdisk", "version"); dev = get_pci_device(&bmdma_bar, &ide_bar); From 75f4cd297922e1ac352625badf548d4a1bb96089 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Fri, 6 Jul 2018 15:06:19 +0200 Subject: [PATCH 13/24] Revert "block: Remove deprecated -drive option addr" This reverts commit eae3bd1eb7c6b105d30ec06008b3bc3dfc5f45bb. Reverted to avoid conflicts for geometry options revert. Signed-off-by: Cornelia Huck Signed-off-by: Kevin Wolf --- blockdev.c | 17 ++++++++++++++++- device-hotplug.c | 4 ++++ include/sysemu/blockdev.h | 1 + qemu-doc.texi | 5 +++++ qemu-options.hx | 5 ++++- 5 files changed, 30 insertions(+), 2 deletions(-) diff --git a/blockdev.c b/blockdev.c index 6c530769fd..c23587b075 100644 --- a/blockdev.c +++ b/blockdev.c @@ -730,6 +730,10 @@ QemuOptsList qemu_legacy_drive_opts = { .name = "if", .type = QEMU_OPT_STRING, .help = "interface (ide, scsi, sd, mtd, floppy, pflash, virtio)", + },{ + .name = "addr", + .type = QEMU_OPT_STRING, + .help = "pci address (virtio only)", },{ .name = "serial", .type = QEMU_OPT_STRING, @@ -773,6 +777,7 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type) DriveMediaType media = MEDIA_DISK; BlockInterfaceType type; int max_devs, bus_id, unit_id, index; + const char *devaddr; const char *werror, *rerror; bool read_only = false; bool copy_on_read; @@ -781,7 +786,7 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type) Error *local_err = NULL; int i; const char *deprecated[] = { - "serial" + "serial", "addr" }; /* Change legacy command line options into QMP ones */ @@ -971,6 +976,12 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type) } /* Add virtio block device */ + devaddr = qemu_opt_get(legacy_opts, "addr"); + if (devaddr && type != IF_VIRTIO) { + error_report("addr is not supported by this bus type"); + goto fail; + } + if (type == IF_VIRTIO) { QemuOpts *devopts; devopts = qemu_opts_create(qemu_find_opts("device"), NULL, 0, @@ -982,6 +993,9 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type) } qemu_opt_set(devopts, "drive", qdict_get_str(bs_opts, "id"), &error_abort); + if (devaddr) { + qemu_opt_set(devopts, "addr", devaddr, &error_abort); + } } filename = qemu_opt_get(legacy_opts, "file"); @@ -1026,6 +1040,7 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type) dinfo->type = type; dinfo->bus = bus_id; dinfo->unit = unit_id; + dinfo->devaddr = devaddr; dinfo->serial = g_strdup(serial); blk_set_legacy_dinfo(blk, dinfo); diff --git a/device-hotplug.c b/device-hotplug.c index cd427e2c76..23fd6656f1 100644 --- a/device-hotplug.c +++ b/device-hotplug.c @@ -69,6 +69,10 @@ void hmp_drive_add(Monitor *mon, const QDict *qdict) if (!dinfo) { goto err; } + if (dinfo->devaddr) { + monitor_printf(mon, "Parameter addr not supported\n"); + goto err; + } switch (dinfo->type) { case IF_NONE: diff --git a/include/sysemu/blockdev.h b/include/sysemu/blockdev.h index c0ae3700ec..37ea39719e 100644 --- a/include/sysemu/blockdev.h +++ b/include/sysemu/blockdev.h @@ -28,6 +28,7 @@ typedef enum { } BlockInterfaceType; struct DriveInfo { + const char *devaddr; BlockInterfaceType type; int bus; int unit; diff --git a/qemu-doc.texi b/qemu-doc.texi index d343affd6d..ae5531a053 100644 --- a/qemu-doc.texi +++ b/qemu-doc.texi @@ -2892,6 +2892,11 @@ provided per NIC. The drive serial argument is replaced by the the serial argument that can be specified with the ``-device'' parameter. +@subsection -drive addr=... (since 2.10.0) + +The drive addr argument is replaced by the the addr argument +that can be specified with the ``-device'' parameter. + @subsection -usbdevice (since 2.10.0) The ``-usbdevice DEV'' argument is now a synonym for setting diff --git a/qemu-options.hx b/qemu-options.hx index 381648b9cb..df248d1568 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -805,7 +805,7 @@ ETEXI DEF("drive", HAS_ARG, QEMU_OPTION_drive, "-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][,index=i]\n" " [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n" - " [,snapshot=on|off][,serial=s][,rerror=ignore|stop|report]\n" + " [,snapshot=on|off][,serial=s][,addr=A][,rerror=ignore|stop|report]\n" " [,werror=ignore|stop|report|enospc][,id=name][,aio=threads|native]\n" " [,readonly=on|off][,copy-on-read=on|off]\n" " [,discard=ignore|unmap][,detect-zeroes=on|off|unmap]\n" @@ -883,6 +883,9 @@ an untrusted format header. This option specifies the serial number to assign to the device. This parameter is deprecated, use the corresponding parameter of @code{-device} instead. +@item addr=@var{addr} +Specify the controller's PCI address (if=virtio only). This parameter is +deprecated, use the corresponding parameter of @code{-device} instead. @item werror=@var{action},rerror=@var{action} Specify which @var{action} to take on write and read errors. Valid actions are: "ignore" (ignore the error and try to continue), "stop" (pause QEMU), From 6703db131f832d6af58fa629be11c5efa5a6adb8 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Fri, 6 Jul 2018 15:06:20 +0200 Subject: [PATCH 14/24] Revert "block: Remove deprecated -drive geometry options" This reverts commit a7aff6dd10b16b67e8b142d0c94c5d92c3fe88f6. Hold off removing this for one more QEMU release (current libvirt release still uses it.) Signed-off-by: Cornelia Huck Signed-off-by: Kevin Wolf --- blockdev.c | 75 ++++++++++++++++++++++++++++++++++++++- hmp-commands.hx | 1 + hw/block/block.c | 14 ++++++++ include/sysemu/blockdev.h | 1 + qemu-doc.texi | 5 +++ qemu-options.hx | 7 +++- tests/hd-geo-test.c | 37 +++++++++++++++---- 7 files changed, 131 insertions(+), 9 deletions(-) diff --git a/blockdev.c b/blockdev.c index c23587b075..dcf8c8d2ab 100644 --- a/blockdev.c +++ b/blockdev.c @@ -730,6 +730,22 @@ QemuOptsList qemu_legacy_drive_opts = { .name = "if", .type = QEMU_OPT_STRING, .help = "interface (ide, scsi, sd, mtd, floppy, pflash, virtio)", + },{ + .name = "cyls", + .type = QEMU_OPT_NUMBER, + .help = "number of cylinders (ide disk geometry)", + },{ + .name = "heads", + .type = QEMU_OPT_NUMBER, + .help = "number of heads (ide disk geometry)", + },{ + .name = "secs", + .type = QEMU_OPT_NUMBER, + .help = "number of sectors (ide disk geometry)", + },{ + .name = "trans", + .type = QEMU_OPT_STRING, + .help = "chs translation (auto, lba, none)", },{ .name = "addr", .type = QEMU_OPT_STRING, @@ -776,6 +792,7 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type) QemuOpts *legacy_opts; DriveMediaType media = MEDIA_DISK; BlockInterfaceType type; + int cyls, heads, secs, translation; int max_devs, bus_id, unit_id, index; const char *devaddr; const char *werror, *rerror; @@ -786,7 +803,7 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type) Error *local_err = NULL; int i; const char *deprecated[] = { - "serial", "addr" + "serial", "trans", "secs", "heads", "cyls", "addr" }; /* Change legacy command line options into QMP ones */ @@ -915,6 +932,57 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type) type = block_default_type; } + /* Geometry */ + cyls = qemu_opt_get_number(legacy_opts, "cyls", 0); + heads = qemu_opt_get_number(legacy_opts, "heads", 0); + secs = qemu_opt_get_number(legacy_opts, "secs", 0); + + if (cyls || heads || secs) { + if (cyls < 1) { + error_report("invalid physical cyls number"); + goto fail; + } + if (heads < 1) { + error_report("invalid physical heads number"); + goto fail; + } + if (secs < 1) { + error_report("invalid physical secs number"); + goto fail; + } + } + + translation = BIOS_ATA_TRANSLATION_AUTO; + value = qemu_opt_get(legacy_opts, "trans"); + if (value != NULL) { + if (!cyls) { + error_report("'%s' trans must be used with cyls, heads and secs", + value); + goto fail; + } + if (!strcmp(value, "none")) { + translation = BIOS_ATA_TRANSLATION_NONE; + } else if (!strcmp(value, "lba")) { + translation = BIOS_ATA_TRANSLATION_LBA; + } else if (!strcmp(value, "large")) { + translation = BIOS_ATA_TRANSLATION_LARGE; + } else if (!strcmp(value, "rechs")) { + translation = BIOS_ATA_TRANSLATION_RECHS; + } else if (!strcmp(value, "auto")) { + translation = BIOS_ATA_TRANSLATION_AUTO; + } else { + error_report("'%s' invalid translation type", value); + goto fail; + } + } + + if (media == MEDIA_CDROM) { + if (cyls || secs || heads) { + error_report("CHS can't be set with media=cdrom"); + goto fail; + } + } + /* Device address specified by bus/unit or index. * If none was specified, try to find the first free one. */ bus_id = qemu_opt_get_number(legacy_opts, "bus", 0); @@ -1037,6 +1105,11 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type) dinfo = g_malloc0(sizeof(*dinfo)); dinfo->opts = all_opts; + dinfo->cyls = cyls; + dinfo->heads = heads; + dinfo->secs = secs; + dinfo->trans = translation; + dinfo->type = type; dinfo->bus = bus_id; dinfo->unit = unit_id; diff --git a/hmp-commands.hx b/hmp-commands.hx index c1fc747403..91dfe51c37 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1306,6 +1306,7 @@ ETEXI .params = "[-n] [[:]:]\n" "[file=file][,if=type][,bus=n]\n" "[,unit=m][,media=d][,index=i]\n" + "[,cyls=c,heads=h,secs=s[,trans=t]]\n" "[,snapshot=on|off][,cache=on|off]\n" "[,readonly=on|off][,copy-on-read=on|off]", .help = "add drive to PCI storage controller", diff --git a/hw/block/block.c b/hw/block/block.c index b6c80ab0b7..b91e2b6d7e 100644 --- a/hw/block/block.c +++ b/hw/block/block.c @@ -108,6 +108,20 @@ bool blkconf_geometry(BlockConf *conf, int *ptrans, unsigned cyls_max, unsigned heads_max, unsigned secs_max, Error **errp) { + DriveInfo *dinfo; + + if (!conf->cyls && !conf->heads && !conf->secs) { + /* try to fall back to value set with legacy -drive cyls=... */ + dinfo = blk_legacy_dinfo(conf->blk); + if (dinfo) { + conf->cyls = dinfo->cyls; + conf->heads = dinfo->heads; + conf->secs = dinfo->secs; + if (ptrans) { + *ptrans = dinfo->trans; + } + } + } if (!conf->cyls && !conf->heads && !conf->secs) { hd_geometry_guess(conf->blk, &conf->cyls, &conf->heads, &conf->secs, diff --git a/include/sysemu/blockdev.h b/include/sysemu/blockdev.h index 37ea39719e..ac22f2ae1f 100644 --- a/include/sysemu/blockdev.h +++ b/include/sysemu/blockdev.h @@ -35,6 +35,7 @@ struct DriveInfo { int auto_del; /* see blockdev_mark_auto_del() */ bool is_default; /* Added by default_drive() ? */ int media_cd; + int cyls, heads, secs, trans; QemuOpts *opts; char *serial; QTAILQ_ENTRY(DriveInfo) next; diff --git a/qemu-doc.texi b/qemu-doc.texi index ae5531a053..0fbf8b108b 100644 --- a/qemu-doc.texi +++ b/qemu-doc.texi @@ -2887,6 +2887,11 @@ with ``-device ...,netdev=x''), or ``-nic user,smb=/some/dir'' (for embedded NICs). The new syntax allows different settings to be provided per NIC. +@subsection -drive cyls=...,heads=...,secs=...,trans=... (since 2.10.0) + +The drive geometry arguments are replaced by the the geometry arguments +that can be specified with the ``-device'' parameter. + @subsection -drive serial=... (since 2.10.0) The drive serial argument is replaced by the the serial argument diff --git a/qemu-options.hx b/qemu-options.hx index df248d1568..654e69cc3b 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -804,8 +804,9 @@ ETEXI DEF("drive", HAS_ARG, QEMU_OPTION_drive, "-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][,index=i]\n" + " [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n" " [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n" - " [,snapshot=on|off][,serial=s][,addr=A][,rerror=ignore|stop|report]\n" + " [,serial=s][,addr=A][,rerror=ignore|stop|report]\n" " [,werror=ignore|stop|report|enospc][,id=name][,aio=threads|native]\n" " [,readonly=on|off][,copy-on-read=on|off]\n" " [,discard=ignore|unmap][,detect-zeroes=on|off|unmap]\n" @@ -846,6 +847,10 @@ This option defines where is connected the drive by using an index in the list of available connectors of a given interface type. @item media=@var{media} This option defines the type of the media: disk or cdrom. +@item cyls=@var{c},heads=@var{h},secs=@var{s}[,trans=@var{t}] +Force disk physical geometry and the optional BIOS translation (trans=none or +lba). These parameters are deprecated, use the corresponding parameters +of @code{-device} instead. @item snapshot=@var{snapshot} @var{snapshot} is "on" or "off" and controls snapshot mode for the given drive (see @option{-snapshot}). diff --git a/tests/hd-geo-test.c b/tests/hd-geo-test.c index ce665f1f83..24870b38f4 100644 --- a/tests/hd-geo-test.c +++ b/tests/hd-geo-test.c @@ -201,7 +201,7 @@ static void setup_mbr(int img_idx, MBRcontents mbr) static int setup_ide(int argc, char *argv[], int argv_sz, int ide_idx, const char *dev, int img_idx, - MBRcontents mbr) + MBRcontents mbr, const char *opts) { char *s1, *s2, *s3; @@ -216,7 +216,7 @@ static int setup_ide(int argc, char *argv[], int argv_sz, s3 = g_strdup(",media=cdrom"); } argc = append_arg(argc, argv, argv_sz, - g_strdup_printf("%s%s%s", s1, s2, s3)); + g_strdup_printf("%s%s%s%s", s1, s2, s3, opts)); g_free(s1); g_free(s2); g_free(s3); @@ -260,7 +260,7 @@ static void test_ide_mbr(bool use_device, MBRcontents mbr) for (i = 0; i < backend_last; i++) { cur_ide[i] = &hd_chst[i][mbr]; dev = use_device ? (is_hd(cur_ide[i]) ? "ide-hd" : "ide-cd") : NULL; - argc = setup_ide(argc, argv, ARGV_SIZE, i, dev, i, mbr); + argc = setup_ide(argc, argv, ARGV_SIZE, i, dev, i, mbr, ""); } args = g_strjoinv(" ", argv); qtest_start(args); @@ -327,12 +327,16 @@ static void test_ide_drive_user(const char *dev, bool trans) const CHST expected_chst = { secs / (4 * 32) , 4, 32, trans }; argc = setup_common(argv, ARGV_SIZE); - opts = g_strdup_printf("%s,%scyls=%d,heads=%d,secs=%d", - dev, trans ? "bios-chs-trans=lba," : "", + opts = g_strdup_printf("%s,%s%scyls=%d,heads=%d,secs=%d", + dev ?: "", + trans && dev ? "bios-chs-" : "", + trans ? "trans=lba," : "", expected_chst.cyls, expected_chst.heads, expected_chst.secs); cur_ide[0] = &expected_chst; - argc = setup_ide(argc, argv, ARGV_SIZE, 0, opts, backend_small, mbr_chs); + argc = setup_ide(argc, argv, ARGV_SIZE, + 0, dev ? opts : NULL, backend_small, mbr_chs, + dev ? "" : opts); g_free(opts); args = g_strjoinv(" ", argv); qtest_start(args); @@ -342,6 +346,22 @@ static void test_ide_drive_user(const char *dev, bool trans) qtest_end(); } +/* + * Test case: IDE device (if=ide) with explicit CHS + */ +static void test_ide_drive_user_chs(void) +{ + test_ide_drive_user(NULL, false); +} + +/* + * Test case: IDE device (if=ide) with explicit CHS and translation + */ +static void test_ide_drive_user_chst(void) +{ + test_ide_drive_user(NULL, true); +} + /* * Test case: IDE device (if=none) with explicit CHS */ @@ -372,7 +392,8 @@ static void test_ide_drive_cd_0(void) for (i = 0; i <= backend_empty; i++) { ide_idx = backend_empty - i; cur_ide[ide_idx] = &hd_chst[i][mbr_blank]; - argc = setup_ide(argc, argv, ARGV_SIZE, ide_idx, NULL, i, mbr_blank); + argc = setup_ide(argc, argv, ARGV_SIZE, + ide_idx, NULL, i, mbr_blank, ""); } args = g_strjoinv(" ", argv); qtest_start(args); @@ -401,6 +422,8 @@ int main(int argc, char **argv) qtest_add_func("hd-geo/ide/drive/mbr/blank", test_ide_drive_mbr_blank); qtest_add_func("hd-geo/ide/drive/mbr/lba", test_ide_drive_mbr_lba); qtest_add_func("hd-geo/ide/drive/mbr/chs", test_ide_drive_mbr_chs); + qtest_add_func("hd-geo/ide/drive/user/chs", test_ide_drive_user_chs); + qtest_add_func("hd-geo/ide/drive/user/chst", test_ide_drive_user_chst); qtest_add_func("hd-geo/ide/drive/cd_0", test_ide_drive_cd_0); qtest_add_func("hd-geo/ide/device/mbr/blank", test_ide_device_mbr_blank); qtest_add_func("hd-geo/ide/device/mbr/lba", test_ide_device_mbr_lba); From f8a30874ca4dd6560b5827433f07877e60960946 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:15 +0800 Subject: [PATCH 15/24] block: Prefix file driver trace points with "file_" With in one module, trace points usually have a common prefix named after the module name. paio_submit and paio_submit_co are the only two trace points so far in the two file protocol drivers. As we are adding more, having a common prefix here is better so that trace points can be enabled with a glob. Rename them. Suggested-by: Kevin Wolf Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/file-posix.c | 2 +- block/file-win32.c | 2 +- block/trace-events | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 4fec8cb53c..1185c7c5cc 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1743,7 +1743,7 @@ static int paio_submit_co_full(BlockDriverState *bs, int fd, assert(qiov->size == bytes); } - trace_paio_submit_co(offset, bytes, type); + trace_file_paio_submit_co(offset, bytes, type); pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); return thread_pool_submit_co(pool, aio_worker, acb); } diff --git a/block/file-win32.c b/block/file-win32.c index 0411fe80fd..f1e2187f3b 100644 --- a/block/file-win32.c +++ b/block/file-win32.c @@ -162,7 +162,7 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile, acb->aio_nbytes = count; acb->aio_offset = offset; - trace_paio_submit(acb, opaque, offset, count, type); + trace_file_paio_submit(acb, opaque, offset, count, type); pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); } diff --git a/block/trace-events b/block/trace-events index c35287b48a..854d5525af 100644 --- a/block/trace-events +++ b/block/trace-events @@ -55,8 +55,8 @@ qmp_block_stream(void *bs, void *job) "bs %p job %p" # block/file-win32.c # block/file-posix.c -paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d type %d" -paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d" +file_paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d type %d" +file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d" # block/qcow2.c qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d" From ecc983a507bec9d3130434702d7031bfd372ba74 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:16 +0800 Subject: [PATCH 16/24] block: Add copy offloading trace points A few trace points that can help reveal what is happening in a copy offloading I/O path. Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/file-posix.c | 2 ++ block/io.c | 4 ++++ block/iscsi.c | 3 +++ block/trace-events | 6 ++++++ 4 files changed, 15 insertions(+) diff --git a/block/file-posix.c b/block/file-posix.c index 1185c7c5cc..09f6b938f6 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1488,6 +1488,8 @@ static ssize_t handle_aiocb_copy_range(RawPosixAIOData *aiocb) ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off, aiocb->aio_fd2, &out_off, bytes, 0); + trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off, + aiocb->aio_fd2, out_off, bytes, 0, ret); if (ret == 0) { /* No progress (e.g. when beyond EOF), let the caller fall back to * buffer I/O. */ diff --git a/block/io.c b/block/io.c index 6be9c40f23..9b2aa04010 100644 --- a/block/io.c +++ b/block/io.c @@ -3019,6 +3019,8 @@ int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { + trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes, + read_flags, write_flags); return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, bytes, read_flags, write_flags, true); } @@ -3033,6 +3035,8 @@ int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { + trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, + read_flags, write_flags); return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, bytes, read_flags, write_flags, false); } diff --git a/block/iscsi.c b/block/iscsi.c index 38b917a1e5..bb69faf34a 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -44,6 +44,7 @@ #include "qapi/qmp/qstring.h" #include "crypto/secret.h" #include "scsi/utils.h" +#include "trace.h" /* Conflict between scsi/utils.h and libiscsi! :( */ #define SCSI_XFER_NONE ISCSI_XFER_NONE @@ -2399,6 +2400,8 @@ retry: } out_unlock: + + trace_iscsi_xcopy(src_lun, src_offset, dst_lun, dst_offset, bytes, r); g_free(iscsi_task.task); qemu_mutex_unlock(&dst_lun->mutex); g_free(iscsi_task.err_str); diff --git a/block/trace-events b/block/trace-events index 854d5525af..3e8c47bb24 100644 --- a/block/trace-events +++ b/block/trace-events @@ -15,6 +15,8 @@ bdrv_co_preadv(void *bs, int64_t offset, int64_t nbytes, unsigned int flags) "bs bdrv_co_pwritev(void *bs, int64_t offset, int64_t nbytes, unsigned int flags) "bs %p offset %"PRId64" nbytes %"PRId64" flags 0x%x" bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags 0x%x" bdrv_co_do_copy_on_readv(void *bs, int64_t offset, unsigned int bytes, int64_t cluster_offset, int64_t cluster_bytes) "bs %p offset %"PRId64" bytes %u cluster_offset %"PRId64" cluster_bytes %"PRId64 +bdrv_co_copy_range_from(void *src, uint64_t src_offset, void *dst, uint64_t dst_offset, uint64_t bytes, int read_flags, int write_flags) "src %p offset %"PRIu64" dst %p offset %"PRIu64" bytes %"PRIu64" rw flags 0x%x 0x%x" +bdrv_co_copy_range_to(void *src, uint64_t src_offset, void *dst, uint64_t dst_offset, uint64_t bytes, int read_flags, int write_flags) "src %p offset %"PRIu64" dst %p offset %"PRIu64" bytes %"PRIu64" rw flags 0x%x 0x%x" # block/stream.c stream_one_iteration(void *s, int64_t offset, uint64_t bytes, int is_allocated) "s %p offset %" PRId64 " bytes %" PRIu64 " is_allocated %d" @@ -57,6 +59,7 @@ qmp_block_stream(void *bs, void *job) "bs %p job %p" # block/file-posix.c file_paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d type %d" file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d" +file_copy_file_range(void *bs, int src, int64_t src_off, int dst, int64_t dst_off, int64_t bytes, int flags, int64_t ret) "bs %p src_fd %d offset %"PRIu64" dst_fd %d offset %"PRIu64" bytes %"PRIu64" flags %d ret %"PRId64 # block/qcow2.c qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d" @@ -150,3 +153,6 @@ nvme_free_req_queue_wait(void *q) "q %p" nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d" nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64 nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d" + +# block/iscsi.c +iscsi_xcopy(void *src_lun, uint64_t src_off, void *dst_lun, uint64_t dst_off, uint64_t bytes, int ret) "src_lun %p offset %"PRIu64" dst_lun %p offset %"PRIu64" bytes %"PRIu64" ret %d" From 0b9fd3f467dc5ac041fa014cd28c949b25b87d25 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:17 +0800 Subject: [PATCH 17/24] block: Use BdrvChild to discard Other I/O functions are already using a BdrvChild pointer in the API, so make discard do the same. It makes it possible to initiate the same permission checks before doing I/O, and much easier to share the helper functions for this, which will be added and used by write, truncate and copy range paths. Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/blkdebug.c | 2 +- block/blklogwrites.c | 2 +- block/blkreplay.c | 2 +- block/block-backend.c | 2 +- block/copy-on-read.c | 2 +- block/io.c | 18 +++++++++--------- block/mirror.c | 2 +- block/qcow2-refcount.c | 2 +- block/raw-format.c | 2 +- block/throttle.c | 2 +- include/block/block.h | 4 ++-- 11 files changed, 20 insertions(+), 20 deletions(-) diff --git a/block/blkdebug.c b/block/blkdebug.c index 526af2a808..0457bf5b66 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -625,7 +625,7 @@ static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs, return err; } - return bdrv_co_pdiscard(bs->file->bs, offset, bytes); + return bdrv_co_pdiscard(bs->file, offset, bytes); } static int coroutine_fn blkdebug_co_block_status(BlockDriverState *bs, diff --git a/block/blklogwrites.c b/block/blklogwrites.c index efa2c7a66a..ff98cd5533 100644 --- a/block/blklogwrites.c +++ b/block/blklogwrites.c @@ -486,7 +486,7 @@ static int coroutine_fn blk_log_writes_co_do_file_flush(BlkLogWritesFileReq *fr) static int coroutine_fn blk_log_writes_co_do_file_pdiscard(BlkLogWritesFileReq *fr) { - return bdrv_co_pdiscard(fr->bs->file->bs, fr->offset, fr->bytes); + return bdrv_co_pdiscard(fr->bs->file, fr->offset, fr->bytes); } static int coroutine_fn diff --git a/block/blkreplay.c b/block/blkreplay.c index b016dbeee7..766150ade6 100755 --- a/block/blkreplay.c +++ b/block/blkreplay.c @@ -113,7 +113,7 @@ static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) { uint64_t reqid = blkreplay_next_id(); - int ret = bdrv_co_pdiscard(bs->file->bs, offset, bytes); + int ret = bdrv_co_pdiscard(bs->file, offset, bytes); block_request_create(reqid, bs, qemu_coroutine_self()); qemu_coroutine_yield(); diff --git a/block/block-backend.c b/block/block-backend.c index e94e3d5929..f2f75a977d 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1560,7 +1560,7 @@ int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes) return ret; } - return bdrv_co_pdiscard(blk_bs(blk), offset, bytes); + return bdrv_co_pdiscard(blk->root, offset, bytes); } int blk_co_flush(BlockBackend *blk) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 1dcdaeed69..a19164f9eb 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -116,7 +116,7 @@ static int coroutine_fn cor_co_pwrite_zeroes(BlockDriverState *bs, static int coroutine_fn cor_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) { - return bdrv_co_pdiscard(bs->file->bs, offset, bytes); + return bdrv_co_pdiscard(bs->file, offset, bytes); } diff --git a/block/io.c b/block/io.c index 9b2aa04010..fbcd93304b 100644 --- a/block/io.c +++ b/block/io.c @@ -2633,7 +2633,7 @@ int bdrv_flush(BlockDriverState *bs) } typedef struct DiscardCo { - BlockDriverState *bs; + BdrvChild *child; int64_t offset; int bytes; int ret; @@ -2642,17 +2642,17 @@ static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque) { DiscardCo *rwco = opaque; - rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->bytes); + rwco->ret = bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes); } -int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, - int bytes) +int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes) { BdrvTrackedRequest req; int max_pdiscard, ret; int head, tail, align; + BlockDriverState *bs = child->bs; - if (!bs->drv) { + if (!bs || !bs->drv) { return -ENOMEDIUM; } @@ -2763,11 +2763,11 @@ out: return ret; } -int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) +int bdrv_pdiscard(BdrvChild *child, int64_t offset, int bytes) { Coroutine *co; DiscardCo rwco = { - .bs = bs, + .child = child, .offset = offset, .bytes = bytes, .ret = NOT_DONE, @@ -2778,8 +2778,8 @@ int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) bdrv_pdiscard_co_entry(&rwco); } else { co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco); - bdrv_coroutine_enter(bs, co); - BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE); + bdrv_coroutine_enter(child->bs, co); + BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); } return rwco.ret; diff --git a/block/mirror.c b/block/mirror.c index 61bd9f3cf1..b48c3f8cf5 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1333,7 +1333,7 @@ static int coroutine_fn bdrv_mirror_top_do_write(BlockDriverState *bs, break; case MIRROR_METHOD_DISCARD: - ret = bdrv_co_pdiscard(bs->backing->bs, offset, bytes); + ret = bdrv_co_pdiscard(bs->backing, offset, bytes); break; default: diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 18c729aa27..4e1589ad7a 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -734,7 +734,7 @@ void qcow2_process_discards(BlockDriverState *bs, int ret) /* Discard is optional, ignore the return value */ if (ret >= 0) { - bdrv_pdiscard(bs->file->bs, d->offset, d->bytes); + bdrv_pdiscard(bs->file, d->offset, d->bytes); } g_free(d); diff --git a/block/raw-format.c b/block/raw-format.c index a3591985f6..dee262875a 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -297,7 +297,7 @@ static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs, if (ret) { return ret; } - return bdrv_co_pdiscard(bs->file->bs, offset, bytes); + return bdrv_co_pdiscard(bs->file, offset, bytes); } static int64_t raw_getlength(BlockDriverState *bs) diff --git a/block/throttle.c b/block/throttle.c index f617f23a12..636c9764aa 100644 --- a/block/throttle.c +++ b/block/throttle.c @@ -149,7 +149,7 @@ static int coroutine_fn throttle_co_pdiscard(BlockDriverState *bs, ThrottleGroupMember *tgm = bs->opaque; throttle_group_co_io_limits_intercept(tgm, bytes, true); - return bdrv_co_pdiscard(bs->file->bs, offset, bytes); + return bdrv_co_pdiscard(bs->file, offset, bytes); } static int throttle_co_flush(BlockDriverState *bs) diff --git a/include/block/block.h b/include/block/block.h index a91f37bedf..f85e3a6ed3 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -418,8 +418,8 @@ AioWait *bdrv_get_aio_wait(BlockDriverState *bs); bdrv_get_aio_context(bs_), \ cond); }) -int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes); -int bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes); +int bdrv_pdiscard(BdrvChild *child, int64_t offset, int bytes); +int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes); int bdrv_has_zero_init_1(BlockDriverState *bs); int bdrv_has_zero_init(BlockDriverState *bs); bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs); From 22931a15336e8b7726965c699981fd108620014b Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:18 +0800 Subject: [PATCH 18/24] block: Use uint64_t for BdrvTrackedRequest byte fields This matches the types used for bytes in the rest parts of block layer. In the case of bdrv_co_truncate, new_bytes can be the image size which probably doesn't fit in a 32 bit int. Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 8 +++++--- include/block/block_int.h | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/block/io.c b/block/io.c index fbcd93304b..6293612e73 100644 --- a/block/io.c +++ b/block/io.c @@ -601,9 +601,11 @@ static void tracked_request_end(BdrvTrackedRequest *req) static void tracked_request_begin(BdrvTrackedRequest *req, BlockDriverState *bs, int64_t offset, - unsigned int bytes, + uint64_t bytes, enum BdrvTrackedRequestType type) { + assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes); + *req = (BdrvTrackedRequest){ .bs = bs, .offset = offset, @@ -625,7 +627,7 @@ static void tracked_request_begin(BdrvTrackedRequest *req, static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) { int64_t overlap_offset = req->offset & ~(align - 1); - unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) + uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align) - overlap_offset; if (!req->serialising) { @@ -683,7 +685,7 @@ static int bdrv_get_cluster_size(BlockDriverState *bs) } static bool tracked_request_overlaps(BdrvTrackedRequest *req, - int64_t offset, unsigned int bytes) + int64_t offset, uint64_t bytes) { /* aaaa bbbb */ if (offset >= req->overlap_offset + req->overlap_bytes) { diff --git a/include/block/block_int.h b/include/block/block_int.h index 920d3d122b..903b9c1034 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -69,12 +69,12 @@ enum BdrvTrackedRequestType { typedef struct BdrvTrackedRequest { BlockDriverState *bs; int64_t offset; - unsigned int bytes; + uint64_t bytes; enum BdrvTrackedRequestType type; bool serialising; int64_t overlap_offset; - unsigned int overlap_bytes; + uint64_t overlap_bytes; QLIST_ENTRY(BdrvTrackedRequest) list; Coroutine *co; /* owner, used for deadlock detection */ From 85fe24796ddf70f2b6a5045952826aedffa55ca2 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:19 +0800 Subject: [PATCH 19/24] block: Extract common write req handling As a mechanical refactoring patch, this is the first step towards unified and more correct write code paths. This is helpful because multiple BlockDriverState fields need to be updated after modifying image data, and it's hard to maintain in multiple places such as copy offload, discard and truncate. Suggested-by: Kevin Wolf Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 91 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 34 deletions(-) diff --git a/block/io.c b/block/io.c index 6293612e73..313fe0776c 100644 --- a/block/io.c +++ b/block/io.c @@ -1575,6 +1575,61 @@ fail: return ret; } +static inline int coroutine_fn +bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, + BdrvTrackedRequest *req, int flags) +{ + BlockDriverState *bs = child->bs; + bool waited; + int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); + + if (bs->read_only) { + return -EPERM; + } + + /* BDRV_REQ_NO_SERIALISING is only for read operation */ + assert(!(flags & BDRV_REQ_NO_SERIALISING)); + assert(!(bs->open_flags & BDRV_O_INACTIVE)); + assert((bs->open_flags & BDRV_O_NO_IO) == 0); + assert(!(flags & ~BDRV_REQ_MASK)); + + if (flags & BDRV_REQ_SERIALISING) { + mark_request_serialising(req, bdrv_get_cluster_size(bs)); + } + + waited = wait_serialising_requests(req); + + assert(!waited || !req->serialising || + is_request_serialising_and_aligned(req)); + assert(req->overlap_offset <= offset); + assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); + + if (flags & BDRV_REQ_WRITE_UNCHANGED) { + assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); + } else { + assert(child->perm & BLK_PERM_WRITE); + } + assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); + return notifier_with_return_list_notify(&bs->before_write_notifiers, req); +} + +static inline void coroutine_fn +bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes, + BdrvTrackedRequest *req, int ret) +{ + int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); + BlockDriverState *bs = child->bs; + + atomic_inc(&bs->write_gen); + bdrv_set_dirty(bs, offset, bytes); + + stat64_max(&bs->wr_highest_offset, offset + bytes); + + if (ret == 0) { + bs->total_sectors = MAX(bs->total_sectors, end_sector); + } +} + /* * Forwards an already correctly aligned write request to the BlockDriver, * after possibly fragmenting it. @@ -1585,10 +1640,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, { BlockDriverState *bs = child->bs; BlockDriver *drv = bs->drv; - bool waited; int ret; - int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); uint64_t bytes_remaining = bytes; int max_transfer; @@ -1604,31 +1657,10 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, assert((offset & (align - 1)) == 0); assert((bytes & (align - 1)) == 0); assert(!qiov || bytes == qiov->size); - assert((bs->open_flags & BDRV_O_NO_IO) == 0); - assert(!(flags & ~BDRV_REQ_MASK)); max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), align); - /* BDRV_REQ_NO_SERIALISING is only for read operation */ - assert(!(flags & BDRV_REQ_NO_SERIALISING)); - - if (flags & BDRV_REQ_SERIALISING) { - mark_request_serialising(req, bdrv_get_cluster_size(bs)); - } - - waited = wait_serialising_requests(req); - assert(!waited || !req->serialising || - is_request_serialising_and_aligned(req)); - assert(req->overlap_offset <= offset); - assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); - if (flags & BDRV_REQ_WRITE_UNCHANGED) { - assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); - } else { - assert(child->perm & BLK_PERM_WRITE); - } - assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); - - ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); + ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags); if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && @@ -1677,15 +1709,10 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, } bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); - atomic_inc(&bs->write_gen); - bdrv_set_dirty(bs, offset, bytes); - - stat64_max(&bs->wr_highest_offset, offset + bytes); - if (ret >= 0) { - bs->total_sectors = MAX(bs->total_sectors, end_sector); ret = 0; } + bdrv_co_write_req_finish(child, offset, bytes, req, ret); return ret; } @@ -1800,10 +1827,6 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child, if (!bs->drv) { return -ENOMEDIUM; } - if (bs->read_only) { - return -EPERM; - } - assert(!(bs->open_flags & BDRV_O_INACTIVE)); ret = bdrv_check_byte_request(bs, offset, bytes); if (ret < 0) { From 7f8f03ef6d5c82fee1c22be06fc9de4a16ad7059 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:20 +0800 Subject: [PATCH 20/24] block: Fix handling of image enlarging write Two problems exist when a write request that enlarges the image (i.e. write beyond EOF) finishes: 1) parent is not notified about size change; 2) dirty bitmap is not resized although we try to set the dirty bits; Fix them just like how bdrv_co_truncate works. Reported-by: Kevin Wolf Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/block/io.c b/block/io.c index 313fe0776c..c88b2b48d7 100644 --- a/block/io.c +++ b/block/io.c @@ -40,6 +40,7 @@ static AioWait drain_all_aio_wait; +static void bdrv_parent_cb_resize(BlockDriverState *bs); static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, BdrvRequestFlags flags); @@ -1621,13 +1622,16 @@ bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes, BlockDriverState *bs = child->bs; atomic_inc(&bs->write_gen); - bdrv_set_dirty(bs, offset, bytes); stat64_max(&bs->wr_highest_offset, offset + bytes); - if (ret == 0) { - bs->total_sectors = MAX(bs->total_sectors, end_sector); + if (ret == 0 && + end_sector > bs->total_sectors) { + bs->total_sectors = end_sector; + bdrv_parent_cb_resize(bs); + bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS); } + bdrv_set_dirty(bs, offset, bytes); } /* From 00695c27a00a4b3333604aeac50c43269cc151a3 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:21 +0800 Subject: [PATCH 21/24] block: Use common req handling for discard Reuse the new bdrv_co_write_req_prepare/finish helpers. The variation here is that discard requests don't affect bs->wr_highest_offset, and it cannot extend the image. Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/block/io.c b/block/io.c index c88b2b48d7..72cfd9bd16 100644 --- a/block/io.c +++ b/block/io.c @@ -1623,15 +1623,32 @@ bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes, atomic_inc(&bs->write_gen); - stat64_max(&bs->wr_highest_offset, offset + bytes); - + /* + * Discard cannot extend the image, but in error handling cases, such as + * when reverting a qcow2 cluster allocation, the discarded range can pass + * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD + * here. Instead, just skip it, since semantically a discard request + * beyond EOF cannot expand the image anyway. + */ if (ret == 0 && - end_sector > bs->total_sectors) { + end_sector > bs->total_sectors && + req->type != BDRV_TRACKED_DISCARD) { bs->total_sectors = end_sector; bdrv_parent_cb_resize(bs); bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS); } - bdrv_set_dirty(bs, offset, bytes); + if (req->bytes) { + switch (req->type) { + case BDRV_TRACKED_WRITE: + stat64_max(&bs->wr_highest_offset, offset + bytes); + /* fall through, to set dirty bits */ + case BDRV_TRACKED_DISCARD: + bdrv_set_dirty(bs, offset, bytes); + break; + default: + break; + } + } } /* @@ -2692,10 +2709,7 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes) ret = bdrv_check_byte_request(bs, offset, bytes); if (ret < 0) { return ret; - } else if (bs->read_only) { - return -EPERM; } - assert(!(bs->open_flags & BDRV_O_INACTIVE)); /* Do nothing if disabled. */ if (!(bs->open_flags & BDRV_O_UNMAP)) { @@ -2719,7 +2733,7 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes) bdrv_inc_in_flight(bs); tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); - ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); + ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0); if (ret < 0) { goto out; } @@ -2785,8 +2799,7 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes) } ret = 0; out: - atomic_inc(&bs->write_gen); - bdrv_set_dirty(bs, req.offset, req.bytes); + bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret); tracked_request_end(&req); bdrv_dec_in_flight(bs); return ret; From 0eb1e891126f0cde52e88384696ad67076bdc341 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:22 +0800 Subject: [PATCH 22/24] block: Use common req handling in copy offloading This brings the request handling logic inline with write and discard, fixing write_gen, resize_cb, dirty bitmaps and image size refreshing. The last of these issues broke iotest case 222, which is now fixed. Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/block/io.c b/block/io.c index 72cfd9bd16..2832214db4 100644 --- a/block/io.c +++ b/block/io.c @@ -3030,20 +3030,16 @@ static int coroutine_fn bdrv_co_copy_range_internal( bdrv_inc_in_flight(dst->bs); tracked_request_begin(&req, dst->bs, dst_offset, bytes, BDRV_TRACKED_WRITE); - - /* BDRV_REQ_NO_SERIALISING is only for read operation */ - assert(!(write_flags & BDRV_REQ_NO_SERIALISING)); - if (write_flags & BDRV_REQ_SERIALISING) { - mark_request_serialising(&req, bdrv_get_cluster_size(dst->bs)); + ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req, + write_flags); + if (!ret) { + ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, + src, src_offset, + dst, dst_offset, + bytes, + read_flags, write_flags); } - wait_serialising_requests(&req); - - ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, - src, src_offset, - dst, dst_offset, - bytes, - read_flags, write_flags); - + bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret); tracked_request_end(&req); bdrv_dec_in_flight(dst->bs); } From 5416a11eb55d46c376dde97c429b89e9b4e1a94f Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:23 +0800 Subject: [PATCH 23/24] block: Fix bdrv_co_truncate overlap check If we are growing the image and potentially using preallocation for the new area, we need to make sure that no write requests are made to the "preallocated" area which is [@old_size, @offset), not [@offset, offset * 2 - @old_size). Signed-off-by: Fam Zheng Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- block/io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/io.c b/block/io.c index 2832214db4..77d38ca1d3 100644 --- a/block/io.c +++ b/block/io.c @@ -3136,7 +3136,8 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, } bdrv_inc_in_flight(bs); - tracked_request_begin(&req, bs, offset, new_bytes, BDRV_TRACKED_TRUNCATE); + tracked_request_begin(&req, bs, offset - new_bytes, new_bytes, + BDRV_TRACKED_TRUNCATE); /* If we are growing the image and potentially using preallocation for the * new area, we need to make sure that no write requests are made to it From cd47d792d7a27a57f4b621e2ff1ed8f4e83de1e9 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:24 +0800 Subject: [PATCH 24/24] block: Use common write req handling in truncate Truncation is the last to convert from open coded req handling to reusing helpers. This time the permission check in prepare has to adapt to the new caller: it checks a different permission bit, and doesn't trigger the before write notifier. Also, truncation should always trigger a bs->total_sectors update and in turn call parent resize_cb. Update the condition in finish helper, too. It's intended to do a duplicated bs->read_only check before calling bdrv_co_write_req_prepare() so that we can be more informative with the error message, as bdrv_co_write_req_prepare() doesn't have Error parameter. Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 57 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/block/io.c b/block/io.c index 77d38ca1d3..7100344c7b 100644 --- a/block/io.c +++ b/block/io.c @@ -1604,14 +1604,24 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, is_request_serialising_and_aligned(req)); assert(req->overlap_offset <= offset); assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); - - if (flags & BDRV_REQ_WRITE_UNCHANGED) { - assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); - } else { - assert(child->perm & BLK_PERM_WRITE); - } assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); - return notifier_with_return_list_notify(&bs->before_write_notifiers, req); + + switch (req->type) { + case BDRV_TRACKED_WRITE: + case BDRV_TRACKED_DISCARD: + if (flags & BDRV_REQ_WRITE_UNCHANGED) { + assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); + } else { + assert(child->perm & BLK_PERM_WRITE); + } + return notifier_with_return_list_notify(&bs->before_write_notifiers, + req); + case BDRV_TRACKED_TRUNCATE: + assert(child->perm & BLK_PERM_RESIZE); + return 0; + default: + abort(); + } } static inline void coroutine_fn @@ -1631,8 +1641,9 @@ bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes, * beyond EOF cannot expand the image anyway. */ if (ret == 0 && - end_sector > bs->total_sectors && - req->type != BDRV_TRACKED_DISCARD) { + (req->type == BDRV_TRACKED_TRUNCATE || + end_sector > bs->total_sectors) && + req->type != BDRV_TRACKED_DISCARD) { bs->total_sectors = end_sector; bdrv_parent_cb_resize(bs); bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS); @@ -3111,7 +3122,6 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, int64_t old_size, new_bytes; int ret; - assert(child->perm & BLK_PERM_RESIZE); /* if bs->drv == NULL, bs is closed, so there's nothing to do here */ if (!drv) { @@ -3144,7 +3154,18 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, * concurrently or they might be overwritten by preallocation. */ if (new_bytes) { mark_request_serialising(&req, 1); - wait_serialising_requests(&req); + } + if (bs->read_only) { + error_setg(errp, "Image is read-only"); + ret = -EACCES; + goto out; + } + ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req, + 0); + if (ret < 0) { + error_setg_errno(errp, -ret, + "Failed to prepare request for truncation"); + goto out; } if (!drv->bdrv_co_truncate) { @@ -3156,13 +3177,6 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, ret = -ENOTSUP; goto out; } - if (bs->read_only) { - error_setg(errp, "Image is read-only"); - ret = -EACCES; - goto out; - } - - assert(!(bs->open_flags & BDRV_O_INACTIVE)); ret = drv->bdrv_co_truncate(bs, offset, prealloc, errp); if (ret < 0) { @@ -3174,9 +3188,10 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, } else { offset = bs->total_sectors * BDRV_SECTOR_SIZE; } - bdrv_dirty_bitmap_truncate(bs, offset); - bdrv_parent_cb_resize(bs); - atomic_inc(&bs->write_gen); + /* It's possible that truncation succeeded but refresh_total_sectors + * failed, but the latter doesn't affect how we should finish the request. + * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */ + bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0); out: tracked_request_end(&req);