From d35ff5e6b3aa3a706b0aa3bcf11400fac945b67a Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 4 Apr 2017 17:29:03 +0200 Subject: [PATCH 01/10] block: Ignore guest dev permissions during incoming migration Usually guest devices don't like other writers to the same image, so they use blk_set_perm() to prevent this from happening. In the migration phase before the VM is actually running, though, they don't have a problem with writes to the image. On the other hand, storage migration needs to be able to write to the image in this phase, so the restrictive blk_set_perm() call of qdev devices breaks it. This patch flags all BlockBackends with a qdev device as blk->disable_perm during incoming migration, which means that the requested permissions are stored in the BlockBackend, but not actually applied to its root node yet. Once migration has finished and the VM should be resumed, the permissions are applied. If they cannot be applied (e.g. because the NBD server used for block migration hasn't been shut down), resuming the VM fails. Signed-off-by: Kevin Wolf Tested-by: Kashyap Chamarthy --- block/block-backend.c | 40 +++++++++++++++++++++++++++++++++++++++- include/block/block.h | 2 ++ migration/migration.c | 8 ++++++++ qmp.c | 6 ++++++ 4 files changed, 55 insertions(+), 1 deletion(-) diff --git a/block/block-backend.c b/block/block-backend.c index 0b6377332c..18ece99c6e 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -61,6 +61,7 @@ struct BlockBackend { uint64_t perm; uint64_t shared_perm; + bool disable_perm; bool allow_write_beyond_eof; @@ -578,7 +579,7 @@ int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm, { int ret; - if (blk->root) { + if (blk->root && !blk->disable_perm) { ret = bdrv_child_try_set_perm(blk->root, perm, shared_perm, errp); if (ret < 0) { return ret; @@ -597,15 +598,52 @@ void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm) *shared_perm = blk->shared_perm; } +/* + * Notifies the user of all BlockBackends that migration has completed. qdev + * devices can tighten their permissions in response (specifically revoke + * shared write permissions that we needed for storage migration). + * + * If an error is returned, the VM cannot be allowed to be resumed. + */ +void blk_resume_after_migration(Error **errp) +{ + BlockBackend *blk; + Error *local_err = NULL; + + for (blk = blk_all_next(NULL); blk; blk = blk_all_next(blk)) { + if (!blk->disable_perm) { + continue; + } + + blk->disable_perm = false; + + blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err); + if (local_err) { + error_propagate(errp, local_err); + blk->disable_perm = true; + return; + } + } +} + static int blk_do_attach_dev(BlockBackend *blk, void *dev) { if (blk->dev) { return -EBUSY; } + + /* While migration is still incoming, we don't need to apply the + * permissions of guest device BlockBackends. We might still have a block + * job or NBD server writing to the image for storage migration. */ + if (runstate_check(RUN_STATE_INMIGRATE)) { + blk->disable_perm = true; + } + blk_ref(blk); blk->dev = dev; blk->legacy_dev = false; blk_iostatus_reset(blk); + return 0; } diff --git a/include/block/block.h b/include/block/block.h index 5149260827..3e09222f5f 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -366,6 +366,8 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp); void bdrv_invalidate_cache_all(Error **errp); int bdrv_inactivate_all(void); +void blk_resume_after_migration(Error **errp); + /* Ensure contents are flushed to disk. */ int bdrv_flush(BlockDriverState *bs); int coroutine_fn bdrv_co_flush(BlockDriverState *bs); diff --git a/migration/migration.c b/migration/migration.c index 54060f749a..ad4036fdb1 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -349,6 +349,14 @@ static void process_incoming_migration_bh(void *opaque) exit(EXIT_FAILURE); } + /* If we get an error here, just don't restart the VM yet. */ + blk_resume_after_migration(&local_err); + if (local_err) { + error_free(local_err); + local_err = NULL; + autostart = false; + } + /* * This must happen after all error conditions are dealt with and * we're sure the VM is going to be running on this host. diff --git a/qmp.c b/qmp.c index fa82b598c6..a744e44ac6 100644 --- a/qmp.c +++ b/qmp.c @@ -207,6 +207,12 @@ void qmp_cont(Error **errp) } } + blk_resume_after_migration(&local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + if (runstate_check(RUN_STATE_INMIGRATE)) { autostart = 1; } else { From 02be4aeb9343d9a4993b1de6cebe6280e82d7254 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 6 Apr 2017 19:05:07 +0200 Subject: [PATCH 02/10] commit: Set commit_top_bs->aio_context The filter driver that is inserted by the commit job needs to use the same AioContext as its parent and child nodes. Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Fam Zheng --- block/commit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/commit.c b/block/commit.c index 28324820a4..4c3822080e 100644 --- a/block/commit.c +++ b/block/commit.c @@ -335,6 +335,7 @@ void commit_start(const char *job_id, BlockDriverState *bs, if (commit_top_bs == NULL) { goto fail; } + bdrv_set_aio_context(commit_top_bs, bdrv_get_aio_context(top)); bdrv_set_backing_hd(commit_top_bs, top, &local_err); if (local_err) { @@ -482,6 +483,7 @@ int bdrv_commit(BlockDriverState *bs) error_report_err(local_err); goto ro_cleanup; } + bdrv_set_aio_context(commit_top_bs, bdrv_get_aio_context(backing_file_bs)); bdrv_set_backing_hd(commit_top_bs, backing_file_bs, &error_abort); bdrv_set_backing_hd(bs, commit_top_bs, &error_abort); From 0d0676a1040d34339731a4e26a9b39b78c8a1fdf Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 6 Apr 2017 19:07:14 +0200 Subject: [PATCH 03/10] commit: Set commit_top_bs->total_sectors Like in the mirror filter driver, we also need to set the image size for the commit filter driver. This is less likely to be a problem in practice than for the mirror because we're not at the active layer here, but attaching new parents to a node in the middle of the chain is possible, so the size needs to be correct anyway. Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Fam Zheng --- block/commit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/commit.c b/block/commit.c index 4c3822080e..91d2c344f6 100644 --- a/block/commit.c +++ b/block/commit.c @@ -335,6 +335,7 @@ void commit_start(const char *job_id, BlockDriverState *bs, if (commit_top_bs == NULL) { goto fail; } + commit_top_bs->total_sectors = top->total_sectors; bdrv_set_aio_context(commit_top_bs, bdrv_get_aio_context(top)); bdrv_set_backing_hd(commit_top_bs, top, &local_err); From 7a25fcd056ddd34ee7abc906c957d252e2889461 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Mon, 3 Apr 2017 19:51:49 +0200 Subject: [PATCH 04/10] block/mirror: Fix use-after-free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If @bs does not have any parents, the only reference to @mirror_top_bs will be held by the BlockJob object after the bdrv_unref() following block_job_create(). However, if block_job_create() fails, this reference will not exist and @mirror_top_bs will have been deleted when we goto fail. The issue comes back at all later entries to the fail label: We delete the BlockJob object before rolling back our changes to the node graph. This means that we will delete @mirror_top_bs in the process. All in all, whenever @bs does not have any parents and we go down the fail path we will dereference @mirror_top_bs after it has been deleted. Fix this by invoking bdrv_unref() only when block_job_create() was successful and by bdrv_ref()'ing @mirror_top_bs in the fail path before deleting the BlockJob object. Finally, bdrv_unref() it at the end of the fail path after we actually no longer need it. Signed-off-by: Max Reitz Reviewed-by: John Snow Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Kevin Wolf --- block/mirror.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index 9e2fecc15e..46ecd38ef0 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1150,7 +1150,7 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs, mirror_top_bs->total_sectors = bs->total_sectors; /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep - * it alive until block_job_create() even if bs has no parent. */ + * it alive until block_job_create() succeeds even if bs has no parent. */ bdrv_ref(mirror_top_bs); bdrv_drained_begin(bs); bdrv_append(mirror_top_bs, bs, &local_err); @@ -1168,10 +1168,12 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs, BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD, speed, creation_flags, cb, opaque, errp); - bdrv_unref(mirror_top_bs); if (!s) { goto fail; } + /* The block job now has a reference to this node */ + bdrv_unref(mirror_top_bs); + s->source = bs; s->mirror_top_bs = mirror_top_bs; @@ -1242,6 +1244,10 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs, fail: if (s) { + /* Make sure this BDS does not go away until we have completed the graph + * changes below */ + bdrv_ref(mirror_top_bs); + g_free(s->replaces); blk_unref(s->target); block_job_unref(&s->common); @@ -1250,6 +1256,8 @@ fail: bdrv_child_try_set_perm(mirror_top_bs->backing, 0, BLK_PERM_ALL, &error_abort); bdrv_replace_node(mirror_top_bs, backing_bs(mirror_top_bs), &error_abort); + + bdrv_unref(mirror_top_bs); } void mirror_start(const char *job_id, BlockDriverState *bs, From 5694923ad1c60a86383244793663c0715bd88e83 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Mon, 3 Apr 2017 19:51:50 +0200 Subject: [PATCH 05/10] iotests: Add mirror tests for orphaned source Signed-off-by: Max Reitz Reviewed-by: John Snow Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- tests/qemu-iotests/041 | 46 +++++++++++++++++++++++++++++++++++ tests/qemu-iotests/041.out | 4 +-- tests/qemu-iotests/iotests.py | 15 ++++++++++++ 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 index bc6cf782fe..2f54986434 100755 --- a/tests/qemu-iotests/041 +++ b/tests/qemu-iotests/041 @@ -966,5 +966,51 @@ class TestRepairQuorum(iotests.QMPTestCase): # to check that this file is really driven by quorum self.vm.shutdown() +# Test mirroring with a source that does not have any parents (not even a +# BlockBackend) +class TestOrphanedSource(iotests.QMPTestCase): + def setUp(self): + blk0 = { 'node-name': 'src', + 'driver': 'null-co' } + + blk1 = { 'node-name': 'dest', + 'driver': 'null-co' } + + blk2 = { 'node-name': 'dest-ro', + 'driver': 'null-co', + 'read-only': 'on' } + + self.vm = iotests.VM() + self.vm.add_blockdev(self.qmp_to_opts(blk0)) + self.vm.add_blockdev(self.qmp_to_opts(blk1)) + self.vm.add_blockdev(self.qmp_to_opts(blk2)) + self.vm.launch() + + def tearDown(self): + self.vm.shutdown() + + def test_no_job_id(self): + self.assert_no_active_block_jobs() + + result = self.vm.qmp('blockdev-mirror', device='src', sync='full', + target='dest') + self.assert_qmp(result, 'error/class', 'GenericError') + + def test_success(self): + self.assert_no_active_block_jobs() + + result = self.vm.qmp('blockdev-mirror', job_id='job', device='src', + sync='full', target='dest') + self.assert_qmp(result, 'return', {}) + + self.complete_and_wait('job') + + def test_failing_permissions(self): + self.assert_no_active_block_jobs() + + result = self.vm.qmp('blockdev-mirror', device='src', sync='full', + target='dest-ro') + self.assert_qmp(result, 'error/class', 'GenericError') + if __name__ == '__main__': iotests.main(supported_fmts=['qcow2', 'qed']) diff --git a/tests/qemu-iotests/041.out b/tests/qemu-iotests/041.out index b67d0504a6..e30fd3b05b 100644 --- a/tests/qemu-iotests/041.out +++ b/tests/qemu-iotests/041.out @@ -1,5 +1,5 @@ -............................................................................ +............................................................................... ---------------------------------------------------------------------- -Ran 76 tests +Ran 79 tests OK diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py index bec8eb4b8d..abcf3c10e2 100644 --- a/tests/qemu-iotests/iotests.py +++ b/tests/qemu-iotests/iotests.py @@ -177,6 +177,14 @@ class VM(qtest.QEMUQtestMachine): self._num_drives += 1 return self + def add_blockdev(self, opts): + self._args.append('-blockdev') + if isinstance(opts, str): + self._args.append(opts) + else: + self._args.append(','.join(opts)) + return self + def pause_drive(self, drive, event=None): '''Pause drive r/w operations''' if not event: @@ -235,6 +243,13 @@ class QMPTestCase(unittest.TestCase): output[basestr[:-1]] = obj # Strip trailing '.' return output + def qmp_to_opts(self, obj): + obj = self.flatten_qmp_object(obj) + output_list = list() + for key in obj: + output_list += [key + '=' + obj[key]] + return ','.join(output_list) + def assert_qmp_absent(self, d, path): try: result = self.dictpath(d, path) From 89aa0465b9135d70682420918b28b6e38a0f6322 Mon Sep 17 00:00:00 2001 From: Jeff Cody Date: Thu, 6 Apr 2017 13:45:42 -0400 Subject: [PATCH 06/10] qemu-img: img_create does not support image-opts, fix docs The documentation and help for qemu-img claims that 'qemu-img create' will take the '--image-opts' argument. This is not true, so this patch removes those claims. Signed-off-by: Jeff Cody Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- qemu-img-cmds.hx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx index 9c9702cc62..8ac78222af 100644 --- a/qemu-img-cmds.hx +++ b/qemu-img-cmds.hx @@ -22,9 +22,9 @@ STEXI ETEXI DEF("create", img_create, - "create [-q] [--object objectdef] [--image-opts] [-f fmt] [-o options] filename [size]") + "create [-q] [--object objectdef] [-f fmt] [-o options] filename [size]") STEXI -@item create [--object @var{objectdef}] [--image-opts] [-q] [-f @var{fmt}] [-o @var{options}] @var{filename} [@var{size}] +@item create [--object @var{objectdef}] [-q] [-f @var{fmt}] [-o @var{options}] @var{filename} [@var{size}] ETEXI DEF("commit", img_commit, From 1bf03e66fd03af46ff0f98dd04b6e28f432ac1e3 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 7 Apr 2017 12:29:05 +0200 Subject: [PATCH 07/10] block: Don't check permissions for copy on read The assertion is currently failing. We can't require callers to have write permissions when all they are doing is a read, so comment it out. Add a FIXME comment in the code so that the check is re-enabled when copy on read is refactored into its own filter driver. Reported-by: Richard W.M. Jones Signed-off-by: Kevin Wolf Reviewed-by: Richard W.M. Jones --- block/io.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/block/io.c b/block/io.c index 2709a7007f..7321ddab3d 100644 --- a/block/io.c +++ b/block/io.c @@ -945,7 +945,14 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, size_t skip_bytes; int ret; - assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); + /* FIXME We cannot require callers to have write permissions when all they + * are doing is a read request. If we did things right, write permissions + * would be obtained anyway, but internally by the copy-on-read code. As + * long as it is implemented here rather than in a separat filter driver, + * the copy-on-read code doesn't have its own BdrvChild, however, for which + * it could request permissions. Therefore we have to bypass the permission + * system for the moment. */ + // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); /* Cover entire cluster so no additional backing file I/O is required when * allocating cluster in the image file. From c26a5ab71338a53340257233bd172bbe22c06b16 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Fri, 7 Apr 2017 14:54:09 +0800 Subject: [PATCH 08/10] block: Fix unpaired aio_disable_external in external snapshot bdrv_replace_child_noperm tries to hand over the quiesce_counter state from old bs to the new one, but if they are not on the same aio context this causes unbalance. Fix this by setting the correct aio context before calling bdrv_append(). Reported-by: Ed Swierk Reviewed-by: Eric Blake Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- blockdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/blockdev.c b/blockdev.c index 040c152512..4927914ce3 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1772,6 +1772,8 @@ static void external_snapshot_prepare(BlkActionState *common, return; } + bdrv_set_aio_context(state->new_bs, state->aio_context); + /* This removes our old bs and adds the new bs. This is an operation that * can fail, so we need to do it in .prepare; undoing it for abort is * always possible. */ @@ -1789,8 +1791,6 @@ static void external_snapshot_commit(BlkActionState *common) ExternalSnapshotState *state = DO_UPCAST(ExternalSnapshotState, common, common); - bdrv_set_aio_context(state->new_bs, state->aio_context); - /* We don't need (or want) to use the transactional * bdrv_reopen_multiple() across all the entries at once, because we * don't want to abort all of them if one of them fails the reopen */ From bb2614e991d88166fccf4a26c7180ee39f39a48d Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Fri, 7 Apr 2017 14:54:10 +0800 Subject: [PATCH 09/10] block: Assert attached child node has right aio context Suggested-by: Kevin Wolf Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block.c b/block.c index 927ba89eb7..b8a30115e1 100644 --- a/block.c +++ b/block.c @@ -1752,6 +1752,9 @@ static void bdrv_replace_child_noperm(BdrvChild *child, { BlockDriverState *old_bs = child->bs; + if (old_bs && new_bs) { + assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs)); + } if (old_bs) { if (old_bs->quiesce_counter && child->role->drained_end) { child->role->drained_end(child); @@ -1852,6 +1855,7 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs, bdrv_get_cumulative_perm(parent_bs, &perm, &shared_perm); assert(parent_bs->drv); + assert(bdrv_get_aio_context(parent_bs) == bdrv_get_aio_context(child_bs)); parent_bs->drv->bdrv_child_perm(parent_bs, NULL, child_role, perm, shared_perm, &perm, &shared_perm); From 19dd29e8a77cd820515de5289f566508e0ed4926 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Fri, 7 Apr 2017 14:54:11 +0800 Subject: [PATCH 10/10] mirror: Fix aio context of mirror_top_bs It should be moved to the same context as source, before inserting to the graph. Reviewed-by: Eric Blake Reviewed-by: Kevin Wolf Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/mirror.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/mirror.c b/block/mirror.c index 46ecd38ef0..164438f422 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1148,6 +1148,7 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs, return; } mirror_top_bs->total_sectors = bs->total_sectors; + bdrv_set_aio_context(mirror_top_bs, bdrv_get_aio_context(bs)); /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep * it alive until block_job_create() succeeds even if bs has no parent. */