From e24154d878c11688202628755f8a47e3cba0f52a Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Thu, 5 Aug 2021 16:36:03 +0200 Subject: [PATCH 01/32] gluster: Align block-status tail gluster's block-status implementation is basically a copy of that in block/file-posix.c, there is only one thing missing, and that is aligning trailing data extents to the request alignment (as added by commit 9c3db310ff0). Note that 9c3db310ff0 mentions that "there seems to be no other block driver that sets request_alignment and [...]", but while block/gluster.c does indeed not set request_alignment, block/io.c's bdrv_refresh_limits() will still default to an alignment of 512 because block/gluster.c does not provide a byte-aligned read function. Therefore, unaligned tails can conceivably occur, and so we should apply the change from 9c3db310ff0 to gluster's block-status implementation. Reported-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Max Reitz Message-Id: <20210805143603.59503-1-mreitz@redhat.com> Reviewed-by: Eric Blake Signed-off-by: Hanna Reitz --- block/gluster.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/block/gluster.c b/block/gluster.c index e8ee14c8e9..48a04417cf 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -1477,6 +1477,8 @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs, off_t data = 0, hole = 0; int ret = -EINVAL; + assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment)); + if (!s->fd) { return ret; } @@ -1501,6 +1503,20 @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs, /* On a data extent, compute bytes to the end of the extent, * possibly including a partial sector at EOF. */ *pnum = MIN(bytes, hole - offset); + + /* + * We are not allowed to return partial sectors, though, so + * round up if necessary. + */ + if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) { + int64_t file_length = qemu_gluster_getlength(bs); + if (file_length > 0) { + /* Ignore errors, this is just a safeguard */ + assert(hole == file_length); + } + *pnum = ROUND_UP(*pnum, bs->bl.request_alignment); + } + ret = BDRV_BLOCK_DATA; } else { /* On a hole, compute bytes to the beginning of the next extent. */ From 33ff4c9e081acdded2c90c8c1963f3403e16082b Mon Sep 17 00:00:00 2001 From: Hanna Reitz Date: Thu, 12 Aug 2021 10:41:43 +0200 Subject: [PATCH 02/32] block: Drop BDS comment regarding bdrv_append() There is a comment above the BDS definition stating care must be taken to consider handling newly added fields in bdrv_append(). Actually, this comment should have said "bdrv_swap()" as of 4ddc07cac (nine years ago), and in any case, bdrv_swap() was dropped in 8e419aefa (six years ago). So no such care is necessary anymore. Signed-off-by: Hanna Reitz Reviewed-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Kevin Wolf Message-Id: <20210812084148.14458-2-hreitz@redhat.com> --- include/block/block_int.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/block/block_int.h b/include/block/block_int.h index f1a54db0f8..12e5750fe8 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -839,12 +839,6 @@ struct BdrvChild { QLIST_ENTRY(BdrvChild) next_parent; }; -/* - * Note: the function bdrv_append() copies and swaps contents of - * BlockDriverStates, so if you add new fields to this struct, please - * inspect bdrv_append() to determine if the new fields need to be - * copied as well. - */ struct BlockDriverState { /* Protected by big QEMU lock or read-only after opening. No special * locking needed during I/O... From 0bc329fbb009f8601cec23bf2bc48ead0c5a5fa2 Mon Sep 17 00:00:00 2001 From: Hanna Reitz Date: Thu, 12 Aug 2021 10:41:44 +0200 Subject: [PATCH 03/32] block: block-status cache for data regions As we have attempted before (https://lists.gnu.org/archive/html/qemu-devel/2019-01/msg06451.html, "file-posix: Cache lseek result for data regions"; https://lists.nongnu.org/archive/html/qemu-block/2021-02/msg00934.html, "file-posix: Cache next hole"), this patch seeks to reduce the number of SEEK_DATA/HOLE operations the file-posix driver has to perform. The main difference is that this time it is implemented as part of the general block layer code. The problem we face is that on some filesystems or in some circumstances, SEEK_DATA/HOLE is unreasonably slow. Given the implementation is outside of qemu, there is little we can do about its performance. We have already introduced the want_zero parameter to bdrv_co_block_status() to reduce the number of SEEK_DATA/HOLE calls unless we really want zero information; but sometimes we do want that information, because for files that consist largely of zero areas, special-casing those areas can give large performance boosts. So the real problem is with files that consist largely of data, so that inquiring the block status does not gain us much performance, but where such an inquiry itself takes a lot of time. To address this, we want to cache data regions. Most of the time, when bad performance is reported, it is in places where the image is iterated over from start to end (qemu-img convert or the mirror job), so a simple yet effective solution is to cache only the current data region. (Note that only caching data regions but not zero regions means that returning false information from the cache is not catastrophic: Treating zeroes as data is fine. While we try to invalidate the cache on zero writes and discards, such incongruences may still occur when there are other processes writing to the image.) We only use the cache for nodes without children (i.e. protocol nodes), because that is where the problem is: Drivers that rely on block-status implementations outside of qemu (e.g. SEEK_DATA/HOLE). Resolves: https://gitlab.com/qemu-project/qemu/-/issues/307 Signed-off-by: Hanna Reitz Message-Id: <20210812084148.14458-3-hreitz@redhat.com> Reviewed-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy [hreitz: Added `local_file == bs` assertion, as suggested by Vladimir] Signed-off-by: Hanna Reitz --- block.c | 80 +++++++++++++++++++++++++++++++++++++++ block/io.c | 68 +++++++++++++++++++++++++++++++-- include/block/block_int.h | 50 ++++++++++++++++++++++++ 3 files changed, 195 insertions(+), 3 deletions(-) diff --git a/block.c b/block.c index b2b66263f9..00982edf45 100644 --- a/block.c +++ b/block.c @@ -49,6 +49,8 @@ #include "qemu/timer.h" #include "qemu/cutils.h" #include "qemu/id.h" +#include "qemu/range.h" +#include "qemu/rcu.h" #include "block/coroutines.h" #ifdef CONFIG_BSD @@ -401,6 +403,9 @@ BlockDriverState *bdrv_new(void) qemu_co_queue_init(&bs->flush_queue); + qemu_co_mutex_init(&bs->bsc_modify_lock); + bs->block_status_cache = g_new0(BdrvBlockStatusCache, 1); + for (i = 0; i < bdrv_drain_all_count; i++) { bdrv_drained_begin(bs); } @@ -4694,6 +4699,8 @@ static void bdrv_close(BlockDriverState *bs) bs->explicit_options = NULL; qobject_unref(bs->full_open_options); bs->full_open_options = NULL; + g_free(bs->block_status_cache); + bs->block_status_cache = NULL; bdrv_release_named_dirty_bitmaps(bs); assert(QLIST_EMPTY(&bs->dirty_bitmaps)); @@ -7684,3 +7691,76 @@ BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs) { return bdrv_skip_filters(bdrv_cow_bs(bdrv_skip_filters(bs))); } + +/** + * Check whether [offset, offset + bytes) overlaps with the cached + * block-status data region. + * + * If so, and @pnum is not NULL, set *pnum to `bsc.data_end - offset`, + * which is what bdrv_bsc_is_data()'s interface needs. + * Otherwise, *pnum is not touched. + */ +static bool bdrv_bsc_range_overlaps_locked(BlockDriverState *bs, + int64_t offset, int64_t bytes, + int64_t *pnum) +{ + BdrvBlockStatusCache *bsc = qatomic_rcu_read(&bs->block_status_cache); + bool overlaps; + + overlaps = + qatomic_read(&bsc->valid) && + ranges_overlap(offset, bytes, bsc->data_start, + bsc->data_end - bsc->data_start); + + if (overlaps && pnum) { + *pnum = bsc->data_end - offset; + } + + return overlaps; +} + +/** + * See block_int.h for this function's documentation. + */ +bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum) +{ + RCU_READ_LOCK_GUARD(); + + return bdrv_bsc_range_overlaps_locked(bs, offset, 1, pnum); +} + +/** + * See block_int.h for this function's documentation. + */ +void bdrv_bsc_invalidate_range(BlockDriverState *bs, + int64_t offset, int64_t bytes) +{ + RCU_READ_LOCK_GUARD(); + + if (bdrv_bsc_range_overlaps_locked(bs, offset, bytes, NULL)) { + qatomic_set(&bs->block_status_cache->valid, false); + } +} + +/** + * See block_int.h for this function's documentation. + */ +void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes) +{ + BdrvBlockStatusCache *new_bsc = g_new(BdrvBlockStatusCache, 1); + BdrvBlockStatusCache *old_bsc; + + *new_bsc = (BdrvBlockStatusCache) { + .valid = true, + .data_start = offset, + .data_end = offset + bytes, + }; + + QEMU_LOCK_GUARD(&bs->bsc_modify_lock); + + old_bsc = qatomic_rcu_read(&bs->block_status_cache); + qatomic_rcu_set(&bs->block_status_cache, new_bsc); + if (old_bsc) { + g_free_rcu(old_bsc, rcu); + } +} diff --git a/block/io.c b/block/io.c index a19942718b..99ee182ca4 100644 --- a/block/io.c +++ b/block/io.c @@ -1883,6 +1883,9 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, return -ENOTSUP; } + /* Invalidate the cached block-status data range if this write overlaps */ + bdrv_bsc_invalidate_range(bs, offset, bytes); + assert(alignment % bs->bl.request_alignment == 0); head = offset % alignment; tail = (offset + bytes) % alignment; @@ -2447,9 +2450,65 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; if (bs->drv->bdrv_co_block_status) { - ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, - aligned_bytes, pnum, &local_map, - &local_file); + /* + * Use the block-status cache only for protocol nodes: Format + * drivers are generally quick to inquire the status, but protocol + * drivers often need to get information from outside of qemu, so + * we do not have control over the actual implementation. There + * have been cases where inquiring the status took an unreasonably + * long time, and we can do nothing in qemu to fix it. + * This is especially problematic for images with large data areas, + * because finding the few holes in them and giving them special + * treatment does not gain much performance. Therefore, we try to + * cache the last-identified data region. + * + * Second, limiting ourselves to protocol nodes allows us to assume + * the block status for data regions to be DATA | OFFSET_VALID, and + * that the host offset is the same as the guest offset. + * + * Note that it is possible that external writers zero parts of + * the cached regions without the cache being invalidated, and so + * we may report zeroes as data. This is not catastrophic, + * however, because reporting zeroes as data is fine. + */ + if (QLIST_EMPTY(&bs->children) && + bdrv_bsc_is_data(bs, aligned_offset, pnum)) + { + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; + local_file = bs; + local_map = aligned_offset; + } else { + ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, + aligned_bytes, pnum, &local_map, + &local_file); + + /* + * Note that checking QLIST_EMPTY(&bs->children) is also done when + * the cache is queried above. Technically, we do not need to check + * it here; the worst that can happen is that we fill the cache for + * non-protocol nodes, and then it is never used. However, filling + * the cache requires an RCU update, so double check here to avoid + * such an update if possible. + */ + if (ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) && + QLIST_EMPTY(&bs->children)) + { + /* + * When a protocol driver reports BLOCK_OFFSET_VALID, the + * returned local_map value must be the same as the offset we + * have passed (aligned_offset), and local_bs must be the node + * itself. + * Assert this, because we follow this rule when reading from + * the cache (see the `local_file = bs` and + * `local_map = aligned_offset` assignments above), and the + * result the cache delivers must be the same as the driver + * would deliver. + */ + assert(local_file == bs); + assert(local_map == aligned_offset); + bdrv_bsc_fill(bs, aligned_offset, *pnum); + } + } } else { /* Default code for filters */ @@ -3002,6 +3061,9 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, return 0; } + /* Invalidate the cached block-status data range if this discard overlaps */ + bdrv_bsc_invalidate_range(bs, offset, bytes); + /* Discard is advisory, but some devices track and coalesce * unaligned requests, so we must pass everything down rather than * round here. Still, most devices will just silently ignore diff --git a/include/block/block_int.h b/include/block/block_int.h index 12e5750fe8..437d746733 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -34,6 +34,7 @@ #include "qemu/hbitmap.h" #include "block/snapshot.h" #include "qemu/throttle.h" +#include "qemu/rcu.h" #define BLOCK_FLAG_LAZY_REFCOUNTS 8 @@ -839,6 +840,24 @@ struct BdrvChild { QLIST_ENTRY(BdrvChild) next_parent; }; +/* + * Allows bdrv_co_block_status() to cache one data region for a + * protocol node. + * + * @valid: Whether the cache is valid (should be accessed with atomic + * functions so this can be reset by RCU readers) + * @data_start: Offset where we know (or strongly assume) is data + * @data_end: Offset where the data region ends (which is not necessarily + * the start of a zeroed region) + */ +typedef struct BdrvBlockStatusCache { + struct rcu_head rcu; + + bool valid; + int64_t data_start; + int64_t data_end; +} BdrvBlockStatusCache; + struct BlockDriverState { /* Protected by big QEMU lock or read-only after opening. No special * locking needed during I/O... @@ -1004,6 +1023,11 @@ struct BlockDriverState { /* BdrvChild links to this node may never be frozen */ bool never_freeze; + + /* Lock for block-status cache RCU writers */ + CoMutex bsc_modify_lock; + /* Always non-NULL, but must only be dereferenced under an RCU read guard */ + BdrvBlockStatusCache *block_status_cache; }; struct BlockBackendRootState { @@ -1429,4 +1453,30 @@ static inline BlockDriverState *bdrv_primary_bs(BlockDriverState *bs) */ void bdrv_drain_all_end_quiesce(BlockDriverState *bs); +/** + * Check whether the given offset is in the cached block-status data + * region. + * + * If it is, and @pnum is not NULL, *pnum is set to + * `bsc.data_end - offset`, i.e. how many bytes, starting from + * @offset, are data (according to the cache). + * Otherwise, *pnum is not touched. + */ +bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum); + +/** + * If [offset, offset + bytes) overlaps with the currently cached + * block-status region, invalidate the cache. + * + * (To be used by I/O paths that cause data regions to be zero or + * holes.) + */ +void bdrv_bsc_invalidate_range(BlockDriverState *bs, + int64_t offset, int64_t bytes); + +/** + * Mark the range [offset, offset + bytes) as a data region. + */ +void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes); + #endif /* BLOCK_INT_H */ From 5a1cfd2150596bddbf429f81097cbd8b861515c0 Mon Sep 17 00:00:00 2001 From: Hanna Reitz Date: Thu, 12 Aug 2021 10:41:45 +0200 Subject: [PATCH 04/32] block: Clarify that @bytes is no limit on *pnum .bdrv_co_block_status() implementations are free to return a *pnum that exceeds @bytes, because bdrv_co_block_status() in block/io.c will clamp *pnum as necessary. On the other hand, if drivers' implementations return values for *pnum that are as large as possible, our recently introduced block-status cache will become more effective. So, make a note in block_int.h that @bytes is no upper limit for *pnum. Suggested-by: Eric Blake Signed-off-by: Hanna Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20210812084148.14458-4-hreitz@redhat.com> Reviewed-by: Eric Blake --- include/block/block_int.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/block/block_int.h b/include/block/block_int.h index 437d746733..5451f89b8d 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -348,6 +348,15 @@ struct BlockDriver { * clamped to bdrv_getlength() and aligned to request_alignment, * as well as non-NULL pnum, map, and file; in turn, the driver * must return an error or set pnum to an aligned non-zero value. + * + * Note that @bytes is just a hint on how big of a region the + * caller wants to inspect. It is not a limit on *pnum. + * Implementations are free to return larger values of *pnum if + * doing so does not incur a performance penalty. + * + * block/io.c's bdrv_co_block_status() will utilize an unclamped + * *pnum value for the block-status cache on protocol nodes, prior + * to clamping *pnum for return to its caller. */ int coroutine_fn (*bdrv_co_block_status)(BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum, From 869e7ee827ebf9392ed100119cc7e21a81311536 Mon Sep 17 00:00:00 2001 From: Hanna Reitz Date: Thu, 12 Aug 2021 10:41:46 +0200 Subject: [PATCH 05/32] block/file-posix: Do not force-cap *pnum bdrv_co_block_status() does it for us, we do not need to do it here. The advantage of not capping *pnum is that bdrv_co_block_status() can cache larger data regions than requested by its caller. Signed-off-by: Hanna Reitz Reviewed-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Kevin Wolf Message-Id: <20210812084148.14458-5-hreitz@redhat.com> --- block/file-posix.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index cb9bffe047..9f35e5631a 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2744,7 +2744,8 @@ static int find_allocation(BlockDriverState *bs, off_t start, * the specified offset) that are known to be in the same * allocated/unallocated state. * - * 'bytes' is the max value 'pnum' should be set to. + * 'bytes' is a soft cap for 'pnum'. If the information is free, 'pnum' may + * well exceed it. */ static int coroutine_fn raw_co_block_status(BlockDriverState *bs, bool want_zero, @@ -2782,7 +2783,7 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs, } else if (data == offset) { /* On a data extent, compute bytes to the end of the extent, * possibly including a partial sector at EOF. */ - *pnum = MIN(bytes, hole - offset); + *pnum = hole - offset; /* * We are not allowed to return partial sectors, though, so @@ -2801,7 +2802,7 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs, } else { /* On a hole, compute bytes to the beginning of the next extent. */ assert(hole == offset); - *pnum = MIN(bytes, data - offset); + *pnum = data - offset; ret = BDRV_BLOCK_ZERO; } *map = offset; From 72b4cabe5e92b131dcb954691f579e59eb9103e3 Mon Sep 17 00:00:00 2001 From: Hanna Reitz Date: Thu, 12 Aug 2021 10:41:47 +0200 Subject: [PATCH 06/32] block/gluster: Do not force-cap *pnum bdrv_co_block_status() does it for us, we do not need to do it here. The advantage of not capping *pnum is that bdrv_co_block_status() can cache larger data regions than requested by its caller. Signed-off-by: Hanna Reitz Reviewed-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Kevin Wolf Message-Id: <20210812084148.14458-6-hreitz@redhat.com> --- block/gluster.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/block/gluster.c b/block/gluster.c index 48a04417cf..d51938e447 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -1461,7 +1461,8 @@ exit: * the specified offset) that are known to be in the same * allocated/unallocated state. * - * 'bytes' is the max value 'pnum' should be set to. + * 'bytes' is a soft cap for 'pnum'. If the information is free, 'pnum' may + * well exceed it. * * (Based on raw_co_block_status() from file-posix.c.) */ @@ -1502,7 +1503,7 @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs, } else if (data == offset) { /* On a data extent, compute bytes to the end of the extent, * possibly including a partial sector at EOF. */ - *pnum = MIN(bytes, hole - offset); + *pnum = hole - offset; /* * We are not allowed to return partial sectors, though, so @@ -1521,7 +1522,7 @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs, } else { /* On a hole, compute bytes to the beginning of the next extent. */ assert(hole == offset); - *pnum = MIN(bytes, data - offset); + *pnum = data - offset; ret = BDRV_BLOCK_ZERO; } From 9dbf6455f4f6bc59e68af033c5813c82d8a1ffff Mon Sep 17 00:00:00 2001 From: Hanna Reitz Date: Thu, 12 Aug 2021 10:41:48 +0200 Subject: [PATCH 07/32] block/iscsi: Do not force-cap *pnum bdrv_co_block_status() does it for us, we do not need to do it here. The advantage of not capping *pnum is that bdrv_co_block_status() can cache larger data regions than requested by its caller. Signed-off-by: Hanna Reitz Reviewed-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Kevin Wolf Message-Id: <20210812084148.14458-7-hreitz@redhat.com> --- block/iscsi.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/iscsi.c b/block/iscsi.c index 4d2a416ce7..852384086b 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -781,9 +781,6 @@ retry: iscsi_allocmap_set_allocated(iscsilun, offset, *pnum); } - if (*pnum > bytes) { - *pnum = bytes; - } out_unlock: qemu_mutex_unlock(&iscsilun->mutex); g_free(iTask.err_str); From 81dcb9ca1ff7f5588ed94824f13bc4c5232aca95 Mon Sep 17 00:00:00 2001 From: Hanna Reitz Date: Tue, 24 Aug 2021 17:35:39 +0200 Subject: [PATCH 08/32] iotests: Fix unspecified-encoding pylint warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As of recently, pylint complains when `open()` calls are missing an `encoding=` specified. Everything we have should be UTF-8 (and in fact, everything should be UTF-8, period (exceptions apply)), so use that. Signed-off-by: Hanna Reitz Message-Id: <20210824153540.177128-2-hreitz@redhat.com> Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: John Snow --- tests/qemu-iotests/297 | 2 +- tests/qemu-iotests/iotests.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/qemu-iotests/297 b/tests/qemu-iotests/297 index 345b617b34..1ee15dd866 100755 --- a/tests/qemu-iotests/297 +++ b/tests/qemu-iotests/297 @@ -46,7 +46,7 @@ def is_python_file(filename): if filename.endswith('.py'): return True - with open(filename) as f: + with open(filename, encoding='utf-8') as f: try: first_line = f.readline() return re.match('^#!.*python', first_line) is not None diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py index 11276f380a..d8c64d4c11 100644 --- a/tests/qemu-iotests/iotests.py +++ b/tests/qemu-iotests/iotests.py @@ -610,7 +610,7 @@ class VM(qtest.QEMUQtestMachine): return valgrind_filename = f"{test_dir}/{self._popen.pid}.valgrind" if self.exitcode() == 99: - with open(valgrind_filename) as f: + with open(valgrind_filename, encoding='utf-8') as f: print(f.read()) else: os.remove(valgrind_filename) @@ -1121,7 +1121,8 @@ def notrun(reason): # Each test in qemu-iotests has a number ("seq") seq = os.path.basename(sys.argv[0]) - with open('%s/%s.notrun' % (output_dir, seq), 'w') as outfile: + with open('%s/%s.notrun' % (output_dir, seq), 'w', encoding='utf-8') \ + as outfile: outfile.write(reason + '\n') logger.warning("%s not run: %s", seq, reason) sys.exit(0) @@ -1135,7 +1136,8 @@ def case_notrun(reason): # Each test in qemu-iotests has a number ("seq") seq = os.path.basename(sys.argv[0]) - with open('%s/%s.casenotrun' % (output_dir, seq), 'a') as outfile: + with open('%s/%s.casenotrun' % (output_dir, seq), 'a', encoding='utf-8') \ + as outfile: outfile.write(' [case not run] ' + reason + '\n') def _verify_image_format(supported_fmts: Sequence[str] = (), From cc16153f1fdec4bc841f0bcb9317c4639ab42530 Mon Sep 17 00:00:00 2001 From: Hanna Reitz Date: Tue, 24 Aug 2021 17:35:40 +0200 Subject: [PATCH 09/32] iotests: Fix use-{list,dict}-literal warnings pylint proposes using `[]` instead of `list()` and `{}` instead of `dict()`, because it is faster. That seems simple enough, so heed its advice. Signed-off-by: Hanna Reitz Message-Id: <20210824153540.177128-3-hreitz@redhat.com> Reviewed-by: Vladimir Sementsov-Ogievskiy --- tests/qemu-iotests/iotests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py index d8c64d4c11..ce06cf5630 100644 --- a/tests/qemu-iotests/iotests.py +++ b/tests/qemu-iotests/iotests.py @@ -703,7 +703,7 @@ class VM(qtest.QEMUQtestMachine): def flatten_qmp_object(self, obj, output=None, basestr=''): if output is None: - output = dict() + output = {} if isinstance(obj, list): for i, item in enumerate(obj): self.flatten_qmp_object(item, output, basestr + str(i) + '.') @@ -716,7 +716,7 @@ class VM(qtest.QEMUQtestMachine): def qmp_to_opts(self, obj): obj = self.flatten_qmp_object(obj) - output_list = list() + output_list = [] for key in obj: output_list += [key + '=' + obj[key]] return ','.join(output_list) From 26db7b23ce57f12b8324111a2ed3b2a4ba3db4a4 Mon Sep 17 00:00:00 2001 From: Hanna Reitz Date: Thu, 2 Sep 2021 11:40:13 +0200 Subject: [PATCH 10/32] iotests/297: Drop 169 and 199 from the skip list 169 and 199 have been renamed and moved to tests/ (commit a44be0334be: "iotests: rename and move 169 and 199 tests"), so we can drop them from the skip list. Signed-off-by: Hanna Reitz Reviewed-by: Willian Rampazzo Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Kevin Wolf Message-Id: <20210902094017.32902-2-hreitz@redhat.com> --- tests/qemu-iotests/297 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qemu-iotests/297 b/tests/qemu-iotests/297 index 1ee15dd866..a0c0cf9e5d 100755 --- a/tests/qemu-iotests/297 +++ b/tests/qemu-iotests/297 @@ -29,7 +29,7 @@ import iotests SKIP_FILES = ( '030', '040', '041', '044', '045', '055', '056', '057', '065', '093', '096', '118', '124', '132', '136', '139', '147', '148', '149', - '151', '152', '155', '163', '165', '169', '194', '196', '199', '202', + '151', '152', '155', '163', '165', '194', '196', '202', '203', '205', '206', '207', '208', '210', '211', '212', '213', '216', '218', '219', '224', '228', '234', '235', '236', '237', '238', '240', '242', '245', '246', '248', '255', '256', '257', '258', '260', From e2ad17a62d9da45fbcddc3faa3d6ced519a9453a Mon Sep 17 00:00:00 2001 From: Hanna Reitz Date: Thu, 2 Sep 2021 11:40:14 +0200 Subject: [PATCH 11/32] migrate-bitmaps-postcopy-test: Fix pylint warnings pylint complains that discards1_sha256 and all_discards_sha256 are first set in non-__init__ methods. These variables are not really class-variables anyway, so let them instead be returned by start_postcopy(), thus silencing pylint. Suggested-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Hanna Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20210902094017.32902-3-hreitz@redhat.com> --- .../tests/migrate-bitmaps-postcopy-test | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test b/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test index 584062b412..00ebb5c251 100755 --- a/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test +++ b/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test @@ -132,10 +132,10 @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase): result = self.vm_a.qmp('x-debug-block-dirty-bitmap-sha256', node='drive0', name='bitmap0') - self.discards1_sha256 = result['return']['sha256'] + discards1_sha256 = result['return']['sha256'] # Check, that updating the bitmap by discards works - assert self.discards1_sha256 != empty_sha256 + assert discards1_sha256 != empty_sha256 # We want to calculate resulting sha256. Do it in bitmap0, so, disable # other bitmaps @@ -148,7 +148,7 @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase): result = self.vm_a.qmp('x-debug-block-dirty-bitmap-sha256', node='drive0', name='bitmap0') - self.all_discards_sha256 = result['return']['sha256'] + all_discards_sha256 = result['return']['sha256'] # Now, enable some bitmaps, to be updated during migration for i in range(2, nb_bitmaps, 2): @@ -173,10 +173,11 @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase): event_resume = self.vm_b.event_wait('RESUME') self.vm_b_events.append(event_resume) - return event_resume + return (event_resume, discards1_sha256, all_discards_sha256) def test_postcopy_success(self): - event_resume = self.start_postcopy() + event_resume, discards1_sha256, all_discards_sha256 = \ + self.start_postcopy() # enabled bitmaps should be updated apply_discards(self.vm_b, discards2) @@ -217,7 +218,7 @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase): for i in range(0, nb_bitmaps, 5): result = self.vm_b.qmp('x-debug-block-dirty-bitmap-sha256', node='drive0', name='bitmap{}'.format(i)) - sha = self.discards1_sha256 if i % 2 else self.all_discards_sha256 + sha = discards1_sha256 if i % 2 else all_discards_sha256 self.assert_qmp(result, 'return/sha256', sha) def test_early_shutdown_destination(self): From d8c2e47dbe998d00dcfe0ca5fc0766a98b69a591 Mon Sep 17 00:00:00 2001 From: Hanna Reitz Date: Thu, 2 Sep 2021 11:40:15 +0200 Subject: [PATCH 12/32] migrate-bitmaps-test: Fix pylint warnings There are a couple of things pylint takes issue with: - The "time" import is unused - The import order (iotests should come last) - get_bitmap_hash() doesn't use @self and so should be a function - Semicolons at the end of some lines - Parentheses after "if" - Some lines are too long (80 characters instead of 79) - inject_test_case()'s @name parameter shadows a top-level @name variable - "lambda self: mc(self)" were equivalent to just "mc", but in inject_test_case(), it is not equivalent, so add a comment and disable the warning locally - Always put two empty lines after a function - f'exec: cat > /dev/null' does not need to be an f-string Fix them. Signed-off-by: Hanna Reitz Message-Id: <20210902094017.32902-4-hreitz@redhat.com> Reviewed-by: Vladimir Sementsov-Ogievskiy --- tests/qemu-iotests/tests/migrate-bitmaps-test | 43 +++++++++++-------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/tests/qemu-iotests/tests/migrate-bitmaps-test b/tests/qemu-iotests/tests/migrate-bitmaps-test index a5c7bc83e0..dc431c35b3 100755 --- a/tests/qemu-iotests/tests/migrate-bitmaps-test +++ b/tests/qemu-iotests/tests/migrate-bitmaps-test @@ -20,11 +20,10 @@ # import os -import iotests -import time import itertools import operator import re +import iotests from iotests import qemu_img, qemu_img_create, Timeout @@ -37,6 +36,12 @@ mig_cmd = 'exec: cat > ' + mig_file incoming_cmd = 'exec: cat ' + mig_file +def get_bitmap_hash(vm): + result = vm.qmp('x-debug-block-dirty-bitmap-sha256', + node='drive0', name='bitmap0') + return result['return']['sha256'] + + class TestDirtyBitmapMigration(iotests.QMPTestCase): def tearDown(self): self.vm_a.shutdown() @@ -62,21 +67,16 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase): params['persistent'] = True result = vm.qmp('block-dirty-bitmap-add', **params) - self.assert_qmp(result, 'return', {}); - - def get_bitmap_hash(self, vm): - result = vm.qmp('x-debug-block-dirty-bitmap-sha256', - node='drive0', name='bitmap0') - return result['return']['sha256'] + self.assert_qmp(result, 'return', {}) def check_bitmap(self, vm, sha256): result = vm.qmp('x-debug-block-dirty-bitmap-sha256', node='drive0', name='bitmap0') if sha256: - self.assert_qmp(result, 'return/sha256', sha256); + self.assert_qmp(result, 'return/sha256', sha256) else: self.assert_qmp(result, 'error/desc', - "Dirty bitmap 'bitmap0' not found"); + "Dirty bitmap 'bitmap0' not found") def do_test_migration_resume_source(self, persistent, migrate_bitmaps): granularity = 512 @@ -97,7 +97,7 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase): self.add_bitmap(self.vm_a, granularity, persistent) for r in regions: self.vm_a.hmp_qemu_io('drive0', 'write %d %d' % r) - sha256 = self.get_bitmap_hash(self.vm_a) + sha256 = get_bitmap_hash(self.vm_a) result = self.vm_a.qmp('migrate', uri=mig_cmd) while True: @@ -106,7 +106,7 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase): break while True: result = self.vm_a.qmp('query-status') - if (result['return']['status'] == 'postmigrate'): + if result['return']['status'] == 'postmigrate': break # test that bitmap is still here @@ -164,7 +164,7 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase): self.add_bitmap(self.vm_a, granularity, persistent) for r in regions: self.vm_a.hmp_qemu_io('drive0', 'write %d %d' % r) - sha256 = self.get_bitmap_hash(self.vm_a) + sha256 = get_bitmap_hash(self.vm_a) if pre_shutdown: self.vm_a.shutdown() @@ -214,16 +214,22 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase): self.check_bitmap(self.vm_b, sha256 if persistent else False) -def inject_test_case(klass, name, method, *args, **kwargs): +def inject_test_case(klass, suffix, method, *args, **kwargs): mc = operator.methodcaller(method, *args, **kwargs) - setattr(klass, 'test_' + method + name, lambda self: mc(self)) + # We want to add a function attribute to `klass`, so that it is + # correctly converted to a method on instantiation. The + # methodcaller object `mc` is a callable, not a function, so we + # need the lambda to turn it into a function. + # pylint: disable=unnecessary-lambda + setattr(klass, 'test_' + method + suffix, lambda self: mc(self)) + for cmb in list(itertools.product((True, False), repeat=5)): name = ('_' if cmb[0] else '_not_') + 'persistent_' name += ('_' if cmb[1] else '_not_') + 'migbitmap_' name += '_online' if cmb[2] else '_offline' name += '_shared' if cmb[3] else '_nonshared' - if (cmb[4]): + if cmb[4]: name += '__pre_shutdown' inject_test_case(TestDirtyBitmapMigration, name, 'do_test_migration', @@ -270,7 +276,8 @@ class TestDirtyBitmapBackingMigration(iotests.QMPTestCase): self.assert_qmp(result, 'return', {}) # Check that the bitmaps are there - for node in self.vm.qmp('query-named-block-nodes', flat=True)['return']: + nodes = self.vm.qmp('query-named-block-nodes', flat=True)['return'] + for node in nodes: if 'node0' in node['node-name']: self.assert_qmp(node, 'dirty-bitmaps[0]/name', 'bmap0') @@ -287,7 +294,7 @@ class TestDirtyBitmapBackingMigration(iotests.QMPTestCase): """ Continue the source after migration. """ - result = self.vm.qmp('migrate', uri=f'exec: cat > /dev/null') + result = self.vm.qmp('migrate', uri='exec: cat > /dev/null') self.assert_qmp(result, 'return', {}) with Timeout(10, 'Migration timeout'): From b90d7a18b615be1e521ee5bf99c816a45099a29b Mon Sep 17 00:00:00 2001 From: Hanna Reitz Date: Thu, 2 Sep 2021 11:40:16 +0200 Subject: [PATCH 13/32] mirror-top-perms: Fix AbnormalShutdown path The AbnormalShutdown exception class is not in qemu.machine, but in qemu.machine.machine. (qemu.machine.AbnormalShutdown was enough for Python to find it in order to run this test, but pylint complains about it.) Signed-off-by: Hanna Reitz Message-Id: <20210902094017.32902-5-hreitz@redhat.com> Reviewed-by: Vladimir Sementsov-Ogievskiy --- tests/qemu-iotests/tests/mirror-top-perms | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qemu-iotests/tests/mirror-top-perms b/tests/qemu-iotests/tests/mirror-top-perms index 451a0666f8..2fc8dd66e0 100755 --- a/tests/qemu-iotests/tests/mirror-top-perms +++ b/tests/qemu-iotests/tests/mirror-top-perms @@ -47,7 +47,7 @@ class TestMirrorTopPerms(iotests.QMPTestCase): def tearDown(self): try: self.vm.shutdown() - except qemu.machine.AbnormalShutdown: + except qemu.machine.machine.AbnormalShutdown: pass if self.vm_b is not None: From 098d983ea5ff1e2b504902d6081074f6b3bf36b8 Mon Sep 17 00:00:00 2001 From: Hanna Reitz Date: Thu, 2 Sep 2021 11:40:17 +0200 Subject: [PATCH 14/32] iotests/297: Cover tests/ 297 so far does not check the named tests, which reside in the tests/ directory (i.e. full path tests/qemu-iotests/tests). Fix it. Thanks to the previous two commits, all named tests pass its scrutiny, so we do not have to add anything to SKIP_FILES. Signed-off-by: Hanna Reitz Reviewed-by: Willian Rampazzo Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Kevin Wolf Message-Id: <20210902094017.32902-6-hreitz@redhat.com> --- tests/qemu-iotests/297 | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/qemu-iotests/297 b/tests/qemu-iotests/297 index a0c0cf9e5d..b04cba5366 100755 --- a/tests/qemu-iotests/297 +++ b/tests/qemu-iotests/297 @@ -55,8 +55,9 @@ def is_python_file(filename): def run_linters(): - files = [filename for filename in (set(os.listdir('.')) - set(SKIP_FILES)) - if is_python_file(filename)] + named_tests = [f'tests/{entry}' for entry in os.listdir('tests')] + check_tests = set(os.listdir('.') + named_tests) - set(SKIP_FILES) + files = [filename for filename in check_tests if is_python_file(filename)] iotests.logger.debug('Files to be checked:') iotests.logger.debug(', '.join(sorted(files))) From 66fed30c9cd11854fc878a4eceb507e915d7c9cd Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 10 Sep 2021 14:45:33 +0200 Subject: [PATCH 15/32] block/mirror: fix NULL pointer dereference in mirror_wait_on_conflicts() In mirror_iteration() we call mirror_wait_on_conflicts() with `self` parameter set to NULL. Starting from commit d44dae1a7c we dereference `self` pointer in mirror_wait_on_conflicts() without checks if it is not NULL. Backtrace: Program terminated with signal SIGSEGV, Segmentation fault. #0 mirror_wait_on_conflicts (self=0x0, s=, offset=, bytes=) at ../block/mirror.c:172 172 self->waiting_for_op = op; [Current thread is 1 (Thread 0x7f0908931ec0 (LWP 380249))] (gdb) bt #0 mirror_wait_on_conflicts (self=0x0, s=, offset=, bytes=) at ../block/mirror.c:172 #1 0x00005610c5d9d631 in mirror_run (job=0x5610c76a2c00, errp=) at ../block/mirror.c:491 #2 0x00005610c5d58726 in job_co_entry (opaque=0x5610c76a2c00) at ../job.c:917 #3 0x00005610c5f046c6 in coroutine_trampoline (i0=, i1=) at ../util/coroutine-ucontext.c:173 #4 0x00007f0909975820 in ?? () at ../sysdeps/unix/sysv/linux/x86_64/__start_context.S:91 from /usr/lib64/libc.so.6 Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2001404 Fixes: d44dae1a7c ("block/mirror: fix active mirror dead-lock in mirror_wait_on_conflicts") Signed-off-by: Stefano Garzarella Message-Id: <20210910124533.288318-1-sgarzare@redhat.com> Reviewed-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Hanna Reitz --- block/mirror.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index 98fc66eabf..85b781bc21 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -160,18 +160,25 @@ static void coroutine_fn mirror_wait_on_conflicts(MirrorOp *self, if (ranges_overlap(self_start_chunk, self_nb_chunks, op_start_chunk, op_nb_chunks)) { - /* - * If the operation is already (indirectly) waiting for us, or - * will wait for us as soon as it wakes up, then just go on - * (instead of producing a deadlock in the former case). - */ - if (op->waiting_for_op) { - continue; + if (self) { + /* + * If the operation is already (indirectly) waiting for us, + * or will wait for us as soon as it wakes up, then just go + * on (instead of producing a deadlock in the former case). + */ + if (op->waiting_for_op) { + continue; + } + + self->waiting_for_op = op; } - self->waiting_for_op = op; qemu_co_queue_wait(&op->waiting_requests, NULL); - self->waiting_for_op = NULL; + + if (self) { + self->waiting_for_op = NULL; + } + break; } } From 2f43482733352e4423fcbddadd09265de92ddf58 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Sat, 11 Sep 2021 15:00:26 +0300 Subject: [PATCH 16/32] tests: add migrate-during-backup Add a simple test which tries to run migration during backup. bdrv_inactivate_all() should fail. But due to bug (see next commit with fix) it doesn't, nodes are inactivated and continued backup crashes on assertion "assert(!(bs->open_flags & BDRV_O_INACTIVE));" in bdrv_co_write_req_prepare(). Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20210911120027.8063-2-vsementsov@virtuozzo.com> Signed-off-by: Hanna Reitz --- .../qemu-iotests/tests/migrate-during-backup | 97 +++++++++++++++++++ .../tests/migrate-during-backup.out | 5 + 2 files changed, 102 insertions(+) create mode 100755 tests/qemu-iotests/tests/migrate-during-backup create mode 100644 tests/qemu-iotests/tests/migrate-during-backup.out diff --git a/tests/qemu-iotests/tests/migrate-during-backup b/tests/qemu-iotests/tests/migrate-during-backup new file mode 100755 index 0000000000..1046904c5a --- /dev/null +++ b/tests/qemu-iotests/tests/migrate-during-backup @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# group: migration disabled +# +# Copyright (c) 2021 Virtuozzo International GmbH +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import os +import iotests +from iotests import qemu_img_create, qemu_io + + +disk_a = os.path.join(iotests.test_dir, 'disk_a') +disk_b = os.path.join(iotests.test_dir, 'disk_b') +size = '1M' +mig_file = os.path.join(iotests.test_dir, 'mig_file') +mig_cmd = 'exec: cat > ' + mig_file + + +class TestMigrateDuringBackup(iotests.QMPTestCase): + def tearDown(self): + self.vm.shutdown() + os.remove(disk_a) + os.remove(disk_b) + os.remove(mig_file) + + def setUp(self): + qemu_img_create('-f', iotests.imgfmt, disk_a, size) + qemu_img_create('-f', iotests.imgfmt, disk_b, size) + qemu_io('-c', f'write 0 {size}', disk_a) + + self.vm = iotests.VM().add_drive(disk_a) + self.vm.launch() + result = self.vm.qmp('blockdev-add', { + 'node-name': 'target', + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': disk_b + } + }) + self.assert_qmp(result, 'return', {}) + + def test_migrate(self): + result = self.vm.qmp('blockdev-backup', device='drive0', + target='target', sync='full', + speed=1, x_perf={ + 'max-workers': 1, + 'max-chunk': 64 * 1024 + }) + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('job-pause', id='drive0') + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('migrate-set-capabilities', + capabilities=[{'capability': 'events', + 'state': True}]) + self.assert_qmp(result, 'return', {}) + result = self.vm.qmp('migrate', uri=mig_cmd) + self.assert_qmp(result, 'return', {}) + + e = self.vm.events_wait((('MIGRATION', + {'data': {'status': 'completed'}}), + ('MIGRATION', + {'data': {'status': 'failed'}}))) + + # Don't assert that e is 'failed' now: this way we'll miss + # possible crash when backup continues :) + + result = self.vm.qmp('block-job-set-speed', device='drive0', + speed=0) + self.assert_qmp(result, 'return', {}) + result = self.vm.qmp('job-resume', id='drive0') + self.assert_qmp(result, 'return', {}) + + # For future: if something changes so that both migration + # and backup pass, let's not miss that moment, as it may + # be a bug as well as improvement. + self.assert_qmp(e, 'data/status', 'failed') + + +if __name__ == '__main__': + iotests.main(supported_fmts=['qcow2'], + supported_protocols=['file']) diff --git a/tests/qemu-iotests/tests/migrate-during-backup.out b/tests/qemu-iotests/tests/migrate-during-backup.out new file mode 100644 index 0000000000..ae1213e6f8 --- /dev/null +++ b/tests/qemu-iotests/tests/migrate-during-backup.out @@ -0,0 +1,5 @@ +. +---------------------------------------------------------------------- +Ran 1 tests + +OK From a13de40a05478e64726dd9861135d344837f3c30 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Sat, 11 Sep 2021 15:00:27 +0300 Subject: [PATCH 17/32] block: bdrv_inactivate_recurse(): check for permissions and fix crash We must not inactivate child when parent has write permissions on it. Calling .bdrv_inactivate() doesn't help: actually only qcow2 has this handler and it is used to flush caches, not for permission manipulations. So, let's simply check cumulative parent permissions before inactivating the node. This commit fixes a crash when we do migration during backup: prior to the commit nothing prevents all nodes inactivation at migration finish and following backup write to the target crashes on assertion "assert(!(bs->open_flags & BDRV_O_INACTIVE));" in bdrv_co_write_req_prepare(). After the commit, we rely on the fact that copy-before-write filter keeps write permission on target node to be able to write to it. So inactivation fails and migration fails as expected. Corresponding test now passes, so, enable it. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Hanna Reitz Message-Id: <20210911120027.8063-3-vsementsov@virtuozzo.com> Signed-off-by: Hanna Reitz --- block.c | 8 ++++++++ tests/qemu-iotests/tests/migrate-during-backup | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/block.c b/block.c index 00982edf45..5ce08a79fd 100644 --- a/block.c +++ b/block.c @@ -6326,6 +6326,7 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs) { BdrvChild *child, *parent; int ret; + uint64_t cumulative_perms, cumulative_shared_perms; if (!bs->drv) { return -ENOMEDIUM; @@ -6356,6 +6357,13 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs) } } + bdrv_get_cumulative_perm(bs, &cumulative_perms, + &cumulative_shared_perms); + if (cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) { + /* Our inactive parents still need write access. Inactivation failed. */ + return -EPERM; + } + bs->open_flags |= BDRV_O_INACTIVE; /* diff --git a/tests/qemu-iotests/tests/migrate-during-backup b/tests/qemu-iotests/tests/migrate-during-backup index 1046904c5a..34103229ee 100755 --- a/tests/qemu-iotests/tests/migrate-during-backup +++ b/tests/qemu-iotests/tests/migrate-during-backup @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# group: migration disabled +# group: migration # # Copyright (c) 2021 Virtuozzo International GmbH # From 5b3f7daaec02704b340ebd8ec4011e19098a2b3c Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 24 Aug 2021 13:15:15 +0300 Subject: [PATCH 18/32] simplebench: add img_bench_templater.py Add simple grammar-parsing template benchmark. New tool consume test template written in bash with some special grammar injections and produces multiple tests, run them and finally print a performance comparison table of different tests produced from one template. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20210824101517.59802-2-vsementsov@virtuozzo.com> Reviewed-by: Hanna Reitz Signed-off-by: Hanna Reitz --- scripts/simplebench/img_bench_templater.py | 95 ++++++++++++++++++++++ scripts/simplebench/table_templater.py | 62 ++++++++++++++ 2 files changed, 157 insertions(+) create mode 100755 scripts/simplebench/img_bench_templater.py create mode 100644 scripts/simplebench/table_templater.py diff --git a/scripts/simplebench/img_bench_templater.py b/scripts/simplebench/img_bench_templater.py new file mode 100755 index 0000000000..f8e1540ada --- /dev/null +++ b/scripts/simplebench/img_bench_templater.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +# +# Process img-bench test templates +# +# Copyright (c) 2021 Virtuozzo International GmbH. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + + +import sys +import subprocess +import re +import json + +import simplebench +from results_to_text import results_to_text +from table_templater import Templater + + +def bench_func(env, case): + test = templater.gen(env['data'], case['data']) + + p = subprocess.run(test, shell=True, stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, universal_newlines=True) + + if p.returncode == 0: + try: + m = re.search(r'Run completed in (\d+.\d+) seconds.', p.stdout) + return {'seconds': float(m.group(1))} + except Exception: + return {'error': f'failed to parse qemu-img output: {p.stdout}'} + else: + return {'error': f'qemu-img failed: {p.returncode}: {p.stdout}'} + + +if __name__ == '__main__': + if len(sys.argv) > 1: + print(""" +Usage: img_bench_templater.py < path/to/test-template.sh + +This script generates performance tests from a test template (example below), +runs them, and displays the results in a table. The template is read from +stdin. It must be written in bash and end with a `qemu-img bench` invocation +(whose result is parsed to get the test instance’s result). + +Use the following syntax in the template to create the various different test +instances: + + column templating: {var1|var2|...} - test will use different values in + different columns. You may use several {} constructions in the test, in this + case product of all choice-sets will be used. + + row templating: [var1|var2|...] - similar thing to define rows (test-cases) + +Test template example: + +Assume you want to compare two qemu-img binaries, called qemu-img-old and +qemu-img-new in your build directory in two test-cases with 4K writes and 64K +writes. The template may look like this: + +qemu_img=/path/to/qemu/build/qemu-img-{old|new} +$qemu_img create -f qcow2 /ssd/x.qcow2 1G +$qemu_img bench -c 100 -d 8 [-s 4K|-s 64K] -w -t none -n /ssd/x.qcow2 + +When passing this to stdin of img_bench_templater.py, the resulting comparison +table will contain two columns (for two binaries) and two rows (for two +test-cases). + +In addition to displaying the results, script also stores results in JSON +format into results.json file in current directory. +""") + sys.exit() + + templater = Templater(sys.stdin.read()) + + envs = [{'id': ' / '.join(x), 'data': x} for x in templater.columns] + cases = [{'id': ' / '.join(x), 'data': x} for x in templater.rows] + + result = simplebench.bench(bench_func, envs, cases, count=5, + initial_run=False) + print(results_to_text(result)) + with open('results.json', 'w') as f: + json.dump(result, f, indent=4) diff --git a/scripts/simplebench/table_templater.py b/scripts/simplebench/table_templater.py new file mode 100644 index 0000000000..950f3b3024 --- /dev/null +++ b/scripts/simplebench/table_templater.py @@ -0,0 +1,62 @@ +# Parser for test templates +# +# Copyright (c) 2021 Virtuozzo International GmbH. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import itertools +from lark import Lark + +grammar = """ +start: ( text | column_switch | row_switch )+ + +column_switch: "{" text ["|" text]+ "}" +row_switch: "[" text ["|" text]+ "]" +text: /[^|{}\[\]]+/ +""" + +parser = Lark(grammar) + +class Templater: + def __init__(self, template): + self.tree = parser.parse(template) + + c_switches = [] + r_switches = [] + for x in self.tree.children: + if x.data == 'column_switch': + c_switches.append([el.children[0].value for el in x.children]) + elif x.data == 'row_switch': + r_switches.append([el.children[0].value for el in x.children]) + + self.columns = list(itertools.product(*c_switches)) + self.rows = list(itertools.product(*r_switches)) + + def gen(self, column, row): + i = 0 + j = 0 + result = [] + + for x in self.tree.children: + if x.data == 'text': + result.append(x.children[0].value) + elif x.data == 'column_switch': + result.append(column[i]) + i += 1 + elif x.data == 'row_switch': + result.append(row[j]) + j += 1 + + return ''.join(result) From 6d207d3501b3e6c46c71d782df413d3f4cad2dd8 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 24 Aug 2021 13:15:16 +0300 Subject: [PATCH 19/32] qcow2: refactor handle_dependencies() loop body No logic change, just prepare for the following commit. While being here do also small grammar fix in a comment. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Eric Blake Reviewed-by: Hanna Reitz Message-Id: <20210824101517.59802-3-vsementsov@virtuozzo.com> Signed-off-by: Hanna Reitz --- block/qcow2-cluster.c | 47 +++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index bd0597842f..9917e5c28c 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1400,29 +1400,36 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, if (end <= old_start || start >= old_end) { /* No intersection */ + continue; + } + + /* Conflict */ + + if (start < old_start) { + /* Stop at the start of a running allocation */ + bytes = old_start - start; } else { - if (start < old_start) { - /* Stop at the start of a running allocation */ - bytes = old_start - start; - } else { - bytes = 0; - } + bytes = 0; + } - /* Stop if already an l2meta exists. After yielding, it wouldn't - * be valid any more, so we'd have to clean up the old L2Metas - * and deal with requests depending on them before starting to - * gather new ones. Not worth the trouble. */ - if (bytes == 0 && *m) { - *cur_bytes = 0; - return 0; - } + /* + * Stop if an l2meta already exists. After yielding, it wouldn't + * be valid any more, so we'd have to clean up the old L2Metas + * and deal with requests depending on them before starting to + * gather new ones. Not worth the trouble. + */ + if (bytes == 0 && *m) { + *cur_bytes = 0; + return 0; + } - if (bytes == 0) { - /* Wait for the dependency to complete. We need to recheck - * the free/allocated clusters when we continue. */ - qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock); - return -EAGAIN; - } + if (bytes == 0) { + /* + * Wait for the dependency to complete. We need to recheck + * the free/allocated clusters when we continue. + */ + qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock); + return -EAGAIN; } } From ff812c55634f767fb0f7f58b942af1a1c44e95cf Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 24 Aug 2021 13:15:17 +0300 Subject: [PATCH 20/32] qcow2: handle_dependencies(): relax conflict detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no conflict and no dependency if we have parallel writes to different subclusters of one cluster when the cluster itself is already allocated. So, relax extra dependency. Measure performance: First, prepare build/qemu-img-old and build/qemu-img-new images. cd scripts/simplebench ./img_bench_templater.py Paste the following to stdin of running script: qemu_img=../../build/qemu-img-{old|new} $qemu_img create -f qcow2 -o extended_l2=on /ssd/x.qcow2 1G $qemu_img bench -c 100000 -d 8 [-s 2K|-s 2K -o 512|-s $((1024*2+512))] \ -w -t none -n /ssd/x.qcow2 The result: All results are in seconds ------------------ --------- --------- old new -s 2K 6.7 ± 15% 6.2 ± 12% -7% -s 2K -o 512 13 ± 3% 11 ± 5% -16% -s $((1024*2+512)) 9.5 ± 4% 8.4 -12% ------------------ --------- --------- So small writes are more independent now and that helps to keep deeper io queue which improves performance. 271 iotest output becomes racy for three allocation in one cluster. Second and third writes may finish in different order. Second and third requests don't depend on each other any more. Still they both depend on first request anyway. Filter out second and third write offsets to cover both possible outputs. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20210824101517.59802-4-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake Reviewed-by: Hanna Reitz [hreitz: s/ an / and /] Signed-off-by: Hanna Reitz --- block/qcow2-cluster.c | 11 +++++++++++ tests/qemu-iotests/271 | 5 ++++- tests/qemu-iotests/271.out | 4 ++-- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 9917e5c28c..c1c43a891b 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1403,6 +1403,17 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, continue; } + if (old_alloc->keep_old_clusters && + (end <= l2meta_cow_start(old_alloc) || + start >= l2meta_cow_end(old_alloc))) + { + /* + * Clusters intersect but COW areas don't. And cluster itself is + * already allocated. So, there is no actual conflict. + */ + continue; + } + /* Conflict */ if (start < old_start) { diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271 index 599b849cc6..2775b4d130 100755 --- a/tests/qemu-iotests/271 +++ b/tests/qemu-iotests/271 @@ -893,7 +893,10 @@ EOF } _make_test_img -o extended_l2=on 1M -_concurrent_io | $QEMU_IO | _filter_qemu_io +# Second and third writes in _concurrent_io() are independent and may finish in +# different order. So, filter offset out to match both possible variants. +_concurrent_io | $QEMU_IO | _filter_qemu_io | \ + $SED -e 's/\(20480\|40960\)/OFFSET/' _concurrent_verify | $QEMU_IO | _filter_qemu_io # success, all done diff --git a/tests/qemu-iotests/271.out b/tests/qemu-iotests/271.out index 81043ba4d7..5be780de76 100644 --- a/tests/qemu-iotests/271.out +++ b/tests/qemu-iotests/271.out @@ -719,8 +719,8 @@ blkdebug: Suspended request 'A' blkdebug: Resuming request 'A' wrote 2048/2048 bytes at offset 30720 2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -wrote 2048/2048 bytes at offset 20480 +wrote 2048/2048 bytes at offset OFFSET 2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -wrote 2048/2048 bytes at offset 40960 +wrote 2048/2048 bytes at offset OFFSET 2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) *** done From a1c62436a4f053766892355da0ca35c5df19fc22 Mon Sep 17 00:00:00 2001 From: Hanna Reitz Date: Thu, 19 Aug 2021 12:12:00 +0200 Subject: [PATCH 21/32] qemu-img: Allow target be aligned to sector size We cannot write to images opened with O_DIRECT unless we allow them to be resized so they are aligned to the sector size: Since 9c60a5d1978, bdrv_node_refresh_perm() ensures that for nodes whose length is not aligned to the request alignment and where someone has taken a WRITE permission, the RESIZE permission is taken, too). Let qemu-img convert pass the BDRV_O_RESIZE flag (which causes blk_new_open() to take the RESIZE permission) when using cache=none for the target, so that when writing to it, it can be aligned to the target sector size. Without this patch, an error is returned: $ qemu-img convert -f raw -O raw -t none foo.img /mnt/tmp/foo.img qemu-img: Could not open '/mnt/tmp/foo.img': Cannot get 'write' permission without 'resize': Image size is not a multiple of request alignment Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1994266 Signed-off-by: Hanna Reitz Message-Id: <20210819101200.64235-1-hreitz@redhat.com> Reviewed-by: Vladimir Sementsov-Ogievskiy --- qemu-img.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/qemu-img.c b/qemu-img.c index d77f3e76a9..e43a71a794 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -2628,6 +2628,14 @@ static int img_convert(int argc, char **argv) goto out; } + if (flags & BDRV_O_NOCACHE) { + /* + * If we open the target with O_DIRECT, it may be necessary to + * extend its size to align to the physical sector size. + */ + flags |= BDRV_O_RESIZE; + } + if (skip_create) { s.target = img_open(tgt_image_opts, out_filename, out_fmt, flags, writethrough, s.quiet, false); From 786c22d9c2c890f7c7fcd2600571a2ef429d65db Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 14 Sep 2021 15:24:45 +0300 Subject: [PATCH 22/32] qcow2-refcount: improve style of check_refcounts_l2() - don't use same name for size in bytes and in entries - use g_autofree for l2_table - add whitespace - fix block comment style Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Eric Blake Reviewed-by: Hanna Reitz Message-Id: <20210914122454.141075-2-vsementsov@virtuozzo.com> Signed-off-by: Hanna Reitz --- block/qcow2-refcount.c | 47 +++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 8e649b008e..2734338625 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -1601,23 +1601,22 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, int flags, BdrvCheckMode fix, bool active) { BDRVQcow2State *s = bs->opaque; - uint64_t *l2_table, l2_entry; + uint64_t l2_entry; uint64_t next_contiguous_offset = 0; - int i, l2_size, nb_csectors, ret; + int i, nb_csectors, ret; + size_t l2_size_bytes = s->l2_size * l2_entry_size(s); + g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes); /* Read L2 table from disk */ - l2_size = s->l2_size * l2_entry_size(s); - l2_table = g_malloc(l2_size); - - ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size); + ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size_bytes); if (ret < 0) { fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n"); res->check_errors++; - goto fail; + return ret; } /* Do the actual checks */ - for(i = 0; i < s->l2_size; i++) { + for (i = 0; i < s->l2_size; i++) { l2_entry = get_l2_entry(s, l2_table, i); switch (qcow2_get_cluster_type(bs, l2_entry)) { @@ -1647,14 +1646,15 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, l2_entry & QCOW2_COMPRESSED_SECTOR_MASK, nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE); if (ret < 0) { - goto fail; + return ret; } if (flags & CHECK_FRAG_INFO) { res->bfi.allocated_clusters++; res->bfi.compressed_clusters++; - /* Compressed clusters are fragmented by nature. Since they + /* + * Compressed clusters are fragmented by nature. Since they * take up sub-sector space but we only have sector granularity * I/O we need to re-read the same sectors even for adjacent * compressed clusters. @@ -1700,9 +1700,11 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, if (ret < 0) { fprintf(stderr, "ERROR: Overlap check failed\n"); res->check_errors++; - /* Something is seriously wrong, so abort checking - * this L2 table */ - goto fail; + /* + * Something is seriously wrong, so abort checking + * this L2 table. + */ + return ret; } ret = bdrv_pwrite_sync(bs->file, l2e_offset, @@ -1712,13 +1714,17 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, fprintf(stderr, "ERROR: Failed to overwrite L2 " "table entry: %s\n", strerror(-ret)); res->check_errors++; - /* Do not abort, continue checking the rest of this - * L2 table's entries */ + /* + * Do not abort, continue checking the rest of this + * L2 table's entries. + */ } else { res->corruptions--; res->corruptions_fixed++; - /* Skip marking the cluster as used - * (it is unused now) */ + /* + * Skip marking the cluster as used + * (it is unused now). + */ continue; } } @@ -1743,7 +1749,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, refcount_table_size, offset, s->cluster_size); if (ret < 0) { - goto fail; + return ret; } } break; @@ -1758,12 +1764,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, } } - g_free(l2_table); return 0; - -fail: - g_free(l2_table); - return ret; } /* From 9a3978a46bc12e0c49b7114983103b07d90cfa1c Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 14 Sep 2021 15:24:46 +0300 Subject: [PATCH 23/32] qcow2: compressed read: simplify cluster descriptor passing Let's pass the whole L2 entry and not bother with L2E_COMPRESSED_OFFSET_SIZE_MASK. It also helps further refactoring that adds generic qcow2_parse_compressed_l2_entry() helper. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Eric Blake Reviewed-by: Alberto Garcia Reviewed-by: Hanna Reitz Message-Id: <20210914122454.141075-3-vsementsov@virtuozzo.com> Signed-off-by: Hanna Reitz --- block/qcow2-cluster.c | 5 ++--- block/qcow2.c | 12 +++++++----- block/qcow2.h | 1 - 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index c1c43a891b..3d53657c26 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -556,8 +556,7 @@ static int coroutine_fn do_perform_cow_write(BlockDriverState *bs, * offset needs to be aligned to a cluster boundary. * * If the cluster is unallocated then *host_offset will be 0. - * If the cluster is compressed then *host_offset will contain the - * complete compressed cluster descriptor. + * If the cluster is compressed then *host_offset will contain the l2 entry. * * On entry, *bytes is the maximum number of contiguous bytes starting at * offset that we are interested in. @@ -660,7 +659,7 @@ int qcow2_get_host_offset(BlockDriverState *bs, uint64_t offset, ret = -EIO; goto fail; } - *host_offset = l2_entry & L2E_COMPRESSED_OFFSET_SIZE_MASK; + *host_offset = l2_entry; break; case QCOW2_SUBCLUSTER_ZERO_PLAIN: case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN: diff --git a/block/qcow2.c b/block/qcow2.c index 9f1b6461c8..e5d8ab679e 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -74,7 +74,7 @@ typedef struct { static int coroutine_fn qcow2_co_preadv_compressed(BlockDriverState *bs, - uint64_t cluster_descriptor, + uint64_t l2_entry, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, @@ -2205,7 +2205,7 @@ typedef struct Qcow2AioTask { BlockDriverState *bs; QCow2SubclusterType subcluster_type; /* only for read */ - uint64_t host_offset; /* or full descriptor in compressed clusters */ + uint64_t host_offset; /* or l2_entry for compressed read */ uint64_t offset; uint64_t bytes; QEMUIOVector *qiov; @@ -4693,7 +4693,7 @@ qcow2_co_pwritev_compressed_part(BlockDriverState *bs, static int coroutine_fn qcow2_co_preadv_compressed(BlockDriverState *bs, - uint64_t cluster_descriptor, + uint64_t l2_entry, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, @@ -4705,8 +4705,10 @@ qcow2_co_preadv_compressed(BlockDriverState *bs, uint8_t *buf, *out_buf; int offset_in_cluster = offset_into_cluster(s, offset); - coffset = cluster_descriptor & s->cluster_offset_mask; - nb_csectors = ((cluster_descriptor >> s->csize_shift) & s->csize_mask) + 1; + assert(qcow2_get_cluster_type(bs, l2_entry) == QCOW2_CLUSTER_COMPRESSED); + + coffset = l2_entry & s->cluster_offset_mask; + nb_csectors = ((l2_entry >> s->csize_shift) & s->csize_mask) + 1; csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE - (coffset & ~QCOW2_COMPRESSED_SECTOR_MASK); diff --git a/block/qcow2.h b/block/qcow2.h index 0fe5f74ed3..42a0058ab7 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -588,7 +588,6 @@ typedef enum QCow2MetadataOverlap { #define L1E_OFFSET_MASK 0x00fffffffffffe00ULL #define L2E_OFFSET_MASK 0x00fffffffffffe00ULL -#define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL #define REFT_OFFSET_MASK 0xfffffffffffffe00ULL From a6e098462ba6d8585a6f21729b406ab51a70eb03 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 14 Sep 2021 15:24:47 +0300 Subject: [PATCH 24/32] qcow2: introduce qcow2_parse_compressed_l2_entry() helper Add helper to parse compressed l2_entry and use it everywhere instead of open-coding. Note, that in most places we move to precise coffset/csize instead of sector-aligned. Still it should work good enough for updating refcounts. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Eric Blake Reviewed-by: Hanna Reitz Message-Id: <20210914122454.141075-4-vsementsov@virtuozzo.com> Signed-off-by: Hanna Reitz --- block/qcow2-cluster.c | 15 +++++++++++++++ block/qcow2-refcount.c | 36 +++++++++++++++++------------------- block/qcow2.c | 9 ++------- block/qcow2.h | 3 ++- 4 files changed, 36 insertions(+), 27 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 3d53657c26..4ebb49a087 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -2480,3 +2480,18 @@ fail: g_free(l1_table); return ret; } + +void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry, + uint64_t *coffset, int *csize) +{ + BDRVQcow2State *s = bs->opaque; + int nb_csectors; + + assert(qcow2_get_cluster_type(bs, l2_entry) == QCOW2_CLUSTER_COMPRESSED); + + *coffset = l2_entry & s->cluster_offset_mask; + + nb_csectors = ((l2_entry >> s->csize_shift) & s->csize_mask) + 1; + *csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE - + (*coffset & (QCOW2_COMPRESSED_SECTOR_SIZE - 1)); +} diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 2734338625..66cbb94ef9 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -1177,11 +1177,11 @@ void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry, switch (ctype) { case QCOW2_CLUSTER_COMPRESSED: { - int64_t offset = (l2_entry & s->cluster_offset_mask) - & QCOW2_COMPRESSED_SECTOR_MASK; - int size = QCOW2_COMPRESSED_SECTOR_SIZE * - (((l2_entry >> s->csize_shift) & s->csize_mask) + 1); - qcow2_free_clusters(bs, offset, size, type); + uint64_t coffset; + int csize; + + qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize); + qcow2_free_clusters(bs, coffset, csize, type); } break; case QCOW2_CLUSTER_NORMAL: @@ -1247,7 +1247,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, bool l1_allocated = false; int64_t old_entry, old_l2_offset; unsigned slice, slice_size2, n_slices; - int i, j, l1_modified = 0, nb_csectors; + int i, j, l1_modified = 0; int ret; assert(addend >= -1 && addend <= 1); @@ -1318,14 +1318,14 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, switch (qcow2_get_cluster_type(bs, entry)) { case QCOW2_CLUSTER_COMPRESSED: - nb_csectors = ((entry >> s->csize_shift) & - s->csize_mask) + 1; if (addend != 0) { - uint64_t coffset = (entry & s->cluster_offset_mask) - & QCOW2_COMPRESSED_SECTOR_MASK; + uint64_t coffset; + int csize; + + qcow2_parse_compressed_l2_entry(bs, entry, + &coffset, &csize); ret = update_refcount( - bs, coffset, - nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE, + bs, coffset, csize, abs(addend), addend < 0, QCOW2_DISCARD_SNAPSHOT); if (ret < 0) { @@ -1603,7 +1603,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, BDRVQcow2State *s = bs->opaque; uint64_t l2_entry; uint64_t next_contiguous_offset = 0; - int i, nb_csectors, ret; + int i, ret; size_t l2_size_bytes = s->l2_size * l2_entry_size(s); g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes); @@ -1617,6 +1617,8 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, /* Do the actual checks */ for (i = 0; i < s->l2_size; i++) { + uint64_t coffset; + int csize; l2_entry = get_l2_entry(s, l2_table, i); switch (qcow2_get_cluster_type(bs, l2_entry)) { @@ -1638,13 +1640,9 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, } /* Mark cluster as used */ - nb_csectors = ((l2_entry >> s->csize_shift) & - s->csize_mask) + 1; - l2_entry &= s->cluster_offset_mask; + qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize); ret = qcow2_inc_refcounts_imrt( - bs, res, refcount_table, refcount_table_size, - l2_entry & QCOW2_COMPRESSED_SECTOR_MASK, - nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE); + bs, res, refcount_table, refcount_table_size, coffset, csize); if (ret < 0) { return ret; } diff --git a/block/qcow2.c b/block/qcow2.c index e5d8ab679e..02f9f3e636 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -4700,17 +4700,12 @@ qcow2_co_preadv_compressed(BlockDriverState *bs, size_t qiov_offset) { BDRVQcow2State *s = bs->opaque; - int ret = 0, csize, nb_csectors; + int ret = 0, csize; uint64_t coffset; uint8_t *buf, *out_buf; int offset_in_cluster = offset_into_cluster(s, offset); - assert(qcow2_get_cluster_type(bs, l2_entry) == QCOW2_CLUSTER_COMPRESSED); - - coffset = l2_entry & s->cluster_offset_mask; - nb_csectors = ((l2_entry >> s->csize_shift) & s->csize_mask) + 1; - csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE - - (coffset & ~QCOW2_COMPRESSED_SECTOR_MASK); + qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize); buf = g_try_malloc(csize); if (!buf) { diff --git a/block/qcow2.h b/block/qcow2.h index 42a0058ab7..c0e1e83796 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -110,7 +110,6 @@ /* Defined in the qcow2 spec (compressed cluster descriptor) */ #define QCOW2_COMPRESSED_SECTOR_SIZE 512U -#define QCOW2_COMPRESSED_SECTOR_MASK (~(QCOW2_COMPRESSED_SECTOR_SIZE - 1ULL)) /* Must be at least 2 to cover COW */ #define MIN_L2_CACHE_SIZE 2 /* cache entries */ @@ -913,6 +912,8 @@ int qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, uint64_t offset, int compressed_size, uint64_t *host_offset); +void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry, + uint64_t *coffset, int *csize); int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m); void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m); From a2debf6506266cc65014a1964c71059dce7b0118 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 14 Sep 2021 15:24:48 +0300 Subject: [PATCH 25/32] qcow2-refcount: introduce fix_l2_entry_by_zero() Split fix_l2_entry_by_zero() out of check_refcounts_l2() to be reused in further patch. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Eric Blake Reviewed-by: Hanna Reitz Message-Id: <20210914122454.141075-5-vsementsov@virtuozzo.com> Signed-off-by: Hanna Reitz --- block/qcow2-refcount.c | 87 +++++++++++++++++++++++++++++------------- 1 file changed, 60 insertions(+), 27 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 66cbb94ef9..184b96ad63 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -1587,6 +1587,54 @@ enum { CHECK_FRAG_INFO = 0x2, /* update BlockFragInfo counters */ }; +/* + * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN. + * + * This function decrements res->corruptions on success, so the caller is + * responsible to increment res->corruptions prior to the call. + * + * On failure in-memory @l2_table may be modified. + */ +static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res, + uint64_t l2_offset, + uint64_t *l2_table, int l2_index, bool active, + bool *metadata_overlap) +{ + BDRVQcow2State *s = bs->opaque; + int ret; + int idx = l2_index * (l2_entry_size(s) / sizeof(uint64_t)); + uint64_t l2e_offset = l2_offset + (uint64_t)l2_index * l2_entry_size(s); + int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2; + uint64_t l2_entry = has_subclusters(s) ? 0 : QCOW_OFLAG_ZERO; + + set_l2_entry(s, l2_table, l2_index, l2_entry); + ret = qcow2_pre_write_overlap_check(bs, ign, l2e_offset, l2_entry_size(s), + false); + if (metadata_overlap) { + *metadata_overlap = ret < 0; + } + if (ret < 0) { + fprintf(stderr, "ERROR: Overlap check failed\n"); + goto fail; + } + + ret = bdrv_pwrite_sync(bs->file, l2e_offset, &l2_table[idx], + l2_entry_size(s)); + if (ret < 0) { + fprintf(stderr, "ERROR: Failed to overwrite L2 " + "table entry: %s\n", strerror(-ret)); + goto fail; + } + + res->corruptions--; + res->corruptions_fixed++; + return 0; + +fail: + res->check_errors++; + return ret; +} + /* * Increases the refcount in the given refcount table for the all clusters * referenced in the L2 table. While doing so, performs some checks on L2 @@ -1606,6 +1654,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, int i, ret; size_t l2_size_bytes = s->l2_size * l2_entry_size(s); g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes); + bool metadata_overlap; /* Read L2 table from disk */ ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size_bytes); @@ -1685,19 +1734,10 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", offset); if (fix & BDRV_FIX_ERRORS) { - int idx = i * (l2_entry_size(s) / sizeof(uint64_t)); - uint64_t l2e_offset = - l2_offset + (uint64_t)i * l2_entry_size(s); - int ign = active ? QCOW2_OL_ACTIVE_L2 : - QCOW2_OL_INACTIVE_L2; - - l2_entry = has_subclusters(s) ? 0 : QCOW_OFLAG_ZERO; - set_l2_entry(s, l2_table, i, l2_entry); - ret = qcow2_pre_write_overlap_check(bs, ign, - l2e_offset, l2_entry_size(s), false); - if (ret < 0) { - fprintf(stderr, "ERROR: Overlap check failed\n"); - res->check_errors++; + ret = fix_l2_entry_by_zero(bs, res, l2_offset, + l2_table, i, active, + &metadata_overlap); + if (metadata_overlap) { /* * Something is seriously wrong, so abort checking * this L2 table. @@ -1705,26 +1745,19 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, return ret; } - ret = bdrv_pwrite_sync(bs->file, l2e_offset, - &l2_table[idx], - l2_entry_size(s)); - if (ret < 0) { - fprintf(stderr, "ERROR: Failed to overwrite L2 " - "table entry: %s\n", strerror(-ret)); - res->check_errors++; - /* - * Do not abort, continue checking the rest of this - * L2 table's entries. - */ - } else { - res->corruptions--; - res->corruptions_fixed++; + if (ret == 0) { /* * Skip marking the cluster as used * (it is unused now). */ continue; } + + /* + * Failed to fix. + * Do not abort, continue checking the rest of this + * L2 table's entries. + */ } } else { fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is " From 5c3216c0460cbedd12994e584f93e0ea63a026ec Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 14 Sep 2021 15:24:49 +0300 Subject: [PATCH 26/32] qcow2-refcount: fix_l2_entry_by_zero(): also zero L2 entry bitmap We'll reuse the function to fix wrong L2 entry bitmap. Support it now. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Eric Blake Reviewed-by: Hanna Reitz Message-Id: <20210914122454.141075-6-vsementsov@virtuozzo.com> Signed-off-by: Hanna Reitz --- block/qcow2-refcount.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 184b96ad63..f48c5e1b5d 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -1588,7 +1588,8 @@ enum { }; /* - * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN. + * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN (or making all its present + * subclusters QCOW2_SUBCLUSTER_ZERO_PLAIN). * * This function decrements res->corruptions on success, so the caller is * responsible to increment res->corruptions prior to the call. @@ -1605,9 +1606,20 @@ static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res, int idx = l2_index * (l2_entry_size(s) / sizeof(uint64_t)); uint64_t l2e_offset = l2_offset + (uint64_t)l2_index * l2_entry_size(s); int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2; - uint64_t l2_entry = has_subclusters(s) ? 0 : QCOW_OFLAG_ZERO; - set_l2_entry(s, l2_table, l2_index, l2_entry); + if (has_subclusters(s)) { + uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, l2_index); + + /* Allocated subclusters become zero */ + l2_bitmap |= l2_bitmap << 32; + l2_bitmap &= QCOW_L2_BITMAP_ALL_ZEROES; + + set_l2_bitmap(s, l2_table, l2_index, l2_bitmap); + set_l2_entry(s, l2_table, l2_index, 0); + } else { + set_l2_entry(s, l2_table, l2_index, QCOW_OFLAG_ZERO); + } + ret = qcow2_pre_write_overlap_check(bs, ign, l2e_offset, l2_entry_size(s), false); if (metadata_overlap) { From 9631c7822ec60eff0701ebf151bd8b9bd5c1d5d4 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 14 Sep 2021 15:24:50 +0300 Subject: [PATCH 27/32] qcow2-refcount: check_refcounts_l2(): check l2_bitmap Check subcluster bitmap of the l2 entry for different types of clusters: - for compressed it must be zero - for allocated check consistency of two parts of the bitmap - for unallocated all subclusters should be unallocated (or zero-plain) Signed-off-by: Vladimir Sementsov-Ogievskiy Tested-by: Kirill Tkhai Message-Id: <20210914122454.141075-7-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake Reviewed-by: Hanna Reitz Signed-off-by: Hanna Reitz --- block/qcow2-refcount.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index f48c5e1b5d..9a5ae3cac4 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -1661,7 +1661,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, int flags, BdrvCheckMode fix, bool active) { BDRVQcow2State *s = bs->opaque; - uint64_t l2_entry; + uint64_t l2_entry, l2_bitmap; uint64_t next_contiguous_offset = 0; int i, ret; size_t l2_size_bytes = s->l2_size * l2_entry_size(s); @@ -1681,6 +1681,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, uint64_t coffset; int csize; l2_entry = get_l2_entry(s, l2_table, i); + l2_bitmap = get_l2_bitmap(s, l2_table, i); switch (qcow2_get_cluster_type(bs, l2_entry)) { case QCOW2_CLUSTER_COMPRESSED: @@ -1700,6 +1701,14 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, break; } + if (l2_bitmap) { + fprintf(stderr, "ERROR compressed cluster %d with non-zero " + "subcluster allocation bitmap, entry=0x%" PRIx64 "\n", + i, l2_entry); + res->corruptions++; + break; + } + /* Mark cluster as used */ qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize); ret = qcow2_inc_refcounts_imrt( @@ -1727,13 +1736,19 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, { uint64_t offset = l2_entry & L2E_OFFSET_MASK; + if ((l2_bitmap >> 32) & l2_bitmap) { + res->corruptions++; + fprintf(stderr, "ERROR offset=%" PRIx64 ": Allocated " + "cluster has corrupted subcluster allocation bitmap\n", + offset); + } + /* Correct offsets are cluster aligned */ if (offset_into_cluster(s, offset)) { bool contains_data; res->corruptions++; if (has_subclusters(s)) { - uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, i); contains_data = (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC); } else { contains_data = !(l2_entry & QCOW_OFLAG_ZERO); @@ -1799,7 +1814,16 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, } case QCOW2_CLUSTER_ZERO_PLAIN: + /* Impossible when image has subclusters */ + assert(!l2_bitmap); + break; + case QCOW2_CLUSTER_UNALLOCATED: + if (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC) { + res->corruptions++; + fprintf(stderr, "ERROR: Unallocated " + "cluster has non-zero subcluster allocation map\n"); + } break; default: From 289ef5f219d1a94b8225c459dc65821b37637a4f Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 14 Sep 2021 15:24:51 +0300 Subject: [PATCH 28/32] qcow2-refcount: check_refcounts_l2(): check reserved bits Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Eric Blake Tested-by: Kirill Tkhai Reviewed-by: Hanna Reitz Message-Id: <20210914122454.141075-8-vsementsov@virtuozzo.com> [hreitz: Separated `type` declaration from statements] Signed-off-by: Hanna Reitz --- block/qcow2-refcount.c | 14 +++++++++++++- block/qcow2.h | 1 + 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 9a5ae3cac4..bdac7b1780 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -1680,10 +1680,22 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, for (i = 0; i < s->l2_size; i++) { uint64_t coffset; int csize; + QCow2ClusterType type; + l2_entry = get_l2_entry(s, l2_table, i); l2_bitmap = get_l2_bitmap(s, l2_table, i); + type = qcow2_get_cluster_type(bs, l2_entry); - switch (qcow2_get_cluster_type(bs, l2_entry)) { + if (type != QCOW2_CLUSTER_COMPRESSED) { + /* Check reserved bits of Standard Cluster Descriptor */ + if (l2_entry & L2E_STD_RESERVED_MASK) { + fprintf(stderr, "ERROR found l2 entry with reserved bits set: " + "%" PRIx64 "\n", l2_entry); + res->corruptions++; + } + } + + switch (type) { case QCOW2_CLUSTER_COMPRESSED: /* Compressed clusters don't have QCOW_OFLAG_COPIED */ if (l2_entry & QCOW_OFLAG_COPIED) { diff --git a/block/qcow2.h b/block/qcow2.h index c0e1e83796..b8b1093b61 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -587,6 +587,7 @@ typedef enum QCow2MetadataOverlap { #define L1E_OFFSET_MASK 0x00fffffffffffe00ULL #define L2E_OFFSET_MASK 0x00fffffffffffe00ULL +#define L2E_STD_RESERVED_MASK 0x3f000000000001feULL #define REFT_OFFSET_MASK 0xfffffffffffffe00ULL From cd6efd60e9aaa0665ae76c0ec91298eeac2f4c25 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 14 Sep 2021 15:24:52 +0300 Subject: [PATCH 29/32] qcow2-refcount: improve style of check_refcounts_l1() - use g_autofree for l1_table - better name for size in bytes variable - reduce code blocks nesting - whitespaces, braces, newlines Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Hanna Reitz Message-Id: <20210914122454.141075-9-vsementsov@virtuozzo.com> Signed-off-by: Hanna Reitz --- block/qcow2-refcount.c | 98 +++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 48 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index bdac7b1780..3c89e09599 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -1862,71 +1862,73 @@ static int check_refcounts_l1(BlockDriverState *bs, int flags, BdrvCheckMode fix, bool active) { BDRVQcow2State *s = bs->opaque; - uint64_t *l1_table = NULL, l2_offset, l1_size2; + size_t l1_size_bytes = l1_size * L1E_SIZE; + g_autofree uint64_t *l1_table = NULL; + uint64_t l2_offset; int i, ret; - l1_size2 = l1_size * L1E_SIZE; + if (!l1_size) { + return 0; + } /* Mark L1 table as used */ ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size, - l1_table_offset, l1_size2); + l1_table_offset, l1_size_bytes); if (ret < 0) { - goto fail; + return ret; + } + + l1_table = g_try_malloc(l1_size_bytes); + if (l1_table == NULL) { + res->check_errors++; + return -ENOMEM; } /* Read L1 table entries from disk */ - if (l1_size2 > 0) { - l1_table = g_try_malloc(l1_size2); - if (l1_table == NULL) { - ret = -ENOMEM; - res->check_errors++; - goto fail; - } - ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); - if (ret < 0) { - fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); - res->check_errors++; - goto fail; - } - for(i = 0;i < l1_size; i++) - be64_to_cpus(&l1_table[i]); + ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size_bytes); + if (ret < 0) { + fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); + res->check_errors++; + return ret; + } + + for (i = 0; i < l1_size; i++) { + be64_to_cpus(&l1_table[i]); } /* Do the actual checks */ - for(i = 0; i < l1_size; i++) { - l2_offset = l1_table[i]; - if (l2_offset) { - /* Mark L2 table as used */ - l2_offset &= L1E_OFFSET_MASK; - ret = qcow2_inc_refcounts_imrt(bs, res, - refcount_table, refcount_table_size, - l2_offset, s->cluster_size); - if (ret < 0) { - goto fail; - } + for (i = 0; i < l1_size; i++) { + if (!l1_table[i]) { + continue; + } - /* L2 tables are cluster aligned */ - if (offset_into_cluster(s, l2_offset)) { - fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " - "cluster aligned; L1 entry corrupted\n", l2_offset); - res->corruptions++; - } + l2_offset = l1_table[i] & L1E_OFFSET_MASK; - /* Process and check L2 entries */ - ret = check_refcounts_l2(bs, res, refcount_table, - refcount_table_size, l2_offset, flags, - fix, active); - if (ret < 0) { - goto fail; - } + /* Mark L2 table as used */ + ret = qcow2_inc_refcounts_imrt(bs, res, + refcount_table, refcount_table_size, + l2_offset, s->cluster_size); + if (ret < 0) { + return ret; + } + + /* L2 tables are cluster aligned */ + if (offset_into_cluster(s, l2_offset)) { + fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " + "cluster aligned; L1 entry corrupted\n", l2_offset); + res->corruptions++; + } + + /* Process and check L2 entries */ + ret = check_refcounts_l2(bs, res, refcount_table, + refcount_table_size, l2_offset, flags, + fix, active); + if (ret < 0) { + return ret; } } - g_free(l1_table); - return 0; -fail: - g_free(l1_table); - return ret; + return 0; } /* From 98bc07d6cd525910c9aec30989fc82f70ddd620c Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 14 Sep 2021 15:24:53 +0300 Subject: [PATCH 30/32] qcow2-refcount: check_refcounts_l1(): check reserved bits Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Eric Blake Tested-by: Kirill Tkhai Reviewed-by: Hanna Reitz Message-Id: <20210914122454.141075-10-vsementsov@virtuozzo.com> Signed-off-by: Hanna Reitz --- block/qcow2-refcount.c | 6 ++++++ block/qcow2.h | 1 + 2 files changed, 7 insertions(+) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 3c89e09599..1c246b9227 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -1902,6 +1902,12 @@ static int check_refcounts_l1(BlockDriverState *bs, continue; } + if (l1_table[i] & L1E_RESERVED_MASK) { + fprintf(stderr, "ERROR found L1 entry with reserved bits set: " + "%" PRIx64 "\n", l1_table[i]); + res->corruptions++; + } + l2_offset = l1_table[i] & L1E_OFFSET_MASK; /* Mark L2 table as used */ diff --git a/block/qcow2.h b/block/qcow2.h index b8b1093b61..58fd7f1678 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -586,6 +586,7 @@ typedef enum QCow2MetadataOverlap { (QCOW2_OL_CACHED | QCOW2_OL_INACTIVE_L2) #define L1E_OFFSET_MASK 0x00fffffffffffe00ULL +#define L1E_RESERVED_MASK 0x7f000000000001ffULL #define L2E_OFFSET_MASK 0x00fffffffffffe00ULL #define L2E_STD_RESERVED_MASK 0x3f000000000001feULL From 8fba39515170752c3bcdbb95551d778c94095271 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 14 Sep 2021 15:24:54 +0300 Subject: [PATCH 31/32] qcow2-refcount: check_refblocks(): add separate message for reserved Split checking for reserved bits out of aligned offset check. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Eric Blake Tested-by: Kirill Tkhai Reviewed-by: Hanna Reitz Message-Id: <20210914122454.141075-11-vsementsov@virtuozzo.com> Signed-off-by: Hanna Reitz --- block/qcow2-refcount.c | 10 +++++++++- block/qcow2.h | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 1c246b9227..4614572252 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -2089,9 +2089,17 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, for(i = 0; i < s->refcount_table_size; i++) { uint64_t offset, cluster; - offset = s->refcount_table[i]; + offset = s->refcount_table[i] & REFT_OFFSET_MASK; cluster = offset >> s->cluster_bits; + if (s->refcount_table[i] & REFT_RESERVED_MASK) { + fprintf(stderr, "ERROR refcount table entry %" PRId64 " has " + "reserved bits set\n", i); + res->corruptions++; + *rebuild = true; + continue; + } + /* Refcount blocks are cluster aligned */ if (offset_into_cluster(s, offset)) { fprintf(stderr, "ERROR refcount block %" PRId64 " is not " diff --git a/block/qcow2.h b/block/qcow2.h index 58fd7f1678..fd48a89d45 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -591,6 +591,7 @@ typedef enum QCow2MetadataOverlap { #define L2E_STD_RESERVED_MASK 0x3f000000000001feULL #define REFT_OFFSET_MASK 0xfffffffffffffe00ULL +#define REFT_RESERVED_MASK 0x1ffULL #define INV_OFFSET (-1ULL) From 1899bf47375ad40555dcdff12ba49b4b8b82df38 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Mon, 13 Sep 2021 08:17:35 -0500 Subject: [PATCH 32/32] qemu-img: Add -F shorthand to convert Although we have long supported 'qemu-img convert -o backing_file=foo,backing_fmt=bar', the fact that we have a shortcut -B for backing_file but none for backing_fmt has made it more likely that users accidentally run into: qemu-img: warning: Deprecated use of backing file without explicit backing format when using -B instead of -o. For similarity with other qemu-img commands, such as create and compare, add '-F $fmt' as the shorthand for '-o backing_fmt=$fmt'. Update iotest 122 for coverage of both spellings. Signed-off-by: Eric Blake Message-Id: <20210913131735.1948339-1-eblake@redhat.com> Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Maxim Levitsky Signed-off-by: Hanna Reitz --- docs/tools/qemu-img.rst | 4 ++-- qemu-img-cmds.hx | 2 +- qemu-img.c | 10 +++++++--- tests/qemu-iotests/122 | 2 +- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst index fe6c30d509..d58980aef8 100644 --- a/docs/tools/qemu-img.rst +++ b/docs/tools/qemu-img.rst @@ -415,7 +415,7 @@ Command description: 4 Error on reading data -.. option:: convert [--object OBJECTDEF] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps [--skip-broken-bitmaps]] [-U] [-C] [-c] [-p] [-q] [-n] [-f FMT] [-t CACHE] [-T SRC_CACHE] [-O OUTPUT_FMT] [-B BACKING_FILE] [-o OPTIONS] [-l SNAPSHOT_PARAM] [-S SPARSE_SIZE] [-r RATE_LIMIT] [-m NUM_COROUTINES] [-W] FILENAME [FILENAME2 [...]] OUTPUT_FILENAME +.. option:: convert [--object OBJECTDEF] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps [--skip-broken-bitmaps]] [-U] [-C] [-c] [-p] [-q] [-n] [-f FMT] [-t CACHE] [-T SRC_CACHE] [-O OUTPUT_FMT] [-B BACKING_FILE [-F backing_fmt]] [-o OPTIONS] [-l SNAPSHOT_PARAM] [-S SPARSE_SIZE] [-r RATE_LIMIT] [-m NUM_COROUTINES] [-W] FILENAME [FILENAME2 [...]] OUTPUT_FILENAME Convert the disk image *FILENAME* or a snapshot *SNAPSHOT_PARAM* to disk image *OUTPUT_FILENAME* using format *OUTPUT_FMT*. It can @@ -439,7 +439,7 @@ Command description: You can use the *BACKING_FILE* option to force the output image to be created as a copy on write image of the specified base image; the *BACKING_FILE* should have the same content as the input's base image, - however the path, image format, etc may differ. + however the path, image format (as given by *BACKING_FMT*), etc may differ. If a relative path name is given, the backing file is looked up relative to the directory containing *OUTPUT_FILENAME*. diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx index b3620f29e5..4c4d94ab22 100644 --- a/qemu-img-cmds.hx +++ b/qemu-img-cmds.hx @@ -46,7 +46,7 @@ SRST ERST DEF("convert", img_convert, - "convert [--object objectdef] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-T src_cache] [-O output_fmt] [-B backing_file] [-o options] [-l snapshot_param] [-S sparse_size] [-r rate_limit] [-m num_coroutines] [-W] [--salvage] filename [filename2 [...]] output_filename") + "convert [--object objectdef] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-T src_cache] [-O output_fmt] [-B backing_file [-F backing_fmt]] [-o options] [-l snapshot_param] [-S sparse_size] [-r rate_limit] [-m num_coroutines] [-W] [--salvage] filename [filename2 [...]] output_filename") SRST .. option:: convert [--object OBJECTDEF] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f FMT] [-t CACHE] [-T SRC_CACHE] [-O OUTPUT_FMT] [-B BACKING_FILE] [-o OPTIONS] [-l SNAPSHOT_PARAM] [-S SPARSE_SIZE] [-r RATE_LIMIT] [-m NUM_COROUTINES] [-W] [--salvage] FILENAME [FILENAME2 [...]] OUTPUT_FILENAME ERST diff --git a/qemu-img.c b/qemu-img.c index e43a71a794..f036a1d428 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -2183,7 +2183,8 @@ static int img_convert(int argc, char **argv) int c, bs_i, flags, src_flags = BDRV_O_NO_SHARE; const char *fmt = NULL, *out_fmt = NULL, *cache = "unsafe", *src_cache = BDRV_DEFAULT_CACHE, *out_baseimg = NULL, - *out_filename, *out_baseimg_param, *snapshot_name = NULL; + *out_filename, *out_baseimg_param, *snapshot_name = NULL, + *backing_fmt = NULL; BlockDriver *drv = NULL, *proto_drv = NULL; BlockDriverInfo bdi; BlockDriverState *out_bs; @@ -2223,7 +2224,7 @@ static int img_convert(int argc, char **argv) {"skip-broken-bitmaps", no_argument, 0, OPTION_SKIP_BROKEN}, {0, 0, 0, 0} }; - c = getopt_long(argc, argv, ":hf:O:B:Cco:l:S:pt:T:qnm:WUr:", + c = getopt_long(argc, argv, ":hf:O:B:CcF:o:l:S:pt:T:qnm:WUr:", long_options, NULL); if (c == -1) { break; @@ -2253,6 +2254,9 @@ static int img_convert(int argc, char **argv) case 'c': s.compressed = true; break; + case 'F': + backing_fmt = optarg; + break; case 'o': if (accumulate_options(&options, optarg) < 0) { goto fail_getopt; @@ -2521,7 +2525,7 @@ static int img_convert(int argc, char **argv) qemu_opt_set_number(opts, BLOCK_OPT_SIZE, s.total_sectors * BDRV_SECTOR_SIZE, &error_abort); - ret = add_old_style_options(out_fmt, opts, out_baseimg, NULL); + ret = add_old_style_options(out_fmt, opts, out_baseimg, backing_fmt); if (ret < 0) { goto out; } diff --git a/tests/qemu-iotests/122 b/tests/qemu-iotests/122 index 5d550ed13e..efb260d822 100755 --- a/tests/qemu-iotests/122 +++ b/tests/qemu-iotests/122 @@ -67,7 +67,7 @@ echo _make_test_img -b "$TEST_IMG".base -F $IMGFMT $QEMU_IO -c "write -P 0 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir -$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base -o backing_fmt=$IMGFMT \ +$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base -F $IMGFMT \ "$TEST_IMG" "$TEST_IMG".orig $QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir $QEMU_IMG convert -O $IMGFMT -c -B "$TEST_IMG".base -o backing_fmt=$IMGFMT \