From 14a58a4e0c2e98a7d9232e1c229a531ca231133b Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 10 Feb 2015 15:02:31 -0500 Subject: [PATCH 01/12] qcow2: Respect new_block in alloc_refcount_block() When choosing a new place for the refcount table, alloc_refcount_block() tries to infer the number of clusters used so far from its argument cluster_index (which comes from the idea that if any cluster with an index greater than cluster_index was in use, the refcount table would have to be big enough already to describe cluster_index). However, there is a cluster that may be at or after cluster_index, and which is not covered by the refcount structures, and that is the new refcount block new_block. Therefore, it should be taken into account for the blocks_used calculation. Also, because new_block already describes (or is intended to describe) cluster_index, we may not put the new refcount structures there. Signed-off-by: Max Reitz Message-id: 1423598552-24301-2-git-send-email-mreitz@redhat.com Reviewed-by: Eric Blake Reviewed-by: Kevin Wolf Signed-off-by: Max Reitz --- block/qcow2-refcount.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index dc8d186a82..6cbae1d205 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -466,8 +466,20 @@ static int alloc_refcount_block(BlockDriverState *bs, */ BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW); - /* Calculate the number of refcount blocks needed so far */ - uint64_t blocks_used = DIV_ROUND_UP(cluster_index, s->refcount_block_size); + /* Calculate the number of refcount blocks needed so far; this will be the + * basis for calculating the index of the first cluster used for the + * self-describing refcount structures which we are about to create. + * + * Because we reached this point, there cannot be any refcount entries for + * cluster_index or higher indices yet. However, because new_block has been + * allocated to describe that cluster (and it will assume this role later + * on), we cannot use that index; also, new_block may actually have a higher + * cluster index than cluster_index, so it needs to be taken into account + * here (and 1 needs to be added to its value because that cluster is used). + */ + uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1, + (new_block >> s->cluster_bits) + 1), + s->refcount_block_size); if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) { return -EFBIG; From 0e8a371468ce24513b15a9ae362f12822e1973a3 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 10 Feb 2015 15:02:32 -0500 Subject: [PATCH 02/12] iotests: Add tests for refcount table growth Signed-off-by: Max Reitz Message-id: 1423598552-24301-3-git-send-email-mreitz@redhat.com Reviewed-by: Eric Blake Reviewed-by: Kevin Wolf Signed-off-by: Max Reitz --- tests/qemu-iotests/121 | 102 +++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/121.out | 23 +++++++++ tests/qemu-iotests/group | 1 + 3 files changed, 126 insertions(+) create mode 100755 tests/qemu-iotests/121 create mode 100644 tests/qemu-iotests/121.out diff --git a/tests/qemu-iotests/121 b/tests/qemu-iotests/121 new file mode 100755 index 0000000000..0912c3f0cb --- /dev/null +++ b/tests/qemu-iotests/121 @@ -0,0 +1,102 @@ +#!/bin/bash +# +# Test cases for qcow2 refcount table growth +# +# Copyright (C) 2015 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +# creator +owner=mreitz@redhat.com + +seq="$(basename $0)" +echo "QA output created by $seq" + +here="$PWD" +tmp=/tmp/$$ +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_test_img +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter + +_supported_fmt qcow2 +_supported_proto file +_supported_os Linux + +echo +echo '=== New refcount structures may not conflict with existing structures ===' + +echo +echo '--- Test 1 ---' +echo + +# Preallocation speeds up the write operation, but preallocating everything will +# destroy the purpose of the write; so preallocate one KB less than what would +# cause a reftable growth... +IMGOPTS='preallocation=metadata,cluster_size=1k' _make_test_img 64512K +# ...and make the image the desired size afterwards. +$QEMU_IMG resize "$TEST_IMG" 65M + +# The first write results in a growth of the refcount table during an allocation +# which has precisely the required size so that the new refcount block allocated +# in alloc_refcount_block() is right after cluster_index; this did lead to a +# different refcount block being written to disk (a zeroed cluster) than what is +# cached (a refblock with one entry having a refcount of 1), and the second +# write would then result in that cached cluster being marked dirty and then +# in it being written to disk. +# This should not happen, the new refcount structures may not conflict with +# new_block. +# (Note that for some reason, 'write 63M 1K' does not trigger the problem) +$QEMU_IO -c 'write 62M 1025K' -c 'write 64M 1M' "$TEST_IMG" | _filter_qemu_io + +_check_test_img + + +echo +echo '--- Test 2 ---' +echo + +IMGOPTS='preallocation=metadata,cluster_size=1k' _make_test_img 64513K +# This results in an L1 table growth which in turn results in some clusters at +# the start of the image becoming free +$QEMU_IMG resize "$TEST_IMG" 65M + +# This write results in a refcount table growth; but the refblock allocated +# immediately before that (new_block) takes cluster index 4 (which is now free) +# and is thus not self-describing (in contrast to test 1, where new_block was +# self-describing). The refcount table growth algorithm then used to place the +# new refcount structures at cluster index 65536 (which is the same as the +# cluster_index parameter in this case), allocating a new refcount block for +# that cluster while new_block already existed, leaking new_block. +# Therefore, the new refcount structures may not be put at cluster_index +# (because new_block already describes that cluster, and the new structures try +# to be self-describing). +$QEMU_IO -c 'write 63M 130K' "$TEST_IMG" | _filter_qemu_io + +_check_test_img + + +# success, all done +echo +echo '*** done' +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/121.out b/tests/qemu-iotests/121.out new file mode 100644 index 0000000000..ff18e2c618 --- /dev/null +++ b/tests/qemu-iotests/121.out @@ -0,0 +1,23 @@ +QA output created by 121 + +=== New refcount structures may not conflict with existing structures === + +--- Test 1 --- + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=66060288 preallocation='metadata' +Image resized. +wrote 1049600/1049600 bytes at offset 65011712 +1.001 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 1048576/1048576 bytes at offset 67108864 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +No errors were found on the image. + +--- Test 2 --- + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=66061312 preallocation='metadata' +Image resized. +wrote 133120/133120 bytes at offset 66060288 +130 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +No errors were found on the image. + +*** done diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index 71f19d4ece..961c7cda33 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -120,5 +120,6 @@ 113 rw auto quick 114 rw auto quick 116 rw auto quick +121 rw auto 123 rw auto quick 128 rw auto quick From 4b4d7b072f0faf9008ca835af101338857b28394 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 5 Dec 2014 17:53:32 +0100 Subject: [PATCH 03/12] iotests: Test non-self-referential qcow2 refblocks It is easy to create only self-referential refblocks, but there are cases where that is impossible. This adds a test for two of those cases (combined in a single test case). Suggested-by: Eric Blake Signed-off-by: Max Reitz Message-id: 1417798412-15330-1-git-send-email-mreitz@redhat.com Reviewed-by: Eric Blake Signed-off-by: Max Reitz --- tests/qemu-iotests/115 | 95 ++++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/115.out | 8 ++++ tests/qemu-iotests/group | 1 + 3 files changed, 104 insertions(+) create mode 100755 tests/qemu-iotests/115 create mode 100644 tests/qemu-iotests/115.out diff --git a/tests/qemu-iotests/115 b/tests/qemu-iotests/115 new file mode 100755 index 0000000000..a6be1876aa --- /dev/null +++ b/tests/qemu-iotests/115 @@ -0,0 +1,95 @@ +#!/bin/bash +# +# Test case for non-self-referential qcow2 refcount blocks +# +# Copyright (C) 2014 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +# creator +owner=mreitz@redhat.com + +seq="$(basename $0)" +echo "QA output created by $seq" + +here="$PWD" +tmp=/tmp/$$ +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_test_img +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter + +_supported_fmt qcow2 +_supported_proto file +_supported_os Linux +# This test relies on refcounts being 64 bits wide (which does not work with +# compat=0.10) +_unsupported_imgopts 'refcount_bits=\([^6]\|.\([^4]\|$\)\)' 'compat=0.10' + +echo +echo '=== Testing large refcount and L1 table ===' +echo + +# Create an image with an L1 table and a refcount table that each span twice the +# number of clusters which can be described by a single refblock; therefore, at +# least two refblocks cannot count their own refcounts because all the clusters +# they describe are part of the L1 table or refcount table. + +# One refblock can describe (with cluster_size=512 and refcount_bits=64) +# 512/8 = 64 clusters, therefore the L1 table should cover 128 clusters, which +# equals 128 * (512/8) = 8192 entries (actually, 8192 - 512/8 = 8129 would +# suffice, but it does not really matter). 8192 L2 tables can in turn describe +# 8192 * 512/8 = 524,288 clusters which cover a space of 256 MB. + +# Since with refcount_bits=64 every refcount block entry is 64 bits wide (just +# like the L2 table entries), the same calculation applies to the refcount table +# as well; the difference is that while for the L1 table the guest disk size is +# concerned, for the refcount table it is the image length that has to be at +# least 256 MB. We can achieve that by using preallocation=metadata for an image +# which has a guest disk size of 256 MB. + +IMGOPTS="$IMGOPTS,refcount_bits=64,cluster_size=512,preallocation=metadata" \ + _make_test_img 256M + +# We know for sure that the L1 and refcount tables do not overlap with any other +# structure because the metadata overlap checks would have caught that case. + +# Because qemu refuses to open qcow2 files whose L1 table does not cover the +# whole guest disk size, it is definitely large enough. On the other hand, to +# test whether the refcount table is large enough, we simply have to verify that +# indeed all the clusters are allocated, which is done by qemu-img check. + +# The final thing we need to test is whether the tables are actually covered by +# refcount blocks; since all clusters of the tables are referenced, we can use +# qemu-img check for that purpose, too. + +$QEMU_IMG check "$TEST_IMG" | \ + sed -e 's/^.* = \([0-9]\+\.[0-9]\+% allocated\).*\(clusters\)$/\1 \2/' \ + -e '/^Image end offset/d' + +# (Note that we cannot use _check_test_img because that function filters out the +# allocation status) + +# success, all done +echo '*** done' +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/115.out b/tests/qemu-iotests/115.out new file mode 100644 index 0000000000..7b2c5e02f5 --- /dev/null +++ b/tests/qemu-iotests/115.out @@ -0,0 +1,8 @@ +QA output created by 115 + +=== Testing large refcount and L1 table === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=268435456 preallocation='metadata' +No errors were found on the image. +100.00% allocated clusters +*** done diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index 961c7cda33..62621278e2 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -119,6 +119,7 @@ 112 rw auto 113 rw auto quick 114 rw auto quick +115 rw auto 116 rw auto quick 121 rw auto 123 rw auto quick From 5560625badb9e710577986de65ff4e4b413729a0 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Mon, 2 Mar 2015 19:36:46 +0800 Subject: [PATCH 04/12] monitor: Convert bdrv_find to blk_by_name Signed-off-by: Fam Zheng Message-id: 1425296209-1476-2-git-send-email-famz@redhat.com Reviewed-by: Max Reitz Signed-off-by: Max Reitz --- monitor.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/monitor.c b/monitor.c index c86a89e9b0..8b703f97be 100644 --- a/monitor.c +++ b/monitor.c @@ -73,6 +73,7 @@ #include "block/qapi.h" #include "qapi/qmp-event.h" #include "qapi-event.h" +#include "sysemu/block-backend.h" /* for hmp_info_irq/pic */ #if defined(TARGET_SPARC) @@ -5414,15 +5415,15 @@ int monitor_read_block_device_key(Monitor *mon, const char *device, BlockCompletionFunc *completion_cb, void *opaque) { - BlockDriverState *bs; + BlockBackend *blk; - bs = bdrv_find(device); - if (!bs) { + blk = blk_by_name(device); + if (!blk) { monitor_printf(mon, "Device not found %s\n", device); return -1; } - return monitor_read_bdrv_key_start(mon, bs, completion_cb, opaque); + return monitor_read_bdrv_key_start(mon, blk_bs(blk), completion_cb, opaque); } QemuOptsList qemu_mon_opts = { From c9ebaf744ea7785776c358400eb13c256a3c9db6 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Mon, 2 Mar 2015 19:36:47 +0800 Subject: [PATCH 05/12] migration: Convert bdrv_find to blk_by_name Signed-off-by: Fam Zheng Message-id: 1425296209-1476-3-git-send-email-famz@redhat.com Reviewed-by: Max Reitz Signed-off-by: Max Reitz --- migration/block.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/migration/block.c b/migration/block.c index 0c7610600b..085c0fae05 100644 --- a/migration/block.c +++ b/migration/block.c @@ -23,6 +23,7 @@ #include "migration/block.h" #include "migration/migration.h" #include "sysemu/blockdev.h" +#include "sysemu/block-backend.h" #include #define BLOCK_SIZE (1 << 20) @@ -783,6 +784,7 @@ static int block_load(QEMUFile *f, void *opaque, int version_id) char device_name[256]; int64_t addr; BlockDriverState *bs, *bs_prev = NULL; + BlockBackend *blk; uint8_t *buf; int64_t total_sectors = 0; int nr_sectors; @@ -800,12 +802,13 @@ static int block_load(QEMUFile *f, void *opaque, int version_id) qemu_get_buffer(f, (uint8_t *)device_name, len); device_name[len] = '\0'; - bs = bdrv_find(device_name); - if (!bs) { + blk = blk_by_name(device_name); + if (!blk) { fprintf(stderr, "Error unknown block device %s\n", device_name); return -EINVAL; } + bs = blk_bs(blk); if (bs != bs_prev) { bs_prev = bs; From a0e8544cf84cf193e455a9a3b7d9997dbc268b4b Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Mon, 2 Mar 2015 19:36:48 +0800 Subject: [PATCH 06/12] blockdev: Convert bdrv_find to blk_by_name Signed-off-by: Fam Zheng Message-id: 1425296209-1476-4-git-send-email-famz@redhat.com Reviewed-by: Max Reitz Signed-off-by: Max Reitz --- blockdev.c | 92 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 59 insertions(+), 33 deletions(-) diff --git a/blockdev.c b/blockdev.c index b9c1c0cc1a..0b509854b0 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1016,18 +1016,18 @@ fail: void hmp_commit(Monitor *mon, const QDict *qdict) { const char *device = qdict_get_str(qdict, "device"); - BlockDriverState *bs; + BlockBackend *blk; int ret; if (!strcmp(device, "all")) { ret = bdrv_commit_all(); } else { - bs = bdrv_find(device); - if (!bs) { + blk = blk_by_name(device); + if (!blk) { monitor_printf(mon, "Device '%s' not found\n", device); return; } - ret = bdrv_commit(bs); + ret = bdrv_commit(blk_bs(blk)); } if (ret < 0) { monitor_printf(mon, "'commit' error for '%s': %s\n", device, @@ -1092,17 +1092,20 @@ SnapshotInfo *qmp_blockdev_snapshot_delete_internal_sync(const char *device, const char *name, Error **errp) { - BlockDriverState *bs = bdrv_find(device); + BlockDriverState *bs; + BlockBackend *blk; AioContext *aio_context; QEMUSnapshotInfo sn; Error *local_err = NULL; SnapshotInfo *info = NULL; int ret; - if (!bs) { + blk = blk_by_name(device); + if (!blk) { error_set(errp, QERR_DEVICE_NOT_FOUND, device); return NULL; } + bs = blk_bs(blk); if (!has_id) { id = NULL; @@ -1205,6 +1208,7 @@ static void internal_snapshot_prepare(BlkTransactionState *common, Error *local_err = NULL; const char *device; const char *name; + BlockBackend *blk; BlockDriverState *bs; QEMUSnapshotInfo old_sn, *sn; bool ret; @@ -1223,11 +1227,12 @@ static void internal_snapshot_prepare(BlkTransactionState *common, name = internal->name; /* 2. check for validation */ - bs = bdrv_find(device); - if (!bs) { + blk = blk_by_name(device); + if (!blk) { error_set(errp, QERR_DEVICE_NOT_FOUND, device); return; } + bs = blk_bs(blk); /* AioContext is released in .clean() */ state->aio_context = bdrv_get_aio_context(bs); @@ -1494,17 +1499,19 @@ static void drive_backup_prepare(BlkTransactionState *common, Error **errp) { DriveBackupState *state = DO_UPCAST(DriveBackupState, common, common); BlockDriverState *bs; + BlockBackend *blk; DriveBackup *backup; Error *local_err = NULL; assert(common->action->kind == TRANSACTION_ACTION_KIND_DRIVE_BACKUP); backup = common->action->drive_backup; - bs = bdrv_find(backup->device); - if (!bs) { + blk = blk_by_name(backup->device); + if (!blk) { error_set(errp, QERR_DEVICE_NOT_FOUND, backup->device); return; } + bs = blk_bs(blk); /* AioContext is released in .clean() */ state->aio_context = bdrv_get_aio_context(bs); @@ -1559,22 +1566,25 @@ static void blockdev_backup_prepare(BlkTransactionState *common, Error **errp) BlockdevBackupState *state = DO_UPCAST(BlockdevBackupState, common, common); BlockdevBackup *backup; BlockDriverState *bs, *target; + BlockBackend *blk; Error *local_err = NULL; assert(common->action->kind == TRANSACTION_ACTION_KIND_BLOCKDEV_BACKUP); backup = common->action->blockdev_backup; - bs = bdrv_find(backup->device); - if (!bs) { + blk = blk_by_name(backup->device); + if (!blk) { error_set(errp, QERR_DEVICE_NOT_FOUND, backup->device); return; } + bs = blk_bs(blk); - target = bdrv_find(backup->target); - if (!target) { + blk = blk_by_name(backup->target); + if (!blk) { error_set(errp, QERR_DEVICE_NOT_FOUND, backup->target); return; } + target = blk_bs(blk); /* AioContext is released in .clean() */ state->aio_context = bdrv_get_aio_context(bs); @@ -1881,13 +1891,15 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd, { ThrottleConfig cfg; BlockDriverState *bs; + BlockBackend *blk; AioContext *aio_context; - bs = bdrv_find(device); - if (!bs) { + blk = blk_by_name(device); + if (!blk) { error_set(errp, QERR_DEVICE_NOT_FOUND, device); return; } + bs = blk_bs(blk); memset(&cfg, 0, sizeof(cfg)); cfg.buckets[THROTTLE_BPS_TOTAL].avg = bps; @@ -2091,6 +2103,7 @@ void qmp_block_stream(const char *device, bool has_on_error, BlockdevOnError on_error, Error **errp) { + BlockBackend *blk; BlockDriverState *bs; BlockDriverState *base_bs = NULL; AioContext *aio_context; @@ -2101,11 +2114,12 @@ void qmp_block_stream(const char *device, on_error = BLOCKDEV_ON_ERROR_REPORT; } - bs = bdrv_find(device); - if (!bs) { + blk = blk_by_name(device); + if (!blk) { error_set(errp, QERR_DEVICE_NOT_FOUND, device); return; } + bs = blk_bs(blk); aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); @@ -2155,6 +2169,7 @@ void qmp_block_commit(const char *device, bool has_speed, int64_t speed, Error **errp) { + BlockBackend *blk; BlockDriverState *bs; BlockDriverState *base_bs, *top_bs; AioContext *aio_context; @@ -2173,11 +2188,12 @@ void qmp_block_commit(const char *device, * live commit feature versions; for this to work, we must make sure to * perform the device lookup before any generic errors that may occur in a * scenario in which all optional arguments are omitted. */ - bs = bdrv_find(device); - if (!bs) { + blk = blk_by_name(device); + if (!blk) { error_set(errp, QERR_DEVICE_NOT_FOUND, device); return; } + bs = blk_bs(blk); aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); @@ -2258,6 +2274,7 @@ void qmp_drive_backup(const char *device, const char *target, bool has_on_target_error, BlockdevOnError on_target_error, Error **errp) { + BlockBackend *blk; BlockDriverState *bs; BlockDriverState *target_bs; BlockDriverState *source = NULL; @@ -2281,11 +2298,12 @@ void qmp_drive_backup(const char *device, const char *target, mode = NEW_IMAGE_MODE_ABSOLUTE_PATHS; } - bs = bdrv_find(device); - if (!bs) { + blk = blk_by_name(device); + if (!blk) { error_set(errp, QERR_DEVICE_NOT_FOUND, device); return; } + bs = blk_bs(blk); aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); @@ -2385,6 +2403,7 @@ void qmp_blockdev_backup(const char *device, const char *target, BlockdevOnError on_target_error, Error **errp) { + BlockBackend *blk; BlockDriverState *bs; BlockDriverState *target_bs; Error *local_err = NULL; @@ -2400,20 +2419,22 @@ void qmp_blockdev_backup(const char *device, const char *target, on_target_error = BLOCKDEV_ON_ERROR_REPORT; } - bs = bdrv_find(device); - if (!bs) { + blk = blk_by_name(device); + if (!blk) { error_set(errp, QERR_DEVICE_NOT_FOUND, device); return; } + bs = blk_bs(blk); aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); - target_bs = bdrv_find(target); - if (!target_bs) { + blk = blk_by_name(target); + if (!blk) { error_set(errp, QERR_DEVICE_NOT_FOUND, target); goto out; } + target_bs = blk_bs(blk); bdrv_ref(target_bs); bdrv_set_aio_context(target_bs, aio_context); @@ -2442,6 +2463,7 @@ void qmp_drive_mirror(const char *device, const char *target, bool has_on_target_error, BlockdevOnError on_target_error, Error **errp) { + BlockBackend *blk; BlockDriverState *bs; BlockDriverState *source, *target_bs; AioContext *aio_context; @@ -2481,11 +2503,12 @@ void qmp_drive_mirror(const char *device, const char *target, return; } - bs = bdrv_find(device); - if (!bs) { + blk = blk_by_name(device); + if (!blk) { error_set(errp, QERR_DEVICE_NOT_FOUND, device); return; } + bs = blk_bs(blk); aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); @@ -2623,12 +2646,14 @@ out: static BlockJob *find_block_job(const char *device, AioContext **aio_context, Error **errp) { + BlockBackend *blk; BlockDriverState *bs; - bs = bdrv_find(device); - if (!bs) { + blk = blk_by_name(device); + if (!blk) { goto notfound; } + bs = blk_bs(blk); *aio_context = bdrv_get_aio_context(bs); aio_context_acquire(*aio_context); @@ -2733,6 +2758,7 @@ void qmp_change_backing_file(const char *device, const char *backing_file, Error **errp) { + BlockBackend *blk; BlockDriverState *bs = NULL; AioContext *aio_context; BlockDriverState *image_bs = NULL; @@ -2741,12 +2767,12 @@ void qmp_change_backing_file(const char *device, int open_flags; int ret; - /* find the top layer BDS of the chain */ - bs = bdrv_find(device); - if (!bs) { + blk = blk_by_name(device); + if (!blk) { error_set(errp, QERR_DEVICE_NOT_FOUND, device); return; } + bs = blk_bs(blk); aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); From d51a2427f68a312b676edd0e9efce881649d1767 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Mon, 2 Mar 2015 19:36:49 +0800 Subject: [PATCH 07/12] block: Drop bdrv_find All callers are converted, so drop it. Signed-off-by: Fam Zheng Message-id: 1425296209-1476-5-git-send-email-famz@redhat.com Reviewed-by: Max Reitz Signed-off-by: Max Reitz --- block.c | 9 --------- include/block/block.h | 1 - 2 files changed, 10 deletions(-) diff --git a/block.c b/block.c index 10c32eb327..0fe97de568 100644 --- a/block.c +++ b/block.c @@ -3821,15 +3821,6 @@ void bdrv_iterate_format(void (*it)(void *opaque, const char *name), g_free(formats); } -/* This function is to find block backend bs */ -/* TODO convert callers to blk_by_name(), then remove */ -BlockDriverState *bdrv_find(const char *name) -{ - BlockBackend *blk = blk_by_name(name); - - return blk ? blk_bs(blk) : NULL; -} - /* This function is to find a node in the bs graph */ BlockDriverState *bdrv_find_node(const char *node_name) { diff --git a/include/block/block.h b/include/block/block.h index 473c746342..4c57d63fe2 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -381,7 +381,6 @@ int bdrv_media_changed(BlockDriverState *bs); void bdrv_lock_medium(BlockDriverState *bs, bool locked); void bdrv_eject(BlockDriverState *bs, bool eject_flag); const char *bdrv_get_format_name(BlockDriverState *bs); -BlockDriverState *bdrv_find(const char *name); BlockDriverState *bdrv_find_node(const char *node_name); BlockDeviceInfoList *bdrv_named_nodes_list(void); BlockDriverState *bdrv_lookup_bs(const char *device, From 2ec711dcd45effc8d583dee6ff92d94573aad75b Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Tue, 3 Mar 2015 11:41:52 +0100 Subject: [PATCH 08/12] block/vpc: optimize vpc_co_get_block_status *pnum can't be greater than s->block_size / BDRV_SECTOR_SIZE for allocated sectors since there is always a bitmap in between. Signed-off-by: Peter Lieven Reviewed-by: Max Reitz Message-id: 1425379316-19639-2-git-send-email-pl@kamp.de Signed-off-by: Max Reitz --- block/vpc.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/block/vpc.c b/block/vpc.c index 1533b6a64d..c8e17cb058 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -602,7 +602,7 @@ static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs, { BDRVVPCState *s = bs->opaque; VHDFooter *footer = (VHDFooter*) s->footer_buf; - int64_t start, offset, next; + int64_t start, offset; bool allocated; int n; @@ -626,20 +626,18 @@ static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs, *pnum += n; sector_num += n; nb_sectors -= n; - next = start + (*pnum * BDRV_SECTOR_SIZE); - + /* *pnum can't be greater than one block for allocated + * sectors since there is always a bitmap in between. */ + if (allocated) { + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; + } if (nb_sectors == 0) { break; } - offset = get_sector_offset(bs, sector_num, 0); - } while ((allocated && offset == next) || (!allocated && offset == -1)); + } while (offset == -1); - if (allocated) { - return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; - } else { - return 0; - } + return 0; } /* From 0444dceee48fed54e8334428fa57f9ff997736e8 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 3 Mar 2015 11:41:53 +0100 Subject: [PATCH 09/12] vpc: Ignore geometry for large images The CHS calculation as done per the VHD spec imposes a maximum image size of ~127 GB. Real VHD images exist that are larger than that. Apparently there are two separate non-standard ways to achieve this: You could use more heads than the spec does - this is the option that qemu-img create chooses. However, other images exist where the geometry is set to the maximum (65535/16/255), but the actual image size is larger. Until now, such images are truncated at 127 GB when opening them with qemu. This patch changes the vpc driver to ignore geometry in this case and only trust the size field in the header. Signed-off-by: Kevin Wolf [PL: Fixed maximum geometry in the commit msg] Signed-off-by: Peter Lieven Message-id: 1425379316-19639-3-git-send-email-pl@kamp.de Reviewed-by: Max Reitz Signed-off-by: Max Reitz --- block/vpc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/block/vpc.c b/block/vpc.c index c8e17cb058..1c9592ccf4 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -215,12 +215,10 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, bs->total_sectors = (int64_t) be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl; - /* images created with disk2vhd report a far higher virtual size - * than expected with the cyls * heads * sectors_per_cyl formula. - * use the footer->size instead if the image was created with - * disk2vhd. - */ - if (!strncmp(footer->creator_app, "d2v", 4)) { + /* Images that have exactly the maximum geometry are probably bigger and + * would be truncated if we adhered to the geometry for them. Rely on + * footer->size for them. */ + if (bs->total_sectors == 65535ULL * 16 * 255) { bs->total_sectors = be64_to_cpu(footer->size) / BDRV_SECTOR_SIZE; } From 690cbb095a17c429513890d991bc57c98dd83912 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Tue, 3 Mar 2015 11:41:54 +0100 Subject: [PATCH 10/12] block/vpc: make calculate_geometry spec conform The VHD spec [1] allows for total_sectors of 65535 x 16 x 255 (~127GB) represented by a CHS geometry. If total_sectors is greater than 65535 x 16 x 255 this geometry is set as a maximum. Qemu, Hyper-V and disk2vhd use this special geometry as an indicator to use the image current size from the footer as disk size. This patch changes vpc_create to effectively calculate a CxHxS geometry for the given image size if possible while rounding up if necessary. If the image size is too big to be represented in CHS we set the maximum and write the exact requested image size into the footer. This partly reverts commit 258d2edb, but leaves support for >127G disks intact. [1] http://download.microsoft.com/download/f/f/e/ffef50a5-07dd-4cf8-aaa3-442c0673a029/Virtual%20Hard%20Disk%20Format%20Spec_10_18_06.doc Signed-off-by: Peter Lieven Message-id: 1425379316-19639-4-git-send-email-pl@kamp.de Reviewed-by: Max Reitz Signed-off-by: Max Reitz --- block/vpc.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/block/vpc.c b/block/vpc.c index 1c9592ccf4..7e99a69137 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -46,6 +46,7 @@ enum vhd_type { #define VHD_TIMESTAMP_BASE 946684800 #define VHD_MAX_SECTORS (65535LL * 255 * 255) +#define VHD_MAX_GEOMETRY (65535LL * 16 * 255) // always big-endian typedef struct vhd_footer { @@ -218,7 +219,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, /* Images that have exactly the maximum geometry are probably bigger and * would be truncated if we adhered to the geometry for them. Rely on * footer->size for them. */ - if (bs->total_sectors == 65535ULL * 16 * 255) { + if (bs->total_sectors == VHD_MAX_GEOMETRY) { bs->total_sectors = be64_to_cpu(footer->size) / BDRV_SECTOR_SIZE; } @@ -655,26 +656,20 @@ static int calculate_geometry(int64_t total_sectors, uint16_t* cyls, { uint32_t cyls_times_heads; - /* Allow a maximum disk size of approximately 2 TB */ - if (total_sectors > 65535LL * 255 * 255) { - return -EFBIG; - } + total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY); - if (total_sectors > 65535 * 16 * 63) { + if (total_sectors >= 65535LL * 16 * 63) { *secs_per_cyl = 255; - if (total_sectors > 65535 * 16 * 255) { - *heads = 255; - } else { - *heads = 16; - } + *heads = 16; cyls_times_heads = total_sectors / *secs_per_cyl; } else { *secs_per_cyl = 17; cyls_times_heads = total_sectors / *secs_per_cyl; *heads = (cyls_times_heads + 1023) / 1024; - if (*heads < 4) + if (*heads < 4) { *heads = 4; + } if (cyls_times_heads >= (*heads * 1024) || *heads > 16) { *secs_per_cyl = 31; @@ -830,20 +825,28 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) * Calculate matching total_size and geometry. Increase the number of * sectors requested until we get enough (or fail). This ensures that * qemu-img convert doesn't truncate images, but rather rounds up. + * + * If the image size can't be represented by a spec conform CHS geometry, + * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use + * the image size from the VHD footer to calculate total_sectors. */ - total_sectors = total_size / BDRV_SECTOR_SIZE; + total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE); for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) { - if (calculate_geometry(total_sectors + i, &cyls, &heads, - &secs_per_cyl)) - { + calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl); + } + + if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) { + total_sectors = total_size / BDRV_SECTOR_SIZE; + /* Allow a maximum disk size of approximately 2 TB */ + if (total_sectors > VHD_MAX_SECTORS) { ret = -EFBIG; goto out; } + } else { + total_sectors = (int64_t)cyls * heads * secs_per_cyl; + total_size = total_sectors * BDRV_SECTOR_SIZE; } - total_sectors = (int64_t) cyls * heads * secs_per_cyl; - total_size = total_sectors * BDRV_SECTOR_SIZE; - /* Prepare the Hard Disk Footer */ memset(buf, 0, 1024); From 03671ded3078ebad1f0a701042622fd5e8918bc9 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Tue, 3 Mar 2015 11:41:55 +0100 Subject: [PATCH 11/12] block/vpc: rename footer->size -> footer->current_size the field is named current size in the spec. Name it accordingly. Signed-off-by: Peter Lieven Reviewed-by: Max Reitz Message-id: 1425379316-19639-5-git-send-email-pl@kamp.de Signed-off-by: Max Reitz --- block/vpc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/block/vpc.c b/block/vpc.c index 7e99a69137..226be02e26 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -66,7 +66,7 @@ typedef struct vhd_footer { char creator_os[4]; // "Wi2k" uint64_t orig_size; - uint64_t size; + uint64_t current_size; uint16_t cyls; uint8_t heads; @@ -218,9 +218,10 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, /* Images that have exactly the maximum geometry are probably bigger and * would be truncated if we adhered to the geometry for them. Rely on - * footer->size for them. */ + * footer->current_size for them. */ if (bs->total_sectors == VHD_MAX_GEOMETRY) { - bs->total_sectors = be64_to_cpu(footer->size) / BDRV_SECTOR_SIZE; + bs->total_sectors = be64_to_cpu(footer->current_size) / + BDRV_SECTOR_SIZE; } /* Allow a maximum disk size of approximately 2 TB */ @@ -868,7 +869,7 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) footer->major = cpu_to_be16(0x0005); footer->minor = cpu_to_be16(0x0003); footer->orig_size = cpu_to_be64(total_size); - footer->size = cpu_to_be64(total_size); + footer->current_size = cpu_to_be64(total_size); footer->cyls = cpu_to_be16(cyls); footer->heads = heads; footer->secs_per_cyl = secs_per_cyl; From 304ee9174f4761d3f4da611352a815ab27baba06 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Tue, 3 Mar 2015 11:41:56 +0100 Subject: [PATCH 12/12] block/vpc: remove disabled code from get_sector_offset The code to check the bitmap for the allocation status of each sector has been "disabled by reason" ever since the vpc driver existed. The reason might be that we might end up reading sector by sector in vpc_read if we really used it. This would be a performance desaster. The current code would furthermore not work if the disabled parts get reactivated since vpc_read and vpc_write only use get_sector_offset to check the allocation status of the first sector of a read/write operation. This might lead to sectors incorrectly treated as zero in vpc_read and to sectors getting allocated twice in vpc_write. Signed-off-by: Peter Lieven Message-id: 1425379316-19639-6-git-send-email-pl@kamp.de Reviewed-by: Max Reitz Signed-off-by: Max Reitz --- block/vpc.c | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/block/vpc.c b/block/vpc.c index 226be02e26..43e768ee76 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -376,38 +376,6 @@ static inline int64_t get_sector_offset(BlockDriverState *bs, bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size); } -// printf("sector: %" PRIx64 ", index: %x, offset: %x, bioff: %" PRIx64 ", bloff: %" PRIx64 "\n", -// sector_num, pagetable_index, pageentry_index, -// bitmap_offset, block_offset); - -// disabled by reason -#if 0 -#ifdef CACHE - if (bitmap_offset != s->last_bitmap) - { - lseek(s->fd, bitmap_offset, SEEK_SET); - - s->last_bitmap = bitmap_offset; - - // Scary! Bitmap is stored as big endian 32bit entries, - // while we used to look it up byte by byte - read(s->fd, s->pageentry_u8, 512); - for (i = 0; i < 128; i++) - be32_to_cpus(&s->pageentry_u32[i]); - } - - if ((s->pageentry_u8[pageentry_index / 8] >> (pageentry_index % 8)) & 1) - return -1; -#else - lseek(s->fd, bitmap_offset + (pageentry_index / 8), SEEK_SET); - - read(s->fd, &bitmap_entry, 1); - - if ((bitmap_entry >> (pageentry_index % 8)) & 1) - return -1; // not allocated -#endif -#endif - return block_offset; }