vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves, cluster allocation code path. Before, when we need a new cluster, get_cluster_offset truncates the image to bdrv_getlength() + cluster_size, and returns the offset of added area, i.e. the image length before truncating. This is not efficient, so it's now rewritten as: - Save the extent file length when opening. - When allocating cluster, use the saved length as cluster offset. - Don't truncate image, because we'll anyway write data there: just write any data at the EOF position, in descending priority: * New user data (cluster allocation happens in a write request). * Filling data in the beginning and/or ending of the new cluster, if not covered by user data: either backing file content (COW), or zero for standalone images. One major benifit of this change is, on host mounted NFS images, even over a fast network, ftruncate is slow (see the example below). This change significantly speeds up cluster allocation. Comparing by converting a cirros image (296M) to VMDK on an NFS mount point, over 1Gbe LAN: $ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk Before: real 0m21.796s user 0m0.130s sys 0m0.483s After: real 0m2.017s user 0m0.047s sys 0m0.190s We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and get a little more documentation in function comments. Tested that this passes qemu-iotests for all VMDK subformats. Signed-off-by: Fam Zheng <famz@redhat.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
parent
a8d8a1a06c
commit
c6ac36e145
228
block/vmdk.c
228
block/vmdk.c
@ -106,6 +106,7 @@ typedef struct VmdkExtent {
|
||||
uint32_t l2_cache_counts[L2_CACHE_SIZE];
|
||||
|
||||
int64_t cluster_sectors;
|
||||
int64_t next_cluster_sector;
|
||||
char *type;
|
||||
} VmdkExtent;
|
||||
|
||||
@ -124,7 +125,6 @@ typedef struct BDRVVmdkState {
|
||||
} BDRVVmdkState;
|
||||
|
||||
typedef struct VmdkMetaData {
|
||||
uint32_t offset;
|
||||
unsigned int l1_index;
|
||||
unsigned int l2_index;
|
||||
unsigned int l2_offset;
|
||||
@ -397,6 +397,7 @@ static int vmdk_add_extent(BlockDriverState *bs,
|
||||
{
|
||||
VmdkExtent *extent;
|
||||
BDRVVmdkState *s = bs->opaque;
|
||||
int64_t length;
|
||||
|
||||
if (cluster_sectors > 0x200000) {
|
||||
/* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
|
||||
@ -412,6 +413,11 @@ static int vmdk_add_extent(BlockDriverState *bs,
|
||||
return -EFBIG;
|
||||
}
|
||||
|
||||
length = bdrv_getlength(file);
|
||||
if (length < 0) {
|
||||
return length;
|
||||
}
|
||||
|
||||
s->extents = g_realloc(s->extents,
|
||||
(s->num_extents + 1) * sizeof(VmdkExtent));
|
||||
extent = &s->extents[s->num_extents];
|
||||
@ -427,6 +433,8 @@ static int vmdk_add_extent(BlockDriverState *bs,
|
||||
extent->l1_entry_sectors = l2_size * cluster_sectors;
|
||||
extent->l2_size = l2_size;
|
||||
extent->cluster_sectors = flat ? sectors : cluster_sectors;
|
||||
extent->next_cluster_sector =
|
||||
ROUND_UP(DIV_ROUND_UP(length, BDRV_SECTOR_SIZE), cluster_sectors);
|
||||
|
||||
if (s->num_extents > 1) {
|
||||
extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
|
||||
@ -951,57 +959,97 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* get_whole_cluster
|
||||
*
|
||||
* Copy backing file's cluster that covers @sector_num, otherwise write zero,
|
||||
* to the cluster at @cluster_sector_num.
|
||||
*
|
||||
* If @skip_start_sector < @skip_end_sector, the relative range
|
||||
* [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
|
||||
* it for call to write user data in the request.
|
||||
*/
|
||||
static int get_whole_cluster(BlockDriverState *bs,
|
||||
VmdkExtent *extent,
|
||||
uint64_t cluster_offset,
|
||||
uint64_t offset,
|
||||
bool allocate)
|
||||
VmdkExtent *extent,
|
||||
uint64_t cluster_sector_num,
|
||||
uint64_t sector_num,
|
||||
uint64_t skip_start_sector,
|
||||
uint64_t skip_end_sector)
|
||||
{
|
||||
int ret = VMDK_OK;
|
||||
uint8_t *whole_grain = NULL;
|
||||
int64_t cluster_bytes;
|
||||
uint8_t *whole_grain;
|
||||
|
||||
/* For COW, align request sector_num to cluster start */
|
||||
sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors);
|
||||
cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
|
||||
whole_grain = qemu_blockalign(bs, cluster_bytes);
|
||||
|
||||
if (!bs->backing_hd) {
|
||||
memset(whole_grain, 0, skip_start_sector << BDRV_SECTOR_BITS);
|
||||
memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0,
|
||||
cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS));
|
||||
}
|
||||
|
||||
assert(skip_end_sector <= extent->cluster_sectors);
|
||||
/* we will be here if it's first write on non-exist grain(cluster).
|
||||
* try to read from parent image, if exist */
|
||||
if (bs->backing_hd) {
|
||||
whole_grain =
|
||||
qemu_blockalign(bs, extent->cluster_sectors << BDRV_SECTOR_BITS);
|
||||
if (!vmdk_is_cid_valid(bs)) {
|
||||
ret = VMDK_ERROR;
|
||||
goto exit;
|
||||
}
|
||||
if (bs->backing_hd && !vmdk_is_cid_valid(bs)) {
|
||||
ret = VMDK_ERROR;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
/* floor offset to cluster */
|
||||
offset -= offset % (extent->cluster_sectors * 512);
|
||||
ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
|
||||
extent->cluster_sectors);
|
||||
if (ret < 0) {
|
||||
ret = VMDK_ERROR;
|
||||
goto exit;
|
||||
/* Read backing data before skip range */
|
||||
if (skip_start_sector > 0) {
|
||||
if (bs->backing_hd) {
|
||||
ret = bdrv_read(bs->backing_hd, sector_num,
|
||||
whole_grain, skip_start_sector);
|
||||
if (ret < 0) {
|
||||
ret = VMDK_ERROR;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
/* Write grain only into the active image */
|
||||
ret = bdrv_write(extent->file, cluster_offset, whole_grain,
|
||||
extent->cluster_sectors);
|
||||
ret = bdrv_write(extent->file, cluster_sector_num, whole_grain,
|
||||
skip_start_sector);
|
||||
if (ret < 0) {
|
||||
ret = VMDK_ERROR;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
/* Read backing data after skip range */
|
||||
if (skip_end_sector < extent->cluster_sectors) {
|
||||
if (bs->backing_hd) {
|
||||
ret = bdrv_read(bs->backing_hd, sector_num + skip_end_sector,
|
||||
whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
|
||||
extent->cluster_sectors - skip_end_sector);
|
||||
if (ret < 0) {
|
||||
ret = VMDK_ERROR;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
ret = bdrv_write(extent->file, cluster_sector_num + skip_end_sector,
|
||||
whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
|
||||
extent->cluster_sectors - skip_end_sector);
|
||||
if (ret < 0) {
|
||||
ret = VMDK_ERROR;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
exit:
|
||||
qemu_vfree(whole_grain);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
|
||||
static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
|
||||
uint32_t offset)
|
||||
{
|
||||
uint32_t offset;
|
||||
QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
|
||||
offset = cpu_to_le32(m_data->offset);
|
||||
offset = cpu_to_le32(offset);
|
||||
/* update L2 table */
|
||||
if (bdrv_pwrite_sync(
|
||||
extent->file,
|
||||
((int64_t)m_data->l2_offset * 512)
|
||||
+ (m_data->l2_index * sizeof(m_data->offset)),
|
||||
+ (m_data->l2_index * sizeof(offset)),
|
||||
&offset, sizeof(offset)) < 0) {
|
||||
return VMDK_ERROR;
|
||||
}
|
||||
@ -1011,7 +1059,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
|
||||
if (bdrv_pwrite_sync(
|
||||
extent->file,
|
||||
((int64_t)m_data->l2_offset * 512)
|
||||
+ (m_data->l2_index * sizeof(m_data->offset)),
|
||||
+ (m_data->l2_index * sizeof(offset)),
|
||||
&offset, sizeof(offset)) < 0) {
|
||||
return VMDK_ERROR;
|
||||
}
|
||||
@ -1023,17 +1071,41 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
|
||||
return VMDK_OK;
|
||||
}
|
||||
|
||||
/**
|
||||
* get_cluster_offset
|
||||
*
|
||||
* Look up cluster offset in extent file by sector number, and store in
|
||||
* @cluster_offset.
|
||||
*
|
||||
* For flat extents, the start offset as parsed from the description file is
|
||||
* returned.
|
||||
*
|
||||
* For sparse extents, look up in L1, L2 table. If allocate is true, return an
|
||||
* offset for a new cluster and update L2 cache. If there is a backing file,
|
||||
* COW is done before returning; otherwise, zeroes are written to the allocated
|
||||
* cluster. Both COW and zero writing skips the sector range
|
||||
* [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
|
||||
* has new data to write there.
|
||||
*
|
||||
* Returns: VMDK_OK if cluster exists and mapped in the image.
|
||||
* VMDK_UNALLOC if cluster is not mapped and @allocate is false.
|
||||
* VMDK_ERROR if failed.
|
||||
*/
|
||||
static int get_cluster_offset(BlockDriverState *bs,
|
||||
VmdkExtent *extent,
|
||||
VmdkMetaData *m_data,
|
||||
uint64_t offset,
|
||||
int allocate,
|
||||
uint64_t *cluster_offset)
|
||||
VmdkExtent *extent,
|
||||
VmdkMetaData *m_data,
|
||||
uint64_t offset,
|
||||
bool allocate,
|
||||
uint64_t *cluster_offset,
|
||||
uint64_t skip_start_sector,
|
||||
uint64_t skip_end_sector)
|
||||
{
|
||||
unsigned int l1_index, l2_offset, l2_index;
|
||||
int min_index, i, j;
|
||||
uint32_t min_count, *l2_table;
|
||||
bool zeroed = false;
|
||||
int64_t ret;
|
||||
int32_t cluster_sector;
|
||||
|
||||
if (m_data) {
|
||||
m_data->valid = 0;
|
||||
@ -1087,52 +1159,41 @@ static int get_cluster_offset(BlockDriverState *bs,
|
||||
extent->l2_cache_counts[min_index] = 1;
|
||||
found:
|
||||
l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
|
||||
*cluster_offset = le32_to_cpu(l2_table[l2_index]);
|
||||
cluster_sector = le32_to_cpu(l2_table[l2_index]);
|
||||
|
||||
if (m_data) {
|
||||
m_data->valid = 1;
|
||||
m_data->l1_index = l1_index;
|
||||
m_data->l2_index = l2_index;
|
||||
m_data->offset = *cluster_offset;
|
||||
m_data->l2_offset = l2_offset;
|
||||
m_data->l2_cache_entry = &l2_table[l2_index];
|
||||
}
|
||||
if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) {
|
||||
if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
|
||||
zeroed = true;
|
||||
}
|
||||
|
||||
if (!*cluster_offset || zeroed) {
|
||||
if (!cluster_sector || zeroed) {
|
||||
if (!allocate) {
|
||||
return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
|
||||
}
|
||||
|
||||
/* Avoid the L2 tables update for the images that have snapshots. */
|
||||
*cluster_offset = bdrv_getlength(extent->file);
|
||||
if (!extent->compressed) {
|
||||
bdrv_truncate(
|
||||
extent->file,
|
||||
*cluster_offset + (extent->cluster_sectors << 9)
|
||||
);
|
||||
}
|
||||
|
||||
*cluster_offset >>= 9;
|
||||
l2_table[l2_index] = cpu_to_le32(*cluster_offset);
|
||||
cluster_sector = extent->next_cluster_sector;
|
||||
extent->next_cluster_sector += extent->cluster_sectors;
|
||||
|
||||
/* First of all we write grain itself, to avoid race condition
|
||||
* that may to corrupt the image.
|
||||
* This problem may occur because of insufficient space on host disk
|
||||
* or inappropriate VM shutdown.
|
||||
*/
|
||||
if (get_whole_cluster(
|
||||
bs, extent, *cluster_offset, offset, allocate) == -1) {
|
||||
return VMDK_ERROR;
|
||||
}
|
||||
|
||||
if (m_data) {
|
||||
m_data->offset = *cluster_offset;
|
||||
ret = get_whole_cluster(bs, extent,
|
||||
cluster_sector,
|
||||
offset >> BDRV_SECTOR_BITS,
|
||||
skip_start_sector, skip_end_sector);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
*cluster_offset <<= 9;
|
||||
*cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
|
||||
return VMDK_OK;
|
||||
}
|
||||
|
||||
@ -1167,7 +1228,8 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
|
||||
}
|
||||
qemu_co_mutex_lock(&s->lock);
|
||||
ret = get_cluster_offset(bs, extent, NULL,
|
||||
sector_num * 512, 0, &offset);
|
||||
sector_num * 512, false, &offset,
|
||||
0, 0);
|
||||
qemu_co_mutex_unlock(&s->lock);
|
||||
|
||||
switch (ret) {
|
||||
@ -1320,9 +1382,9 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
|
||||
if (!extent) {
|
||||
return -EIO;
|
||||
}
|
||||
ret = get_cluster_offset(
|
||||
bs, extent, NULL,
|
||||
sector_num << 9, 0, &cluster_offset);
|
||||
ret = get_cluster_offset(bs, extent, NULL,
|
||||
sector_num << 9, false, &cluster_offset,
|
||||
0, 0);
|
||||
extent_begin_sector = extent->end_sector - extent->sectors;
|
||||
extent_relative_sector_num = sector_num - extent_begin_sector;
|
||||
index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
|
||||
@ -1403,12 +1465,17 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
|
||||
if (!extent) {
|
||||
return -EIO;
|
||||
}
|
||||
ret = get_cluster_offset(
|
||||
bs,
|
||||
extent,
|
||||
&m_data,
|
||||
sector_num << 9, !extent->compressed,
|
||||
&cluster_offset);
|
||||
extent_begin_sector = extent->end_sector - extent->sectors;
|
||||
extent_relative_sector_num = sector_num - extent_begin_sector;
|
||||
index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
|
||||
n = extent->cluster_sectors - index_in_cluster;
|
||||
if (n > nb_sectors) {
|
||||
n = nb_sectors;
|
||||
}
|
||||
ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
|
||||
!(extent->compressed || zeroed),
|
||||
&cluster_offset,
|
||||
index_in_cluster, index_in_cluster + n);
|
||||
if (extent->compressed) {
|
||||
if (ret == VMDK_OK) {
|
||||
/* Refuse write to allocated cluster for streamOptimized */
|
||||
@ -1417,24 +1484,13 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
|
||||
return -EIO;
|
||||
} else {
|
||||
/* allocate */
|
||||
ret = get_cluster_offset(
|
||||
bs,
|
||||
extent,
|
||||
&m_data,
|
||||
sector_num << 9, 1,
|
||||
&cluster_offset);
|
||||
ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
|
||||
true, &cluster_offset, 0, 0);
|
||||
}
|
||||
}
|
||||
if (ret == VMDK_ERROR) {
|
||||
return -EINVAL;
|
||||
}
|
||||
extent_begin_sector = extent->end_sector - extent->sectors;
|
||||
extent_relative_sector_num = sector_num - extent_begin_sector;
|
||||
index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
|
||||
n = extent->cluster_sectors - index_in_cluster;
|
||||
if (n > nb_sectors) {
|
||||
n = nb_sectors;
|
||||
}
|
||||
if (zeroed) {
|
||||
/* Do zeroed write, buf is ignored */
|
||||
if (extent->has_zero_grain &&
|
||||
@ -1442,9 +1498,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
|
||||
n >= extent->cluster_sectors) {
|
||||
n = extent->cluster_sectors;
|
||||
if (!zero_dry_run) {
|
||||
m_data.offset = VMDK_GTE_ZEROED;
|
||||
/* update L2 tables */
|
||||
if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
|
||||
if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
|
||||
!= VMDK_OK) {
|
||||
return -EIO;
|
||||
}
|
||||
}
|
||||
@ -1460,7 +1516,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
|
||||
}
|
||||
if (m_data.valid) {
|
||||
/* update L2 tables */
|
||||
if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
|
||||
if (vmdk_L2update(extent, &m_data,
|
||||
cluster_offset >> BDRV_SECTOR_BITS)
|
||||
!= VMDK_OK) {
|
||||
return -EIO;
|
||||
}
|
||||
}
|
||||
@ -2019,7 +2077,7 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
|
||||
}
|
||||
ret = get_cluster_offset(bs, extent, NULL,
|
||||
sector_num << BDRV_SECTOR_BITS,
|
||||
0, &cluster_offset);
|
||||
false, &cluster_offset, 0, 0);
|
||||
if (ret == VMDK_ERROR) {
|
||||
fprintf(stderr,
|
||||
"ERROR: could not get cluster_offset for sector %"
|
||||
|
Loading…
Reference in New Issue
Block a user