vmdk: Optimize cluster allocation

This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.

Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.

This is not efficient, so it's now rewritten as:

  - Save the extent file length when opening.

  - When allocating cluster, use the saved length as cluster offset.

  - Don't truncate image, because we'll anyway write data there: just
    write any data at the EOF position, in descending priority:

    * New user data (cluster allocation happens in a write request).

    * Filling data in the beginning and/or ending of the new cluster, if
      not covered by user data: either backing file content (COW), or
      zero for standalone images.

One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:

    $ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk

    Before:
        real    0m21.796s
        user    0m0.130s
        sys     0m0.483s

    After:
        real    0m2.017s
        user    0m0.047s
        sys     0m0.190s

We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.

Tested that this passes qemu-iotests for all VMDK subformats.

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
Fam Zheng 2014-07-30 14:39:10 +08:00 committed by Kevin Wolf
parent a8d8a1a06c
commit c6ac36e145

View File

@ -106,6 +106,7 @@ typedef struct VmdkExtent {
uint32_t l2_cache_counts[L2_CACHE_SIZE]; uint32_t l2_cache_counts[L2_CACHE_SIZE];
int64_t cluster_sectors; int64_t cluster_sectors;
int64_t next_cluster_sector;
char *type; char *type;
} VmdkExtent; } VmdkExtent;
@ -124,7 +125,6 @@ typedef struct BDRVVmdkState {
} BDRVVmdkState; } BDRVVmdkState;
typedef struct VmdkMetaData { typedef struct VmdkMetaData {
uint32_t offset;
unsigned int l1_index; unsigned int l1_index;
unsigned int l2_index; unsigned int l2_index;
unsigned int l2_offset; unsigned int l2_offset;
@ -397,6 +397,7 @@ static int vmdk_add_extent(BlockDriverState *bs,
{ {
VmdkExtent *extent; VmdkExtent *extent;
BDRVVmdkState *s = bs->opaque; BDRVVmdkState *s = bs->opaque;
int64_t length;
if (cluster_sectors > 0x200000) { if (cluster_sectors > 0x200000) {
/* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */ /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
@ -412,6 +413,11 @@ static int vmdk_add_extent(BlockDriverState *bs,
return -EFBIG; return -EFBIG;
} }
length = bdrv_getlength(file);
if (length < 0) {
return length;
}
s->extents = g_realloc(s->extents, s->extents = g_realloc(s->extents,
(s->num_extents + 1) * sizeof(VmdkExtent)); (s->num_extents + 1) * sizeof(VmdkExtent));
extent = &s->extents[s->num_extents]; extent = &s->extents[s->num_extents];
@ -427,6 +433,8 @@ static int vmdk_add_extent(BlockDriverState *bs,
extent->l1_entry_sectors = l2_size * cluster_sectors; extent->l1_entry_sectors = l2_size * cluster_sectors;
extent->l2_size = l2_size; extent->l2_size = l2_size;
extent->cluster_sectors = flat ? sectors : cluster_sectors; extent->cluster_sectors = flat ? sectors : cluster_sectors;
extent->next_cluster_sector =
ROUND_UP(DIV_ROUND_UP(length, BDRV_SECTOR_SIZE), cluster_sectors);
if (s->num_extents > 1) { if (s->num_extents > 1) {
extent->end_sector = (*(extent - 1)).end_sector + extent->sectors; extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
@ -951,57 +959,97 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
} }
} }
/**
* get_whole_cluster
*
* Copy backing file's cluster that covers @sector_num, otherwise write zero,
* to the cluster at @cluster_sector_num.
*
* If @skip_start_sector < @skip_end_sector, the relative range
* [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
* it for call to write user data in the request.
*/
static int get_whole_cluster(BlockDriverState *bs, static int get_whole_cluster(BlockDriverState *bs,
VmdkExtent *extent, VmdkExtent *extent,
uint64_t cluster_offset, uint64_t cluster_sector_num,
uint64_t offset, uint64_t sector_num,
bool allocate) uint64_t skip_start_sector,
uint64_t skip_end_sector)
{ {
int ret = VMDK_OK; int ret = VMDK_OK;
uint8_t *whole_grain = NULL; int64_t cluster_bytes;
uint8_t *whole_grain;
/* For COW, align request sector_num to cluster start */
sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors);
cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
whole_grain = qemu_blockalign(bs, cluster_bytes);
if (!bs->backing_hd) {
memset(whole_grain, 0, skip_start_sector << BDRV_SECTOR_BITS);
memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0,
cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS));
}
assert(skip_end_sector <= extent->cluster_sectors);
/* we will be here if it's first write on non-exist grain(cluster). /* we will be here if it's first write on non-exist grain(cluster).
* try to read from parent image, if exist */ * try to read from parent image, if exist */
if (bs->backing_hd) { if (bs->backing_hd && !vmdk_is_cid_valid(bs)) {
whole_grain = ret = VMDK_ERROR;
qemu_blockalign(bs, extent->cluster_sectors << BDRV_SECTOR_BITS); goto exit;
if (!vmdk_is_cid_valid(bs)) { }
ret = VMDK_ERROR;
goto exit;
}
/* floor offset to cluster */ /* Read backing data before skip range */
offset -= offset % (extent->cluster_sectors * 512); if (skip_start_sector > 0) {
ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain, if (bs->backing_hd) {
extent->cluster_sectors); ret = bdrv_read(bs->backing_hd, sector_num,
if (ret < 0) { whole_grain, skip_start_sector);
ret = VMDK_ERROR; if (ret < 0) {
goto exit; ret = VMDK_ERROR;
goto exit;
}
} }
ret = bdrv_write(extent->file, cluster_sector_num, whole_grain,
/* Write grain only into the active image */ skip_start_sector);
ret = bdrv_write(extent->file, cluster_offset, whole_grain,
extent->cluster_sectors);
if (ret < 0) { if (ret < 0) {
ret = VMDK_ERROR; ret = VMDK_ERROR;
goto exit; goto exit;
} }
} }
/* Read backing data after skip range */
if (skip_end_sector < extent->cluster_sectors) {
if (bs->backing_hd) {
ret = bdrv_read(bs->backing_hd, sector_num + skip_end_sector,
whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
extent->cluster_sectors - skip_end_sector);
if (ret < 0) {
ret = VMDK_ERROR;
goto exit;
}
}
ret = bdrv_write(extent->file, cluster_sector_num + skip_end_sector,
whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
extent->cluster_sectors - skip_end_sector);
if (ret < 0) {
ret = VMDK_ERROR;
goto exit;
}
}
exit: exit:
qemu_vfree(whole_grain); qemu_vfree(whole_grain);
return ret; return ret;
} }
static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data) static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
uint32_t offset)
{ {
uint32_t offset; offset = cpu_to_le32(offset);
QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
offset = cpu_to_le32(m_data->offset);
/* update L2 table */ /* update L2 table */
if (bdrv_pwrite_sync( if (bdrv_pwrite_sync(
extent->file, extent->file,
((int64_t)m_data->l2_offset * 512) ((int64_t)m_data->l2_offset * 512)
+ (m_data->l2_index * sizeof(m_data->offset)), + (m_data->l2_index * sizeof(offset)),
&offset, sizeof(offset)) < 0) { &offset, sizeof(offset)) < 0) {
return VMDK_ERROR; return VMDK_ERROR;
} }
@ -1011,7 +1059,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
if (bdrv_pwrite_sync( if (bdrv_pwrite_sync(
extent->file, extent->file,
((int64_t)m_data->l2_offset * 512) ((int64_t)m_data->l2_offset * 512)
+ (m_data->l2_index * sizeof(m_data->offset)), + (m_data->l2_index * sizeof(offset)),
&offset, sizeof(offset)) < 0) { &offset, sizeof(offset)) < 0) {
return VMDK_ERROR; return VMDK_ERROR;
} }
@ -1023,17 +1071,41 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
return VMDK_OK; return VMDK_OK;
} }
/**
* get_cluster_offset
*
* Look up cluster offset in extent file by sector number, and store in
* @cluster_offset.
*
* For flat extents, the start offset as parsed from the description file is
* returned.
*
* For sparse extents, look up in L1, L2 table. If allocate is true, return an
* offset for a new cluster and update L2 cache. If there is a backing file,
* COW is done before returning; otherwise, zeroes are written to the allocated
* cluster. Both COW and zero writing skips the sector range
* [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
* has new data to write there.
*
* Returns: VMDK_OK if cluster exists and mapped in the image.
* VMDK_UNALLOC if cluster is not mapped and @allocate is false.
* VMDK_ERROR if failed.
*/
static int get_cluster_offset(BlockDriverState *bs, static int get_cluster_offset(BlockDriverState *bs,
VmdkExtent *extent, VmdkExtent *extent,
VmdkMetaData *m_data, VmdkMetaData *m_data,
uint64_t offset, uint64_t offset,
int allocate, bool allocate,
uint64_t *cluster_offset) uint64_t *cluster_offset,
uint64_t skip_start_sector,
uint64_t skip_end_sector)
{ {
unsigned int l1_index, l2_offset, l2_index; unsigned int l1_index, l2_offset, l2_index;
int min_index, i, j; int min_index, i, j;
uint32_t min_count, *l2_table; uint32_t min_count, *l2_table;
bool zeroed = false; bool zeroed = false;
int64_t ret;
int32_t cluster_sector;
if (m_data) { if (m_data) {
m_data->valid = 0; m_data->valid = 0;
@ -1087,52 +1159,41 @@ static int get_cluster_offset(BlockDriverState *bs,
extent->l2_cache_counts[min_index] = 1; extent->l2_cache_counts[min_index] = 1;
found: found:
l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
*cluster_offset = le32_to_cpu(l2_table[l2_index]); cluster_sector = le32_to_cpu(l2_table[l2_index]);
if (m_data) { if (m_data) {
m_data->valid = 1; m_data->valid = 1;
m_data->l1_index = l1_index; m_data->l1_index = l1_index;
m_data->l2_index = l2_index; m_data->l2_index = l2_index;
m_data->offset = *cluster_offset;
m_data->l2_offset = l2_offset; m_data->l2_offset = l2_offset;
m_data->l2_cache_entry = &l2_table[l2_index]; m_data->l2_cache_entry = &l2_table[l2_index];
} }
if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) { if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
zeroed = true; zeroed = true;
} }
if (!*cluster_offset || zeroed) { if (!cluster_sector || zeroed) {
if (!allocate) { if (!allocate) {
return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
} }
/* Avoid the L2 tables update for the images that have snapshots. */ cluster_sector = extent->next_cluster_sector;
*cluster_offset = bdrv_getlength(extent->file); extent->next_cluster_sector += extent->cluster_sectors;
if (!extent->compressed) {
bdrv_truncate(
extent->file,
*cluster_offset + (extent->cluster_sectors << 9)
);
}
*cluster_offset >>= 9;
l2_table[l2_index] = cpu_to_le32(*cluster_offset);
/* First of all we write grain itself, to avoid race condition /* First of all we write grain itself, to avoid race condition
* that may to corrupt the image. * that may to corrupt the image.
* This problem may occur because of insufficient space on host disk * This problem may occur because of insufficient space on host disk
* or inappropriate VM shutdown. * or inappropriate VM shutdown.
*/ */
if (get_whole_cluster( ret = get_whole_cluster(bs, extent,
bs, extent, *cluster_offset, offset, allocate) == -1) { cluster_sector,
return VMDK_ERROR; offset >> BDRV_SECTOR_BITS,
} skip_start_sector, skip_end_sector);
if (ret) {
if (m_data) { return ret;
m_data->offset = *cluster_offset;
} }
} }
*cluster_offset <<= 9; *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
return VMDK_OK; return VMDK_OK;
} }
@ -1167,7 +1228,8 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
} }
qemu_co_mutex_lock(&s->lock); qemu_co_mutex_lock(&s->lock);
ret = get_cluster_offset(bs, extent, NULL, ret = get_cluster_offset(bs, extent, NULL,
sector_num * 512, 0, &offset); sector_num * 512, false, &offset,
0, 0);
qemu_co_mutex_unlock(&s->lock); qemu_co_mutex_unlock(&s->lock);
switch (ret) { switch (ret) {
@ -1320,9 +1382,9 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
if (!extent) { if (!extent) {
return -EIO; return -EIO;
} }
ret = get_cluster_offset( ret = get_cluster_offset(bs, extent, NULL,
bs, extent, NULL, sector_num << 9, false, &cluster_offset,
sector_num << 9, 0, &cluster_offset); 0, 0);
extent_begin_sector = extent->end_sector - extent->sectors; extent_begin_sector = extent->end_sector - extent->sectors;
extent_relative_sector_num = sector_num - extent_begin_sector; extent_relative_sector_num = sector_num - extent_begin_sector;
index_in_cluster = extent_relative_sector_num % extent->cluster_sectors; index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
@ -1403,12 +1465,17 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
if (!extent) { if (!extent) {
return -EIO; return -EIO;
} }
ret = get_cluster_offset( extent_begin_sector = extent->end_sector - extent->sectors;
bs, extent_relative_sector_num = sector_num - extent_begin_sector;
extent, index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
&m_data, n = extent->cluster_sectors - index_in_cluster;
sector_num << 9, !extent->compressed, if (n > nb_sectors) {
&cluster_offset); n = nb_sectors;
}
ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
!(extent->compressed || zeroed),
&cluster_offset,
index_in_cluster, index_in_cluster + n);
if (extent->compressed) { if (extent->compressed) {
if (ret == VMDK_OK) { if (ret == VMDK_OK) {
/* Refuse write to allocated cluster for streamOptimized */ /* Refuse write to allocated cluster for streamOptimized */
@ -1417,24 +1484,13 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
return -EIO; return -EIO;
} else { } else {
/* allocate */ /* allocate */
ret = get_cluster_offset( ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
bs, true, &cluster_offset, 0, 0);
extent,
&m_data,
sector_num << 9, 1,
&cluster_offset);
} }
} }
if (ret == VMDK_ERROR) { if (ret == VMDK_ERROR) {
return -EINVAL; return -EINVAL;
} }
extent_begin_sector = extent->end_sector - extent->sectors;
extent_relative_sector_num = sector_num - extent_begin_sector;
index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
n = extent->cluster_sectors - index_in_cluster;
if (n > nb_sectors) {
n = nb_sectors;
}
if (zeroed) { if (zeroed) {
/* Do zeroed write, buf is ignored */ /* Do zeroed write, buf is ignored */
if (extent->has_zero_grain && if (extent->has_zero_grain &&
@ -1442,9 +1498,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
n >= extent->cluster_sectors) { n >= extent->cluster_sectors) {
n = extent->cluster_sectors; n = extent->cluster_sectors;
if (!zero_dry_run) { if (!zero_dry_run) {
m_data.offset = VMDK_GTE_ZEROED;
/* update L2 tables */ /* update L2 tables */
if (vmdk_L2update(extent, &m_data) != VMDK_OK) { if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
!= VMDK_OK) {
return -EIO; return -EIO;
} }
} }
@ -1460,7 +1516,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
} }
if (m_data.valid) { if (m_data.valid) {
/* update L2 tables */ /* update L2 tables */
if (vmdk_L2update(extent, &m_data) != VMDK_OK) { if (vmdk_L2update(extent, &m_data,
cluster_offset >> BDRV_SECTOR_BITS)
!= VMDK_OK) {
return -EIO; return -EIO;
} }
} }
@ -2019,7 +2077,7 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
} }
ret = get_cluster_offset(bs, extent, NULL, ret = get_cluster_offset(bs, extent, NULL,
sector_num << BDRV_SECTOR_BITS, sector_num << BDRV_SECTOR_BITS,
0, &cluster_offset); false, &cluster_offset, 0, 0);
if (ret == VMDK_ERROR) { if (ret == VMDK_ERROR) {
fprintf(stderr, fprintf(stderr,
"ERROR: could not get cluster_offset for sector %" "ERROR: could not get cluster_offset for sector %"