2004-08-01 23:59:26 +02:00
|
|
|
/*
|
|
|
|
* Block driver for the VMDK format
|
2007-09-16 23:08:06 +02:00
|
|
|
*
|
2004-08-01 23:59:26 +02:00
|
|
|
* Copyright (c) 2004 Fabrice Bellard
|
2005-04-26 23:08:00 +02:00
|
|
|
* Copyright (c) 2005 Filip Navara
|
2007-09-16 23:08:06 +02:00
|
|
|
*
|
2004-08-01 23:59:26 +02:00
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
* of this software and associated documentation files (the "Software"), to deal
|
|
|
|
* in the Software without restriction, including without limitation the rights
|
|
|
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
* copies of the Software, and to permit persons to whom the Software is
|
|
|
|
* furnished to do so, subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice shall be included in
|
|
|
|
* all copies or substantial portions of the Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
|
* THE SOFTWARE.
|
|
|
|
*/
|
2007-01-24 22:05:24 +01:00
|
|
|
|
2016-01-18 19:01:42 +01:00
|
|
|
#include "qemu/osdep.h"
|
include/qemu/osdep.h: Don't include qapi/error.h
Commit 57cb38b included qapi/error.h into qemu/osdep.h to get the
Error typedef. Since then, we've moved to include qemu/osdep.h
everywhere. Its file comment explains: "To avoid getting into
possible circular include dependencies, this file should not include
any other QEMU headers, with the exceptions of config-host.h,
compiler.h, os-posix.h and os-win32.h, all of which are doing a
similar job to this file and are under similar constraints."
qapi/error.h doesn't do a similar job, and it doesn't adhere to
similar constraints: it includes qapi-types.h. That's in excess of
100KiB of crap most .c files don't actually need.
Add the typedef to qemu/typedefs.h, and include that instead of
qapi/error.h. Include qapi/error.h in .c files that need it and don't
get it now. Include qapi-types.h in qom/object.h for uint16List.
Update scripts/clean-includes accordingly. Update it further to match
reality: replace config.h by config-target.h, add sysemu/os-posix.h,
sysemu/os-win32.h. Update the list of includes in the qemu/osdep.h
comment quoted above similarly.
This reduces the number of objects depending on qapi/error.h from "all
of them" to less than a third. Unfortunately, the number depending on
qapi-types.h shrinks only a little. More work is needed for that one.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
[Fix compilation without the spice devel packages. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-14 09:01:28 +01:00
|
|
|
#include "qapi/error.h"
|
2012-12-17 18:19:44 +01:00
|
|
|
#include "block/block_int.h"
|
2016-03-08 15:57:05 +01:00
|
|
|
#include "sysemu/block-backend.h"
|
2019-02-01 20:29:26 +01:00
|
|
|
#include "qapi/qmp/qdict.h"
|
2015-03-17 17:22:46 +01:00
|
|
|
#include "qapi/qmp/qerror.h"
|
2015-03-17 18:29:20 +01:00
|
|
|
#include "qemu/error-report.h"
|
2012-12-17 18:20:00 +01:00
|
|
|
#include "qemu/module.h"
|
2018-02-01 12:18:46 +01:00
|
|
|
#include "qemu/option.h"
|
2016-03-15 17:22:36 +01:00
|
|
|
#include "qemu/bswap.h"
|
2017-04-06 12:00:28 +02:00
|
|
|
#include "migration/blocker.h"
|
2016-03-20 18:16:19 +01:00
|
|
|
#include "qemu/cutils.h"
|
2011-11-20 12:34:30 +01:00
|
|
|
#include <zlib.h>
|
2004-08-01 23:59:26 +02:00
|
|
|
|
|
|
|
#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
|
|
|
|
#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
|
2011-08-12 17:19:30 +02:00
|
|
|
#define VMDK4_COMPRESSION_DEFLATE 1
|
2013-05-02 04:25:25 +02:00
|
|
|
#define VMDK4_FLAG_NL_DETECT (1 << 0)
|
2011-08-12 17:19:34 +02:00
|
|
|
#define VMDK4_FLAG_RGD (1 << 1)
|
2013-05-02 04:25:23 +02:00
|
|
|
/* Zeroed-grain enable bit */
|
|
|
|
#define VMDK4_FLAG_ZERO_GRAIN (1 << 2)
|
2011-08-12 17:19:30 +02:00
|
|
|
#define VMDK4_FLAG_COMPRESS (1 << 16)
|
|
|
|
#define VMDK4_FLAG_MARKER (1 << 17)
|
2012-08-16 10:39:33 +02:00
|
|
|
#define VMDK4_GD_AT_END 0xffffffffffffffffULL
|
2004-08-01 23:59:26 +02:00
|
|
|
|
2018-03-22 14:33:37 +01:00
|
|
|
#define VMDK_EXTENT_MAX_SECTORS (1ULL << 32)
|
|
|
|
|
2013-05-02 04:25:23 +02:00
|
|
|
#define VMDK_GTE_ZEROED 0x1
|
2013-05-02 04:25:22 +02:00
|
|
|
|
|
|
|
/* VMDK internal error codes */
|
|
|
|
#define VMDK_OK 0
|
|
|
|
#define VMDK_ERROR (-1)
|
|
|
|
/* Cluster not allocated */
|
|
|
|
#define VMDK_UNALLOC (-2)
|
|
|
|
#define VMDK_ZEROED (-3)
|
|
|
|
|
2013-05-02 04:25:24 +02:00
|
|
|
#define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain"
|
|
|
|
|
2004-08-01 23:59:26 +02:00
|
|
|
typedef struct {
|
|
|
|
uint32_t version;
|
|
|
|
uint32_t flags;
|
|
|
|
uint32_t disk_sectors;
|
|
|
|
uint32_t granularity;
|
|
|
|
uint32_t l1dir_offset;
|
|
|
|
uint32_t l1dir_size;
|
|
|
|
uint32_t file_sectors;
|
|
|
|
uint32_t cylinders;
|
|
|
|
uint32_t heads;
|
|
|
|
uint32_t sectors_per_track;
|
2013-08-06 09:44:47 +02:00
|
|
|
} QEMU_PACKED VMDK3Header;
|
2004-08-01 23:59:26 +02:00
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
uint32_t version;
|
|
|
|
uint32_t flags;
|
2013-08-06 09:44:48 +02:00
|
|
|
uint64_t capacity;
|
|
|
|
uint64_t granularity;
|
|
|
|
uint64_t desc_offset;
|
|
|
|
uint64_t desc_size;
|
2013-08-06 09:44:55 +02:00
|
|
|
/* Number of GrainTableEntries per GrainTable */
|
|
|
|
uint32_t num_gtes_per_gt;
|
2013-08-06 09:44:48 +02:00
|
|
|
uint64_t rgd_offset;
|
|
|
|
uint64_t gd_offset;
|
|
|
|
uint64_t grain_offset;
|
2004-08-01 23:59:26 +02:00
|
|
|
char filler[1];
|
|
|
|
char check_bytes[4];
|
2011-08-12 17:19:30 +02:00
|
|
|
uint16_t compressAlgorithm;
|
2011-08-31 12:38:01 +02:00
|
|
|
} QEMU_PACKED VMDK4Header;
|
2004-08-01 23:59:26 +02:00
|
|
|
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
typedef struct VMDKSESparseConstHeader {
|
|
|
|
uint64_t magic;
|
|
|
|
uint64_t version;
|
|
|
|
uint64_t capacity;
|
|
|
|
uint64_t grain_size;
|
|
|
|
uint64_t grain_table_size;
|
|
|
|
uint64_t flags;
|
|
|
|
uint64_t reserved1;
|
|
|
|
uint64_t reserved2;
|
|
|
|
uint64_t reserved3;
|
|
|
|
uint64_t reserved4;
|
|
|
|
uint64_t volatile_header_offset;
|
|
|
|
uint64_t volatile_header_size;
|
|
|
|
uint64_t journal_header_offset;
|
|
|
|
uint64_t journal_header_size;
|
|
|
|
uint64_t journal_offset;
|
|
|
|
uint64_t journal_size;
|
|
|
|
uint64_t grain_dir_offset;
|
|
|
|
uint64_t grain_dir_size;
|
|
|
|
uint64_t grain_tables_offset;
|
|
|
|
uint64_t grain_tables_size;
|
|
|
|
uint64_t free_bitmap_offset;
|
|
|
|
uint64_t free_bitmap_size;
|
|
|
|
uint64_t backmap_offset;
|
|
|
|
uint64_t backmap_size;
|
|
|
|
uint64_t grains_offset;
|
|
|
|
uint64_t grains_size;
|
|
|
|
uint8_t pad[304];
|
|
|
|
} QEMU_PACKED VMDKSESparseConstHeader;
|
|
|
|
|
|
|
|
typedef struct VMDKSESparseVolatileHeader {
|
|
|
|
uint64_t magic;
|
|
|
|
uint64_t free_gt_number;
|
|
|
|
uint64_t next_txn_seq_number;
|
|
|
|
uint64_t replay_journal;
|
|
|
|
uint8_t pad[480];
|
|
|
|
} QEMU_PACKED VMDKSESparseVolatileHeader;
|
|
|
|
|
2004-08-01 23:59:26 +02:00
|
|
|
#define L2_CACHE_SIZE 16
|
|
|
|
|
2011-07-12 13:56:28 +02:00
|
|
|
typedef struct VmdkExtent {
|
2015-06-15 13:50:20 +02:00
|
|
|
BdrvChild *file;
|
2011-07-12 13:56:28 +02:00
|
|
|
bool flat;
|
2011-08-12 17:19:30 +02:00
|
|
|
bool compressed;
|
|
|
|
bool has_marker;
|
2013-05-02 04:25:23 +02:00
|
|
|
bool has_zero_grain;
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
bool sesparse;
|
|
|
|
uint64_t sesparse_l2_tables_offset;
|
|
|
|
uint64_t sesparse_clusters_offset;
|
|
|
|
int32_t entry_size;
|
2013-05-02 04:25:23 +02:00
|
|
|
int version;
|
2011-07-12 13:56:28 +02:00
|
|
|
int64_t sectors;
|
|
|
|
int64_t end_sector;
|
2011-07-19 02:38:22 +02:00
|
|
|
int64_t flat_start_offset;
|
2004-08-01 23:59:26 +02:00
|
|
|
int64_t l1_table_offset;
|
2005-04-26 23:08:00 +02:00
|
|
|
int64_t l1_backup_table_offset;
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
void *l1_table;
|
2005-04-26 23:08:00 +02:00
|
|
|
uint32_t *l1_backup_table;
|
2004-08-01 23:59:26 +02:00
|
|
|
unsigned int l1_size;
|
|
|
|
uint32_t l1_entry_sectors;
|
|
|
|
|
|
|
|
unsigned int l2_size;
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
void *l2_cache;
|
2004-08-01 23:59:26 +02:00
|
|
|
uint32_t l2_cache_offsets[L2_CACHE_SIZE];
|
|
|
|
uint32_t l2_cache_counts[L2_CACHE_SIZE];
|
|
|
|
|
2013-09-23 11:18:29 +02:00
|
|
|
int64_t cluster_sectors;
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
int64_t next_cluster_sector;
|
2013-10-31 03:06:23 +01:00
|
|
|
char *type;
|
2011-07-12 13:56:28 +02:00
|
|
|
} VmdkExtent;
|
|
|
|
|
|
|
|
typedef struct BDRVVmdkState {
|
2011-10-20 13:16:21 +02:00
|
|
|
CoMutex lock;
|
2013-08-06 09:44:48 +02:00
|
|
|
uint64_t desc_offset;
|
2011-07-12 13:56:34 +02:00
|
|
|
bool cid_updated;
|
2013-10-18 07:17:19 +02:00
|
|
|
bool cid_checked;
|
2013-10-31 03:06:23 +01:00
|
|
|
uint32_t cid;
|
2007-01-24 22:05:24 +01:00
|
|
|
uint32_t parent_cid;
|
2011-07-12 13:56:28 +02:00
|
|
|
int num_extents;
|
|
|
|
/* Extent array with num_extents entries, ascend ordered by address */
|
|
|
|
VmdkExtent *extents;
|
2011-11-22 16:50:27 +01:00
|
|
|
Error *migration_blocker;
|
2013-10-31 03:06:23 +01:00
|
|
|
char *create_type;
|
2004-08-01 23:59:26 +02:00
|
|
|
} BDRVVmdkState;
|
|
|
|
|
2007-06-18 17:01:30 +02:00
|
|
|
typedef struct VmdkMetaData {
|
|
|
|
unsigned int l1_index;
|
|
|
|
unsigned int l2_index;
|
|
|
|
unsigned int l2_offset;
|
|
|
|
int valid;
|
2013-05-02 04:25:27 +02:00
|
|
|
uint32_t *l2_cache_entry;
|
2007-06-18 17:01:30 +02:00
|
|
|
} VmdkMetaData;
|
|
|
|
|
2011-08-12 17:19:30 +02:00
|
|
|
typedef struct VmdkGrainMarker {
|
|
|
|
uint64_t lba;
|
|
|
|
uint32_t size;
|
|
|
|
uint8_t data[0];
|
2013-08-06 09:44:47 +02:00
|
|
|
} QEMU_PACKED VmdkGrainMarker;
|
2011-08-12 17:19:30 +02:00
|
|
|
|
2012-08-16 10:39:33 +02:00
|
|
|
enum {
|
|
|
|
MARKER_END_OF_STREAM = 0,
|
|
|
|
MARKER_GRAIN_TABLE = 1,
|
|
|
|
MARKER_GRAIN_DIRECTORY = 2,
|
|
|
|
MARKER_FOOTER = 3,
|
|
|
|
};
|
|
|
|
|
2004-08-01 23:59:26 +02:00
|
|
|
static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
|
|
|
|
{
|
|
|
|
uint32_t magic;
|
|
|
|
|
2011-07-12 13:56:38 +02:00
|
|
|
if (buf_size < 4) {
|
2004-08-01 23:59:26 +02:00
|
|
|
return 0;
|
2011-07-12 13:56:38 +02:00
|
|
|
}
|
2004-08-01 23:59:26 +02:00
|
|
|
magic = be32_to_cpu(*(uint32_t *)buf);
|
|
|
|
if (magic == VMDK3_MAGIC ||
|
2011-07-12 13:56:30 +02:00
|
|
|
magic == VMDK4_MAGIC) {
|
2004-08-01 23:59:26 +02:00
|
|
|
return 100;
|
2011-07-12 13:56:30 +02:00
|
|
|
} else {
|
|
|
|
const char *p = (const char *)buf;
|
|
|
|
const char *end = p + buf_size;
|
|
|
|
while (p < end) {
|
|
|
|
if (*p == '#') {
|
|
|
|
/* skip comment line */
|
|
|
|
while (p < end && *p != '\n') {
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
p++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (*p == ' ') {
|
|
|
|
while (p < end && *p == ' ') {
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
/* skip '\r' if windows line endings used. */
|
|
|
|
if (p < end && *p == '\r') {
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
/* only accept blank lines before 'version=' line */
|
|
|
|
if (p == end || *p != '\n') {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
p++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (end - p >= strlen("version=X\n")) {
|
|
|
|
if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 ||
|
2019-03-14 15:14:37 +01:00
|
|
|
strncmp("version=2\n", p, strlen("version=2\n")) == 0 ||
|
|
|
|
strncmp("version=3\n", p, strlen("version=3\n")) == 0) {
|
2011-07-12 13:56:30 +02:00
|
|
|
return 100;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (end - p >= strlen("version=X\r\n")) {
|
|
|
|
if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 ||
|
2019-03-14 15:14:37 +01:00
|
|
|
strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0 ||
|
|
|
|
strncmp("version=3\r\n", p, strlen("version=3\r\n")) == 0) {
|
2011-07-12 13:56:30 +02:00
|
|
|
return 100;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2004-08-01 23:59:26 +02:00
|
|
|
return 0;
|
2011-07-12 13:56:30 +02:00
|
|
|
}
|
2004-08-01 23:59:26 +02:00
|
|
|
}
|
|
|
|
|
2007-09-17 10:09:54 +02:00
|
|
|
#define SECTOR_SIZE 512
|
2011-07-19 02:45:23 +02:00
|
|
|
#define DESC_SIZE (20 * SECTOR_SIZE) /* 20 sectors of 512 bytes each */
|
|
|
|
#define BUF_SIZE 4096
|
|
|
|
#define HEADER_SIZE 512 /* first sector of 512 bytes */
|
2007-01-24 22:05:24 +01:00
|
|
|
|
2011-07-12 13:56:28 +02:00
|
|
|
static void vmdk_free_extents(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2011-09-19 04:26:42 +02:00
|
|
|
VmdkExtent *e;
|
2011-07-12 13:56:28 +02:00
|
|
|
|
|
|
|
for (i = 0; i < s->num_extents; i++) {
|
2011-09-19 04:26:42 +02:00
|
|
|
e = &s->extents[i];
|
|
|
|
g_free(e->l1_table);
|
|
|
|
g_free(e->l2_cache);
|
|
|
|
g_free(e->l1_backup_table);
|
2013-10-31 03:06:23 +01:00
|
|
|
g_free(e->type);
|
2015-06-16 14:19:22 +02:00
|
|
|
if (e->file != bs->file) {
|
2015-06-15 13:50:20 +02:00
|
|
|
bdrv_unref_child(bs, e->file);
|
2011-09-19 04:26:42 +02:00
|
|
|
}
|
2011-07-12 13:56:28 +02:00
|
|
|
}
|
2011-08-21 05:09:37 +02:00
|
|
|
g_free(s->extents);
|
2011-07-12 13:56:28 +02:00
|
|
|
}
|
|
|
|
|
2011-08-12 17:19:28 +02:00
|
|
|
static void vmdk_free_last_extent(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
|
|
|
|
if (s->num_extents == 0) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
s->num_extents--;
|
block: Use g_new() & friends where that makes obvious sense
g_new(T, n) is neater than g_malloc(sizeof(T) * n). It's also safer,
for two reasons. One, it catches multiplication overflowing size_t.
Two, it returns T * rather than void *, which lets the compiler catch
more type errors.
Patch created with Coccinelle, with two manual changes on top:
* Add const to bdrv_iterate_format() to keep the types straight
* Convert the allocation in bdrv_drop_intermediate(), which Coccinelle
inexplicably misses
Coccinelle semantic patch:
@@
type T;
@@
-g_malloc(sizeof(T))
+g_new(T, 1)
@@
type T;
@@
-g_try_malloc(sizeof(T))
+g_try_new(T, 1)
@@
type T;
@@
-g_malloc0(sizeof(T))
+g_new0(T, 1)
@@
type T;
@@
-g_try_malloc0(sizeof(T))
+g_try_new0(T, 1)
@@
type T;
expression n;
@@
-g_malloc(sizeof(T) * (n))
+g_new(T, n)
@@
type T;
expression n;
@@
-g_try_malloc(sizeof(T) * (n))
+g_try_new(T, n)
@@
type T;
expression n;
@@
-g_malloc0(sizeof(T) * (n))
+g_new0(T, n)
@@
type T;
expression n;
@@
-g_try_malloc0(sizeof(T) * (n))
+g_try_new0(T, n)
@@
type T;
expression p, n;
@@
-g_realloc(p, sizeof(T) * (n))
+g_renew(T, p, n)
@@
type T;
expression p, n;
@@
-g_try_realloc(p, sizeof(T) * (n))
+g_try_renew(T, p, n)
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Jeff Cody <jcody@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2014-08-19 10:31:08 +02:00
|
|
|
s->extents = g_renew(VmdkExtent, s->extents, s->num_extents);
|
2011-08-12 17:19:28 +02:00
|
|
|
}
|
|
|
|
|
2017-07-09 19:06:14 +02:00
|
|
|
/* Return -ve errno, or 0 on success and write CID into *pcid. */
|
|
|
|
static int vmdk_read_cid(BlockDriverState *bs, int parent, uint32_t *pcid)
|
2004-08-01 23:59:26 +02:00
|
|
|
{
|
2016-03-08 09:24:35 +01:00
|
|
|
char *desc;
|
2017-07-09 19:06:14 +02:00
|
|
|
uint32_t cid;
|
2008-09-14 08:45:34 +02:00
|
|
|
const char *p_name, *cid_str;
|
2007-01-24 22:05:24 +01:00
|
|
|
size_t cid_str_size;
|
2011-07-12 13:56:32 +02:00
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2011-10-26 12:25:25 +02:00
|
|
|
int ret;
|
2007-01-24 22:05:24 +01:00
|
|
|
|
2016-03-08 09:24:35 +01:00
|
|
|
desc = g_malloc0(DESC_SIZE);
|
2016-06-20 18:24:02 +02:00
|
|
|
ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
|
2011-10-26 12:25:25 +02:00
|
|
|
if (ret < 0) {
|
2017-07-09 19:06:14 +02:00
|
|
|
goto out;
|
2011-07-12 13:56:32 +02:00
|
|
|
}
|
2007-01-24 22:05:24 +01:00
|
|
|
|
|
|
|
if (parent) {
|
|
|
|
cid_str = "parentCID";
|
|
|
|
cid_str_size = sizeof("parentCID");
|
|
|
|
} else {
|
|
|
|
cid_str = "CID";
|
|
|
|
cid_str_size = sizeof("CID");
|
|
|
|
}
|
|
|
|
|
2011-10-26 12:25:52 +02:00
|
|
|
desc[DESC_SIZE - 1] = '\0';
|
2011-07-12 13:56:38 +02:00
|
|
|
p_name = strstr(desc, cid_str);
|
2017-07-09 19:06:14 +02:00
|
|
|
if (p_name == NULL) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
2007-01-24 22:05:24 +01:00
|
|
|
}
|
2017-07-09 19:06:14 +02:00
|
|
|
p_name += cid_str_size;
|
|
|
|
if (sscanf(p_name, "%" SCNx32, &cid) != 1) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
*pcid = cid;
|
|
|
|
ret = 0;
|
2007-01-24 22:05:24 +01:00
|
|
|
|
2017-07-09 19:06:14 +02:00
|
|
|
out:
|
2016-03-08 09:24:35 +01:00
|
|
|
g_free(desc);
|
2017-07-09 19:06:14 +02:00
|
|
|
return ret;
|
2007-01-24 22:05:24 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
|
|
|
|
{
|
2016-03-08 09:24:34 +01:00
|
|
|
char *desc, *tmp_desc;
|
2007-01-24 22:05:24 +01:00
|
|
|
char *p_name, *tmp_str;
|
2011-07-12 13:56:32 +02:00
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2016-03-08 09:24:34 +01:00
|
|
|
int ret = 0;
|
2007-01-24 22:05:24 +01:00
|
|
|
|
2016-03-08 09:24:34 +01:00
|
|
|
desc = g_malloc0(DESC_SIZE);
|
|
|
|
tmp_desc = g_malloc0(DESC_SIZE);
|
2016-06-20 18:24:02 +02:00
|
|
|
ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
|
2011-10-26 12:25:25 +02:00
|
|
|
if (ret < 0) {
|
2016-03-08 09:24:34 +01:00
|
|
|
goto out;
|
2011-07-12 13:56:32 +02:00
|
|
|
}
|
2007-01-24 22:05:24 +01:00
|
|
|
|
2011-10-26 12:25:52 +02:00
|
|
|
desc[DESC_SIZE - 1] = '\0';
|
2011-07-12 13:56:38 +02:00
|
|
|
tmp_str = strstr(desc, "parentCID");
|
2011-10-26 12:25:52 +02:00
|
|
|
if (tmp_str == NULL) {
|
2016-03-08 09:24:34 +01:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
2011-10-26 12:25:52 +02:00
|
|
|
}
|
|
|
|
|
2016-03-08 09:24:34 +01:00
|
|
|
pstrcpy(tmp_desc, DESC_SIZE, tmp_str);
|
2011-07-12 13:56:38 +02:00
|
|
|
p_name = strstr(desc, "CID");
|
|
|
|
if (p_name != NULL) {
|
2007-01-24 22:05:24 +01:00
|
|
|
p_name += sizeof("CID");
|
2016-03-08 09:24:34 +01:00
|
|
|
snprintf(p_name, DESC_SIZE - (p_name - desc), "%" PRIx32 "\n", cid);
|
|
|
|
pstrcat(desc, DESC_SIZE, tmp_desc);
|
2007-01-24 22:05:24 +01:00
|
|
|
}
|
|
|
|
|
2016-06-20 20:09:15 +02:00
|
|
|
ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE);
|
2011-10-26 12:25:25 +02:00
|
|
|
|
2016-03-08 09:24:34 +01:00
|
|
|
out:
|
|
|
|
g_free(desc);
|
|
|
|
g_free(tmp_desc);
|
|
|
|
return ret;
|
2007-01-24 22:05:24 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static int vmdk_is_cid_valid(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
uint32_t cur_pcid;
|
|
|
|
|
2015-06-17 14:55:21 +02:00
|
|
|
if (!s->cid_checked && bs->backing) {
|
|
|
|
BlockDriverState *p_bs = bs->backing->bs;
|
|
|
|
|
vmdk: Fix possible segfault with non-VMDK backing
VMDK performs a probing check in vmdk_co_create_opts() to prevent the
user from assigning non-VMDK files as a backing file, because it only
supports VMDK backing files. However, with the @backing runtime option,
it is possible to assign arbitrary nodes as backing nodes, regardless of
what the image header says. Therefore, VMDK may not just access backing
nodes assuming they are VMDK nodes -- which it does, because it needs to
compare the backing file's CID with the overlay's parentCID value, and
naturally the backing file only has a CID when it's a VMDK file.
Instead, it should report the CID of non-VMDK backing files not to match
the overlay because clearly a non-present CID does not match.
Without this change, vmdk_read_cid() reads from the backing file's
bs->file, which may be NULL (in which case we get a segfault). Also, it
interprets bs->opaque as a BDRVVmdkState and then reads from the
.desc_offset field, which usually will just return some arbitrary value
which then results in either garbage to be read, or bdrv_pread() to
return an error, both of which result in a non-matching CID to be
reported.
(In a very unlikely case, we could read something that looks like a
VMDK descriptor, and then get a CID which might actually match. But
that is highly unlikely, and the only result would be that VMDK accepts
the backing file which is not too bad (albeit unintentional).)
((And in theory, the seek to .desc_offset might leak data from another
block driver's opaque object. But then again, the user should realize
very quickly that a non-VMDK backing file does not work (because the
read will very likely fail, due to the reasons given above), so this
should not be exploitable.))
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180702210721.4847-2-mreitz@redhat.com
Reviewed-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2018-07-02 23:07:20 +02:00
|
|
|
if (strcmp(p_bs->drv->format_name, "vmdk")) {
|
|
|
|
/* Backing file is not in vmdk format, so it does not have
|
|
|
|
* a CID, which makes the overlay's parent CID invalid */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-07-09 19:06:14 +02:00
|
|
|
if (vmdk_read_cid(p_bs, 0, &cur_pcid) != 0) {
|
|
|
|
/* read failure: report as not valid */
|
|
|
|
return 0;
|
|
|
|
}
|
2011-07-12 13:56:38 +02:00
|
|
|
if (s->parent_cid != cur_pcid) {
|
|
|
|
/* CID not valid */
|
2007-01-24 22:05:24 +01:00
|
|
|
return 0;
|
2011-07-12 13:56:38 +02:00
|
|
|
}
|
2007-01-24 22:05:24 +01:00
|
|
|
}
|
2013-10-18 07:17:19 +02:00
|
|
|
s->cid_checked = true;
|
2011-07-12 13:56:38 +02:00
|
|
|
/* CID valid */
|
2007-01-24 22:05:24 +01:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2015-04-09 18:54:04 +02:00
|
|
|
/* We have nothing to do for VMDK reopen, stubs just return success */
|
2012-09-20 21:13:30 +02:00
|
|
|
static int vmdk_reopen_prepare(BDRVReopenState *state,
|
|
|
|
BlockReopenQueue *queue, Error **errp)
|
|
|
|
{
|
|
|
|
assert(state != NULL);
|
|
|
|
assert(state->bs != NULL);
|
2015-04-09 18:54:04 +02:00
|
|
|
return 0;
|
2012-09-20 21:13:30 +02:00
|
|
|
}
|
|
|
|
|
2010-04-16 21:07:19 +02:00
|
|
|
static int vmdk_parent_open(BlockDriverState *bs)
|
2007-01-24 22:05:24 +01:00
|
|
|
{
|
2007-09-16 23:08:06 +02:00
|
|
|
char *p_name;
|
2016-03-08 09:24:36 +01:00
|
|
|
char *desc;
|
2011-07-12 13:56:32 +02:00
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2011-10-20 13:16:19 +02:00
|
|
|
int ret;
|
2007-01-24 22:05:24 +01:00
|
|
|
|
2016-03-08 09:24:36 +01:00
|
|
|
desc = g_malloc0(DESC_SIZE + 1);
|
2016-06-20 18:24:02 +02:00
|
|
|
ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
|
2011-10-20 13:16:19 +02:00
|
|
|
if (ret < 0) {
|
2016-03-08 09:24:36 +01:00
|
|
|
goto out;
|
2011-07-12 13:56:32 +02:00
|
|
|
}
|
2016-03-08 09:24:36 +01:00
|
|
|
ret = 0;
|
2007-01-24 22:05:24 +01:00
|
|
|
|
2011-07-12 13:56:38 +02:00
|
|
|
p_name = strstr(desc, "parentFileNameHint");
|
|
|
|
if (p_name != NULL) {
|
2007-01-24 22:05:24 +01:00
|
|
|
char *end_name;
|
|
|
|
|
|
|
|
p_name += sizeof("parentFileNameHint") + 1;
|
2011-07-12 13:56:38 +02:00
|
|
|
end_name = strchr(p_name, '\"');
|
|
|
|
if (end_name == NULL) {
|
2016-03-08 09:24:36 +01:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
2011-07-12 13:56:38 +02:00
|
|
|
}
|
block: Add BDS.auto_backing_file
If the backing file is overridden, this most probably does change the
guest-visible data of a BDS. Therefore, we will need to consider this
in bdrv_refresh_filename().
To see whether it has been overridden, we might want to compare
bs->backing_file and bs->backing->bs->filename. However,
bs->backing_file is changed by bdrv_set_backing_hd() (which is just used
to change the backing child at runtime, without modifying the image
header), so bs->backing_file most of the time simply contains a copy of
bs->backing->bs->filename anyway, so it is useless for such a
comparison.
This patch adds an auto_backing_file BDS field which contains the
backing file path as indicated by the image header, which is not changed
by bdrv_set_backing_hd().
Because of bdrv_refresh_filename() magic, however, a BDS's filename may
differ from what has been specified during bdrv_open(). Then, the
comparison between bs->auto_backing_file and bs->backing->bs->filename
may fail even though bs->backing was opened from bs->auto_backing_file.
To mitigate this, we can copy the real BDS's filename (after the whole
bdrv_open() and bdrv_refresh_filename() process) into
bs->auto_backing_file, if we know the former has been opened based on
the latter. This is only possible if no options modifying the backing
file's behavior have been specified, though. To simplify things, this
patch only copies the filename from the backing file if no options have
been specified for it at all.
Furthermore, there are cases where an overlay is created by qemu which
already contains a BDS's filename (e.g. in blockdev-snapshot-sync). We
do not need to worry about updating the overlay's bs->auto_backing_file
there, because we actually wrote a post-bdrv_refresh_filename() filename
into the image header.
So all in all, there will be false negatives where (as of a future
patch) bdrv_refresh_filename() will assume that the backing file differs
from what was specified in the image header, even though it really does
not. However, these cases should be limited to where (1) the user
actually did override something in the backing chain (e.g. by specifying
options for the backing file), or (2) the user executed a QMP command to
change some node's backing file (e.g. change-backing-file or
block-commit with @backing-file given) where the given filename does not
happen to coincide with qemu's idea of the backing BDS's filename.
Then again, (1) really is limited to -drive. With -blockdev or
blockdev-add, you have to adhere to the schema, so a user cannot give
partial "unimportant" options (e.g. by just setting backing.node-name
and leaving the rest to the image header). Therefore, trying to fix
this would mean trying to fix something for -drive only.
To improve on (2), we would need a full infrastructure to "canonicalize"
an arbitrary filename (+ options), so it can be compared against
another. That seems a bit over the top, considering that filenames
nowadays are there mostly for the user's entertainment.
Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Message-id: 20190201192935.18394-5-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-02-01 20:29:08 +01:00
|
|
|
if ((end_name - p_name) > sizeof(bs->auto_backing_file) - 1) {
|
2016-03-08 09:24:36 +01:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
2011-07-12 13:56:38 +02:00
|
|
|
}
|
2007-09-17 10:09:54 +02:00
|
|
|
|
block: Add BDS.auto_backing_file
If the backing file is overridden, this most probably does change the
guest-visible data of a BDS. Therefore, we will need to consider this
in bdrv_refresh_filename().
To see whether it has been overridden, we might want to compare
bs->backing_file and bs->backing->bs->filename. However,
bs->backing_file is changed by bdrv_set_backing_hd() (which is just used
to change the backing child at runtime, without modifying the image
header), so bs->backing_file most of the time simply contains a copy of
bs->backing->bs->filename anyway, so it is useless for such a
comparison.
This patch adds an auto_backing_file BDS field which contains the
backing file path as indicated by the image header, which is not changed
by bdrv_set_backing_hd().
Because of bdrv_refresh_filename() magic, however, a BDS's filename may
differ from what has been specified during bdrv_open(). Then, the
comparison between bs->auto_backing_file and bs->backing->bs->filename
may fail even though bs->backing was opened from bs->auto_backing_file.
To mitigate this, we can copy the real BDS's filename (after the whole
bdrv_open() and bdrv_refresh_filename() process) into
bs->auto_backing_file, if we know the former has been opened based on
the latter. This is only possible if no options modifying the backing
file's behavior have been specified, though. To simplify things, this
patch only copies the filename from the backing file if no options have
been specified for it at all.
Furthermore, there are cases where an overlay is created by qemu which
already contains a BDS's filename (e.g. in blockdev-snapshot-sync). We
do not need to worry about updating the overlay's bs->auto_backing_file
there, because we actually wrote a post-bdrv_refresh_filename() filename
into the image header.
So all in all, there will be false negatives where (as of a future
patch) bdrv_refresh_filename() will assume that the backing file differs
from what was specified in the image header, even though it really does
not. However, these cases should be limited to where (1) the user
actually did override something in the backing chain (e.g. by specifying
options for the backing file), or (2) the user executed a QMP command to
change some node's backing file (e.g. change-backing-file or
block-commit with @backing-file given) where the given filename does not
happen to coincide with qemu's idea of the backing BDS's filename.
Then again, (1) really is limited to -drive. With -blockdev or
blockdev-add, you have to adhere to the schema, so a user cannot give
partial "unimportant" options (e.g. by just setting backing.node-name
and leaving the rest to the image header). Therefore, trying to fix
this would mean trying to fix something for -drive only.
To improve on (2), we would need a full infrastructure to "canonicalize"
an arbitrary filename (+ options), so it can be compared against
another. That seems a bit over the top, considering that filenames
nowadays are there mostly for the user's entertainment.
Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Message-id: 20190201192935.18394-5-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-02-01 20:29:08 +01:00
|
|
|
pstrcpy(bs->auto_backing_file, end_name - p_name + 1, p_name);
|
|
|
|
pstrcpy(bs->backing_file, sizeof(bs->backing_file),
|
|
|
|
bs->auto_backing_file);
|
2019-03-26 20:58:37 +01:00
|
|
|
pstrcpy(bs->backing_format, sizeof(bs->backing_format),
|
|
|
|
"vmdk");
|
2005-04-26 23:08:00 +02:00
|
|
|
}
|
2007-01-24 22:05:24 +01:00
|
|
|
|
2016-03-08 09:24:36 +01:00
|
|
|
out:
|
|
|
|
g_free(desc);
|
|
|
|
return ret;
|
2007-01-24 22:05:24 +01:00
|
|
|
}
|
|
|
|
|
2011-07-12 13:56:28 +02:00
|
|
|
/* Create and append extent to the extent array. Return the added VmdkExtent
|
|
|
|
* address. return NULL if allocation failed. */
|
2013-08-06 09:44:51 +02:00
|
|
|
static int vmdk_add_extent(BlockDriverState *bs,
|
2015-06-15 13:50:20 +02:00
|
|
|
BdrvChild *file, bool flat, int64_t sectors,
|
2011-07-12 13:56:28 +02:00
|
|
|
int64_t l1_offset, int64_t l1_backup_offset,
|
|
|
|
uint32_t l1_size,
|
2013-08-06 09:44:51 +02:00
|
|
|
int l2_size, uint64_t cluster_sectors,
|
2013-10-11 09:43:22 +02:00
|
|
|
VmdkExtent **new_extent,
|
|
|
|
Error **errp)
|
2011-07-12 13:56:28 +02:00
|
|
|
{
|
|
|
|
VmdkExtent *extent;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2014-08-21 14:36:19 +02:00
|
|
|
int64_t nb_sectors;
|
2011-07-12 13:56:28 +02:00
|
|
|
|
2013-08-06 09:44:51 +02:00
|
|
|
if (cluster_sectors > 0x200000) {
|
|
|
|
/* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
|
2013-10-11 09:43:22 +02:00
|
|
|
error_setg(errp, "Invalid granularity, image may be corrupt");
|
|
|
|
return -EFBIG;
|
2013-08-06 09:44:51 +02:00
|
|
|
}
|
2019-06-20 11:10:56 +02:00
|
|
|
if (l1_size > 32 * 1024 * 1024) {
|
2019-06-20 11:10:55 +02:00
|
|
|
/*
|
|
|
|
* Although with big capacity and small l1_entry_sectors, we can get a
|
2013-08-19 12:54:25 +02:00
|
|
|
* big l1_size, we don't want unbounded value to allocate the table.
|
2019-06-20 11:10:56 +02:00
|
|
|
* Limit it to 32M, which is enough to store:
|
|
|
|
* 8TB - for both VMDK3 & VMDK4 with
|
|
|
|
* minimal cluster size: 512B
|
|
|
|
* minimal L2 table size: 512 entries
|
|
|
|
* 8 TB is still more than the maximal value supported for
|
|
|
|
* VMDK3 & VMDK4 which is 2TB.
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
* 64TB - for "ESXi seSparse Extent"
|
|
|
|
* minimal cluster size: 512B (default is 4KB)
|
|
|
|
* L2 table size: 4096 entries (const).
|
|
|
|
* 64TB is more than the maximal value supported for
|
|
|
|
* seSparse VMDKs (which is slightly less than 64TB)
|
2019-06-20 11:10:55 +02:00
|
|
|
*/
|
2013-10-11 09:43:22 +02:00
|
|
|
error_setg(errp, "L1 size too big");
|
2013-08-19 12:54:25 +02:00
|
|
|
return -EFBIG;
|
|
|
|
}
|
2013-08-06 09:44:51 +02:00
|
|
|
|
2015-06-15 13:50:20 +02:00
|
|
|
nb_sectors = bdrv_nb_sectors(file->bs);
|
2014-08-21 14:36:19 +02:00
|
|
|
if (nb_sectors < 0) {
|
|
|
|
return nb_sectors;
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
}
|
|
|
|
|
block: Use g_new() & friends where that makes obvious sense
g_new(T, n) is neater than g_malloc(sizeof(T) * n). It's also safer,
for two reasons. One, it catches multiplication overflowing size_t.
Two, it returns T * rather than void *, which lets the compiler catch
more type errors.
Patch created with Coccinelle, with two manual changes on top:
* Add const to bdrv_iterate_format() to keep the types straight
* Convert the allocation in bdrv_drop_intermediate(), which Coccinelle
inexplicably misses
Coccinelle semantic patch:
@@
type T;
@@
-g_malloc(sizeof(T))
+g_new(T, 1)
@@
type T;
@@
-g_try_malloc(sizeof(T))
+g_try_new(T, 1)
@@
type T;
@@
-g_malloc0(sizeof(T))
+g_new0(T, 1)
@@
type T;
@@
-g_try_malloc0(sizeof(T))
+g_try_new0(T, 1)
@@
type T;
expression n;
@@
-g_malloc(sizeof(T) * (n))
+g_new(T, n)
@@
type T;
expression n;
@@
-g_try_malloc(sizeof(T) * (n))
+g_try_new(T, n)
@@
type T;
expression n;
@@
-g_malloc0(sizeof(T) * (n))
+g_new0(T, n)
@@
type T;
expression n;
@@
-g_try_malloc0(sizeof(T) * (n))
+g_try_new0(T, n)
@@
type T;
expression p, n;
@@
-g_realloc(p, sizeof(T) * (n))
+g_renew(T, p, n)
@@
type T;
expression p, n;
@@
-g_try_realloc(p, sizeof(T) * (n))
+g_try_renew(T, p, n)
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Jeff Cody <jcody@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2014-08-19 10:31:08 +02:00
|
|
|
s->extents = g_renew(VmdkExtent, s->extents, s->num_extents + 1);
|
2011-07-12 13:56:28 +02:00
|
|
|
extent = &s->extents[s->num_extents];
|
|
|
|
s->num_extents++;
|
|
|
|
|
|
|
|
memset(extent, 0, sizeof(VmdkExtent));
|
|
|
|
extent->file = file;
|
|
|
|
extent->flat = flat;
|
|
|
|
extent->sectors = sectors;
|
|
|
|
extent->l1_table_offset = l1_offset;
|
|
|
|
extent->l1_backup_table_offset = l1_backup_offset;
|
|
|
|
extent->l1_size = l1_size;
|
|
|
|
extent->l1_entry_sectors = l2_size * cluster_sectors;
|
|
|
|
extent->l2_size = l2_size;
|
2013-09-23 11:18:29 +02:00
|
|
|
extent->cluster_sectors = flat ? sectors : cluster_sectors;
|
2014-08-21 14:36:19 +02:00
|
|
|
extent->next_cluster_sector = ROUND_UP(nb_sectors, cluster_sectors);
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
extent->entry_size = sizeof(uint32_t);
|
2011-07-12 13:56:28 +02:00
|
|
|
|
|
|
|
if (s->num_extents > 1) {
|
|
|
|
extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
|
|
|
|
} else {
|
|
|
|
extent->end_sector = extent->sectors;
|
|
|
|
}
|
|
|
|
bs->total_sectors = extent->end_sector;
|
2013-08-06 09:44:51 +02:00
|
|
|
if (new_extent) {
|
|
|
|
*new_extent = extent;
|
|
|
|
}
|
|
|
|
return 0;
|
2011-07-12 13:56:28 +02:00
|
|
|
}
|
|
|
|
|
2013-10-11 09:43:22 +02:00
|
|
|
static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
|
|
|
|
Error **errp)
|
2007-01-24 22:05:24 +01:00
|
|
|
{
|
2011-07-12 13:56:31 +02:00
|
|
|
int ret;
|
2015-05-05 11:28:13 +02:00
|
|
|
size_t l1_size;
|
|
|
|
int i;
|
2007-01-24 22:05:24 +01:00
|
|
|
|
2004-08-01 23:59:26 +02:00
|
|
|
/* read the L1 table */
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
l1_size = extent->l1_size * extent->entry_size;
|
2014-05-20 13:56:27 +02:00
|
|
|
extent->l1_table = g_try_malloc(l1_size);
|
|
|
|
if (l1_size && extent->l1_table == NULL) {
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2016-06-20 18:24:02 +02:00
|
|
|
ret = bdrv_pread(extent->file,
|
2013-10-11 09:43:22 +02:00
|
|
|
extent->l1_table_offset,
|
|
|
|
extent->l1_table,
|
|
|
|
l1_size);
|
2011-07-12 13:56:31 +02:00
|
|
|
if (ret < 0) {
|
block: Use bdrv_refresh_filename() to pull
Before this patch, bdrv_refresh_filename() is used in a pushing manner:
Whenever the BDS graph is modified, the parents of the modified edges
are supposed to be updated (recursively upwards). However, that is
nonviable, considering that we want child changes not to concern
parents.
Also, in the long run we want a pull model anyway: Here, we would have a
bdrv_filename() function which returns a BDS's filename, freshly
constructed.
This patch is an intermediate step. It adds bdrv_refresh_filename()
calls before every place a BDS.filename value is used. The only
exceptions are protocol drivers that use their own filename, which
clearly would not profit from refreshing that filename before.
Also, bdrv_get_encrypted_filename() is removed along the way (as a user
of BDS.filename), since it is completely unused.
In turn, all of the calls to bdrv_refresh_filename() before this patch
are removed, because we no longer have to call this function on graph
changes.
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190201192935.18394-2-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-02-01 20:29:05 +01:00
|
|
|
bdrv_refresh_filename(extent->file->bs);
|
2013-10-11 09:43:22 +02:00
|
|
|
error_setg_errno(errp, -ret,
|
|
|
|
"Could not read l1 table from extent '%s'",
|
2015-06-15 13:50:20 +02:00
|
|
|
extent->file->bs->filename);
|
2011-07-12 13:56:31 +02:00
|
|
|
goto fail_l1;
|
2011-07-12 13:56:28 +02:00
|
|
|
}
|
|
|
|
for (i = 0; i < extent->l1_size; i++) {
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
if (extent->entry_size == sizeof(uint64_t)) {
|
|
|
|
le64_to_cpus((uint64_t *)extent->l1_table + i);
|
|
|
|
} else {
|
|
|
|
assert(extent->entry_size == sizeof(uint32_t));
|
|
|
|
le32_to_cpus((uint32_t *)extent->l1_table + i);
|
|
|
|
}
|
2004-08-01 23:59:26 +02:00
|
|
|
}
|
|
|
|
|
2011-07-12 13:56:28 +02:00
|
|
|
if (extent->l1_backup_table_offset) {
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
assert(!extent->sesparse);
|
2014-05-20 13:56:27 +02:00
|
|
|
extent->l1_backup_table = g_try_malloc(l1_size);
|
|
|
|
if (l1_size && extent->l1_backup_table == NULL) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto fail_l1;
|
|
|
|
}
|
2016-06-20 18:24:02 +02:00
|
|
|
ret = bdrv_pread(extent->file,
|
2013-10-11 09:43:22 +02:00
|
|
|
extent->l1_backup_table_offset,
|
|
|
|
extent->l1_backup_table,
|
|
|
|
l1_size);
|
2011-07-12 13:56:31 +02:00
|
|
|
if (ret < 0) {
|
block: Use bdrv_refresh_filename() to pull
Before this patch, bdrv_refresh_filename() is used in a pushing manner:
Whenever the BDS graph is modified, the parents of the modified edges
are supposed to be updated (recursively upwards). However, that is
nonviable, considering that we want child changes not to concern
parents.
Also, in the long run we want a pull model anyway: Here, we would have a
bdrv_filename() function which returns a BDS's filename, freshly
constructed.
This patch is an intermediate step. It adds bdrv_refresh_filename()
calls before every place a BDS.filename value is used. The only
exceptions are protocol drivers that use their own filename, which
clearly would not profit from refreshing that filename before.
Also, bdrv_get_encrypted_filename() is removed along the way (as a user
of BDS.filename), since it is completely unused.
In turn, all of the calls to bdrv_refresh_filename() before this patch
are removed, because we no longer have to call this function on graph
changes.
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190201192935.18394-2-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-02-01 20:29:05 +01:00
|
|
|
bdrv_refresh_filename(extent->file->bs);
|
2013-10-11 09:43:22 +02:00
|
|
|
error_setg_errno(errp, -ret,
|
|
|
|
"Could not read l1 backup table from extent '%s'",
|
2015-06-15 13:50:20 +02:00
|
|
|
extent->file->bs->filename);
|
2011-07-12 13:56:31 +02:00
|
|
|
goto fail_l1b;
|
2011-07-12 13:56:28 +02:00
|
|
|
}
|
|
|
|
for (i = 0; i < extent->l1_size; i++) {
|
|
|
|
le32_to_cpus(&extent->l1_backup_table[i]);
|
2005-04-26 23:08:00 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-07-12 13:56:28 +02:00
|
|
|
extent->l2_cache =
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
g_malloc(extent->entry_size * extent->l2_size * L2_CACHE_SIZE);
|
2004-08-01 23:59:26 +02:00
|
|
|
return 0;
|
2011-07-12 13:56:31 +02:00
|
|
|
fail_l1b:
|
2011-08-21 05:09:37 +02:00
|
|
|
g_free(extent->l1_backup_table);
|
2011-07-12 13:56:31 +02:00
|
|
|
fail_l1:
|
2011-08-21 05:09:37 +02:00
|
|
|
g_free(extent->l1_table);
|
2011-07-12 13:56:31 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-08-19 12:54:27 +02:00
|
|
|
static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
|
2015-06-15 13:50:20 +02:00
|
|
|
BdrvChild *file,
|
2013-10-11 09:43:22 +02:00
|
|
|
int flags, Error **errp)
|
2011-07-12 13:56:31 +02:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
uint32_t magic;
|
|
|
|
VMDK3Header header;
|
|
|
|
VmdkExtent *extent;
|
|
|
|
|
2016-06-20 18:24:02 +02:00
|
|
|
ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
|
2011-07-12 13:56:31 +02:00
|
|
|
if (ret < 0) {
|
block: Use bdrv_refresh_filename() to pull
Before this patch, bdrv_refresh_filename() is used in a pushing manner:
Whenever the BDS graph is modified, the parents of the modified edges
are supposed to be updated (recursively upwards). However, that is
nonviable, considering that we want child changes not to concern
parents.
Also, in the long run we want a pull model anyway: Here, we would have a
bdrv_filename() function which returns a BDS's filename, freshly
constructed.
This patch is an intermediate step. It adds bdrv_refresh_filename()
calls before every place a BDS.filename value is used. The only
exceptions are protocol drivers that use their own filename, which
clearly would not profit from refreshing that filename before.
Also, bdrv_get_encrypted_filename() is removed along the way (as a user
of BDS.filename), since it is completely unused.
In turn, all of the calls to bdrv_refresh_filename() before this patch
are removed, because we no longer have to call this function on graph
changes.
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190201192935.18394-2-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-02-01 20:29:05 +01:00
|
|
|
bdrv_refresh_filename(file->bs);
|
2013-10-11 09:43:22 +02:00
|
|
|
error_setg_errno(errp, -ret,
|
|
|
|
"Could not read header from file '%s'",
|
2015-06-15 13:50:20 +02:00
|
|
|
file->bs->filename);
|
2011-08-12 17:19:28 +02:00
|
|
|
return ret;
|
2011-07-12 13:56:31 +02:00
|
|
|
}
|
2013-08-19 12:54:26 +02:00
|
|
|
ret = vmdk_add_extent(bs, file, false,
|
|
|
|
le32_to_cpu(header.disk_sectors),
|
2015-04-27 16:23:01 +02:00
|
|
|
(int64_t)le32_to_cpu(header.l1dir_offset) << 9,
|
2013-08-19 12:54:26 +02:00
|
|
|
0,
|
|
|
|
le32_to_cpu(header.l1dir_size),
|
|
|
|
4096,
|
|
|
|
le32_to_cpu(header.granularity),
|
2013-10-11 09:43:22 +02:00
|
|
|
&extent,
|
|
|
|
errp);
|
2013-08-06 09:44:51 +02:00
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
2013-10-11 09:43:22 +02:00
|
|
|
ret = vmdk_init_tables(bs, extent, errp);
|
2011-07-12 13:56:31 +02:00
|
|
|
if (ret) {
|
2011-08-12 17:19:28 +02:00
|
|
|
/* free extent allocated by vmdk_add_extent */
|
|
|
|
vmdk_free_last_extent(bs);
|
2011-07-12 13:56:31 +02:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
#define SESPARSE_CONST_HEADER_MAGIC UINT64_C(0x00000000cafebabe)
|
|
|
|
#define SESPARSE_VOLATILE_HEADER_MAGIC UINT64_C(0x00000000cafecafe)
|
|
|
|
|
|
|
|
/* Strict checks - format not officially documented */
|
|
|
|
static int check_se_sparse_const_header(VMDKSESparseConstHeader *header,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
header->magic = le64_to_cpu(header->magic);
|
|
|
|
header->version = le64_to_cpu(header->version);
|
|
|
|
header->grain_size = le64_to_cpu(header->grain_size);
|
|
|
|
header->grain_table_size = le64_to_cpu(header->grain_table_size);
|
|
|
|
header->flags = le64_to_cpu(header->flags);
|
|
|
|
header->reserved1 = le64_to_cpu(header->reserved1);
|
|
|
|
header->reserved2 = le64_to_cpu(header->reserved2);
|
|
|
|
header->reserved3 = le64_to_cpu(header->reserved3);
|
|
|
|
header->reserved4 = le64_to_cpu(header->reserved4);
|
|
|
|
|
|
|
|
header->volatile_header_offset =
|
|
|
|
le64_to_cpu(header->volatile_header_offset);
|
|
|
|
header->volatile_header_size = le64_to_cpu(header->volatile_header_size);
|
|
|
|
|
|
|
|
header->journal_header_offset = le64_to_cpu(header->journal_header_offset);
|
|
|
|
header->journal_header_size = le64_to_cpu(header->journal_header_size);
|
|
|
|
|
|
|
|
header->journal_offset = le64_to_cpu(header->journal_offset);
|
|
|
|
header->journal_size = le64_to_cpu(header->journal_size);
|
|
|
|
|
|
|
|
header->grain_dir_offset = le64_to_cpu(header->grain_dir_offset);
|
|
|
|
header->grain_dir_size = le64_to_cpu(header->grain_dir_size);
|
|
|
|
|
|
|
|
header->grain_tables_offset = le64_to_cpu(header->grain_tables_offset);
|
|
|
|
header->grain_tables_size = le64_to_cpu(header->grain_tables_size);
|
|
|
|
|
|
|
|
header->free_bitmap_offset = le64_to_cpu(header->free_bitmap_offset);
|
|
|
|
header->free_bitmap_size = le64_to_cpu(header->free_bitmap_size);
|
|
|
|
|
|
|
|
header->backmap_offset = le64_to_cpu(header->backmap_offset);
|
|
|
|
header->backmap_size = le64_to_cpu(header->backmap_size);
|
|
|
|
|
|
|
|
header->grains_offset = le64_to_cpu(header->grains_offset);
|
|
|
|
header->grains_size = le64_to_cpu(header->grains_size);
|
|
|
|
|
|
|
|
if (header->magic != SESPARSE_CONST_HEADER_MAGIC) {
|
|
|
|
error_setg(errp, "Bad const header magic: 0x%016" PRIx64,
|
|
|
|
header->magic);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (header->version != 0x0000000200000001) {
|
|
|
|
error_setg(errp, "Unsupported version: 0x%016" PRIx64,
|
|
|
|
header->version);
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (header->grain_size != 8) {
|
|
|
|
error_setg(errp, "Unsupported grain size: %" PRIu64,
|
|
|
|
header->grain_size);
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (header->grain_table_size != 64) {
|
|
|
|
error_setg(errp, "Unsupported grain table size: %" PRIu64,
|
|
|
|
header->grain_table_size);
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (header->flags != 0) {
|
|
|
|
error_setg(errp, "Unsupported flags: 0x%016" PRIx64,
|
|
|
|
header->flags);
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (header->reserved1 != 0 || header->reserved2 != 0 ||
|
|
|
|
header->reserved3 != 0 || header->reserved4 != 0) {
|
|
|
|
error_setg(errp, "Unsupported reserved bits:"
|
|
|
|
" 0x%016" PRIx64 " 0x%016" PRIx64
|
|
|
|
" 0x%016" PRIx64 " 0x%016" PRIx64,
|
|
|
|
header->reserved1, header->reserved2,
|
|
|
|
header->reserved3, header->reserved4);
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check that padding is 0 */
|
|
|
|
if (!buffer_is_zero(header->pad, sizeof(header->pad))) {
|
|
|
|
error_setg(errp, "Unsupported non-zero const header padding");
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int check_se_sparse_volatile_header(VMDKSESparseVolatileHeader *header,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
header->magic = le64_to_cpu(header->magic);
|
|
|
|
header->free_gt_number = le64_to_cpu(header->free_gt_number);
|
|
|
|
header->next_txn_seq_number = le64_to_cpu(header->next_txn_seq_number);
|
|
|
|
header->replay_journal = le64_to_cpu(header->replay_journal);
|
|
|
|
|
|
|
|
if (header->magic != SESPARSE_VOLATILE_HEADER_MAGIC) {
|
|
|
|
error_setg(errp, "Bad volatile header magic: 0x%016" PRIx64,
|
|
|
|
header->magic);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (header->replay_journal) {
|
|
|
|
error_setg(errp, "Image is dirty, Replaying journal not supported");
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check that padding is 0 */
|
|
|
|
if (!buffer_is_zero(header->pad, sizeof(header->pad))) {
|
|
|
|
error_setg(errp, "Unsupported non-zero volatile header padding");
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vmdk_open_se_sparse(BlockDriverState *bs,
|
|
|
|
BdrvChild *file,
|
|
|
|
int flags, Error **errp)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
VMDKSESparseConstHeader const_header;
|
|
|
|
VMDKSESparseVolatileHeader volatile_header;
|
|
|
|
VmdkExtent *extent;
|
|
|
|
|
|
|
|
ret = bdrv_apply_auto_read_only(bs,
|
|
|
|
"No write support for seSparse images available", errp);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(sizeof(const_header) == SECTOR_SIZE);
|
|
|
|
|
|
|
|
ret = bdrv_pread(file, 0, &const_header, sizeof(const_header));
|
|
|
|
if (ret < 0) {
|
|
|
|
bdrv_refresh_filename(file->bs);
|
|
|
|
error_setg_errno(errp, -ret,
|
|
|
|
"Could not read const header from file '%s'",
|
|
|
|
file->bs->filename);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check const header */
|
|
|
|
ret = check_se_sparse_const_header(&const_header, errp);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(sizeof(volatile_header) == SECTOR_SIZE);
|
|
|
|
|
|
|
|
ret = bdrv_pread(file,
|
|
|
|
const_header.volatile_header_offset * SECTOR_SIZE,
|
|
|
|
&volatile_header, sizeof(volatile_header));
|
|
|
|
if (ret < 0) {
|
|
|
|
bdrv_refresh_filename(file->bs);
|
|
|
|
error_setg_errno(errp, -ret,
|
|
|
|
"Could not read volatile header from file '%s'",
|
|
|
|
file->bs->filename);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check volatile header */
|
|
|
|
ret = check_se_sparse_volatile_header(&volatile_header, errp);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = vmdk_add_extent(bs, file, false,
|
|
|
|
const_header.capacity,
|
|
|
|
const_header.grain_dir_offset * SECTOR_SIZE,
|
|
|
|
0,
|
|
|
|
const_header.grain_dir_size *
|
|
|
|
SECTOR_SIZE / sizeof(uint64_t),
|
|
|
|
const_header.grain_table_size *
|
|
|
|
SECTOR_SIZE / sizeof(uint64_t),
|
|
|
|
const_header.grain_size,
|
|
|
|
&extent,
|
|
|
|
errp);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
extent->sesparse = true;
|
|
|
|
extent->sesparse_l2_tables_offset = const_header.grain_tables_offset;
|
|
|
|
extent->sesparse_clusters_offset = const_header.grains_offset;
|
|
|
|
extent->entry_size = sizeof(uint64_t);
|
|
|
|
|
|
|
|
ret = vmdk_init_tables(bs, extent, errp);
|
|
|
|
if (ret) {
|
|
|
|
/* free extent allocated by vmdk_add_extent */
|
|
|
|
vmdk_free_last_extent(bs);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-02-17 14:44:03 +01:00
|
|
|
static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
|
2015-04-07 15:35:59 +02:00
|
|
|
QDict *options, Error **errp);
|
2011-08-12 17:19:33 +02:00
|
|
|
|
2016-06-20 18:24:02 +02:00
|
|
|
static char *vmdk_read_desc(BdrvChild *file, uint64_t desc_offset, Error **errp)
|
2014-02-17 14:44:02 +01:00
|
|
|
{
|
|
|
|
int64_t size;
|
|
|
|
char *buf;
|
|
|
|
int ret;
|
|
|
|
|
2016-06-20 18:24:02 +02:00
|
|
|
size = bdrv_getlength(file->bs);
|
2014-02-17 14:44:02 +01:00
|
|
|
if (size < 0) {
|
|
|
|
error_setg_errno(errp, -size, "Could not access file");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2014-12-04 00:28:32 +01:00
|
|
|
if (size < 4) {
|
|
|
|
/* Both descriptor file and sparse image must be much larger than 4
|
|
|
|
* bytes, also callers of vmdk_read_desc want to compare the first 4
|
|
|
|
* bytes with VMDK4_MAGIC, let's error out if less is read. */
|
|
|
|
error_setg(errp, "File is too small, not a valid image");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2014-12-04 00:28:31 +01:00
|
|
|
size = MIN(size, (1 << 20) - 1); /* avoid unbounded allocation */
|
|
|
|
buf = g_malloc(size + 1);
|
2014-02-17 14:44:02 +01:00
|
|
|
|
|
|
|
ret = bdrv_pread(file, desc_offset, buf, size);
|
|
|
|
if (ret < 0) {
|
|
|
|
error_setg_errno(errp, -ret, "Could not read from file");
|
|
|
|
g_free(buf);
|
|
|
|
return NULL;
|
|
|
|
}
|
2014-12-04 00:28:31 +01:00
|
|
|
buf[ret] = 0;
|
2014-02-17 14:44:02 +01:00
|
|
|
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
|
2011-08-12 17:19:28 +02:00
|
|
|
static int vmdk_open_vmdk4(BlockDriverState *bs,
|
2015-06-15 13:50:20 +02:00
|
|
|
BdrvChild *file,
|
2015-04-07 15:35:59 +02:00
|
|
|
int flags, QDict *options, Error **errp)
|
2011-07-12 13:56:31 +02:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
uint32_t magic;
|
|
|
|
uint32_t l1_size, l1_entry_sectors;
|
|
|
|
VMDK4Header header;
|
|
|
|
VmdkExtent *extent;
|
2013-10-31 03:06:23 +01:00
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2011-08-12 17:19:34 +02:00
|
|
|
int64_t l1_backup_offset = 0;
|
2016-01-25 03:26:23 +01:00
|
|
|
bool compressed;
|
2011-07-12 13:56:31 +02:00
|
|
|
|
2016-06-20 18:24:02 +02:00
|
|
|
ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
|
2011-07-12 13:56:31 +02:00
|
|
|
if (ret < 0) {
|
block: Use bdrv_refresh_filename() to pull
Before this patch, bdrv_refresh_filename() is used in a pushing manner:
Whenever the BDS graph is modified, the parents of the modified edges
are supposed to be updated (recursively upwards). However, that is
nonviable, considering that we want child changes not to concern
parents.
Also, in the long run we want a pull model anyway: Here, we would have a
bdrv_filename() function which returns a BDS's filename, freshly
constructed.
This patch is an intermediate step. It adds bdrv_refresh_filename()
calls before every place a BDS.filename value is used. The only
exceptions are protocol drivers that use their own filename, which
clearly would not profit from refreshing that filename before.
Also, bdrv_get_encrypted_filename() is removed along the way (as a user
of BDS.filename), since it is completely unused.
In turn, all of the calls to bdrv_refresh_filename() before this patch
are removed, because we no longer have to call this function on graph
changes.
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190201192935.18394-2-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-02-01 20:29:05 +01:00
|
|
|
bdrv_refresh_filename(file->bs);
|
2013-10-11 09:43:22 +02:00
|
|
|
error_setg_errno(errp, -ret,
|
|
|
|
"Could not read header from file '%s'",
|
2015-06-15 13:50:20 +02:00
|
|
|
file->bs->filename);
|
2014-02-17 14:44:05 +01:00
|
|
|
return -EINVAL;
|
2011-07-12 13:56:31 +02:00
|
|
|
}
|
2013-06-10 11:07:33 +02:00
|
|
|
if (header.capacity == 0) {
|
2013-08-06 09:44:48 +02:00
|
|
|
uint64_t desc_offset = le64_to_cpu(header.desc_offset);
|
2013-06-10 11:07:33 +02:00
|
|
|
if (desc_offset) {
|
2016-06-20 18:24:02 +02:00
|
|
|
char *buf = vmdk_read_desc(file, desc_offset << 9, errp);
|
2014-02-17 14:44:03 +01:00
|
|
|
if (!buf) {
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2015-04-07 15:35:59 +02:00
|
|
|
ret = vmdk_open_desc_file(bs, flags, buf, options, errp);
|
2014-02-17 14:44:03 +01:00
|
|
|
g_free(buf);
|
|
|
|
return ret;
|
2013-06-10 11:07:33 +02:00
|
|
|
}
|
2011-08-12 17:19:33 +02:00
|
|
|
}
|
2012-08-16 10:39:33 +02:00
|
|
|
|
2013-10-31 03:06:23 +01:00
|
|
|
if (!s->create_type) {
|
|
|
|
s->create_type = g_strdup("monolithicSparse");
|
|
|
|
}
|
|
|
|
|
2012-08-16 10:39:33 +02:00
|
|
|
if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
|
|
|
|
/*
|
|
|
|
* The footer takes precedence over the header, so read it in. The
|
|
|
|
* footer starts at offset -1024 from the end: One sector for the
|
|
|
|
* footer, and another one for the end-of-stream marker.
|
|
|
|
*/
|
|
|
|
struct {
|
|
|
|
struct {
|
|
|
|
uint64_t val;
|
|
|
|
uint32_t size;
|
|
|
|
uint32_t type;
|
|
|
|
uint8_t pad[512 - 16];
|
|
|
|
} QEMU_PACKED footer_marker;
|
|
|
|
|
|
|
|
uint32_t magic;
|
|
|
|
VMDK4Header header;
|
|
|
|
uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
|
|
|
|
|
|
|
|
struct {
|
|
|
|
uint64_t val;
|
|
|
|
uint32_t size;
|
|
|
|
uint32_t type;
|
|
|
|
uint8_t pad[512 - 16];
|
|
|
|
} QEMU_PACKED eos_marker;
|
|
|
|
} QEMU_PACKED footer;
|
|
|
|
|
2016-06-20 18:24:02 +02:00
|
|
|
ret = bdrv_pread(file,
|
2015-06-16 14:19:22 +02:00
|
|
|
bs->file->bs->total_sectors * 512 - 1536,
|
2012-08-16 10:39:33 +02:00
|
|
|
&footer, sizeof(footer));
|
|
|
|
if (ret < 0) {
|
2014-12-04 00:28:34 +01:00
|
|
|
error_setg_errno(errp, -ret, "Failed to read footer");
|
2012-08-16 10:39:33 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Some sanity checks for the footer */
|
|
|
|
if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
|
|
|
|
le32_to_cpu(footer.footer_marker.size) != 0 ||
|
|
|
|
le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
|
|
|
|
le64_to_cpu(footer.eos_marker.val) != 0 ||
|
|
|
|
le32_to_cpu(footer.eos_marker.size) != 0 ||
|
|
|
|
le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
|
|
|
|
{
|
2014-12-04 00:28:34 +01:00
|
|
|
error_setg(errp, "Invalid footer");
|
2012-08-16 10:39:33 +02:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
header = footer.header;
|
|
|
|
}
|
|
|
|
|
2016-01-25 03:26:23 +01:00
|
|
|
compressed =
|
|
|
|
le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
|
2013-11-28 02:48:03 +01:00
|
|
|
if (le32_to_cpu(header.version) > 3) {
|
2016-03-16 19:54:33 +01:00
|
|
|
error_setg(errp, "Unsupported VMDK version %" PRIu32,
|
|
|
|
le32_to_cpu(header.version));
|
2013-06-13 05:21:29 +02:00
|
|
|
return -ENOTSUP;
|
2016-01-25 03:26:23 +01:00
|
|
|
} else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR) &&
|
|
|
|
!compressed) {
|
2013-11-28 02:48:03 +01:00
|
|
|
/* VMware KB 2064959 explains that version 3 added support for
|
|
|
|
* persistent changed block tracking (CBT), and backup software can
|
|
|
|
* read it as version=1 if it doesn't care about the changed area
|
|
|
|
* information. So we are safe to enable read only. */
|
|
|
|
error_setg(errp, "VMDK version 3 must be read only");
|
|
|
|
return -EINVAL;
|
2013-06-13 05:21:29 +02:00
|
|
|
}
|
|
|
|
|
2013-08-06 09:44:55 +02:00
|
|
|
if (le32_to_cpu(header.num_gtes_per_gt) > 512) {
|
2014-02-17 14:44:05 +01:00
|
|
|
error_setg(errp, "L2 table size too big");
|
2013-08-06 09:44:52 +02:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2013-08-06 09:44:55 +02:00
|
|
|
l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gt)
|
2011-07-12 13:56:31 +02:00
|
|
|
* le64_to_cpu(header.granularity);
|
2012-02-25 14:01:42 +01:00
|
|
|
if (l1_entry_sectors == 0) {
|
2014-12-04 00:28:34 +01:00
|
|
|
error_setg(errp, "L1 entry size is invalid");
|
2011-08-12 17:19:28 +02:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
2011-07-12 13:56:31 +02:00
|
|
|
l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
|
|
|
|
/ l1_entry_sectors;
|
2011-08-12 17:19:34 +02:00
|
|
|
if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
|
|
|
|
l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
|
|
|
|
}
|
2015-06-15 13:50:20 +02:00
|
|
|
if (bdrv_nb_sectors(file->bs) < le64_to_cpu(header.grain_offset)) {
|
2014-04-17 05:34:37 +02:00
|
|
|
error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes",
|
|
|
|
(int64_t)(le64_to_cpu(header.grain_offset)
|
|
|
|
* BDRV_SECTOR_SIZE));
|
2014-01-21 08:07:43 +01:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2013-08-06 09:44:51 +02:00
|
|
|
ret = vmdk_add_extent(bs, file, false,
|
2011-07-12 13:56:31 +02:00
|
|
|
le64_to_cpu(header.capacity),
|
|
|
|
le64_to_cpu(header.gd_offset) << 9,
|
2011-08-12 17:19:34 +02:00
|
|
|
l1_backup_offset,
|
2011-07-12 13:56:31 +02:00
|
|
|
l1_size,
|
2013-08-06 09:44:55 +02:00
|
|
|
le32_to_cpu(header.num_gtes_per_gt),
|
2013-08-06 09:44:51 +02:00
|
|
|
le64_to_cpu(header.granularity),
|
2013-10-11 09:43:22 +02:00
|
|
|
&extent,
|
|
|
|
errp);
|
2013-08-06 09:44:51 +02:00
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
2011-08-12 17:19:30 +02:00
|
|
|
extent->compressed =
|
|
|
|
le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
|
2014-01-23 08:10:52 +01:00
|
|
|
if (extent->compressed) {
|
|
|
|
g_free(s->create_type);
|
|
|
|
s->create_type = g_strdup("streamOptimized");
|
|
|
|
}
|
2011-08-12 17:19:30 +02:00
|
|
|
extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
|
2013-05-02 04:25:23 +02:00
|
|
|
extent->version = le32_to_cpu(header.version);
|
|
|
|
extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN;
|
2013-10-11 09:43:22 +02:00
|
|
|
ret = vmdk_init_tables(bs, extent, errp);
|
2011-07-12 13:56:31 +02:00
|
|
|
if (ret) {
|
2011-08-12 17:19:28 +02:00
|
|
|
/* free extent allocated by vmdk_add_extent */
|
|
|
|
vmdk_free_last_extent(bs);
|
2011-07-12 13:56:31 +02:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-07-19 02:38:22 +02:00
|
|
|
/* find an option value out of descriptor file */
|
|
|
|
static int vmdk_parse_description(const char *desc, const char *opt_name,
|
|
|
|
char *buf, int buf_size)
|
|
|
|
{
|
|
|
|
char *opt_pos, *opt_end;
|
|
|
|
const char *end = desc + strlen(desc);
|
|
|
|
|
|
|
|
opt_pos = strstr(desc, opt_name);
|
|
|
|
if (!opt_pos) {
|
2013-05-02 04:25:22 +02:00
|
|
|
return VMDK_ERROR;
|
2011-07-19 02:38:22 +02:00
|
|
|
}
|
|
|
|
/* Skip "=\"" following opt_name */
|
|
|
|
opt_pos += strlen(opt_name) + 2;
|
|
|
|
if (opt_pos >= end) {
|
2013-05-02 04:25:22 +02:00
|
|
|
return VMDK_ERROR;
|
2011-07-19 02:38:22 +02:00
|
|
|
}
|
|
|
|
opt_end = opt_pos;
|
|
|
|
while (opt_end < end && *opt_end != '"') {
|
|
|
|
opt_end++;
|
|
|
|
}
|
|
|
|
if (opt_end == end || buf_size < opt_end - opt_pos + 1) {
|
2013-05-02 04:25:22 +02:00
|
|
|
return VMDK_ERROR;
|
2011-07-19 02:38:22 +02:00
|
|
|
}
|
|
|
|
pstrcpy(buf, opt_end - opt_pos + 1, opt_pos);
|
2013-05-02 04:25:22 +02:00
|
|
|
return VMDK_OK;
|
2011-07-19 02:38:22 +02:00
|
|
|
}
|
|
|
|
|
2011-08-12 17:19:28 +02:00
|
|
|
/* Open an extent file and append to bs array */
|
2015-06-15 13:50:20 +02:00
|
|
|
static int vmdk_open_sparse(BlockDriverState *bs, BdrvChild *file, int flags,
|
2015-04-07 15:35:59 +02:00
|
|
|
char *buf, QDict *options, Error **errp)
|
2011-08-12 17:19:28 +02:00
|
|
|
{
|
|
|
|
uint32_t magic;
|
|
|
|
|
2014-02-17 14:44:03 +01:00
|
|
|
magic = ldl_be_p(buf);
|
2011-08-12 17:19:28 +02:00
|
|
|
switch (magic) {
|
|
|
|
case VMDK3_MAGIC:
|
2013-10-11 09:43:22 +02:00
|
|
|
return vmdk_open_vmfs_sparse(bs, file, flags, errp);
|
2011-08-12 17:19:28 +02:00
|
|
|
break;
|
|
|
|
case VMDK4_MAGIC:
|
2015-04-07 15:35:59 +02:00
|
|
|
return vmdk_open_vmdk4(bs, file, flags, options, errp);
|
2011-08-12 17:19:28 +02:00
|
|
|
break;
|
|
|
|
default:
|
2014-02-17 14:44:06 +01:00
|
|
|
error_setg(errp, "Image not in VMDK format");
|
|
|
|
return -EINVAL;
|
2011-08-12 17:19:28 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-12-18 16:35:20 +01:00
|
|
|
static const char *next_line(const char *s)
|
|
|
|
{
|
|
|
|
while (*s) {
|
|
|
|
if (*s == '\n') {
|
|
|
|
return s + 1;
|
|
|
|
}
|
|
|
|
s++;
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2011-07-19 02:38:22 +02:00
|
|
|
static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
|
2019-08-15 17:36:33 +02:00
|
|
|
QDict *options, Error **errp)
|
2011-07-19 02:38:22 +02:00
|
|
|
{
|
|
|
|
int ret;
|
2015-01-22 14:03:25 +01:00
|
|
|
int matches;
|
2011-07-19 02:38:22 +02:00
|
|
|
char access[11];
|
|
|
|
char type[11];
|
|
|
|
char fname[512];
|
2015-12-18 16:35:21 +01:00
|
|
|
const char *p, *np;
|
2011-07-19 02:38:22 +02:00
|
|
|
int64_t sectors = 0;
|
|
|
|
int64_t flat_offset;
|
2019-08-15 17:36:33 +02:00
|
|
|
char *desc_file_dir = NULL;
|
2015-01-22 14:03:26 +01:00
|
|
|
char *extent_path;
|
2015-06-15 13:50:20 +02:00
|
|
|
BdrvChild *extent_file;
|
2013-10-31 03:06:23 +01:00
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
VmdkExtent *extent;
|
2015-04-07 15:35:59 +02:00
|
|
|
char extent_opt_prefix[32];
|
2015-06-15 13:50:20 +02:00
|
|
|
Error *local_err = NULL;
|
2011-07-19 02:38:22 +02:00
|
|
|
|
2015-12-18 16:35:20 +01:00
|
|
|
for (p = desc; *p; p = next_line(p)) {
|
2014-12-04 00:28:30 +01:00
|
|
|
/* parse extent line in one of below formats:
|
|
|
|
*
|
2011-07-19 02:38:22 +02:00
|
|
|
* RW [size in sectors] FLAT "file-name.vmdk" OFFSET
|
|
|
|
* RW [size in sectors] SPARSE "file-name.vmdk"
|
2014-12-04 00:28:30 +01:00
|
|
|
* RW [size in sectors] VMFS "file-name.vmdk"
|
|
|
|
* RW [size in sectors] VMFSSPARSE "file-name.vmdk"
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
* RW [size in sectors] SESPARSE "file-name.vmdk"
|
2011-07-19 02:38:22 +02:00
|
|
|
*/
|
|
|
|
flat_offset = -1;
|
2015-01-22 14:03:25 +01:00
|
|
|
matches = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64,
|
|
|
|
access, §ors, type, fname, &flat_offset);
|
|
|
|
if (matches < 4 || strcmp(access, "RW")) {
|
2015-12-18 16:35:20 +01:00
|
|
|
continue;
|
2011-07-19 02:38:22 +02:00
|
|
|
} else if (!strcmp(type, "FLAT")) {
|
2015-01-22 14:03:25 +01:00
|
|
|
if (matches != 5 || flat_offset < 0) {
|
2015-12-18 16:35:21 +01:00
|
|
|
goto invalid;
|
2011-07-19 02:38:22 +02:00
|
|
|
}
|
2013-10-18 09:07:33 +02:00
|
|
|
} else if (!strcmp(type, "VMFS")) {
|
2015-01-22 14:03:25 +01:00
|
|
|
if (matches == 4) {
|
2013-12-09 06:24:36 +01:00
|
|
|
flat_offset = 0;
|
|
|
|
} else {
|
2015-12-18 16:35:21 +01:00
|
|
|
goto invalid;
|
2013-12-09 06:24:36 +01:00
|
|
|
}
|
2015-01-22 14:03:25 +01:00
|
|
|
} else if (matches != 4) {
|
2015-12-18 16:35:21 +01:00
|
|
|
goto invalid;
|
2011-07-19 02:38:22 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (sectors <= 0 ||
|
2013-08-19 12:54:27 +02:00
|
|
|
(strcmp(type, "FLAT") && strcmp(type, "SPARSE") &&
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE") &&
|
|
|
|
strcmp(type, "SESPARSE")) ||
|
2011-07-19 02:38:22 +02:00
|
|
|
(strcmp(access, "RW"))) {
|
2015-12-18 16:35:20 +01:00
|
|
|
continue;
|
2011-07-19 02:38:22 +02:00
|
|
|
}
|
|
|
|
|
2019-08-15 17:36:33 +02:00
|
|
|
if (path_is_absolute(fname)) {
|
|
|
|
extent_path = g_strdup(fname);
|
|
|
|
} else {
|
|
|
|
if (!desc_file_dir) {
|
|
|
|
desc_file_dir = bdrv_dirname(bs->file->bs, errp);
|
|
|
|
if (!desc_file_dir) {
|
|
|
|
bdrv_refresh_filename(bs->file->bs);
|
|
|
|
error_prepend(errp, "Cannot use relative paths with VMDK "
|
|
|
|
"descriptor file '%s': ",
|
|
|
|
bs->file->bs->filename);
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
2014-12-03 14:57:22 +01:00
|
|
|
|
2019-08-15 17:36:33 +02:00
|
|
|
extent_path = g_strconcat(desc_file_dir, fname, NULL);
|
|
|
|
}
|
2015-04-07 15:35:59 +02:00
|
|
|
|
|
|
|
ret = snprintf(extent_opt_prefix, 32, "extents.%d", s->num_extents);
|
|
|
|
assert(ret < 32);
|
|
|
|
|
2015-06-15 13:50:20 +02:00
|
|
|
extent_file = bdrv_open_child(extent_path, options, extent_opt_prefix,
|
|
|
|
bs, &child_file, false, &local_err);
|
2015-01-22 14:03:26 +01:00
|
|
|
g_free(extent_path);
|
2015-06-15 13:50:20 +02:00
|
|
|
if (local_err) {
|
|
|
|
error_propagate(errp, local_err);
|
2019-08-15 17:36:33 +02:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
2011-08-12 17:19:28 +02:00
|
|
|
}
|
|
|
|
|
2011-07-19 02:38:22 +02:00
|
|
|
/* save to extents array */
|
2013-08-19 12:54:28 +02:00
|
|
|
if (!strcmp(type, "FLAT") || !strcmp(type, "VMFS")) {
|
2011-07-19 02:38:22 +02:00
|
|
|
/* FLAT extent */
|
|
|
|
|
2013-08-06 09:44:51 +02:00
|
|
|
ret = vmdk_add_extent(bs, extent_file, true, sectors,
|
2013-10-11 09:43:22 +02:00
|
|
|
0, 0, 0, 0, 0, &extent, errp);
|
2013-08-06 09:44:51 +02:00
|
|
|
if (ret < 0) {
|
2015-06-15 13:50:20 +02:00
|
|
|
bdrv_unref_child(bs, extent_file);
|
2019-08-15 17:36:33 +02:00
|
|
|
goto out;
|
2013-08-06 09:44:51 +02:00
|
|
|
}
|
2011-08-12 17:19:33 +02:00
|
|
|
extent->flat_start_offset = flat_offset << 9;
|
2013-08-19 12:54:27 +02:00
|
|
|
} else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) {
|
|
|
|
/* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/
|
2016-06-20 18:24:02 +02:00
|
|
|
char *buf = vmdk_read_desc(extent_file, 0, errp);
|
2014-02-17 14:44:03 +01:00
|
|
|
if (!buf) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
} else {
|
2015-04-07 15:35:59 +02:00
|
|
|
ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf,
|
|
|
|
options, errp);
|
2014-02-17 14:44:03 +01:00
|
|
|
}
|
2014-09-04 22:04:43 +02:00
|
|
|
g_free(buf);
|
2011-08-12 17:19:28 +02:00
|
|
|
if (ret) {
|
2015-06-15 13:50:20 +02:00
|
|
|
bdrv_unref_child(bs, extent_file);
|
2019-08-15 17:36:33 +02:00
|
|
|
goto out;
|
2011-08-12 17:19:28 +02:00
|
|
|
}
|
2013-10-31 03:06:23 +01:00
|
|
|
extent = &s->extents[s->num_extents - 1];
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
} else if (!strcmp(type, "SESPARSE")) {
|
|
|
|
ret = vmdk_open_se_sparse(bs, extent_file, bs->open_flags, errp);
|
|
|
|
if (ret) {
|
|
|
|
bdrv_unref_child(bs, extent_file);
|
2019-08-15 17:36:33 +02:00
|
|
|
goto out;
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
}
|
|
|
|
extent = &s->extents[s->num_extents - 1];
|
2011-07-19 02:38:22 +02:00
|
|
|
} else {
|
2013-10-11 09:43:22 +02:00
|
|
|
error_setg(errp, "Unsupported extent type '%s'", type);
|
2015-06-15 13:50:20 +02:00
|
|
|
bdrv_unref_child(bs, extent_file);
|
2019-08-15 17:36:33 +02:00
|
|
|
ret = -ENOTSUP;
|
|
|
|
goto out;
|
2011-07-19 02:38:22 +02:00
|
|
|
}
|
2013-10-31 03:06:23 +01:00
|
|
|
extent->type = g_strdup(type);
|
2011-07-19 02:38:22 +02:00
|
|
|
}
|
2019-08-15 17:36:33 +02:00
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
2015-12-18 16:35:21 +01:00
|
|
|
|
|
|
|
invalid:
|
|
|
|
np = next_line(p);
|
|
|
|
assert(np != p);
|
|
|
|
if (np[-1] == '\n') {
|
|
|
|
np--;
|
|
|
|
}
|
|
|
|
error_setg(errp, "Invalid extent line: %.*s", (int)(np - p), p);
|
2019-08-15 17:36:33 +02:00
|
|
|
ret = -EINVAL;
|
|
|
|
|
|
|
|
out:
|
|
|
|
g_free(desc_file_dir);
|
|
|
|
return ret;
|
2011-07-19 02:38:22 +02:00
|
|
|
}
|
|
|
|
|
2014-02-17 14:44:03 +01:00
|
|
|
static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
|
2015-04-07 15:35:59 +02:00
|
|
|
QDict *options, Error **errp)
|
2011-07-19 02:38:22 +02:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
char ct[128];
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
|
|
|
|
if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) {
|
2014-02-17 14:44:06 +01:00
|
|
|
error_setg(errp, "invalid VMDK image descriptor");
|
|
|
|
ret = -EINVAL;
|
2013-06-12 13:06:30 +02:00
|
|
|
goto exit;
|
2011-07-19 02:38:22 +02:00
|
|
|
}
|
2011-08-12 17:19:27 +02:00
|
|
|
if (strcmp(ct, "monolithicFlat") &&
|
2013-08-19 12:54:28 +02:00
|
|
|
strcmp(ct, "vmfs") &&
|
2013-08-19 12:54:27 +02:00
|
|
|
strcmp(ct, "vmfsSparse") &&
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
strcmp(ct, "seSparse") &&
|
2011-08-12 17:19:28 +02:00
|
|
|
strcmp(ct, "twoGbMaxExtentSparse") &&
|
2011-08-12 17:19:27 +02:00
|
|
|
strcmp(ct, "twoGbMaxExtentFlat")) {
|
2013-10-11 09:43:22 +02:00
|
|
|
error_setg(errp, "Unsupported image type '%s'", ct);
|
2013-06-12 13:06:30 +02:00
|
|
|
ret = -ENOTSUP;
|
|
|
|
goto exit;
|
2011-07-19 02:38:22 +02:00
|
|
|
}
|
2013-10-31 03:06:23 +01:00
|
|
|
s->create_type = g_strdup(ct);
|
2011-07-19 02:38:22 +02:00
|
|
|
s->desc_offset = 0;
|
2019-08-15 17:36:33 +02:00
|
|
|
ret = vmdk_parse_extents(buf, bs, options, errp);
|
2013-06-12 13:06:30 +02:00
|
|
|
exit:
|
|
|
|
return ret;
|
2011-07-19 02:38:22 +02:00
|
|
|
}
|
|
|
|
|
2013-09-05 14:22:29 +02:00
|
|
|
static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
|
|
|
|
Error **errp)
|
2011-07-12 13:56:31 +02:00
|
|
|
{
|
2014-12-04 00:28:33 +01:00
|
|
|
char *buf;
|
2011-08-12 17:19:28 +02:00
|
|
|
int ret;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2014-02-17 14:44:04 +01:00
|
|
|
uint32_t magic;
|
2017-01-16 12:31:53 +01:00
|
|
|
Error *local_err = NULL;
|
2011-07-12 13:56:31 +02:00
|
|
|
|
2016-12-16 18:52:37 +01:00
|
|
|
bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
|
|
|
|
false, errp);
|
|
|
|
if (!bs->file) {
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2016-06-20 18:24:02 +02:00
|
|
|
buf = vmdk_read_desc(bs->file, 0, errp);
|
2014-02-17 14:44:03 +01:00
|
|
|
if (!buf) {
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2014-02-17 14:44:04 +01:00
|
|
|
magic = ldl_be_p(buf);
|
|
|
|
switch (magic) {
|
|
|
|
case VMDK3_MAGIC:
|
|
|
|
case VMDK4_MAGIC:
|
2015-06-16 14:19:22 +02:00
|
|
|
ret = vmdk_open_sparse(bs, bs->file, flags, buf, options,
|
2015-06-15 13:50:20 +02:00
|
|
|
errp);
|
2014-02-17 14:44:04 +01:00
|
|
|
s->desc_offset = 0x200;
|
|
|
|
break;
|
|
|
|
default:
|
2015-04-07 15:35:59 +02:00
|
|
|
ret = vmdk_open_desc_file(bs, flags, buf, options, errp);
|
2014-02-17 14:44:04 +01:00
|
|
|
break;
|
2011-07-12 13:56:31 +02:00
|
|
|
}
|
2014-02-17 14:44:04 +01:00
|
|
|
if (ret) {
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
vmdk: clean up open
Move vmdk_parent_open to vmdk_open. There's another path how
vmdk_parent_open can be reached:
vmdk_parse_extents() -> vmdk_open_sparse() -> vmdk_open_vmdk4() ->
vmdk_open_desc_file().
If that can happen, however, the code is bogus. vmdk_parent_open
reads from bs->file:
if (bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE) != DESC_SIZE) {
but it is always called with s->desc_offset == 0 and with the same
bs->file. So the data that vmdk_parent_open reads comes always from the
same place, and anyway there is only one place where it can write it,
namely bs->backing_file.
So, if it cannot happen, the patched code is okay.
It is also possible that the recursive call can happen, but only once. In
that case there would still be a bug in vmdk_open_desc_file setting
s->desc_offset = 0, but the patched code is okay.
Finally, in the case where multiple recursive calls can happen the code
would need to be rewritten anyway. It is likely that this would anyway
involve adding several parameters to vmdk_parent_open, and calling it from
vmdk_open_vmdk4.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2011-10-20 13:16:20 +02:00
|
|
|
/* try to open parent images, if exist */
|
|
|
|
ret = vmdk_parent_open(bs);
|
|
|
|
if (ret) {
|
|
|
|
goto fail;
|
|
|
|
}
|
2017-07-09 19:06:14 +02:00
|
|
|
ret = vmdk_read_cid(bs, 0, &s->cid);
|
|
|
|
if (ret) {
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
ret = vmdk_read_cid(bs, 1, &s->parent_cid);
|
|
|
|
if (ret) {
|
|
|
|
goto fail;
|
|
|
|
}
|
2011-10-20 13:16:21 +02:00
|
|
|
qemu_co_mutex_init(&s->lock);
|
2011-11-22 16:50:27 +01:00
|
|
|
|
|
|
|
/* Disable migration when VMDK images are used */
|
2015-04-08 11:29:19 +02:00
|
|
|
error_setg(&s->migration_blocker, "The vmdk format used by node '%s' "
|
|
|
|
"does not support live migration",
|
|
|
|
bdrv_get_device_or_node_name(bs));
|
2017-01-16 12:31:53 +01:00
|
|
|
ret = migrate_add_blocker(s->migration_blocker, &local_err);
|
|
|
|
if (local_err) {
|
|
|
|
error_propagate(errp, local_err);
|
|
|
|
error_free(s->migration_blocker);
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
2014-02-17 14:44:03 +01:00
|
|
|
g_free(buf);
|
2011-11-22 16:50:27 +01:00
|
|
|
return 0;
|
vmdk: clean up open
Move vmdk_parent_open to vmdk_open. There's another path how
vmdk_parent_open can be reached:
vmdk_parse_extents() -> vmdk_open_sparse() -> vmdk_open_vmdk4() ->
vmdk_open_desc_file().
If that can happen, however, the code is bogus. vmdk_parent_open
reads from bs->file:
if (bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE) != DESC_SIZE) {
but it is always called with s->desc_offset == 0 and with the same
bs->file. So the data that vmdk_parent_open reads comes always from the
same place, and anyway there is only one place where it can write it,
namely bs->backing_file.
So, if it cannot happen, the patched code is okay.
It is also possible that the recursive call can happen, but only once. In
that case there would still be a bug in vmdk_open_desc_file setting
s->desc_offset = 0, but the patched code is okay.
Finally, in the case where multiple recursive calls can happen the code
would need to be rewritten anyway. It is likely that this would anyway
involve adding several parameters to vmdk_parent_open, and calling it from
vmdk_open_vmdk4.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2011-10-20 13:16:20 +02:00
|
|
|
|
|
|
|
fail:
|
2014-02-17 14:44:03 +01:00
|
|
|
g_free(buf);
|
2013-10-31 03:06:23 +01:00
|
|
|
g_free(s->create_type);
|
|
|
|
s->create_type = NULL;
|
vmdk: clean up open
Move vmdk_parent_open to vmdk_open. There's another path how
vmdk_parent_open can be reached:
vmdk_parse_extents() -> vmdk_open_sparse() -> vmdk_open_vmdk4() ->
vmdk_open_desc_file().
If that can happen, however, the code is bogus. vmdk_parent_open
reads from bs->file:
if (bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE) != DESC_SIZE) {
but it is always called with s->desc_offset == 0 and with the same
bs->file. So the data that vmdk_parent_open reads comes always from the
same place, and anyway there is only one place where it can write it,
namely bs->backing_file.
So, if it cannot happen, the patched code is okay.
It is also possible that the recursive call can happen, but only once. In
that case there would still be a bug in vmdk_open_desc_file setting
s->desc_offset = 0, but the patched code is okay.
Finally, in the case where multiple recursive calls can happen the code
would need to be rewritten anyway. It is likely that this would anyway
involve adding several parameters to vmdk_parent_open, and calling it from
vmdk_open_vmdk4.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2011-10-20 13:16:20 +02:00
|
|
|
vmdk_free_extents(bs);
|
|
|
|
return ret;
|
2004-08-01 23:59:26 +02:00
|
|
|
}
|
|
|
|
|
2013-12-11 19:26:16 +01:00
|
|
|
|
2014-07-16 17:48:16 +02:00
|
|
|
static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
|
2013-12-11 19:26:16 +01:00
|
|
|
{
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < s->num_extents; i++) {
|
|
|
|
if (!s->extents[i].flat) {
|
2016-06-01 23:10:02 +02:00
|
|
|
bs->bl.pwrite_zeroes_alignment =
|
|
|
|
MAX(bs->bl.pwrite_zeroes_alignment,
|
|
|
|
s->extents[i].cluster_sectors << BDRV_SECTOR_BITS);
|
2013-12-11 19:26:16 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
/**
|
|
|
|
* get_whole_cluster
|
|
|
|
*
|
|
|
|
* Copy backing file's cluster that covers @sector_num, otherwise write zero,
|
|
|
|
* to the cluster at @cluster_sector_num.
|
|
|
|
*
|
|
|
|
* If @skip_start_sector < @skip_end_sector, the relative range
|
|
|
|
* [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
|
|
|
|
* it for call to write user data in the request.
|
|
|
|
*/
|
2011-07-12 13:56:28 +02:00
|
|
|
static int get_whole_cluster(BlockDriverState *bs,
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
VmdkExtent *extent,
|
2016-04-26 13:39:11 +02:00
|
|
|
uint64_t cluster_offset,
|
|
|
|
uint64_t offset,
|
|
|
|
uint64_t skip_start_bytes,
|
|
|
|
uint64_t skip_end_bytes)
|
2007-01-24 22:05:24 +01:00
|
|
|
{
|
2013-08-06 09:44:54 +02:00
|
|
|
int ret = VMDK_OK;
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
int64_t cluster_bytes;
|
|
|
|
uint8_t *whole_grain;
|
|
|
|
|
|
|
|
/* For COW, align request sector_num to cluster start */
|
|
|
|
cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
|
2016-04-26 13:39:11 +02:00
|
|
|
offset = QEMU_ALIGN_DOWN(offset, cluster_bytes);
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
whole_grain = qemu_blockalign(bs, cluster_bytes);
|
2007-01-24 22:05:24 +01:00
|
|
|
|
2015-06-17 14:55:21 +02:00
|
|
|
if (!bs->backing) {
|
2016-04-26 13:39:11 +02:00
|
|
|
memset(whole_grain, 0, skip_start_bytes);
|
|
|
|
memset(whole_grain + skip_end_bytes, 0, cluster_bytes - skip_end_bytes);
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
}
|
|
|
|
|
2016-04-26 13:39:11 +02:00
|
|
|
assert(skip_end_bytes <= cluster_bytes);
|
2011-07-12 13:56:29 +02:00
|
|
|
/* we will be here if it's first write on non-exist grain(cluster).
|
|
|
|
* try to read from parent image, if exist */
|
2015-06-17 14:55:21 +02:00
|
|
|
if (bs->backing && !vmdk_is_cid_valid(bs)) {
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
ret = VMDK_ERROR;
|
|
|
|
goto exit;
|
|
|
|
}
|
2007-01-24 22:05:24 +01:00
|
|
|
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
/* Read backing data before skip range */
|
2016-04-26 13:39:11 +02:00
|
|
|
if (skip_start_bytes > 0) {
|
2015-06-17 14:55:21 +02:00
|
|
|
if (bs->backing) {
|
2017-11-23 03:08:19 +01:00
|
|
|
/* qcow2 emits this on bs->file instead of bs->backing */
|
|
|
|
BLKDBG_EVENT(extent->file, BLKDBG_COW_READ);
|
2016-06-20 18:24:02 +02:00
|
|
|
ret = bdrv_pread(bs->backing, offset, whole_grain,
|
2016-04-26 13:39:11 +02:00
|
|
|
skip_start_bytes);
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
if (ret < 0) {
|
|
|
|
ret = VMDK_ERROR;
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
}
|
2017-11-23 03:08:19 +01:00
|
|
|
BLKDBG_EVENT(extent->file, BLKDBG_COW_WRITE);
|
2016-06-20 20:09:15 +02:00
|
|
|
ret = bdrv_pwrite(extent->file, cluster_offset, whole_grain,
|
2016-04-26 13:39:11 +02:00
|
|
|
skip_start_bytes);
|
2010-04-16 19:28:14 +02:00
|
|
|
if (ret < 0) {
|
2013-08-06 09:44:54 +02:00
|
|
|
ret = VMDK_ERROR;
|
|
|
|
goto exit;
|
2010-04-16 19:28:14 +02:00
|
|
|
}
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
}
|
|
|
|
/* Read backing data after skip range */
|
2016-04-26 13:39:11 +02:00
|
|
|
if (skip_end_bytes < cluster_bytes) {
|
2015-06-17 14:55:21 +02:00
|
|
|
if (bs->backing) {
|
2017-11-23 03:08:19 +01:00
|
|
|
/* qcow2 emits this on bs->file instead of bs->backing */
|
|
|
|
BLKDBG_EVENT(extent->file, BLKDBG_COW_READ);
|
2016-06-20 18:24:02 +02:00
|
|
|
ret = bdrv_pread(bs->backing, offset + skip_end_bytes,
|
2016-04-26 13:39:11 +02:00
|
|
|
whole_grain + skip_end_bytes,
|
|
|
|
cluster_bytes - skip_end_bytes);
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
if (ret < 0) {
|
|
|
|
ret = VMDK_ERROR;
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
}
|
2017-11-23 03:08:19 +01:00
|
|
|
BLKDBG_EVENT(extent->file, BLKDBG_COW_WRITE);
|
2016-06-20 20:09:15 +02:00
|
|
|
ret = bdrv_pwrite(extent->file, cluster_offset + skip_end_bytes,
|
2016-04-26 13:39:11 +02:00
|
|
|
whole_grain + skip_end_bytes,
|
|
|
|
cluster_bytes - skip_end_bytes);
|
2010-04-16 19:28:14 +02:00
|
|
|
if (ret < 0) {
|
2013-08-06 09:44:54 +02:00
|
|
|
ret = VMDK_ERROR;
|
|
|
|
goto exit;
|
2007-06-18 17:01:30 +02:00
|
|
|
}
|
|
|
|
}
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
|
2016-04-26 13:39:11 +02:00
|
|
|
ret = VMDK_OK;
|
2013-08-06 09:44:54 +02:00
|
|
|
exit:
|
|
|
|
qemu_vfree(whole_grain);
|
|
|
|
return ret;
|
2007-06-18 17:01:30 +02:00
|
|
|
}
|
|
|
|
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
|
|
|
|
uint32_t offset)
|
2007-06-18 17:01:30 +02:00
|
|
|
{
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
offset = cpu_to_le32(offset);
|
2007-06-18 17:01:30 +02:00
|
|
|
/* update L2 table */
|
2017-11-23 03:08:19 +01:00
|
|
|
BLKDBG_EVENT(extent->file, BLKDBG_L2_UPDATE);
|
2016-06-20 20:09:15 +02:00
|
|
|
if (bdrv_pwrite_sync(extent->file,
|
2011-07-12 13:56:28 +02:00
|
|
|
((int64_t)m_data->l2_offset * 512)
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
+ (m_data->l2_index * sizeof(offset)),
|
2013-05-02 04:25:26 +02:00
|
|
|
&offset, sizeof(offset)) < 0) {
|
2013-05-02 04:25:22 +02:00
|
|
|
return VMDK_ERROR;
|
2011-07-12 13:56:28 +02:00
|
|
|
}
|
2007-06-18 17:01:30 +02:00
|
|
|
/* update backup L2 table */
|
2011-07-12 13:56:28 +02:00
|
|
|
if (extent->l1_backup_table_offset != 0) {
|
|
|
|
m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
|
2016-06-20 20:09:15 +02:00
|
|
|
if (bdrv_pwrite_sync(extent->file,
|
2011-07-12 13:56:28 +02:00
|
|
|
((int64_t)m_data->l2_offset * 512)
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
+ (m_data->l2_index * sizeof(offset)),
|
2013-05-02 04:25:26 +02:00
|
|
|
&offset, sizeof(offset)) < 0) {
|
2013-05-02 04:25:22 +02:00
|
|
|
return VMDK_ERROR;
|
2011-07-12 13:56:28 +02:00
|
|
|
}
|
2007-01-24 22:05:24 +01:00
|
|
|
}
|
2013-05-02 04:25:27 +02:00
|
|
|
if (m_data->l2_cache_entry) {
|
|
|
|
*m_data->l2_cache_entry = offset;
|
|
|
|
}
|
2007-06-18 17:01:30 +02:00
|
|
|
|
2013-05-02 04:25:22 +02:00
|
|
|
return VMDK_OK;
|
2007-01-24 22:05:24 +01:00
|
|
|
}
|
|
|
|
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
/**
|
|
|
|
* get_cluster_offset
|
|
|
|
*
|
|
|
|
* Look up cluster offset in extent file by sector number, and store in
|
|
|
|
* @cluster_offset.
|
|
|
|
*
|
|
|
|
* For flat extents, the start offset as parsed from the description file is
|
|
|
|
* returned.
|
|
|
|
*
|
|
|
|
* For sparse extents, look up in L1, L2 table. If allocate is true, return an
|
|
|
|
* offset for a new cluster and update L2 cache. If there is a backing file,
|
|
|
|
* COW is done before returning; otherwise, zeroes are written to the allocated
|
|
|
|
* cluster. Both COW and zero writing skips the sector range
|
|
|
|
* [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
|
|
|
|
* has new data to write there.
|
|
|
|
*
|
|
|
|
* Returns: VMDK_OK if cluster exists and mapped in the image.
|
|
|
|
* VMDK_UNALLOC if cluster is not mapped and @allocate is false.
|
|
|
|
* VMDK_ERROR if failed.
|
|
|
|
*/
|
2011-07-12 13:56:35 +02:00
|
|
|
static int get_cluster_offset(BlockDriverState *bs,
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
VmdkExtent *extent,
|
|
|
|
VmdkMetaData *m_data,
|
|
|
|
uint64_t offset,
|
|
|
|
bool allocate,
|
|
|
|
uint64_t *cluster_offset,
|
2016-04-26 13:39:11 +02:00
|
|
|
uint64_t skip_start_bytes,
|
|
|
|
uint64_t skip_end_bytes)
|
2004-08-01 23:59:26 +02:00
|
|
|
{
|
|
|
|
unsigned int l1_index, l2_offset, l2_index;
|
|
|
|
int min_index, i, j;
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
uint32_t min_count;
|
|
|
|
void *l2_table;
|
2013-05-02 04:25:23 +02:00
|
|
|
bool zeroed = false;
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
int64_t ret;
|
2014-09-23 03:56:21 +02:00
|
|
|
int64_t cluster_sector;
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
unsigned int l2_size_bytes = extent->l2_size * extent->entry_size;
|
2007-06-18 17:01:30 +02:00
|
|
|
|
2011-07-12 13:56:38 +02:00
|
|
|
if (m_data) {
|
2007-06-18 17:01:30 +02:00
|
|
|
m_data->valid = 0;
|
2011-07-12 13:56:38 +02:00
|
|
|
}
|
2011-07-12 13:56:35 +02:00
|
|
|
if (extent->flat) {
|
2011-07-19 02:38:22 +02:00
|
|
|
*cluster_offset = extent->flat_start_offset;
|
2013-05-02 04:25:22 +02:00
|
|
|
return VMDK_OK;
|
2011-07-12 13:56:35 +02:00
|
|
|
}
|
2007-06-18 17:01:30 +02:00
|
|
|
|
2011-08-12 17:19:27 +02:00
|
|
|
offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
|
2011-07-12 13:56:28 +02:00
|
|
|
l1_index = (offset >> 9) / extent->l1_entry_sectors;
|
|
|
|
if (l1_index >= extent->l1_size) {
|
2013-05-02 04:25:22 +02:00
|
|
|
return VMDK_ERROR;
|
2011-07-12 13:56:28 +02:00
|
|
|
}
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
if (extent->sesparse) {
|
|
|
|
uint64_t l2_offset_u64;
|
|
|
|
|
|
|
|
assert(extent->entry_size == sizeof(uint64_t));
|
|
|
|
|
|
|
|
l2_offset_u64 = ((uint64_t *)extent->l1_table)[l1_index];
|
|
|
|
if (l2_offset_u64 == 0) {
|
|
|
|
l2_offset = 0;
|
|
|
|
} else if ((l2_offset_u64 & 0xffffffff00000000) != 0x1000000000000000) {
|
|
|
|
/*
|
|
|
|
* Top most nibble is 0x1 if grain table is allocated.
|
|
|
|
* strict check - top most 4 bytes must be 0x10000000 since max
|
|
|
|
* supported size is 64TB for disk - so no more than 64TB / 16MB
|
|
|
|
* grain directories which is smaller than uint32,
|
|
|
|
* where 16MB is the only supported default grain table coverage.
|
|
|
|
*/
|
|
|
|
return VMDK_ERROR;
|
|
|
|
} else {
|
|
|
|
l2_offset_u64 = l2_offset_u64 & 0x00000000ffffffff;
|
|
|
|
l2_offset_u64 = extent->sesparse_l2_tables_offset +
|
|
|
|
l2_offset_u64 * l2_size_bytes / SECTOR_SIZE;
|
|
|
|
if (l2_offset_u64 > 0x00000000ffffffff) {
|
|
|
|
return VMDK_ERROR;
|
|
|
|
}
|
|
|
|
l2_offset = (unsigned int)(l2_offset_u64);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
assert(extent->entry_size == sizeof(uint32_t));
|
|
|
|
l2_offset = ((uint32_t *)extent->l1_table)[l1_index];
|
|
|
|
}
|
2011-07-12 13:56:28 +02:00
|
|
|
if (!l2_offset) {
|
2013-05-02 04:25:22 +02:00
|
|
|
return VMDK_UNALLOC;
|
2011-07-12 13:56:28 +02:00
|
|
|
}
|
2011-07-12 13:56:31 +02:00
|
|
|
for (i = 0; i < L2_CACHE_SIZE; i++) {
|
2011-07-12 13:56:28 +02:00
|
|
|
if (l2_offset == extent->l2_cache_offsets[i]) {
|
2004-08-01 23:59:26 +02:00
|
|
|
/* increment the hit count */
|
2011-07-12 13:56:28 +02:00
|
|
|
if (++extent->l2_cache_counts[i] == 0xffffffff) {
|
2011-07-12 13:56:31 +02:00
|
|
|
for (j = 0; j < L2_CACHE_SIZE; j++) {
|
2011-07-12 13:56:28 +02:00
|
|
|
extent->l2_cache_counts[j] >>= 1;
|
2004-08-01 23:59:26 +02:00
|
|
|
}
|
|
|
|
}
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
l2_table = (char *)extent->l2_cache + (i * l2_size_bytes);
|
2004-08-01 23:59:26 +02:00
|
|
|
goto found;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* not found: load a new entry in the least used one */
|
|
|
|
min_index = 0;
|
|
|
|
min_count = 0xffffffff;
|
2011-07-12 13:56:31 +02:00
|
|
|
for (i = 0; i < L2_CACHE_SIZE; i++) {
|
2011-07-12 13:56:28 +02:00
|
|
|
if (extent->l2_cache_counts[i] < min_count) {
|
|
|
|
min_count = extent->l2_cache_counts[i];
|
2004-08-01 23:59:26 +02:00
|
|
|
min_index = i;
|
|
|
|
}
|
|
|
|
}
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
l2_table = (char *)extent->l2_cache + (min_index * l2_size_bytes);
|
2017-11-23 03:08:19 +01:00
|
|
|
BLKDBG_EVENT(extent->file, BLKDBG_L2_LOAD);
|
2016-06-20 18:24:02 +02:00
|
|
|
if (bdrv_pread(extent->file,
|
2011-07-12 13:56:28 +02:00
|
|
|
(int64_t)l2_offset * 512,
|
|
|
|
l2_table,
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
l2_size_bytes
|
|
|
|
) != l2_size_bytes) {
|
2013-05-02 04:25:22 +02:00
|
|
|
return VMDK_ERROR;
|
2011-07-12 13:56:28 +02:00
|
|
|
}
|
2007-01-24 22:05:24 +01:00
|
|
|
|
2011-07-12 13:56:28 +02:00
|
|
|
extent->l2_cache_offsets[min_index] = l2_offset;
|
|
|
|
extent->l2_cache_counts[min_index] = 1;
|
2004-08-01 23:59:26 +02:00
|
|
|
found:
|
2011-07-12 13:56:28 +02:00
|
|
|
l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
|
2007-06-18 17:01:30 +02:00
|
|
|
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
if (extent->sesparse) {
|
|
|
|
cluster_sector = le64_to_cpu(((uint64_t *)l2_table)[l2_index]);
|
|
|
|
switch (cluster_sector & 0xf000000000000000) {
|
|
|
|
case 0x0000000000000000:
|
|
|
|
/* unallocated grain */
|
|
|
|
if (cluster_sector != 0) {
|
|
|
|
return VMDK_ERROR;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case 0x1000000000000000:
|
|
|
|
/* scsi-unmapped grain - fallthrough */
|
|
|
|
case 0x2000000000000000:
|
|
|
|
/* zero grain */
|
|
|
|
zeroed = true;
|
|
|
|
break;
|
|
|
|
case 0x3000000000000000:
|
|
|
|
/* allocated grain */
|
|
|
|
cluster_sector = (((cluster_sector & 0x0fff000000000000) >> 48) |
|
|
|
|
((cluster_sector & 0x0000ffffffffffff) << 12));
|
|
|
|
cluster_sector = extent->sesparse_clusters_offset +
|
|
|
|
cluster_sector * extent->cluster_sectors;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return VMDK_ERROR;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
cluster_sector = le32_to_cpu(((uint32_t *)l2_table)[l2_index]);
|
|
|
|
|
|
|
|
if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
|
|
|
|
zeroed = true;
|
|
|
|
}
|
2013-05-02 04:25:23 +02:00
|
|
|
}
|
|
|
|
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
if (!cluster_sector || zeroed) {
|
2011-07-12 13:56:35 +02:00
|
|
|
if (!allocate) {
|
2013-05-02 04:25:23 +02:00
|
|
|
return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
|
2011-07-12 13:56:35 +02:00
|
|
|
}
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
assert(!extent->sesparse);
|
2010-04-16 21:07:19 +02:00
|
|
|
|
2018-03-22 14:33:37 +01:00
|
|
|
if (extent->next_cluster_sector >= VMDK_EXTENT_MAX_SECTORS) {
|
|
|
|
return VMDK_ERROR;
|
|
|
|
}
|
|
|
|
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
cluster_sector = extent->next_cluster_sector;
|
|
|
|
extent->next_cluster_sector += extent->cluster_sectors;
|
2007-06-18 17:01:30 +02:00
|
|
|
|
|
|
|
/* First of all we write grain itself, to avoid race condition
|
|
|
|
* that may to corrupt the image.
|
|
|
|
* This problem may occur because of insufficient space on host disk
|
|
|
|
* or inappropriate VM shutdown.
|
|
|
|
*/
|
2016-04-26 13:39:11 +02:00
|
|
|
ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
|
|
|
|
offset, skip_start_bytes, skip_end_bytes);
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
if (ret) {
|
|
|
|
return ret;
|
2007-06-18 17:01:30 +02:00
|
|
|
}
|
2016-07-07 10:42:49 +02:00
|
|
|
if (m_data) {
|
|
|
|
m_data->valid = 1;
|
|
|
|
m_data->l1_index = l1_index;
|
|
|
|
m_data->l2_index = l2_index;
|
|
|
|
m_data->l2_offset = l2_offset;
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
m_data->l2_cache_entry = ((uint32_t *)l2_table) + l2_index;
|
2016-07-07 10:42:49 +02:00
|
|
|
}
|
2005-04-26 23:08:00 +02:00
|
|
|
}
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
*cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
|
2013-05-02 04:25:22 +02:00
|
|
|
return VMDK_OK;
|
2004-08-01 23:59:26 +02:00
|
|
|
}
|
|
|
|
|
2011-07-12 13:56:28 +02:00
|
|
|
static VmdkExtent *find_extent(BDRVVmdkState *s,
|
|
|
|
int64_t sector_num, VmdkExtent *start_hint)
|
|
|
|
{
|
|
|
|
VmdkExtent *extent = start_hint;
|
|
|
|
|
|
|
|
if (!extent) {
|
|
|
|
extent = &s->extents[0];
|
|
|
|
}
|
|
|
|
while (extent < &s->extents[s->num_extents]) {
|
|
|
|
if (sector_num < extent->end_sector) {
|
|
|
|
return extent;
|
|
|
|
}
|
|
|
|
extent++;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2016-04-25 17:14:48 +02:00
|
|
|
static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
|
|
|
|
int64_t offset)
|
|
|
|
{
|
2016-06-13 23:57:58 +02:00
|
|
|
uint64_t extent_begin_offset, extent_relative_offset;
|
2016-04-25 17:14:48 +02:00
|
|
|
uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
|
|
|
|
|
|
|
|
extent_begin_offset =
|
|
|
|
(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
|
|
|
|
extent_relative_offset = offset - extent_begin_offset;
|
2016-06-13 23:57:58 +02:00
|
|
|
return extent_relative_offset % cluster_size;
|
2016-04-25 17:14:48 +02:00
|
|
|
}
|
|
|
|
|
2018-02-13 21:26:58 +01:00
|
|
|
static int coroutine_fn vmdk_co_block_status(BlockDriverState *bs,
|
|
|
|
bool want_zero,
|
|
|
|
int64_t offset, int64_t bytes,
|
|
|
|
int64_t *pnum, int64_t *map,
|
|
|
|
BlockDriverState **file)
|
2004-08-01 23:59:26 +02:00
|
|
|
{
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2011-07-12 13:56:28 +02:00
|
|
|
int64_t index_in_cluster, n, ret;
|
2018-02-13 21:26:58 +01:00
|
|
|
uint64_t cluster_offset;
|
2011-07-12 13:56:28 +02:00
|
|
|
VmdkExtent *extent;
|
|
|
|
|
2018-02-13 21:26:58 +01:00
|
|
|
extent = find_extent(s, offset >> BDRV_SECTOR_BITS, NULL);
|
2011-07-12 13:56:28 +02:00
|
|
|
if (!extent) {
|
2018-02-13 21:26:58 +01:00
|
|
|
return -EIO;
|
2011-07-12 13:56:28 +02:00
|
|
|
}
|
2011-11-14 13:44:21 +01:00
|
|
|
qemu_co_mutex_lock(&s->lock);
|
2018-02-13 21:26:58 +01:00
|
|
|
ret = get_cluster_offset(bs, extent, NULL, offset, false, &cluster_offset,
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
0, 0);
|
2011-11-14 13:44:21 +01:00
|
|
|
qemu_co_mutex_unlock(&s->lock);
|
2013-05-02 04:25:23 +02:00
|
|
|
|
2018-02-13 21:26:58 +01:00
|
|
|
index_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
|
2013-09-04 19:00:30 +02:00
|
|
|
switch (ret) {
|
|
|
|
case VMDK_ERROR:
|
|
|
|
ret = -EIO;
|
|
|
|
break;
|
|
|
|
case VMDK_UNALLOC:
|
|
|
|
ret = 0;
|
|
|
|
break;
|
|
|
|
case VMDK_ZEROED:
|
|
|
|
ret = BDRV_BLOCK_ZERO;
|
|
|
|
break;
|
|
|
|
case VMDK_OK:
|
|
|
|
ret = BDRV_BLOCK_DATA;
|
2016-01-26 04:58:59 +01:00
|
|
|
if (!extent->compressed) {
|
2016-01-26 04:58:58 +01:00
|
|
|
ret |= BDRV_BLOCK_OFFSET_VALID;
|
2018-02-13 21:26:58 +01:00
|
|
|
*map = cluster_offset + index_in_cluster;
|
2019-07-25 17:55:11 +02:00
|
|
|
if (extent->flat) {
|
|
|
|
ret |= BDRV_BLOCK_RECURSE;
|
|
|
|
}
|
2013-09-04 19:00:30 +02:00
|
|
|
}
|
2016-01-26 04:58:59 +01:00
|
|
|
*file = extent->file->bs;
|
2013-09-04 19:00:30 +02:00
|
|
|
break;
|
|
|
|
}
|
2011-07-12 13:56:35 +02:00
|
|
|
|
2018-02-13 21:26:58 +01:00
|
|
|
n = extent->cluster_sectors * BDRV_SECTOR_SIZE - index_in_cluster;
|
|
|
|
*pnum = MIN(n, bytes);
|
2011-07-12 13:56:28 +02:00
|
|
|
return ret;
|
2004-08-01 23:59:26 +02:00
|
|
|
}
|
|
|
|
|
2011-08-12 17:19:29 +02:00
|
|
|
static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
|
2016-04-26 13:39:11 +02:00
|
|
|
int64_t offset_in_cluster, QEMUIOVector *qiov,
|
|
|
|
uint64_t qiov_offset, uint64_t n_bytes,
|
|
|
|
uint64_t offset)
|
2011-08-12 17:19:29 +02:00
|
|
|
{
|
|
|
|
int ret;
|
2011-08-12 17:19:31 +02:00
|
|
|
VmdkGrainMarker *data = NULL;
|
|
|
|
uLongf buf_len;
|
2016-04-26 13:39:11 +02:00
|
|
|
QEMUIOVector local_qiov;
|
2015-05-06 14:23:46 +02:00
|
|
|
int64_t write_offset;
|
|
|
|
int64_t write_end_sector;
|
2011-08-12 17:19:29 +02:00
|
|
|
|
2011-08-12 17:19:31 +02:00
|
|
|
if (extent->compressed) {
|
2016-04-26 13:39:11 +02:00
|
|
|
void *compressed_data;
|
|
|
|
|
2019-08-15 17:36:35 +02:00
|
|
|
/* Only whole clusters */
|
|
|
|
if (offset_in_cluster ||
|
|
|
|
n_bytes > (extent->cluster_sectors * SECTOR_SIZE) ||
|
|
|
|
(n_bytes < (extent->cluster_sectors * SECTOR_SIZE) &&
|
|
|
|
offset + n_bytes != extent->end_sector * SECTOR_SIZE))
|
|
|
|
{
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2011-08-12 17:19:31 +02:00
|
|
|
if (!extent->has_marker) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
buf_len = (extent->cluster_sectors << 9) * 2;
|
|
|
|
data = g_malloc(buf_len + sizeof(VmdkGrainMarker));
|
2016-04-26 13:39:11 +02:00
|
|
|
|
|
|
|
compressed_data = g_malloc(n_bytes);
|
|
|
|
qemu_iovec_to_buf(qiov, qiov_offset, compressed_data, n_bytes);
|
|
|
|
ret = compress(data->data, &buf_len, compressed_data, n_bytes);
|
|
|
|
g_free(compressed_data);
|
|
|
|
|
|
|
|
if (ret != Z_OK || buf_len == 0) {
|
2011-08-12 17:19:31 +02:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
2016-04-26 13:39:11 +02:00
|
|
|
|
2016-12-16 06:20:40 +01:00
|
|
|
data->lba = cpu_to_le64(offset >> BDRV_SECTOR_BITS);
|
|
|
|
data->size = cpu_to_le32(buf_len);
|
2016-04-26 13:39:11 +02:00
|
|
|
|
|
|
|
n_bytes = buf_len + sizeof(VmdkGrainMarker);
|
2019-02-18 15:09:20 +01:00
|
|
|
qemu_iovec_init_buf(&local_qiov, data, n_bytes);
|
2017-11-23 03:08:19 +01:00
|
|
|
|
|
|
|
BLKDBG_EVENT(extent->file, BLKDBG_WRITE_COMPRESSED);
|
2016-04-26 13:39:11 +02:00
|
|
|
} else {
|
|
|
|
qemu_iovec_init(&local_qiov, qiov->niov);
|
|
|
|
qemu_iovec_concat(&local_qiov, qiov, qiov_offset, n_bytes);
|
2017-11-23 03:08:19 +01:00
|
|
|
|
|
|
|
BLKDBG_EVENT(extent->file, BLKDBG_WRITE_AIO);
|
2011-08-12 17:19:31 +02:00
|
|
|
}
|
2016-04-26 13:39:11 +02:00
|
|
|
|
2017-11-23 03:08:16 +01:00
|
|
|
write_offset = cluster_offset + offset_in_cluster;
|
2016-06-20 21:31:46 +02:00
|
|
|
ret = bdrv_co_pwritev(extent->file, write_offset, n_bytes,
|
2016-04-26 13:39:11 +02:00
|
|
|
&local_qiov, 0);
|
2015-05-06 14:23:46 +02:00
|
|
|
|
2016-04-26 13:39:11 +02:00
|
|
|
write_end_sector = DIV_ROUND_UP(write_offset + n_bytes, BDRV_SECTOR_SIZE);
|
2015-05-06 14:23:46 +02:00
|
|
|
|
2015-09-10 09:53:14 +02:00
|
|
|
if (extent->compressed) {
|
|
|
|
extent->next_cluster_sector = write_end_sector;
|
|
|
|
} else {
|
|
|
|
extent->next_cluster_sector = MAX(extent->next_cluster_sector,
|
|
|
|
write_end_sector);
|
|
|
|
}
|
2015-05-06 14:23:46 +02:00
|
|
|
|
2016-04-26 13:39:11 +02:00
|
|
|
if (ret < 0) {
|
2011-08-12 17:19:29 +02:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
out:
|
2011-08-12 17:19:31 +02:00
|
|
|
g_free(data);
|
2016-04-26 13:39:11 +02:00
|
|
|
if (!extent->compressed) {
|
|
|
|
qemu_iovec_destroy(&local_qiov);
|
|
|
|
}
|
2011-08-12 17:19:29 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
|
2016-04-25 17:34:41 +02:00
|
|
|
int64_t offset_in_cluster, QEMUIOVector *qiov,
|
|
|
|
int bytes)
|
2011-08-12 17:19:29 +02:00
|
|
|
{
|
|
|
|
int ret;
|
2011-08-12 17:19:31 +02:00
|
|
|
int cluster_bytes, buf_bytes;
|
|
|
|
uint8_t *cluster_buf, *compressed_data;
|
|
|
|
uint8_t *uncomp_buf;
|
|
|
|
uint32_t data_len;
|
|
|
|
VmdkGrainMarker *marker;
|
|
|
|
uLongf buf_len;
|
|
|
|
|
2011-08-12 17:19:29 +02:00
|
|
|
|
2011-08-12 17:19:31 +02:00
|
|
|
if (!extent->compressed) {
|
2017-11-23 03:08:19 +01:00
|
|
|
BLKDBG_EVENT(extent->file, BLKDBG_READ_AIO);
|
2016-06-20 21:31:46 +02:00
|
|
|
ret = bdrv_co_preadv(extent->file,
|
2016-04-25 17:34:41 +02:00
|
|
|
cluster_offset + offset_in_cluster, bytes,
|
|
|
|
qiov, 0);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
2011-08-12 17:19:31 +02:00
|
|
|
}
|
2016-04-25 17:34:41 +02:00
|
|
|
return 0;
|
2011-08-12 17:19:31 +02:00
|
|
|
}
|
|
|
|
cluster_bytes = extent->cluster_sectors * 512;
|
|
|
|
/* Read two clusters in case GrainMarker + compressed data > one cluster */
|
|
|
|
buf_bytes = cluster_bytes * 2;
|
|
|
|
cluster_buf = g_malloc(buf_bytes);
|
|
|
|
uncomp_buf = g_malloc(cluster_bytes);
|
2017-11-23 03:08:19 +01:00
|
|
|
BLKDBG_EVENT(extent->file, BLKDBG_READ_COMPRESSED);
|
2016-06-20 18:24:02 +02:00
|
|
|
ret = bdrv_pread(extent->file,
|
2011-08-12 17:19:31 +02:00
|
|
|
cluster_offset,
|
|
|
|
cluster_buf, buf_bytes);
|
|
|
|
if (ret < 0) {
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
compressed_data = cluster_buf;
|
|
|
|
buf_len = cluster_bytes;
|
|
|
|
data_len = cluster_bytes;
|
|
|
|
if (extent->has_marker) {
|
|
|
|
marker = (VmdkGrainMarker *)cluster_buf;
|
|
|
|
compressed_data = marker->data;
|
|
|
|
data_len = le32_to_cpu(marker->size);
|
|
|
|
}
|
|
|
|
if (!data_len || data_len > buf_bytes) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len);
|
|
|
|
if (ret != Z_OK) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
}
|
|
|
|
if (offset_in_cluster < 0 ||
|
2016-04-25 17:34:41 +02:00
|
|
|
offset_in_cluster + bytes > buf_len) {
|
2011-08-12 17:19:31 +02:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
2011-08-12 17:19:29 +02:00
|
|
|
}
|
2016-04-25 17:34:41 +02:00
|
|
|
qemu_iovec_from_buf(qiov, 0, uncomp_buf + offset_in_cluster, bytes);
|
2011-08-12 17:19:31 +02:00
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
out:
|
|
|
|
g_free(uncomp_buf);
|
|
|
|
g_free(cluster_buf);
|
|
|
|
return ret;
|
2011-08-12 17:19:29 +02:00
|
|
|
}
|
|
|
|
|
2016-04-25 17:34:41 +02:00
|
|
|
static int coroutine_fn
|
|
|
|
vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
|
|
|
|
QEMUIOVector *qiov, int flags)
|
2004-08-01 23:59:26 +02:00
|
|
|
{
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2011-07-12 13:56:28 +02:00
|
|
|
int ret;
|
2016-04-25 17:34:41 +02:00
|
|
|
uint64_t n_bytes, offset_in_cluster;
|
2011-07-12 13:56:28 +02:00
|
|
|
VmdkExtent *extent = NULL;
|
2016-04-25 17:34:41 +02:00
|
|
|
QEMUIOVector local_qiov;
|
2004-08-01 23:59:26 +02:00
|
|
|
uint64_t cluster_offset;
|
2016-04-25 17:34:41 +02:00
|
|
|
uint64_t bytes_done = 0;
|
2007-01-24 22:05:24 +01:00
|
|
|
|
2016-04-25 17:34:41 +02:00
|
|
|
qemu_iovec_init(&local_qiov, qiov->niov);
|
|
|
|
qemu_co_mutex_lock(&s->lock);
|
|
|
|
|
|
|
|
while (bytes > 0) {
|
|
|
|
extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
|
2011-07-12 13:56:28 +02:00
|
|
|
if (!extent) {
|
2016-04-25 17:34:41 +02:00
|
|
|
ret = -EIO;
|
|
|
|
goto fail;
|
2011-07-12 13:56:28 +02:00
|
|
|
}
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
ret = get_cluster_offset(bs, extent, NULL,
|
2016-04-25 17:34:41 +02:00
|
|
|
offset, false, &cluster_offset, 0, 0);
|
|
|
|
offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
|
|
|
|
|
|
|
|
n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
|
|
|
|
- offset_in_cluster);
|
|
|
|
|
2013-05-02 04:25:23 +02:00
|
|
|
if (ret != VMDK_OK) {
|
2011-07-12 13:56:35 +02:00
|
|
|
/* if not allocated, try to read from parent image, if exist */
|
2015-06-17 14:55:21 +02:00
|
|
|
if (bs->backing && ret != VMDK_ZEROED) {
|
2011-07-12 13:56:38 +02:00
|
|
|
if (!vmdk_is_cid_valid(bs)) {
|
2016-04-25 17:34:41 +02:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto fail;
|
2011-07-12 13:56:38 +02:00
|
|
|
}
|
2016-04-25 17:34:41 +02:00
|
|
|
|
|
|
|
qemu_iovec_reset(&local_qiov);
|
|
|
|
qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
|
|
|
|
|
2017-11-23 03:08:19 +01:00
|
|
|
/* qcow2 emits this on bs->file instead of bs->backing */
|
|
|
|
BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
|
2016-06-20 21:31:46 +02:00
|
|
|
ret = bdrv_co_preadv(bs->backing, offset, n_bytes,
|
2016-04-25 17:34:41 +02:00
|
|
|
&local_qiov, 0);
|
2011-07-12 13:56:38 +02:00
|
|
|
if (ret < 0) {
|
2016-04-25 17:34:41 +02:00
|
|
|
goto fail;
|
2011-07-12 13:56:38 +02:00
|
|
|
}
|
2007-01-24 22:05:24 +01:00
|
|
|
} else {
|
2016-04-25 17:34:41 +02:00
|
|
|
qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
|
2007-01-24 22:05:24 +01:00
|
|
|
}
|
2004-08-01 23:59:26 +02:00
|
|
|
} else {
|
2016-04-25 17:34:41 +02:00
|
|
|
qemu_iovec_reset(&local_qiov);
|
|
|
|
qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
|
|
|
|
|
|
|
|
ret = vmdk_read_extent(extent, cluster_offset, offset_in_cluster,
|
|
|
|
&local_qiov, n_bytes);
|
2011-08-12 17:19:29 +02:00
|
|
|
if (ret) {
|
2016-04-25 17:34:41 +02:00
|
|
|
goto fail;
|
2011-07-19 02:38:22 +02:00
|
|
|
}
|
2004-08-01 23:59:26 +02:00
|
|
|
}
|
2016-04-25 17:34:41 +02:00
|
|
|
bytes -= n_bytes;
|
|
|
|
offset += n_bytes;
|
|
|
|
bytes_done += n_bytes;
|
2004-08-01 23:59:26 +02:00
|
|
|
}
|
|
|
|
|
2016-04-25 17:34:41 +02:00
|
|
|
ret = 0;
|
|
|
|
fail:
|
2011-10-20 13:16:22 +02:00
|
|
|
qemu_co_mutex_unlock(&s->lock);
|
2016-04-25 17:34:41 +02:00
|
|
|
qemu_iovec_destroy(&local_qiov);
|
|
|
|
|
2011-10-20 13:16:22 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-05-02 04:25:27 +02:00
|
|
|
/**
|
|
|
|
* vmdk_write:
|
|
|
|
* @zeroed: buf is ignored (data is zero), use zeroed_grain GTE feature
|
2013-08-01 12:12:17 +02:00
|
|
|
* if possible, otherwise return -ENOTSUP.
|
|
|
|
* @zero_dry_run: used for zeroed == true only, don't update L2 table, just try
|
|
|
|
* with each cluster. By dry run we can find if the zero write
|
|
|
|
* is possible without modifying image data.
|
2013-05-02 04:25:27 +02:00
|
|
|
*
|
|
|
|
* Returns: error code with 0 for success.
|
|
|
|
*/
|
2016-04-26 13:39:11 +02:00
|
|
|
static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
|
|
|
|
uint64_t bytes, QEMUIOVector *qiov,
|
|
|
|
bool zeroed, bool zero_dry_run)
|
2004-08-01 23:59:26 +02:00
|
|
|
{
|
2005-04-26 23:08:00 +02:00
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2011-07-12 13:56:28 +02:00
|
|
|
VmdkExtent *extent = NULL;
|
2014-01-08 02:42:07 +01:00
|
|
|
int ret;
|
2016-04-26 13:39:11 +02:00
|
|
|
int64_t offset_in_cluster, n_bytes;
|
2005-04-26 23:08:00 +02:00
|
|
|
uint64_t cluster_offset;
|
2016-04-26 13:39:11 +02:00
|
|
|
uint64_t bytes_done = 0;
|
2011-07-12 13:56:28 +02:00
|
|
|
VmdkMetaData m_data;
|
2005-04-26 23:08:00 +02:00
|
|
|
|
2016-04-26 13:39:11 +02:00
|
|
|
if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) {
|
|
|
|
error_report("Wrong offset: offset=0x%" PRIx64
|
2015-12-18 16:35:19 +01:00
|
|
|
" total_sectors=0x%" PRIx64,
|
2016-04-26 13:39:11 +02:00
|
|
|
offset, bs->total_sectors);
|
2011-07-19 02:38:22 +02:00
|
|
|
return -EIO;
|
2007-06-18 17:01:30 +02:00
|
|
|
}
|
|
|
|
|
2016-04-26 13:39:11 +02:00
|
|
|
while (bytes > 0) {
|
|
|
|
extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
|
2011-07-12 13:56:28 +02:00
|
|
|
if (!extent) {
|
|
|
|
return -EIO;
|
|
|
|
}
|
vmdk: Add read-only support for seSparse snapshots
Until ESXi 6.5 VMware used the vmfsSparse format for snapshots (VMDK3 in
QEMU).
This format was lacking in the following:
* Grain directory (L1) and grain table (L2) entries were 32-bit,
allowing access to only 2TB (slightly less) of data.
* The grain size (default) was 512 bytes - leading to data
fragmentation and many grain tables.
* For space reclamation purposes, it was necessary to find all the
grains which are not pointed to by any grain table - so a reverse
mapping of "offset of grain in vmdk" to "grain table" must be
constructed - which takes large amounts of CPU/RAM.
The format specification can be found in VMware's documentation:
https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf
In ESXi 6.5, to support snapshot files larger than 2TB, a new format was
introduced: SESparse (Space Efficient).
This format fixes the above issues:
* All entries are now 64-bit.
* The grain size (default) is 4KB.
* Grain directory and grain tables are now located at the beginning
of the file.
+ seSparse format reserves space for all grain tables.
+ Grain tables can be addressed using an index.
+ Grains are located in the end of the file and can also be
addressed with an index.
- seSparse vmdks of large disks (64TB) have huge preallocated
headers - mainly due to L2 tables, even for empty snapshots.
* The header contains a reverse mapping ("backmap") of "offset of
grain in vmdk" to "grain table" and a bitmap ("free bitmap") which
specifies for each grain - whether it is allocated or not.
Using these data structures we can implement space reclamation
efficiently.
* Due to the fact that the header now maintains two mappings:
* The regular one (grain directory & grain tables)
* A reverse one (backmap and free bitmap)
These data structures can lose consistency upon crash and result
in a corrupted VMDK.
Therefore, a journal is also added to the VMDK and is replayed
when the VMware reopens the file after a crash.
Since ESXi 6.7 - SESparse is the only snapshot format available.
Unfortunately, VMware does not provide documentation regarding the new
seSparse format.
This commit is based on black-box research of the seSparse format.
Various in-guest block operations and their effect on the snapshot file
were tested.
The only VMware provided source of information (regarding the underlying
implementation) was a log file on the ESXi:
/var/log/hostd.log
Whenever an seSparse snapshot is created - the log is being populated
with seSparse records.
Relevant log records are of the form:
[...] Const Header:
[...] constMagic = 0xcafebabe
[...] version = 2.1
[...] capacity = 204800
[...] grainSize = 8
[...] grainTableSize = 64
[...] flags = 0
[...] Extents:
[...] Header : <1 : 1>
[...] JournalHdr : <2 : 2>
[...] Journal : <2048 : 2048>
[...] GrainDirectory : <4096 : 2048>
[...] GrainTables : <6144 : 2048>
[...] FreeBitmap : <8192 : 2048>
[...] BackMap : <10240 : 2048>
[...] Grain : <12288 : 204800>
[...] Volatile Header:
[...] volatileMagic = 0xcafecafe
[...] FreeGTNumber = 0
[...] nextTxnSeqNumber = 0
[...] replayJournal = 0
The sizes that are seen in the log file are in sectors.
Extents are of the following format: <offset : size>
This commit is a strict implementation which enforces:
* magics
* version number 2.1
* grain size of 8 sectors (4KB)
* grain table size of 64 sectors
* zero flags
* extent locations
Additionally, this commit proivdes only a subset of the functionality
offered by seSparse's format:
* Read-only
* No journal replay
* No space reclamation
* No unmap support
Hence, journal header, journal, free bitmap and backmap extents are
unused, only the "classic" (L1 -> L2 -> data) grain access is
implemented.
However there are several differences in the grain access itself.
Grain directory (L1):
* Grain directory entries are indexes (not offsets) to grain
tables.
* Valid grain directory entries have their highest nibble set to
0x1.
* Since grain tables are always located in the beginning of the
file - the index can fit into 32 bits - so we can use its low
part if it's valid.
Grain table (L2):
* Grain table entries are indexes (not offsets) to grains.
* If the highest nibble of the entry is:
0x0:
The grain in not allocated.
The rest of the bytes are 0.
0x1:
The grain is unmapped - guest sees a zero grain.
The rest of the bits point to the previously mapped grain,
see 0x3 case.
0x2:
The grain is zero.
0x3:
The grain is allocated - to get the index calculate:
((entry & 0x0fff000000000000) >> 48) |
((entry & 0x0000ffffffffffff) << 12)
* The difference between 0x1 and 0x2 is that 0x1 is an unallocated
grain which results from the guest using sg_unmap to unmap the
grain - but the grain itself still exists in the grain extent - a
space reclamation procedure should delete it.
Unmapping a zero grain has no effect (0x2 will not change to 0x1)
but unmapping an unallocated grain will (0x0 to 0x1) - naturally.
In order to implement seSparse some fields had to be changed to support
both 32-bit and 64-bit entry sizes.
Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
Reviewed-by: Eyal Moscovici <eyal.moscovici@oracle.com>
Reviewed-by: Arbel Moshe <arbel.moshe@oracle.com>
Signed-off-by: Sam Eiderman <shmuel.eiderman@oracle.com>
Message-id: 20190620091057.47441-4-shmuel.eiderman@oracle.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-06-20 11:10:57 +02:00
|
|
|
if (extent->sesparse) {
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
2016-04-26 13:39:11 +02:00
|
|
|
offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
|
|
|
|
n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
|
|
|
|
- offset_in_cluster);
|
|
|
|
|
|
|
|
ret = get_cluster_offset(bs, extent, &m_data, offset,
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
!(extent->compressed || zeroed),
|
2016-04-26 13:39:11 +02:00
|
|
|
&cluster_offset, offset_in_cluster,
|
|
|
|
offset_in_cluster + n_bytes);
|
2011-08-12 17:19:31 +02:00
|
|
|
if (extent->compressed) {
|
2013-05-02 04:25:22 +02:00
|
|
|
if (ret == VMDK_OK) {
|
2011-08-12 17:19:31 +02:00
|
|
|
/* Refuse write to allocated cluster for streamOptimized */
|
2013-10-11 09:43:22 +02:00
|
|
|
error_report("Could not write to allocated cluster"
|
|
|
|
" for streamOptimized");
|
2011-08-12 17:19:31 +02:00
|
|
|
return -EIO;
|
|
|
|
} else {
|
|
|
|
/* allocate */
|
2016-04-26 13:39:11 +02:00
|
|
|
ret = get_cluster_offset(bs, extent, &m_data, offset,
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
true, &cluster_offset, 0, 0);
|
2011-08-12 17:19:31 +02:00
|
|
|
}
|
|
|
|
}
|
2013-05-02 04:25:27 +02:00
|
|
|
if (ret == VMDK_ERROR) {
|
2011-07-12 13:56:35 +02:00
|
|
|
return -EINVAL;
|
2011-07-12 13:56:28 +02:00
|
|
|
}
|
2013-05-02 04:25:27 +02:00
|
|
|
if (zeroed) {
|
|
|
|
/* Do zeroed write, buf is ignored */
|
|
|
|
if (extent->has_zero_grain &&
|
2016-04-26 13:39:11 +02:00
|
|
|
offset_in_cluster == 0 &&
|
|
|
|
n_bytes >= extent->cluster_sectors * BDRV_SECTOR_SIZE) {
|
|
|
|
n_bytes = extent->cluster_sectors * BDRV_SECTOR_SIZE;
|
2013-05-02 04:25:27 +02:00
|
|
|
if (!zero_dry_run) {
|
|
|
|
/* update L2 tables */
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
|
|
|
|
!= VMDK_OK) {
|
2013-05-02 04:25:27 +02:00
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
} else {
|
2016-04-26 13:39:11 +02:00
|
|
|
ret = vmdk_write_extent(extent, cluster_offset, offset_in_cluster,
|
|
|
|
qiov, bytes_done, n_bytes, offset);
|
2013-05-02 04:25:27 +02:00
|
|
|
if (ret) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
if (m_data.valid) {
|
|
|
|
/* update L2 tables */
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
if (vmdk_L2update(extent, &m_data,
|
|
|
|
cluster_offset >> BDRV_SECTOR_BITS)
|
|
|
|
!= VMDK_OK) {
|
2013-05-02 04:25:27 +02:00
|
|
|
return -EIO;
|
|
|
|
}
|
2011-07-12 13:56:28 +02:00
|
|
|
}
|
2007-06-18 17:01:30 +02:00
|
|
|
}
|
2016-04-26 13:39:11 +02:00
|
|
|
bytes -= n_bytes;
|
|
|
|
offset += n_bytes;
|
|
|
|
bytes_done += n_bytes;
|
2007-01-24 22:05:24 +01:00
|
|
|
|
2011-07-12 13:56:38 +02:00
|
|
|
/* update CID on the first write every time the virtual disk is
|
|
|
|
* opened */
|
2011-07-12 13:56:34 +02:00
|
|
|
if (!s->cid_updated) {
|
2014-12-04 00:28:29 +01:00
|
|
|
ret = vmdk_write_cid(bs, g_random_int());
|
2011-10-26 12:25:25 +02:00
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
2011-07-12 13:56:34 +02:00
|
|
|
s->cid_updated = true;
|
2007-01-24 22:05:24 +01:00
|
|
|
}
|
2005-04-26 23:08:00 +02:00
|
|
|
}
|
|
|
|
return 0;
|
2004-08-01 23:59:26 +02:00
|
|
|
}
|
|
|
|
|
2016-04-26 13:39:11 +02:00
|
|
|
static int coroutine_fn
|
|
|
|
vmdk_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
|
|
|
|
QEMUIOVector *qiov, int flags)
|
2011-10-20 13:16:23 +02:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
qemu_co_mutex_lock(&s->lock);
|
2016-04-26 13:39:11 +02:00
|
|
|
ret = vmdk_pwritev(bs, offset, bytes, qiov, false, false);
|
2013-05-02 04:25:27 +02:00
|
|
|
qemu_co_mutex_unlock(&s->lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-07-22 10:17:45 +02:00
|
|
|
static int coroutine_fn
|
|
|
|
vmdk_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
|
|
|
|
uint64_t bytes, QEMUIOVector *qiov)
|
2014-05-06 15:08:44 +02:00
|
|
|
{
|
2018-09-13 10:29:52 +02:00
|
|
|
if (bytes == 0) {
|
|
|
|
/* The caller will write bytes 0 to signal EOF.
|
|
|
|
* When receive it, we align EOF to a sector boundary. */
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
int i, ret;
|
|
|
|
int64_t length;
|
|
|
|
|
|
|
|
for (i = 0; i < s->num_extents; i++) {
|
|
|
|
length = bdrv_getlength(s->extents[i].file->bs);
|
|
|
|
if (length < 0) {
|
|
|
|
return length;
|
|
|
|
}
|
|
|
|
length = QEMU_ALIGN_UP(length, BDRV_SECTOR_SIZE);
|
2019-09-18 11:51:40 +02:00
|
|
|
ret = bdrv_truncate(s->extents[i].file, length, false,
|
2018-09-13 10:29:52 +02:00
|
|
|
PREALLOC_MODE_OFF, NULL);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2016-07-22 10:17:45 +02:00
|
|
|
return vmdk_co_pwritev(bs, offset, bytes, qiov, 0);
|
2014-05-06 15:08:44 +02:00
|
|
|
}
|
|
|
|
|
2016-06-01 23:10:12 +02:00
|
|
|
static int coroutine_fn vmdk_co_pwrite_zeroes(BlockDriverState *bs,
|
|
|
|
int64_t offset,
|
|
|
|
int bytes,
|
|
|
|
BdrvRequestFlags flags)
|
2013-05-02 04:25:27 +02:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2016-04-26 13:39:11 +02:00
|
|
|
|
2013-05-02 04:25:27 +02:00
|
|
|
qemu_co_mutex_lock(&s->lock);
|
2013-08-01 12:12:17 +02:00
|
|
|
/* write zeroes could fail if sectors not aligned to cluster, test it with
|
|
|
|
* dry_run == true before really updating image */
|
2016-04-26 13:39:11 +02:00
|
|
|
ret = vmdk_pwritev(bs, offset, bytes, NULL, true, true);
|
2013-05-02 04:25:27 +02:00
|
|
|
if (!ret) {
|
2016-04-26 13:39:11 +02:00
|
|
|
ret = vmdk_pwritev(bs, offset, bytes, NULL, true, false);
|
2013-05-02 04:25:27 +02:00
|
|
|
}
|
2011-10-20 13:16:23 +02:00
|
|
|
qemu_co_mutex_unlock(&s->lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-05-15 17:36:31 +02:00
|
|
|
static int vmdk_init_extent(BlockBackend *blk,
|
|
|
|
int64_t filesize, bool flat,
|
|
|
|
bool compress, bool zeroed_grain,
|
|
|
|
Error **errp)
|
2005-07-02 16:02:54 +02:00
|
|
|
{
|
2011-07-19 02:45:23 +02:00
|
|
|
int ret, i;
|
2005-07-02 16:02:54 +02:00
|
|
|
VMDK4Header header;
|
2013-12-20 02:48:48 +01:00
|
|
|
uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
|
|
|
|
uint32_t *gd_buf = NULL;
|
|
|
|
int gd_buf_size;
|
2009-05-18 16:42:10 +02:00
|
|
|
|
2011-07-19 02:45:23 +02:00
|
|
|
if (flat) {
|
2019-09-18 11:51:40 +02:00
|
|
|
ret = blk_truncate(blk, filesize, false, PREALLOC_MODE_OFF, errp);
|
2011-07-19 02:45:23 +02:00
|
|
|
goto exit;
|
2007-01-24 22:05:24 +01:00
|
|
|
}
|
2005-07-02 16:02:54 +02:00
|
|
|
magic = cpu_to_be32(VMDK4_MAGIC);
|
|
|
|
memset(&header, 0, sizeof(header));
|
2015-09-17 07:04:10 +02:00
|
|
|
if (compress) {
|
|
|
|
header.version = 3;
|
|
|
|
} else if (zeroed_grain) {
|
|
|
|
header.version = 2;
|
|
|
|
} else {
|
|
|
|
header.version = 1;
|
|
|
|
}
|
2013-05-02 04:25:25 +02:00
|
|
|
header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
|
2013-05-02 04:25:24 +02:00
|
|
|
| (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
|
|
|
|
| (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
|
2011-08-12 17:19:32 +02:00
|
|
|
header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
|
2013-12-20 02:48:48 +01:00
|
|
|
header.capacity = filesize / BDRV_SECTOR_SIZE;
|
2011-05-25 00:46:55 +02:00
|
|
|
header.granularity = 128;
|
2013-12-20 02:48:48 +01:00
|
|
|
header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
|
2005-07-02 16:02:54 +02:00
|
|
|
|
2013-12-20 02:48:48 +01:00
|
|
|
grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
|
|
|
|
gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
|
|
|
|
BDRV_SECTOR_SIZE);
|
|
|
|
gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
|
|
|
|
gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
|
2005-07-02 16:02:54 +02:00
|
|
|
|
|
|
|
header.desc_offset = 1;
|
|
|
|
header.desc_size = 20;
|
|
|
|
header.rgd_offset = header.desc_offset + header.desc_size;
|
2013-12-20 02:48:48 +01:00
|
|
|
header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
|
2005-07-02 16:02:54 +02:00
|
|
|
header.grain_offset =
|
2013-12-20 02:48:48 +01:00
|
|
|
ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
|
|
|
|
header.granularity);
|
2011-05-25 00:46:55 +02:00
|
|
|
/* swap endianness for all header fields */
|
|
|
|
header.version = cpu_to_le32(header.version);
|
|
|
|
header.flags = cpu_to_le32(header.flags);
|
|
|
|
header.capacity = cpu_to_le64(header.capacity);
|
|
|
|
header.granularity = cpu_to_le64(header.granularity);
|
2013-08-06 09:44:55 +02:00
|
|
|
header.num_gtes_per_gt = cpu_to_le32(header.num_gtes_per_gt);
|
2005-07-02 16:02:54 +02:00
|
|
|
header.desc_offset = cpu_to_le64(header.desc_offset);
|
|
|
|
header.desc_size = cpu_to_le64(header.desc_size);
|
|
|
|
header.rgd_offset = cpu_to_le64(header.rgd_offset);
|
|
|
|
header.gd_offset = cpu_to_le64(header.gd_offset);
|
|
|
|
header.grain_offset = cpu_to_le64(header.grain_offset);
|
2011-08-12 17:19:32 +02:00
|
|
|
header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm);
|
2005-07-02 16:02:54 +02:00
|
|
|
|
|
|
|
header.check_bytes[0] = 0xa;
|
|
|
|
header.check_bytes[1] = 0x20;
|
|
|
|
header.check_bytes[2] = 0xd;
|
|
|
|
header.check_bytes[3] = 0xa;
|
2007-09-17 10:09:54 +02:00
|
|
|
|
|
|
|
/* write all the data */
|
2016-05-06 18:26:27 +02:00
|
|
|
ret = blk_pwrite(blk, 0, &magic, sizeof(magic), 0);
|
2013-12-20 02:48:48 +01:00
|
|
|
if (ret < 0) {
|
2015-03-17 11:54:50 +01:00
|
|
|
error_setg(errp, QERR_IO_ERROR);
|
2010-01-20 00:56:13 +01:00
|
|
|
goto exit;
|
|
|
|
}
|
2016-05-06 18:26:27 +02:00
|
|
|
ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header), 0);
|
2013-12-20 02:48:48 +01:00
|
|
|
if (ret < 0) {
|
2015-03-17 11:54:50 +01:00
|
|
|
error_setg(errp, QERR_IO_ERROR);
|
2010-01-20 00:56:13 +01:00
|
|
|
goto exit;
|
|
|
|
}
|
2005-07-02 16:02:54 +02:00
|
|
|
|
2019-09-18 11:51:40 +02:00
|
|
|
ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9, false,
|
2017-06-13 22:20:54 +02:00
|
|
|
PREALLOC_MODE_OFF, errp);
|
2010-01-20 00:56:13 +01:00
|
|
|
if (ret < 0) {
|
|
|
|
goto exit;
|
|
|
|
}
|
2005-07-02 16:02:54 +02:00
|
|
|
|
|
|
|
/* write grain directory */
|
2013-12-20 02:48:48 +01:00
|
|
|
gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
|
|
|
|
gd_buf = g_malloc0(gd_buf_size);
|
|
|
|
for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
|
2010-01-20 00:56:13 +01:00
|
|
|
i < gt_count; i++, tmp += gt_size) {
|
2013-12-20 02:48:48 +01:00
|
|
|
gd_buf[i] = cpu_to_le32(tmp);
|
|
|
|
}
|
2016-03-08 15:57:05 +01:00
|
|
|
ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
|
2016-05-06 18:26:27 +02:00
|
|
|
gd_buf, gd_buf_size, 0);
|
2013-12-20 02:48:48 +01:00
|
|
|
if (ret < 0) {
|
2015-03-17 11:54:50 +01:00
|
|
|
error_setg(errp, QERR_IO_ERROR);
|
2013-12-20 02:48:48 +01:00
|
|
|
goto exit;
|
2010-01-20 00:56:13 +01:00
|
|
|
}
|
2007-09-17 10:09:54 +02:00
|
|
|
|
2005-07-02 16:02:54 +02:00
|
|
|
/* write backup grain directory */
|
2013-12-20 02:48:48 +01:00
|
|
|
for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
|
2010-01-20 00:56:13 +01:00
|
|
|
i < gt_count; i++, tmp += gt_size) {
|
2013-12-20 02:48:48 +01:00
|
|
|
gd_buf[i] = cpu_to_le32(tmp);
|
|
|
|
}
|
2016-03-08 15:57:05 +01:00
|
|
|
ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
|
2016-05-06 18:26:27 +02:00
|
|
|
gd_buf, gd_buf_size, 0);
|
2013-12-20 02:48:48 +01:00
|
|
|
if (ret < 0) {
|
2015-03-17 11:54:50 +01:00
|
|
|
error_setg(errp, QERR_IO_ERROR);
|
2010-01-20 00:56:13 +01:00
|
|
|
}
|
2005-07-02 16:02:54 +02:00
|
|
|
|
2011-07-19 02:45:23 +02:00
|
|
|
ret = 0;
|
2018-05-15 17:36:31 +02:00
|
|
|
exit:
|
|
|
|
g_free(gd_buf);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vmdk_create_extent(const char *filename, int64_t filesize,
|
|
|
|
bool flat, bool compress, bool zeroed_grain,
|
|
|
|
BlockBackend **pbb,
|
|
|
|
QemuOpts *opts, Error **errp)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
BlockBackend *blk = NULL;
|
|
|
|
Error *local_err = NULL;
|
|
|
|
|
|
|
|
ret = bdrv_create_file(filename, opts, &local_err);
|
|
|
|
if (ret < 0) {
|
|
|
|
error_propagate(errp, local_err);
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
blk = blk_new_open(filename, NULL, NULL,
|
|
|
|
BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
|
|
|
|
&local_err);
|
|
|
|
if (blk == NULL) {
|
|
|
|
error_propagate(errp, local_err);
|
|
|
|
ret = -EIO;
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
blk_set_allow_write_beyond_eof(blk, true);
|
|
|
|
|
|
|
|
ret = vmdk_init_extent(blk, filesize, flat, compress, zeroed_grain, errp);
|
2013-12-20 02:48:48 +01:00
|
|
|
exit:
|
2016-03-08 15:57:05 +01:00
|
|
|
if (blk) {
|
2018-05-15 17:36:31 +02:00
|
|
|
if (pbb) {
|
|
|
|
*pbb = blk;
|
|
|
|
} else {
|
|
|
|
blk_unref(blk);
|
|
|
|
blk = NULL;
|
|
|
|
}
|
2013-12-20 02:48:48 +01:00
|
|
|
}
|
2011-07-19 02:45:23 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int filename_decompose(const char *filename, char *path, char *prefix,
|
2013-10-11 09:43:22 +02:00
|
|
|
char *postfix, size_t buf_len, Error **errp)
|
2011-07-19 02:45:23 +02:00
|
|
|
{
|
|
|
|
const char *p, *q;
|
|
|
|
|
|
|
|
if (filename == NULL || !strlen(filename)) {
|
2013-10-11 09:43:22 +02:00
|
|
|
error_setg(errp, "No filename provided");
|
2013-05-02 04:25:22 +02:00
|
|
|
return VMDK_ERROR;
|
2011-07-19 02:45:23 +02:00
|
|
|
}
|
|
|
|
p = strrchr(filename, '/');
|
|
|
|
if (p == NULL) {
|
|
|
|
p = strrchr(filename, '\\');
|
|
|
|
}
|
|
|
|
if (p == NULL) {
|
|
|
|
p = strrchr(filename, ':');
|
|
|
|
}
|
|
|
|
if (p != NULL) {
|
|
|
|
p++;
|
|
|
|
if (p - filename >= buf_len) {
|
2013-05-02 04:25:22 +02:00
|
|
|
return VMDK_ERROR;
|
2011-07-19 02:45:23 +02:00
|
|
|
}
|
|
|
|
pstrcpy(path, p - filename + 1, filename);
|
|
|
|
} else {
|
|
|
|
p = filename;
|
|
|
|
path[0] = '\0';
|
|
|
|
}
|
|
|
|
q = strrchr(p, '.');
|
|
|
|
if (q == NULL) {
|
|
|
|
pstrcpy(prefix, buf_len, p);
|
|
|
|
postfix[0] = '\0';
|
|
|
|
} else {
|
|
|
|
if (q - p >= buf_len) {
|
2013-05-02 04:25:22 +02:00
|
|
|
return VMDK_ERROR;
|
2011-07-19 02:45:23 +02:00
|
|
|
}
|
|
|
|
pstrcpy(prefix, q - p + 1, p);
|
|
|
|
pstrcpy(postfix, buf_len, q);
|
|
|
|
}
|
2013-05-02 04:25:22 +02:00
|
|
|
return VMDK_OK;
|
2011-07-19 02:45:23 +02:00
|
|
|
}
|
|
|
|
|
2018-05-15 17:36:32 +02:00
|
|
|
/*
|
|
|
|
* idx == 0: get or create the descriptor file (also the image file if in a
|
|
|
|
* non-split format.
|
|
|
|
* idx >= 1: get the n-th extent if in a split subformat
|
|
|
|
*/
|
|
|
|
typedef BlockBackend *(*vmdk_create_extent_fn)(int64_t size,
|
|
|
|
int idx,
|
|
|
|
bool flat,
|
|
|
|
bool split,
|
|
|
|
bool compress,
|
|
|
|
bool zeroed_grain,
|
|
|
|
void *opaque,
|
|
|
|
Error **errp);
|
|
|
|
|
|
|
|
static void vmdk_desc_add_extent(GString *desc,
|
|
|
|
const char *extent_line_fmt,
|
|
|
|
int64_t size, const char *filename)
|
|
|
|
{
|
|
|
|
char *basename = g_path_get_basename(filename);
|
|
|
|
|
|
|
|
g_string_append_printf(desc, extent_line_fmt,
|
|
|
|
DIV_ROUND_UP(size, BDRV_SECTOR_SIZE), basename);
|
|
|
|
g_free(basename);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int coroutine_fn vmdk_co_do_create(int64_t size,
|
|
|
|
BlockdevVmdkSubformat subformat,
|
|
|
|
BlockdevVmdkAdapterType adapter_type,
|
|
|
|
const char *backing_file,
|
|
|
|
const char *hw_version,
|
|
|
|
bool compat6,
|
|
|
|
bool zeroed_grain,
|
|
|
|
vmdk_create_extent_fn extent_fn,
|
|
|
|
void *opaque,
|
|
|
|
Error **errp)
|
2011-07-19 02:45:23 +02:00
|
|
|
{
|
2018-05-15 17:36:32 +02:00
|
|
|
int extent_idx;
|
|
|
|
BlockBackend *blk = NULL;
|
2018-12-07 12:42:19 +01:00
|
|
|
BlockBackend *extent_blk;
|
2014-05-28 05:38:58 +02:00
|
|
|
Error *local_err = NULL;
|
2013-12-03 03:41:05 +01:00
|
|
|
char *desc = NULL;
|
2011-07-19 02:45:23 +02:00
|
|
|
int ret = 0;
|
2011-08-12 17:19:32 +02:00
|
|
|
bool flat, split, compress;
|
2013-12-03 03:41:05 +01:00
|
|
|
GString *ext_desc_lines;
|
2011-07-19 02:45:23 +02:00
|
|
|
const int64_t split_size = 0x80000000; /* VMDK has constant split size */
|
2018-05-15 17:36:32 +02:00
|
|
|
int64_t extent_size;
|
|
|
|
int64_t created_size = 0;
|
|
|
|
const char *extent_line_fmt;
|
2015-01-22 14:03:26 +01:00
|
|
|
char *parent_desc_line = g_malloc0(BUF_SIZE);
|
2011-07-19 02:45:23 +02:00
|
|
|
uint32_t parent_cid = 0xffffffff;
|
2013-01-30 00:26:52 +01:00
|
|
|
uint32_t number_heads = 16;
|
2013-12-20 02:48:48 +01:00
|
|
|
uint32_t desc_offset = 0, desc_len;
|
2011-07-19 02:45:23 +02:00
|
|
|
const char desc_template[] =
|
|
|
|
"# Disk DescriptorFile\n"
|
|
|
|
"version=1\n"
|
2014-04-17 12:43:53 +02:00
|
|
|
"CID=%" PRIx32 "\n"
|
|
|
|
"parentCID=%" PRIx32 "\n"
|
2011-07-19 02:45:23 +02:00
|
|
|
"createType=\"%s\"\n"
|
|
|
|
"%s"
|
|
|
|
"\n"
|
|
|
|
"# Extent description\n"
|
|
|
|
"%s"
|
|
|
|
"\n"
|
|
|
|
"# The Disk Data Base\n"
|
|
|
|
"#DDB\n"
|
|
|
|
"\n"
|
2016-05-03 11:43:30 +02:00
|
|
|
"ddb.virtualHWVersion = \"%s\"\n"
|
2011-07-19 02:45:23 +02:00
|
|
|
"ddb.geometry.cylinders = \"%" PRId64 "\"\n"
|
2014-04-17 05:34:37 +02:00
|
|
|
"ddb.geometry.heads = \"%" PRIu32 "\"\n"
|
2011-07-19 02:45:23 +02:00
|
|
|
"ddb.geometry.sectors = \"63\"\n"
|
2013-01-30 00:26:52 +01:00
|
|
|
"ddb.adapterType = \"%s\"\n";
|
2011-07-19 02:45:23 +02:00
|
|
|
|
2013-12-03 03:41:05 +01:00
|
|
|
ext_desc_lines = g_string_new(NULL);
|
|
|
|
|
2011-07-19 02:45:23 +02:00
|
|
|
/* Read out options */
|
2018-05-15 17:36:32 +02:00
|
|
|
if (compat6) {
|
|
|
|
if (hw_version) {
|
2016-05-03 11:43:30 +02:00
|
|
|
error_setg(errp,
|
|
|
|
"compat6 cannot be enabled with hwversion set");
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto exit;
|
|
|
|
}
|
2018-05-15 17:36:32 +02:00
|
|
|
hw_version = "6";
|
2014-06-05 11:21:09 +02:00
|
|
|
}
|
2018-05-15 17:36:32 +02:00
|
|
|
if (!hw_version) {
|
|
|
|
hw_version = "4";
|
2011-07-19 02:45:23 +02:00
|
|
|
}
|
2014-06-05 11:21:09 +02:00
|
|
|
|
2018-05-15 17:36:32 +02:00
|
|
|
if (adapter_type != BLOCKDEV_VMDK_ADAPTER_TYPE_IDE) {
|
2013-01-30 00:26:52 +01:00
|
|
|
/* that's the number of heads with which vmware operates when
|
|
|
|
creating, exporting, etc. vmdk files with a non-ide adapter type */
|
|
|
|
number_heads = 255;
|
|
|
|
}
|
2018-05-15 17:36:32 +02:00
|
|
|
split = (subformat == BLOCKDEV_VMDK_SUBFORMAT_TWOGBMAXEXTENTFLAT) ||
|
|
|
|
(subformat == BLOCKDEV_VMDK_SUBFORMAT_TWOGBMAXEXTENTSPARSE);
|
|
|
|
flat = (subformat == BLOCKDEV_VMDK_SUBFORMAT_MONOLITHICFLAT) ||
|
|
|
|
(subformat == BLOCKDEV_VMDK_SUBFORMAT_TWOGBMAXEXTENTFLAT);
|
|
|
|
compress = subformat == BLOCKDEV_VMDK_SUBFORMAT_STREAMOPTIMIZED;
|
|
|
|
|
2011-07-19 02:45:23 +02:00
|
|
|
if (flat) {
|
2018-05-15 17:36:32 +02:00
|
|
|
extent_line_fmt = "RW %" PRId64 " FLAT \"%s\" 0\n";
|
2011-07-19 02:45:23 +02:00
|
|
|
} else {
|
2018-05-15 17:36:32 +02:00
|
|
|
extent_line_fmt = "RW %" PRId64 " SPARSE \"%s\"\n";
|
2011-07-19 02:45:23 +02:00
|
|
|
}
|
|
|
|
if (flat && backing_file) {
|
2013-10-11 09:43:22 +02:00
|
|
|
error_setg(errp, "Flat image can't have backing file");
|
2013-12-03 03:41:05 +01:00
|
|
|
ret = -ENOTSUP;
|
|
|
|
goto exit;
|
2011-07-19 02:45:23 +02:00
|
|
|
}
|
2013-10-11 09:43:23 +02:00
|
|
|
if (flat && zeroed_grain) {
|
|
|
|
error_setg(errp, "Flat image can't enable zeroed grain");
|
2013-12-03 03:41:05 +01:00
|
|
|
ret = -ENOTSUP;
|
|
|
|
goto exit;
|
2013-10-11 09:43:23 +02:00
|
|
|
}
|
2018-05-15 17:36:32 +02:00
|
|
|
|
|
|
|
/* Create extents */
|
|
|
|
if (split) {
|
|
|
|
extent_size = split_size;
|
|
|
|
} else {
|
|
|
|
extent_size = size;
|
|
|
|
}
|
|
|
|
if (!split && !flat) {
|
|
|
|
created_size = extent_size;
|
|
|
|
} else {
|
|
|
|
created_size = 0;
|
|
|
|
}
|
|
|
|
/* Get the descriptor file BDS */
|
|
|
|
blk = extent_fn(created_size, 0, flat, split, compress, zeroed_grain,
|
|
|
|
opaque, errp);
|
|
|
|
if (!blk) {
|
|
|
|
ret = -EIO;
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
if (!split && !flat) {
|
|
|
|
vmdk_desc_add_extent(ext_desc_lines, extent_line_fmt, created_size,
|
|
|
|
blk_bs(blk)->filename);
|
|
|
|
}
|
|
|
|
|
2011-07-19 02:45:23 +02:00
|
|
|
if (backing_file) {
|
2018-05-15 17:36:32 +02:00
|
|
|
BlockBackend *backing;
|
2019-02-01 20:29:14 +01:00
|
|
|
char *full_backing =
|
|
|
|
bdrv_get_full_backing_filename_from_filename(blk_bs(blk)->filename,
|
|
|
|
backing_file,
|
|
|
|
&local_err);
|
2014-11-26 17:20:28 +01:00
|
|
|
if (local_err) {
|
|
|
|
error_propagate(errp, local_err);
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto exit;
|
|
|
|
}
|
2019-02-01 20:29:14 +01:00
|
|
|
assert(full_backing);
|
2016-03-08 15:57:05 +01:00
|
|
|
|
2018-05-15 17:36:32 +02:00
|
|
|
backing = blk_new_open(full_backing, NULL, NULL,
|
|
|
|
BDRV_O_NO_BACKING, errp);
|
2014-11-26 17:20:28 +01:00
|
|
|
g_free(full_backing);
|
2018-05-15 17:36:32 +02:00
|
|
|
if (backing == NULL) {
|
2016-03-08 15:57:05 +01:00
|
|
|
ret = -EIO;
|
2013-12-03 03:41:05 +01:00
|
|
|
goto exit;
|
2011-07-19 02:45:23 +02:00
|
|
|
}
|
2018-05-15 17:36:32 +02:00
|
|
|
if (strcmp(blk_bs(backing)->drv->format_name, "vmdk")) {
|
|
|
|
error_setg(errp, "Invalid backing file format: %s. Must be vmdk",
|
|
|
|
blk_bs(backing)->drv->format_name);
|
|
|
|
blk_unref(backing);
|
2013-12-03 03:41:05 +01:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto exit;
|
2011-07-19 02:45:23 +02:00
|
|
|
}
|
2018-05-15 17:36:32 +02:00
|
|
|
ret = vmdk_read_cid(blk_bs(backing), 0, &parent_cid);
|
|
|
|
blk_unref(backing);
|
2017-07-09 19:06:14 +02:00
|
|
|
if (ret) {
|
2018-05-15 17:36:32 +02:00
|
|
|
error_setg(errp, "Failed to read parent CID");
|
2017-07-09 19:06:14 +02:00
|
|
|
goto exit;
|
|
|
|
}
|
2015-01-22 14:03:26 +01:00
|
|
|
snprintf(parent_desc_line, BUF_SIZE,
|
2013-06-26 11:24:32 +02:00
|
|
|
"parentFileNameHint=\"%s\"", backing_file);
|
2011-07-19 02:45:23 +02:00
|
|
|
}
|
2018-05-15 17:36:32 +02:00
|
|
|
extent_idx = 1;
|
|
|
|
while (created_size < size) {
|
|
|
|
int64_t cur_size = MIN(size - created_size, extent_size);
|
|
|
|
extent_blk = extent_fn(cur_size, extent_idx, flat, split, compress,
|
|
|
|
zeroed_grain, opaque, errp);
|
|
|
|
if (!extent_blk) {
|
2013-12-03 03:41:05 +01:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto exit;
|
2011-07-19 02:45:23 +02:00
|
|
|
}
|
2018-05-15 17:36:32 +02:00
|
|
|
vmdk_desc_add_extent(ext_desc_lines, extent_line_fmt, cur_size,
|
|
|
|
blk_bs(extent_blk)->filename);
|
|
|
|
created_size += cur_size;
|
|
|
|
extent_idx++;
|
|
|
|
blk_unref(extent_blk);
|
2011-07-19 02:45:23 +02:00
|
|
|
}
|
2018-12-07 12:42:19 +01:00
|
|
|
|
|
|
|
/* Check whether we got excess extents */
|
|
|
|
extent_blk = extent_fn(-1, extent_idx, flat, split, compress, zeroed_grain,
|
|
|
|
opaque, NULL);
|
|
|
|
if (extent_blk) {
|
|
|
|
blk_unref(extent_blk);
|
|
|
|
error_setg(errp, "List of extents contains unused extents");
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
|
2011-07-19 02:45:23 +02:00
|
|
|
/* generate descriptor file */
|
2013-12-03 03:41:05 +01:00
|
|
|
desc = g_strdup_printf(desc_template,
|
2014-12-04 00:28:29 +01:00
|
|
|
g_random_int(),
|
2013-12-03 03:41:05 +01:00
|
|
|
parent_cid,
|
2018-05-15 17:36:32 +02:00
|
|
|
BlockdevVmdkSubformat_str(subformat),
|
2013-12-03 03:41:05 +01:00
|
|
|
parent_desc_line,
|
|
|
|
ext_desc_lines->str,
|
2016-05-03 11:43:30 +02:00
|
|
|
hw_version,
|
2018-05-15 17:36:32 +02:00
|
|
|
size /
|
2013-12-20 02:48:48 +01:00
|
|
|
(int64_t)(63 * number_heads * BDRV_SECTOR_SIZE),
|
2013-12-03 03:41:05 +01:00
|
|
|
number_heads,
|
2018-05-15 17:36:32 +02:00
|
|
|
BlockdevVmdkAdapterType_str(adapter_type));
|
2013-12-20 02:48:48 +01:00
|
|
|
desc_len = strlen(desc);
|
|
|
|
/* the descriptor offset = 0x200 */
|
|
|
|
if (!split && !flat) {
|
|
|
|
desc_offset = 0x200;
|
2018-05-15 17:36:32 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
ret = blk_pwrite(blk, desc_offset, desc, desc_len, 0);
|
|
|
|
if (ret < 0) {
|
|
|
|
error_setg_errno(errp, -ret, "Could not write description");
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
/* bdrv_pwrite write padding zeros to align to sector, we don't need that
|
|
|
|
* for description file */
|
|
|
|
if (desc_offset == 0) {
|
2019-09-18 11:51:40 +02:00
|
|
|
ret = blk_truncate(blk, desc_len, false, PREALLOC_MODE_OFF, errp);
|
2013-12-20 02:48:48 +01:00
|
|
|
if (ret < 0) {
|
|
|
|
goto exit;
|
|
|
|
}
|
2011-07-19 02:45:23 +02:00
|
|
|
}
|
2018-05-15 17:36:32 +02:00
|
|
|
ret = 0;
|
|
|
|
exit:
|
|
|
|
if (blk) {
|
|
|
|
blk_unref(blk);
|
|
|
|
}
|
|
|
|
g_free(desc);
|
|
|
|
g_free(parent_desc_line);
|
|
|
|
g_string_free(ext_desc_lines, true);
|
|
|
|
return ret;
|
|
|
|
}
|
2016-03-08 15:57:05 +01:00
|
|
|
|
2018-05-15 17:36:32 +02:00
|
|
|
typedef struct {
|
|
|
|
char *path;
|
|
|
|
char *prefix;
|
|
|
|
char *postfix;
|
|
|
|
QemuOpts *opts;
|
|
|
|
} VMDKCreateOptsData;
|
|
|
|
|
|
|
|
static BlockBackend *vmdk_co_create_opts_cb(int64_t size, int idx,
|
|
|
|
bool flat, bool split, bool compress,
|
|
|
|
bool zeroed_grain, void *opaque,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
BlockBackend *blk = NULL;
|
|
|
|
BlockDriverState *bs = NULL;
|
|
|
|
VMDKCreateOptsData *data = opaque;
|
|
|
|
char *ext_filename = NULL;
|
|
|
|
char *rel_filename = NULL;
|
|
|
|
|
2018-12-07 12:42:19 +01:00
|
|
|
/* We're done, don't create excess extents. */
|
|
|
|
if (size == -1) {
|
|
|
|
assert(errp == NULL);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2018-05-15 17:36:32 +02:00
|
|
|
if (idx == 0) {
|
|
|
|
rel_filename = g_strdup_printf("%s%s", data->prefix, data->postfix);
|
|
|
|
} else if (split) {
|
|
|
|
rel_filename = g_strdup_printf("%s-%c%03d%s",
|
|
|
|
data->prefix,
|
|
|
|
flat ? 'f' : 's', idx, data->postfix);
|
|
|
|
} else {
|
|
|
|
assert(idx == 1);
|
|
|
|
rel_filename = g_strdup_printf("%s-flat%s", data->prefix, data->postfix);
|
|
|
|
}
|
|
|
|
|
|
|
|
ext_filename = g_strdup_printf("%s%s", data->path, rel_filename);
|
|
|
|
g_free(rel_filename);
|
|
|
|
|
|
|
|
if (vmdk_create_extent(ext_filename, size,
|
|
|
|
flat, compress, zeroed_grain, &blk, data->opts,
|
|
|
|
errp)) {
|
2013-12-03 03:41:05 +01:00
|
|
|
goto exit;
|
2011-07-19 02:45:23 +02:00
|
|
|
}
|
2018-05-15 17:36:32 +02:00
|
|
|
bdrv_unref(bs);
|
|
|
|
exit:
|
|
|
|
g_free(ext_filename);
|
|
|
|
return blk;
|
|
|
|
}
|
2016-03-08 15:57:05 +01:00
|
|
|
|
2018-05-15 17:36:32 +02:00
|
|
|
static int coroutine_fn vmdk_co_create_opts(const char *filename, QemuOpts *opts,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
Error *local_err = NULL;
|
|
|
|
char *desc = NULL;
|
|
|
|
int64_t total_size = 0;
|
|
|
|
char *adapter_type = NULL;
|
|
|
|
BlockdevVmdkAdapterType adapter_type_enum;
|
|
|
|
char *backing_file = NULL;
|
|
|
|
char *hw_version = NULL;
|
|
|
|
char *fmt = NULL;
|
|
|
|
BlockdevVmdkSubformat subformat;
|
|
|
|
int ret = 0;
|
|
|
|
char *path = g_malloc0(PATH_MAX);
|
|
|
|
char *prefix = g_malloc0(PATH_MAX);
|
|
|
|
char *postfix = g_malloc0(PATH_MAX);
|
|
|
|
char *desc_line = g_malloc0(BUF_SIZE);
|
|
|
|
char *ext_filename = g_malloc0(PATH_MAX);
|
|
|
|
char *desc_filename = g_malloc0(PATH_MAX);
|
|
|
|
char *parent_desc_line = g_malloc0(BUF_SIZE);
|
|
|
|
bool zeroed_grain;
|
|
|
|
bool compat6;
|
|
|
|
VMDKCreateOptsData data;
|
2016-03-08 15:57:05 +01:00
|
|
|
|
2018-05-15 17:36:32 +02:00
|
|
|
if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) {
|
|
|
|
ret = -EINVAL;
|
2013-12-20 02:48:48 +01:00
|
|
|
goto exit;
|
2011-07-19 02:45:23 +02:00
|
|
|
}
|
2018-05-15 17:36:32 +02:00
|
|
|
/* Read out options */
|
|
|
|
total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
|
|
|
|
BDRV_SECTOR_SIZE);
|
|
|
|
adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE);
|
|
|
|
backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
|
|
|
|
hw_version = qemu_opt_get_del(opts, BLOCK_OPT_HWVERSION);
|
|
|
|
compat6 = qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false);
|
|
|
|
if (strcmp(hw_version, "undefined") == 0) {
|
|
|
|
g_free(hw_version);
|
2019-02-21 12:08:05 +01:00
|
|
|
hw_version = NULL;
|
2010-01-20 00:56:13 +01:00
|
|
|
}
|
2018-05-15 17:36:32 +02:00
|
|
|
fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
|
|
|
|
zeroed_grain = qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false);
|
|
|
|
|
|
|
|
if (adapter_type) {
|
|
|
|
adapter_type_enum = qapi_enum_parse(&BlockdevVmdkAdapterType_lookup,
|
|
|
|
adapter_type,
|
|
|
|
BLOCKDEV_VMDK_ADAPTER_TYPE_IDE,
|
|
|
|
&local_err);
|
|
|
|
if (local_err) {
|
|
|
|
error_propagate(errp, local_err);
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
adapter_type_enum = BLOCKDEV_VMDK_ADAPTER_TYPE_IDE;
|
2013-12-20 02:48:48 +01:00
|
|
|
}
|
2018-05-15 17:36:32 +02:00
|
|
|
|
|
|
|
if (!fmt) {
|
|
|
|
/* Default format to monolithicSparse */
|
|
|
|
subformat = BLOCKDEV_VMDK_SUBFORMAT_MONOLITHICSPARSE;
|
|
|
|
} else {
|
|
|
|
subformat = qapi_enum_parse(&BlockdevVmdkSubformat_lookup,
|
|
|
|
fmt,
|
|
|
|
BLOCKDEV_VMDK_SUBFORMAT_MONOLITHICSPARSE,
|
|
|
|
&local_err);
|
|
|
|
if (local_err) {
|
|
|
|
error_propagate(errp, local_err);
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
data = (VMDKCreateOptsData){
|
|
|
|
.prefix = prefix,
|
|
|
|
.postfix = postfix,
|
|
|
|
.path = path,
|
|
|
|
.opts = opts,
|
|
|
|
};
|
|
|
|
ret = vmdk_co_do_create(total_size, subformat, adapter_type_enum,
|
|
|
|
backing_file, hw_version, compat6, zeroed_grain,
|
|
|
|
vmdk_co_create_opts_cb, &data, errp);
|
|
|
|
|
|
|
|
exit:
|
2014-06-05 11:21:09 +02:00
|
|
|
g_free(adapter_type);
|
|
|
|
g_free(backing_file);
|
2016-05-03 11:43:30 +02:00
|
|
|
g_free(hw_version);
|
2014-06-05 11:21:09 +02:00
|
|
|
g_free(fmt);
|
2013-12-03 03:41:05 +01:00
|
|
|
g_free(desc);
|
2015-01-22 14:03:26 +01:00
|
|
|
g_free(path);
|
|
|
|
g_free(prefix);
|
|
|
|
g_free(postfix);
|
|
|
|
g_free(desc_line);
|
|
|
|
g_free(ext_filename);
|
|
|
|
g_free(desc_filename);
|
|
|
|
g_free(parent_desc_line);
|
2018-05-15 17:36:32 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static BlockBackend *vmdk_co_create_cb(int64_t size, int idx,
|
|
|
|
bool flat, bool split, bool compress,
|
|
|
|
bool zeroed_grain, void *opaque,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
BlockDriverState *bs;
|
|
|
|
BlockBackend *blk;
|
|
|
|
BlockdevCreateOptionsVmdk *opts = opaque;
|
|
|
|
|
|
|
|
if (idx == 0) {
|
|
|
|
bs = bdrv_open_blockdev_ref(opts->file, errp);
|
|
|
|
} else {
|
|
|
|
int i;
|
|
|
|
BlockdevRefList *list = opts->extents;
|
|
|
|
for (i = 1; i < idx; i++) {
|
|
|
|
if (!list || !list->next) {
|
|
|
|
error_setg(errp, "Extent [%d] not specified", i);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
list = list->next;
|
|
|
|
}
|
|
|
|
if (!list) {
|
|
|
|
error_setg(errp, "Extent [%d] not specified", idx - 1);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
bs = bdrv_open_blockdev_ref(list->value, errp);
|
|
|
|
}
|
|
|
|
if (!bs) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2019-04-25 14:25:10 +02:00
|
|
|
blk = blk_new(bdrv_get_aio_context(bs),
|
|
|
|
BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | BLK_PERM_RESIZE,
|
2018-05-15 17:36:32 +02:00
|
|
|
BLK_PERM_ALL);
|
|
|
|
if (blk_insert_bs(blk, bs, errp)) {
|
|
|
|
bdrv_unref(bs);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
blk_set_allow_write_beyond_eof(blk, true);
|
|
|
|
bdrv_unref(bs);
|
|
|
|
|
2018-12-07 12:42:19 +01:00
|
|
|
if (size != -1) {
|
|
|
|
ret = vmdk_init_extent(blk, size, flat, compress, zeroed_grain, errp);
|
|
|
|
if (ret) {
|
|
|
|
blk_unref(blk);
|
|
|
|
blk = NULL;
|
|
|
|
}
|
2018-05-15 17:36:32 +02:00
|
|
|
}
|
|
|
|
return blk;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int coroutine_fn vmdk_co_create(BlockdevCreateOptions *create_options,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
BlockdevCreateOptionsVmdk *opts;
|
|
|
|
|
|
|
|
opts = &create_options->u.vmdk;
|
|
|
|
|
|
|
|
/* Validate options */
|
|
|
|
if (!QEMU_IS_ALIGNED(opts->size, BDRV_SECTOR_SIZE)) {
|
|
|
|
error_setg(errp, "Image size must be a multiple of 512 bytes");
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = vmdk_co_do_create(opts->size,
|
|
|
|
opts->subformat,
|
|
|
|
opts->adapter_type,
|
|
|
|
opts->backing_file,
|
|
|
|
opts->hwversion,
|
|
|
|
false,
|
|
|
|
opts->zeroed_grain,
|
|
|
|
vmdk_co_create_cb,
|
|
|
|
opts, errp);
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
out:
|
2010-01-20 00:56:13 +01:00
|
|
|
return ret;
|
2005-07-02 16:02:54 +02:00
|
|
|
}
|
|
|
|
|
2004-09-18 21:32:11 +02:00
|
|
|
static void vmdk_close(BlockDriverState *bs)
|
2004-08-01 23:59:26 +02:00
|
|
|
{
|
2011-11-22 16:50:27 +01:00
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
|
2011-07-12 13:56:28 +02:00
|
|
|
vmdk_free_extents(bs);
|
2013-10-31 03:06:23 +01:00
|
|
|
g_free(s->create_type);
|
2011-11-22 16:50:27 +01:00
|
|
|
|
|
|
|
migrate_del_blocker(s->migration_blocker);
|
|
|
|
error_free(s->migration_blocker);
|
2004-08-01 23:59:26 +02:00
|
|
|
}
|
|
|
|
|
2011-10-20 13:16:24 +02:00
|
|
|
static coroutine_fn int vmdk_co_flush(BlockDriverState *bs)
|
2006-06-04 13:39:07 +02:00
|
|
|
{
|
2011-07-12 13:56:33 +02:00
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2012-03-12 18:26:01 +01:00
|
|
|
int i, err;
|
|
|
|
int ret = 0;
|
2011-07-12 13:56:33 +02:00
|
|
|
|
|
|
|
for (i = 0; i < s->num_extents; i++) {
|
2015-06-15 13:50:20 +02:00
|
|
|
err = bdrv_co_flush(s->extents[i].file->bs);
|
2011-07-12 13:56:33 +02:00
|
|
|
if (err < 0) {
|
|
|
|
ret = err;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ret;
|
2006-06-04 13:39:07 +02:00
|
|
|
}
|
|
|
|
|
2011-07-12 13:56:39 +02:00
|
|
|
static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
int64_t ret = 0;
|
|
|
|
int64_t r;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
|
2015-06-16 14:19:22 +02:00
|
|
|
ret = bdrv_get_allocated_file_size(bs->file->bs);
|
2011-07-12 13:56:39 +02:00
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
for (i = 0; i < s->num_extents; i++) {
|
2015-06-16 14:19:22 +02:00
|
|
|
if (s->extents[i].file == bs->file) {
|
2011-07-12 13:56:39 +02:00
|
|
|
continue;
|
|
|
|
}
|
2015-06-15 13:50:20 +02:00
|
|
|
r = bdrv_get_allocated_file_size(s->extents[i].file->bs);
|
2011-07-12 13:56:39 +02:00
|
|
|
if (r < 0) {
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
ret += r;
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
2009-05-18 16:42:10 +02:00
|
|
|
|
2013-07-01 05:33:17 +02:00
|
|
|
static int vmdk_has_zero_init(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
|
|
|
|
/* If has a flat extent and its underlying storage doesn't have zero init,
|
|
|
|
* return 0. */
|
|
|
|
for (i = 0; i < s->num_extents; i++) {
|
|
|
|
if (s->extents[i].flat) {
|
2015-06-15 13:50:20 +02:00
|
|
|
if (!bdrv_has_zero_init(s->extents[i].file->bs)) {
|
2013-07-01 05:33:17 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2013-10-31 03:06:23 +01:00
|
|
|
static ImageInfo *vmdk_get_extent_info(VmdkExtent *extent)
|
|
|
|
{
|
|
|
|
ImageInfo *info = g_new0(ImageInfo, 1);
|
|
|
|
|
block: Use bdrv_refresh_filename() to pull
Before this patch, bdrv_refresh_filename() is used in a pushing manner:
Whenever the BDS graph is modified, the parents of the modified edges
are supposed to be updated (recursively upwards). However, that is
nonviable, considering that we want child changes not to concern
parents.
Also, in the long run we want a pull model anyway: Here, we would have a
bdrv_filename() function which returns a BDS's filename, freshly
constructed.
This patch is an intermediate step. It adds bdrv_refresh_filename()
calls before every place a BDS.filename value is used. The only
exceptions are protocol drivers that use their own filename, which
clearly would not profit from refreshing that filename before.
Also, bdrv_get_encrypted_filename() is removed along the way (as a user
of BDS.filename), since it is completely unused.
In turn, all of the calls to bdrv_refresh_filename() before this patch
are removed, because we no longer have to call this function on graph
changes.
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190201192935.18394-2-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-02-01 20:29:05 +01:00
|
|
|
bdrv_refresh_filename(extent->file->bs);
|
2013-10-31 03:06:23 +01:00
|
|
|
*info = (ImageInfo){
|
2015-06-15 13:50:20 +02:00
|
|
|
.filename = g_strdup(extent->file->bs->filename),
|
2013-10-31 03:06:23 +01:00
|
|
|
.format = g_strdup(extent->type),
|
|
|
|
.virtual_size = extent->sectors * BDRV_SECTOR_SIZE,
|
|
|
|
.compressed = extent->compressed,
|
|
|
|
.has_compressed = extent->compressed,
|
|
|
|
.cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE,
|
|
|
|
.has_cluster_size = !extent->flat,
|
|
|
|
};
|
|
|
|
|
|
|
|
return info;
|
|
|
|
}
|
|
|
|
|
2018-03-01 17:36:19 +01:00
|
|
|
static int coroutine_fn vmdk_co_check(BlockDriverState *bs,
|
|
|
|
BdrvCheckResult *result,
|
|
|
|
BdrvCheckMode fix)
|
2014-01-29 09:34:16 +01:00
|
|
|
{
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
VmdkExtent *extent = NULL;
|
|
|
|
int64_t sector_num = 0;
|
2014-06-26 13:23:22 +02:00
|
|
|
int64_t total_sectors = bdrv_nb_sectors(bs);
|
2014-01-29 09:34:16 +01:00
|
|
|
int ret;
|
|
|
|
uint64_t cluster_offset;
|
|
|
|
|
|
|
|
if (fix) {
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
if (sector_num >= total_sectors) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
extent = find_extent(s, sector_num, extent);
|
|
|
|
if (!extent) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"ERROR: could not find extent for sector %" PRId64 "\n",
|
|
|
|
sector_num);
|
2017-08-04 16:09:42 +02:00
|
|
|
ret = -EINVAL;
|
2014-01-29 09:34:16 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
ret = get_cluster_offset(bs, extent, NULL,
|
|
|
|
sector_num << BDRV_SECTOR_BITS,
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 08:39:10 +02:00
|
|
|
false, &cluster_offset, 0, 0);
|
2014-01-29 09:34:16 +01:00
|
|
|
if (ret == VMDK_ERROR) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"ERROR: could not get cluster_offset for sector %"
|
|
|
|
PRId64 "\n", sector_num);
|
|
|
|
break;
|
|
|
|
}
|
2017-08-04 16:09:42 +02:00
|
|
|
if (ret == VMDK_OK) {
|
|
|
|
int64_t extent_len = bdrv_getlength(extent->file->bs);
|
|
|
|
if (extent_len < 0) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"ERROR: could not get extent file length for sector %"
|
|
|
|
PRId64 "\n", sector_num);
|
|
|
|
ret = extent_len;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (cluster_offset >= extent_len) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"ERROR: cluster offset for sector %"
|
|
|
|
PRId64 " points after EOF\n", sector_num);
|
|
|
|
ret = -EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
2014-01-29 09:34:16 +01:00
|
|
|
}
|
|
|
|
sector_num += extent->cluster_sectors;
|
|
|
|
}
|
|
|
|
|
|
|
|
result->corruptions++;
|
2017-08-04 16:09:42 +02:00
|
|
|
return ret;
|
2014-01-29 09:34:16 +01:00
|
|
|
}
|
|
|
|
|
2019-02-08 16:06:06 +01:00
|
|
|
static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs,
|
|
|
|
Error **errp)
|
2013-10-31 03:06:23 +01:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
ImageInfoSpecific *spec_info = g_new0(ImageInfoSpecific, 1);
|
|
|
|
ImageInfoList **next;
|
|
|
|
|
|
|
|
*spec_info = (ImageInfoSpecific){
|
2015-10-26 23:34:54 +01:00
|
|
|
.type = IMAGE_INFO_SPECIFIC_KIND_VMDK,
|
qapi: Don't special-case simple union wrappers
Simple unions were carrying a special case that hid their 'data'
QMP member from the resulting C struct, via the hack method
QAPISchemaObjectTypeVariant.simple_union_type(). But by using
the work we started by unboxing flat union and alternate
branches, coupled with the ability to visit the members of an
implicit type, we can now expose the simple union's implicit
type in qapi-types.h:
| struct q_obj_ImageInfoSpecificQCow2_wrapper {
| ImageInfoSpecificQCow2 *data;
| };
|
| struct q_obj_ImageInfoSpecificVmdk_wrapper {
| ImageInfoSpecificVmdk *data;
| };
...
| struct ImageInfoSpecific {
| ImageInfoSpecificKind type;
| union { /* union tag is @type */
| void *data;
|- ImageInfoSpecificQCow2 *qcow2;
|- ImageInfoSpecificVmdk *vmdk;
|+ q_obj_ImageInfoSpecificQCow2_wrapper qcow2;
|+ q_obj_ImageInfoSpecificVmdk_wrapper vmdk;
| } u;
| };
Doing this removes asymmetry between QAPI's QMP side and its
C side (both sides now expose 'data'), and means that the
treatment of a simple union as sugar for a flat union is now
equivalent in both languages (previously the two approaches used
a different layer of dereferencing, where the simple union could
be converted to a flat union with equivalent C layout but
different {} on the wire, or to an equivalent QMP wire form
but with different C representation). Using the implicit type
also lets us get rid of the simple_union_type() hack.
Of course, now all clients of simple unions have to adjust from
using su->u.member to using su->u.member.data; while this touches
a number of files in the tree, some earlier cleanup patches
helped minimize the change to the initialization of a temporary
variable rather than every single member access. The generated
qapi-visit.c code is also affected by the layout change:
|@@ -7393,10 +7393,10 @@ void visit_type_ImageInfoSpecific_member
| }
| switch (obj->type) {
| case IMAGE_INFO_SPECIFIC_KIND_QCOW2:
|- visit_type_ImageInfoSpecificQCow2(v, "data", &obj->u.qcow2, &err);
|+ visit_type_q_obj_ImageInfoSpecificQCow2_wrapper_members(v, &obj->u.qcow2, &err);
| break;
| case IMAGE_INFO_SPECIFIC_KIND_VMDK:
|- visit_type_ImageInfoSpecificVmdk(v, "data", &obj->u.vmdk, &err);
|+ visit_type_q_obj_ImageInfoSpecificVmdk_wrapper_members(v, &obj->u.vmdk, &err);
| break;
| default:
| abort();
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <1458254921-17042-13-git-send-email-eblake@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
2016-03-17 23:48:37 +01:00
|
|
|
.u = {
|
|
|
|
.vmdk.data = g_new0(ImageInfoSpecificVmdk, 1),
|
2013-10-31 03:06:23 +01:00
|
|
|
},
|
|
|
|
};
|
|
|
|
|
qapi: Don't special-case simple union wrappers
Simple unions were carrying a special case that hid their 'data'
QMP member from the resulting C struct, via the hack method
QAPISchemaObjectTypeVariant.simple_union_type(). But by using
the work we started by unboxing flat union and alternate
branches, coupled with the ability to visit the members of an
implicit type, we can now expose the simple union's implicit
type in qapi-types.h:
| struct q_obj_ImageInfoSpecificQCow2_wrapper {
| ImageInfoSpecificQCow2 *data;
| };
|
| struct q_obj_ImageInfoSpecificVmdk_wrapper {
| ImageInfoSpecificVmdk *data;
| };
...
| struct ImageInfoSpecific {
| ImageInfoSpecificKind type;
| union { /* union tag is @type */
| void *data;
|- ImageInfoSpecificQCow2 *qcow2;
|- ImageInfoSpecificVmdk *vmdk;
|+ q_obj_ImageInfoSpecificQCow2_wrapper qcow2;
|+ q_obj_ImageInfoSpecificVmdk_wrapper vmdk;
| } u;
| };
Doing this removes asymmetry between QAPI's QMP side and its
C side (both sides now expose 'data'), and means that the
treatment of a simple union as sugar for a flat union is now
equivalent in both languages (previously the two approaches used
a different layer of dereferencing, where the simple union could
be converted to a flat union with equivalent C layout but
different {} on the wire, or to an equivalent QMP wire form
but with different C representation). Using the implicit type
also lets us get rid of the simple_union_type() hack.
Of course, now all clients of simple unions have to adjust from
using su->u.member to using su->u.member.data; while this touches
a number of files in the tree, some earlier cleanup patches
helped minimize the change to the initialization of a temporary
variable rather than every single member access. The generated
qapi-visit.c code is also affected by the layout change:
|@@ -7393,10 +7393,10 @@ void visit_type_ImageInfoSpecific_member
| }
| switch (obj->type) {
| case IMAGE_INFO_SPECIFIC_KIND_QCOW2:
|- visit_type_ImageInfoSpecificQCow2(v, "data", &obj->u.qcow2, &err);
|+ visit_type_q_obj_ImageInfoSpecificQCow2_wrapper_members(v, &obj->u.qcow2, &err);
| break;
| case IMAGE_INFO_SPECIFIC_KIND_VMDK:
|- visit_type_ImageInfoSpecificVmdk(v, "data", &obj->u.vmdk, &err);
|+ visit_type_q_obj_ImageInfoSpecificVmdk_wrapper_members(v, &obj->u.vmdk, &err);
| break;
| default:
| abort();
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <1458254921-17042-13-git-send-email-eblake@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
2016-03-17 23:48:37 +01:00
|
|
|
*spec_info->u.vmdk.data = (ImageInfoSpecificVmdk) {
|
2013-10-31 03:06:23 +01:00
|
|
|
.create_type = g_strdup(s->create_type),
|
|
|
|
.cid = s->cid,
|
|
|
|
.parent_cid = s->parent_cid,
|
|
|
|
};
|
|
|
|
|
qapi: Don't special-case simple union wrappers
Simple unions were carrying a special case that hid their 'data'
QMP member from the resulting C struct, via the hack method
QAPISchemaObjectTypeVariant.simple_union_type(). But by using
the work we started by unboxing flat union and alternate
branches, coupled with the ability to visit the members of an
implicit type, we can now expose the simple union's implicit
type in qapi-types.h:
| struct q_obj_ImageInfoSpecificQCow2_wrapper {
| ImageInfoSpecificQCow2 *data;
| };
|
| struct q_obj_ImageInfoSpecificVmdk_wrapper {
| ImageInfoSpecificVmdk *data;
| };
...
| struct ImageInfoSpecific {
| ImageInfoSpecificKind type;
| union { /* union tag is @type */
| void *data;
|- ImageInfoSpecificQCow2 *qcow2;
|- ImageInfoSpecificVmdk *vmdk;
|+ q_obj_ImageInfoSpecificQCow2_wrapper qcow2;
|+ q_obj_ImageInfoSpecificVmdk_wrapper vmdk;
| } u;
| };
Doing this removes asymmetry between QAPI's QMP side and its
C side (both sides now expose 'data'), and means that the
treatment of a simple union as sugar for a flat union is now
equivalent in both languages (previously the two approaches used
a different layer of dereferencing, where the simple union could
be converted to a flat union with equivalent C layout but
different {} on the wire, or to an equivalent QMP wire form
but with different C representation). Using the implicit type
also lets us get rid of the simple_union_type() hack.
Of course, now all clients of simple unions have to adjust from
using su->u.member to using su->u.member.data; while this touches
a number of files in the tree, some earlier cleanup patches
helped minimize the change to the initialization of a temporary
variable rather than every single member access. The generated
qapi-visit.c code is also affected by the layout change:
|@@ -7393,10 +7393,10 @@ void visit_type_ImageInfoSpecific_member
| }
| switch (obj->type) {
| case IMAGE_INFO_SPECIFIC_KIND_QCOW2:
|- visit_type_ImageInfoSpecificQCow2(v, "data", &obj->u.qcow2, &err);
|+ visit_type_q_obj_ImageInfoSpecificQCow2_wrapper_members(v, &obj->u.qcow2, &err);
| break;
| case IMAGE_INFO_SPECIFIC_KIND_VMDK:
|- visit_type_ImageInfoSpecificVmdk(v, "data", &obj->u.vmdk, &err);
|+ visit_type_q_obj_ImageInfoSpecificVmdk_wrapper_members(v, &obj->u.vmdk, &err);
| break;
| default:
| abort();
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <1458254921-17042-13-git-send-email-eblake@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
2016-03-17 23:48:37 +01:00
|
|
|
next = &spec_info->u.vmdk.data->extents;
|
2013-10-31 03:06:23 +01:00
|
|
|
for (i = 0; i < s->num_extents; i++) {
|
|
|
|
*next = g_new0(ImageInfoList, 1);
|
|
|
|
(*next)->value = vmdk_get_extent_info(&s->extents[i]);
|
|
|
|
(*next)->next = NULL;
|
|
|
|
next = &(*next)->next;
|
|
|
|
}
|
|
|
|
|
|
|
|
return spec_info;
|
|
|
|
}
|
|
|
|
|
2014-11-14 05:09:21 +01:00
|
|
|
static bool vmdk_extents_type_eq(const VmdkExtent *a, const VmdkExtent *b)
|
|
|
|
{
|
|
|
|
return a->flat == b->flat &&
|
|
|
|
a->compressed == b->compressed &&
|
|
|
|
(a->flat || a->cluster_sectors == b->cluster_sectors);
|
|
|
|
}
|
|
|
|
|
2014-05-06 15:08:45 +02:00
|
|
|
static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
assert(s->num_extents);
|
2014-11-14 05:09:21 +01:00
|
|
|
|
2014-05-06 15:08:45 +02:00
|
|
|
/* See if we have multiple extents but they have different cases */
|
|
|
|
for (i = 1; i < s->num_extents; i++) {
|
2014-11-14 05:09:21 +01:00
|
|
|
if (!vmdk_extents_type_eq(&s->extents[0], &s->extents[i])) {
|
2014-05-06 15:08:45 +02:00
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
}
|
2014-11-14 05:09:21 +01:00
|
|
|
bdi->needs_compressed_writes = s->extents[0].compressed;
|
|
|
|
if (!s->extents[0].flat) {
|
|
|
|
bdi->cluster_size = s->extents[0].cluster_sectors << BDRV_SECTOR_BITS;
|
|
|
|
}
|
2014-05-06 15:08:45 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-02-01 20:29:26 +01:00
|
|
|
static void vmdk_gather_child_options(BlockDriverState *bs, QDict *target,
|
|
|
|
bool backing_overridden)
|
|
|
|
{
|
|
|
|
/* No children but file and backing can be explicitly specified (TODO) */
|
|
|
|
qdict_put(target, "file",
|
|
|
|
qobject_ref(bs->file->bs->full_open_options));
|
|
|
|
|
|
|
|
if (backing_overridden) {
|
|
|
|
if (bs->backing) {
|
|
|
|
qdict_put(target, "backing",
|
|
|
|
qobject_ref(bs->backing->bs->full_open_options));
|
|
|
|
} else {
|
|
|
|
qdict_put_null(target, "backing");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-06-05 11:21:09 +02:00
|
|
|
static QemuOptsList vmdk_create_opts = {
|
|
|
|
.name = "vmdk-create-opts",
|
|
|
|
.head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head),
|
|
|
|
.desc = {
|
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_SIZE,
|
|
|
|
.type = QEMU_OPT_SIZE,
|
|
|
|
.help = "Virtual disk size"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_ADAPTER_TYPE,
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help = "Virtual adapter type, can be one of "
|
|
|
|
"ide (default), lsilogic, buslogic or legacyESX"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_BACKING_FILE,
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help = "File name of a base image"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_COMPAT6,
|
|
|
|
.type = QEMU_OPT_BOOL,
|
|
|
|
.help = "VMDK version 6 image",
|
|
|
|
.def_value_str = "off"
|
|
|
|
},
|
2016-05-03 11:43:30 +02:00
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_HWVERSION,
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help = "VMDK hardware version",
|
|
|
|
.def_value_str = "undefined"
|
|
|
|
},
|
2014-06-05 11:21:09 +02:00
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_SUBFMT,
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help =
|
|
|
|
"VMDK flat extent format, can be one of "
|
|
|
|
"{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} "
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_ZEROED_GRAIN,
|
|
|
|
.type = QEMU_OPT_BOOL,
|
|
|
|
.help = "Enable efficient zero writes "
|
|
|
|
"using the zeroed-grain GTE feature"
|
|
|
|
},
|
|
|
|
{ /* end of list */ }
|
|
|
|
}
|
2009-05-18 16:42:10 +02:00
|
|
|
};
|
|
|
|
|
2009-05-10 00:03:42 +02:00
|
|
|
static BlockDriver bdrv_vmdk = {
|
2013-07-01 05:33:17 +02:00
|
|
|
.format_name = "vmdk",
|
|
|
|
.instance_size = sizeof(BDRVVmdkState),
|
|
|
|
.bdrv_probe = vmdk_probe,
|
|
|
|
.bdrv_open = vmdk_open,
|
2018-03-01 17:36:19 +01:00
|
|
|
.bdrv_co_check = vmdk_co_check,
|
2013-07-01 05:33:17 +02:00
|
|
|
.bdrv_reopen_prepare = vmdk_reopen_prepare,
|
2016-12-19 16:36:02 +01:00
|
|
|
.bdrv_child_perm = bdrv_format_default_perms,
|
2016-04-25 17:34:41 +02:00
|
|
|
.bdrv_co_preadv = vmdk_co_preadv,
|
2016-04-26 13:39:11 +02:00
|
|
|
.bdrv_co_pwritev = vmdk_co_pwritev,
|
2016-07-22 10:17:45 +02:00
|
|
|
.bdrv_co_pwritev_compressed = vmdk_co_pwritev_compressed,
|
2016-06-01 23:10:12 +02:00
|
|
|
.bdrv_co_pwrite_zeroes = vmdk_co_pwrite_zeroes,
|
2013-07-01 05:33:17 +02:00
|
|
|
.bdrv_close = vmdk_close,
|
2018-01-18 13:43:45 +01:00
|
|
|
.bdrv_co_create_opts = vmdk_co_create_opts,
|
2018-05-15 17:36:32 +02:00
|
|
|
.bdrv_co_create = vmdk_co_create,
|
2013-07-01 05:33:17 +02:00
|
|
|
.bdrv_co_flush_to_disk = vmdk_co_flush,
|
2018-02-13 21:26:58 +01:00
|
|
|
.bdrv_co_block_status = vmdk_co_block_status,
|
2013-07-01 05:33:17 +02:00
|
|
|
.bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
|
|
|
|
.bdrv_has_zero_init = vmdk_has_zero_init,
|
2013-10-31 03:06:23 +01:00
|
|
|
.bdrv_get_specific_info = vmdk_get_specific_info,
|
2013-12-11 19:26:16 +01:00
|
|
|
.bdrv_refresh_limits = vmdk_refresh_limits,
|
2014-05-06 15:08:45 +02:00
|
|
|
.bdrv_get_info = vmdk_get_info,
|
2019-02-01 20:29:26 +01:00
|
|
|
.bdrv_gather_child_options = vmdk_gather_child_options,
|
2013-07-01 05:33:17 +02:00
|
|
|
|
2014-06-04 15:09:35 +02:00
|
|
|
.supports_backing = true,
|
2014-06-05 11:21:09 +02:00
|
|
|
.create_opts = &vmdk_create_opts,
|
2004-08-01 23:59:26 +02:00
|
|
|
};
|
2009-05-10 00:03:42 +02:00
|
|
|
|
|
|
|
static void bdrv_vmdk_init(void)
|
|
|
|
{
|
|
|
|
bdrv_register(&bdrv_vmdk);
|
|
|
|
}
|
|
|
|
|
|
|
|
block_init(bdrv_vmdk_init);
|