2007-11-11 03:51:17 +01:00
|
|
|
#ifndef BLOCK_H
|
|
|
|
#define BLOCK_H
|
|
|
|
|
2012-12-17 18:19:44 +01:00
|
|
|
#include "block/aio.h"
|
2018-02-16 17:50:12 +01:00
|
|
|
#include "block/aio-wait.h"
|
2016-03-09 10:52:44 +01:00
|
|
|
#include "qemu/iov.h"
|
2015-09-01 15:48:02 +02:00
|
|
|
#include "qemu/coroutine.h"
|
2014-09-05 15:46:18 +02:00
|
|
|
#include "block/accounting.h"
|
2016-03-08 05:44:55 +01:00
|
|
|
#include "block/dirty-bitmap.h"
|
2016-10-27 18:07:00 +02:00
|
|
|
#include "block/blockjob.h"
|
2016-03-08 05:44:53 +01:00
|
|
|
#include "qemu/hbitmap.h"
|
2008-09-22 21:17:18 +02:00
|
|
|
|
2007-11-11 03:51:17 +01:00
|
|
|
/* block.c */
|
|
|
|
typedef struct BlockDriver BlockDriver;
|
2015-06-15 13:24:19 +02:00
|
|
|
typedef struct BdrvChild BdrvChild;
|
2015-04-08 13:43:47 +02:00
|
|
|
typedef struct BdrvChildRole BdrvChildRole;
|
2007-11-11 03:51:17 +01:00
|
|
|
|
|
|
|
typedef struct BlockDriverInfo {
|
|
|
|
/* in bytes, 0 if irrelevant */
|
|
|
|
int cluster_size;
|
|
|
|
/* offset at which the VM state can be saved (0 if not possible) */
|
|
|
|
int64_t vm_state_offset;
|
2012-03-15 13:13:33 +01:00
|
|
|
bool is_dirty;
|
2013-10-24 12:06:53 +02:00
|
|
|
/*
|
|
|
|
* True if unallocated blocks read back as zeroes. This is equivalent
|
2015-08-26 13:17:13 +02:00
|
|
|
* to the LBPRZ flag in the SCSI logical block provisioning page.
|
2013-10-24 12:06:53 +02:00
|
|
|
*/
|
|
|
|
bool unallocated_blocks_are_zero;
|
2014-05-06 15:08:43 +02:00
|
|
|
/*
|
|
|
|
* True if this block driver only supports compressed writes
|
|
|
|
*/
|
|
|
|
bool needs_compressed_writes;
|
2007-11-11 03:51:17 +01:00
|
|
|
} BlockDriverInfo;
|
|
|
|
|
2012-03-15 13:13:31 +01:00
|
|
|
typedef struct BlockFragInfo {
|
|
|
|
uint64_t allocated_clusters;
|
|
|
|
uint64_t total_clusters;
|
|
|
|
uint64_t fragmented_clusters;
|
2013-02-07 17:15:04 +01:00
|
|
|
uint64_t compressed_clusters;
|
2012-03-15 13:13:31 +01:00
|
|
|
} BlockFragInfo;
|
|
|
|
|
2013-10-24 12:06:50 +02:00
|
|
|
typedef enum {
|
2015-09-08 05:28:32 +02:00
|
|
|
BDRV_REQ_COPY_ON_READ = 0x1,
|
|
|
|
BDRV_REQ_ZERO_WRITE = 0x2,
|
2018-07-25 13:20:32 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The BDRV_REQ_MAY_UNMAP flag is used in write_zeroes requests to indicate
|
|
|
|
* that the block driver should unmap (discard) blocks if it is guaranteed
|
|
|
|
* that the result will read back as zeroes. The flag is only passed to the
|
|
|
|
* driver if the block device is opened with BDRV_O_UNMAP.
|
2013-10-24 12:06:52 +02:00
|
|
|
*/
|
2015-09-08 05:28:32 +02:00
|
|
|
BDRV_REQ_MAY_UNMAP = 0x4,
|
2018-07-09 18:37:16 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The BDRV_REQ_NO_SERIALISING flag is only valid for reads and means that
|
|
|
|
* we don't want wait_serialising_requests() during the read operation.
|
|
|
|
*
|
|
|
|
* This flag is used for backup copy-on-write operations, when we need to
|
|
|
|
* read old data before write (write notifier triggered). It is okay since
|
|
|
|
* we already waited for other serializing requests in the initiating write
|
|
|
|
* (see bdrv_aligned_pwritev), and it is necessary if the initiating write
|
|
|
|
* is already serializing (without the flag, the read would deadlock
|
|
|
|
* waiting for the serialising write to complete).
|
|
|
|
*/
|
2015-12-01 10:36:28 +01:00
|
|
|
BDRV_REQ_NO_SERIALISING = 0x8,
|
2016-03-04 14:28:01 +01:00
|
|
|
BDRV_REQ_FUA = 0x10,
|
2016-07-22 10:17:42 +02:00
|
|
|
BDRV_REQ_WRITE_COMPRESSED = 0x20,
|
2016-06-13 20:56:35 +02:00
|
|
|
|
2018-04-21 15:29:23 +02:00
|
|
|
/* Signifies that this write request will not change the visible disk
|
|
|
|
* content. */
|
|
|
|
BDRV_REQ_WRITE_UNCHANGED = 0x40,
|
|
|
|
|
2018-07-09 18:37:18 +02:00
|
|
|
/*
|
|
|
|
* BDRV_REQ_SERIALISING forces request serialisation for writes.
|
|
|
|
* It is used to ensure that writes to the backing file of a backup process
|
|
|
|
* target cannot race with a read of the backup target that defers to the
|
|
|
|
* backing file.
|
|
|
|
*
|
|
|
|
* Note, that BDRV_REQ_SERIALISING is _not_ opposite in meaning to
|
|
|
|
* BDRV_REQ_NO_SERIALISING. A more descriptive name for the latter might be
|
|
|
|
* _DO_NOT_WAIT_FOR_SERIALISING, except that is too long.
|
|
|
|
*/
|
|
|
|
BDRV_REQ_SERIALISING = 0x80,
|
|
|
|
|
2019-03-22 13:38:43 +01:00
|
|
|
/* Execute the request only if the operation can be offloaded or otherwise
|
|
|
|
* be executed efficiently, but return an error instead of using a slow
|
|
|
|
* fallback. */
|
|
|
|
BDRV_REQ_NO_FALLBACK = 0x100,
|
|
|
|
|
2019-07-25 12:05:48 +02:00
|
|
|
/*
|
|
|
|
* BDRV_REQ_PREFETCH may be used only together with BDRV_REQ_COPY_ON_READ
|
|
|
|
* on read request and means that caller doesn't really need data to be
|
|
|
|
* written to qiov parameter which may be NULL.
|
|
|
|
*/
|
|
|
|
BDRV_REQ_PREFETCH = 0x200,
|
2016-06-13 20:56:35 +02:00
|
|
|
/* Mask of valid flags */
|
2019-07-25 12:05:48 +02:00
|
|
|
BDRV_REQ_MASK = 0x3ff,
|
2013-10-24 12:06:50 +02:00
|
|
|
} BdrvRequestFlags;
|
|
|
|
|
2015-02-16 12:47:54 +01:00
|
|
|
typedef struct BlockSizes {
|
|
|
|
uint32_t phys;
|
|
|
|
uint32_t log;
|
|
|
|
} BlockSizes;
|
|
|
|
|
|
|
|
typedef struct HDGeometry {
|
|
|
|
uint32_t heads;
|
|
|
|
uint32_t sectors;
|
|
|
|
uint32_t cylinders;
|
|
|
|
} HDGeometry;
|
|
|
|
|
2007-11-11 03:51:17 +01:00
|
|
|
#define BDRV_O_RDWR 0x0002
|
2017-02-17 15:07:38 +01:00
|
|
|
#define BDRV_O_RESIZE 0x0004 /* request permission for resizing the node */
|
2007-11-11 03:51:17 +01:00
|
|
|
#define BDRV_O_SNAPSHOT 0x0008 /* open the file read only and save writes in a snapshot */
|
2014-04-11 19:16:36 +02:00
|
|
|
#define BDRV_O_TEMPORARY 0x0010 /* delete the file after use */
|
2008-10-14 16:42:54 +02:00
|
|
|
#define BDRV_O_NOCACHE 0x0020 /* do not use the host page cache */
|
2009-08-20 16:58:35 +02:00
|
|
|
#define BDRV_O_NATIVE_AIO 0x0080 /* use native AIO instead of the thread pool */
|
2010-01-12 12:55:16 +01:00
|
|
|
#define BDRV_O_NO_BACKING 0x0100 /* don't open the backing file */
|
2010-05-26 17:51:49 +02:00
|
|
|
#define BDRV_O_NO_FLUSH 0x0200 /* disable flushing on this disk */
|
2011-11-28 17:08:47 +01:00
|
|
|
#define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
|
2016-01-13 15:56:06 +01:00
|
|
|
#define BDRV_O_INACTIVE 0x0800 /* consistency hint for migration handoff */
|
2012-08-09 14:05:56 +02:00
|
|
|
#define BDRV_O_CHECK 0x1000 /* open solely for consistency check */
|
block: correctly set the keep_read_only flag
I believe the bs->keep_read_only flag is supposed to reflect
the initial open state of the device. If the device is initially
opened R/O, then commit operations, or reopen operations changing
to R/W, are prohibited.
Currently, the keep_read_only flag is only accurate for the active
layer, and its backing file. Subsequent images end up always having
the keep_read_only flag set.
For instance, what happens now:
[ base ] kro = 1, ro = 1
|
v
[ snap-1 ] kro = 1, ro = 1
|
v
[ snap-2 ] kro = 0, ro = 1
|
v
[ active ] kro = 0, ro = 0
What we want:
[ base ] kro = 0, ro = 1
|
v
[ snap-1 ] kro = 0, ro = 1
|
v
[ snap-2 ] kro = 0, ro = 1
|
v
[ active ] kro = 0, ro = 0
Signed-off-by: Jeff Cody <jcody@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2012-09-20 21:13:17 +02:00
|
|
|
#define BDRV_O_ALLOW_RDWR 0x2000 /* allow reopen to change from r/o to r/w */
|
2013-02-08 14:06:11 +01:00
|
|
|
#define BDRV_O_UNMAP 0x4000 /* execute guest UNMAP/TRIM operations */
|
2014-02-18 18:33:07 +01:00
|
|
|
#define BDRV_O_PROTOCOL 0x8000 /* if no block driver is explicitly given:
|
|
|
|
select an appropriate protocol driver,
|
|
|
|
ignoring the format layer */
|
2016-03-21 15:11:42 +01:00
|
|
|
#define BDRV_O_NO_IO 0x10000 /* don't initialize for I/O */
|
block: Add auto-read-only option
If a management application builds the block graph node by node, the
protocol layer doesn't inherit its read-only option from the format
layer any more, so it must be set explicitly.
Backing files should work on read-only storage, but at the same time, a
block job like commit should be able to reopen them read-write if they
are on read-write storage. However, without option inheritance, reopen
only changes the read-only option for the root node (typically the
format layer), but not the protocol layer, so reopening fails (the
format layer wants to get write permissions, but the protocol layer is
still read-only).
A simple workaround for the problem in the management tool would be to
open the protocol layer always read-write and to make only the format
layer read-only for backing files. However, sometimes the file is
actually stored on read-only storage and we don't know whether the image
can be opened read-write (for example, for NBD it depends on the server
we're trying to connect to). This adds an option that makes QEMU try to
open the image read-write, but allows it to degrade to a read-only mode
without returning an error.
The documentation for this option is consciously phrased in a way that
allows QEMU to switch to a better model eventually: Instead of trying
when the image is first opened, making the read-only flag dynamic and
changing it automatically whenever the first BLK_PERM_WRITE user is
attached or the last one is detached would be much more useful
behaviour.
Unfortunately, this more useful behaviour is also a lot harder to
implement, and libvirt needs a solution now before it can switch to
-blockdev, so let's start with this easier approach for now.
Instead of adding a new auto-read-only option, turning the existing
read-only into an enum (with a bool alternate for compatibility) was
considered, but it complicated the implementation to the point that it
didn't seem to be worth it.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
2018-10-05 18:57:40 +02:00
|
|
|
#define BDRV_O_AUTO_RDONLY 0x20000 /* degrade to read-only if opening read-write fails */
|
2008-10-14 16:42:54 +02:00
|
|
|
|
2016-03-18 17:46:45 +01:00
|
|
|
#define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
|
2007-11-11 03:51:17 +01:00
|
|
|
|
2015-04-07 16:55:00 +02:00
|
|
|
|
|
|
|
/* Option names of options parsed by the block layer */
|
|
|
|
|
|
|
|
#define BDRV_OPT_CACHE_WB "cache.writeback"
|
|
|
|
#define BDRV_OPT_CACHE_DIRECT "cache.direct"
|
|
|
|
#define BDRV_OPT_CACHE_NO_FLUSH "cache.no-flush"
|
2016-09-15 16:53:02 +02:00
|
|
|
#define BDRV_OPT_READ_ONLY "read-only"
|
block: Add auto-read-only option
If a management application builds the block graph node by node, the
protocol layer doesn't inherit its read-only option from the format
layer any more, so it must be set explicitly.
Backing files should work on read-only storage, but at the same time, a
block job like commit should be able to reopen them read-write if they
are on read-write storage. However, without option inheritance, reopen
only changes the read-only option for the root node (typically the
format layer), but not the protocol layer, so reopening fails (the
format layer wants to get write permissions, but the protocol layer is
still read-only).
A simple workaround for the problem in the management tool would be to
open the protocol layer always read-write and to make only the format
layer read-only for backing files. However, sometimes the file is
actually stored on read-only storage and we don't know whether the image
can be opened read-write (for example, for NBD it depends on the server
we're trying to connect to). This adds an option that makes QEMU try to
open the image read-write, but allows it to degrade to a read-only mode
without returning an error.
The documentation for this option is consciously phrased in a way that
allows QEMU to switch to a better model eventually: Instead of trying
when the image is first opened, making the read-only flag dynamic and
changing it automatically whenever the first BLK_PERM_WRITE user is
attached or the last one is detached would be much more useful
behaviour.
Unfortunately, this more useful behaviour is also a lot harder to
implement, and libvirt needs a solution now before it can switch to
-blockdev, so let's start with this easier approach for now.
Instead of adding a new auto-read-only option, turning the existing
read-only into an enum (with a bool alternate for compatibility) was
considered, but it complicated the implementation to the point that it
didn't seem to be worth it.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
2018-10-05 18:57:40 +02:00
|
|
|
#define BDRV_OPT_AUTO_READ_ONLY "auto-read-only"
|
2016-09-12 18:03:18 +02:00
|
|
|
#define BDRV_OPT_DISCARD "discard"
|
2017-05-02 18:35:37 +02:00
|
|
|
#define BDRV_OPT_FORCE_SHARE "force-share"
|
2015-04-07 16:55:00 +02:00
|
|
|
|
|
|
|
|
2009-11-30 18:21:19 +01:00
|
|
|
#define BDRV_SECTOR_BITS 9
|
2010-05-27 15:46:55 +02:00
|
|
|
#define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS)
|
2009-11-30 18:21:19 +01:00
|
|
|
|
2015-02-06 11:54:11 +01:00
|
|
|
#define BDRV_REQUEST_MAX_SECTORS MIN(SIZE_MAX >> BDRV_SECTOR_BITS, \
|
|
|
|
INT_MAX >> BDRV_SECTOR_BITS)
|
2017-01-20 17:25:26 +01:00
|
|
|
#define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
|
2015-02-06 11:54:11 +01:00
|
|
|
|
2014-11-10 10:10:38 +01:00
|
|
|
/*
|
block: Convert bdrv_get_block_status() to bytes
We are gradually moving away from sector-based interfaces, towards
byte-based. In the common case, allocation is unlikely to ever use
values that are not naturally sector-aligned, but it is possible
that byte-based values will let us be more precise about allocation
at the end of an unaligned file that can do byte-based access.
Changing the name of the function from bdrv_get_block_status() to
bdrv_block_status() ensures that the compiler enforces that all
callers are updated. For now, the io.c layer still assert()s that
all callers are sector-aligned, but that can be relaxed when a later
patch implements byte-based block status in the drivers.
There was an inherent limitation in returning the offset via the
return value: we only have room for BDRV_BLOCK_OFFSET_MASK bits, which
means an offset can only be mapped for sector-aligned queries (or,
if we declare that non-aligned input is at the same relative position
modulo 512 of the answer), so the new interface also changes things to
return the offset via output through a parameter by reference rather
than mashed into the return value. We'll have some glue code that
munges between the two styles until we finish converting all uses.
For the most part this patch is just the addition of scaling at the
callers followed by inverse scaling at bdrv_block_status(), coupled
with the tweak in calling convention. But some code, particularly
bdrv_is_allocated(), gets a lot simpler because it no longer has to
mess with sectors.
For ease of review, bdrv_get_block_status_above() will be tackled
separately.
Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2017-10-12 05:47:03 +02:00
|
|
|
* Allocation status flags for bdrv_block_status() and friends.
|
2017-05-07 02:05:43 +02:00
|
|
|
*
|
|
|
|
* Public flags:
|
|
|
|
* BDRV_BLOCK_DATA: allocation for data at offset is tied to this layer
|
|
|
|
* BDRV_BLOCK_ZERO: offset reads as zero
|
|
|
|
* BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data
|
2014-05-06 15:25:36 +02:00
|
|
|
* BDRV_BLOCK_ALLOCATED: the content of the block is determined by this
|
block: Add .bdrv_co_block_status() callback
We are gradually moving away from sector-based interfaces, towards
byte-based. Now that the block layer exposes byte-based allocation,
it's time to tackle the drivers. Add a new callback that operates
on as small as byte boundaries. Subsequent patches will then update
individual drivers, then finally remove .bdrv_co_get_block_status().
The new code also passes through the 'want_zero' hint, which will
allow subsequent patches to further optimize callers that only care
about how much of the image is allocated (want_zero is false),
rather than full details about runs of zeroes and which offsets the
allocation actually maps to (want_zero is true). As part of this
effort, fix another part of the documentation: the claim in commit
4c41cb4 that BDRV_BLOCK_ALLOCATED is short for 'DATA || ZERO' is a
lie at the block layer (see commit e88ae2264), even though it is
how the bit is computed from the driver layer. After all, there
are intentionally cases where we return ZERO but not ALLOCATED at
the block layer, when we know that a read sees zero because the
backing file is too short. Note that the driver interface is thus
slightly different than the public interface with regards to which
bits will be set, and what guarantees are provided on input.
We also add an assertion that any driver using the new callback will
make progress (the only time pnum will be 0 is if the block layer
already handled an out-of-bounds request, or if there is an error);
the old driver interface did not provide this guarantee, which
could lead to some inf-loops in drastic corner-case failures.
Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2018-02-13 21:26:41 +01:00
|
|
|
* layer rather than any backing, set by block layer
|
|
|
|
* BDRV_BLOCK_EOF: the returned pnum covers through end of file for this
|
|
|
|
* layer, set by block layer
|
2013-09-04 19:00:29 +02:00
|
|
|
*
|
block: avoid recursive block_status call if possible
drv_co_block_status digs bs->file for additional, more accurate search
for hole inside region, reported as DATA by bs since 5daa74a6ebc.
This accuracy is not free: assume we have qcow2 disk. Actually, qcow2
knows, where are holes and where is data. But every block_status
request calls lseek additionally. Assume a big disk, full of
data, in any iterative copying block job (or img convert) we'll call
lseek(HOLE) on every iteration, and each of these lseeks will have to
iterate through all metadata up to the end of file. It's obviously
ineffective behavior. And for many scenarios we don't need this lseek
at all.
However, lseek is needed when we have metadata-preallocated image.
So, let's detect metadata-preallocation case and don't dig qcow2's
protocol file in other cases.
The idea is to compare allocation size in POV of filesystem with
allocations size in POV of Qcow2 (by refcounts). If allocation in fs is
significantly lower, consider it as metadata-preallocation case.
102 iotest changed, as our detector can't detect shrinked file as
metadata-preallocation, which don't seem to be wrong, as with metadata
preallocation we always have valid file length.
Two other iotests have a slight change in their QMP output sequence:
Active 'block-commit' returns earlier because the job coroutine yields
earlier on a blocking operation. This operation is loading the refcount
blocks in qcow2_detect_metadata_preallocation().
Suggested-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2019-04-08 18:26:17 +02:00
|
|
|
* Internal flags:
|
2017-06-05 22:38:44 +02:00
|
|
|
* BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request
|
|
|
|
* that the block layer recompute the answer from the returned
|
|
|
|
* BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID.
|
block: avoid recursive block_status call if possible
drv_co_block_status digs bs->file for additional, more accurate search
for hole inside region, reported as DATA by bs since 5daa74a6ebc.
This accuracy is not free: assume we have qcow2 disk. Actually, qcow2
knows, where are holes and where is data. But every block_status
request calls lseek additionally. Assume a big disk, full of
data, in any iterative copying block job (or img convert) we'll call
lseek(HOLE) on every iteration, and each of these lseeks will have to
iterate through all metadata up to the end of file. It's obviously
ineffective behavior. And for many scenarios we don't need this lseek
at all.
However, lseek is needed when we have metadata-preallocated image.
So, let's detect metadata-preallocation case and don't dig qcow2's
protocol file in other cases.
The idea is to compare allocation size in POV of filesystem with
allocations size in POV of Qcow2 (by refcounts). If allocation in fs is
significantly lower, consider it as metadata-preallocation case.
102 iotest changed, as our detector can't detect shrinked file as
metadata-preallocation, which don't seem to be wrong, as with metadata
preallocation we always have valid file length.
Two other iotests have a slight change in their QMP output sequence:
Active 'block-commit' returns earlier because the job coroutine yields
earlier on a blocking operation. This operation is loading the refcount
blocks in qcow2_detect_metadata_preallocation().
Suggested-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2019-04-08 18:26:17 +02:00
|
|
|
* BDRV_BLOCK_RECURSE: request that the block layer will recursively search for
|
|
|
|
* zeroes in file child of current block node inside
|
|
|
|
* returned region. Only valid together with both
|
|
|
|
* BDRV_BLOCK_DATA and BDRV_BLOCK_OFFSET_VALID. Should not
|
|
|
|
* appear with BDRV_BLOCK_ZERO.
|
2013-09-04 19:00:29 +02:00
|
|
|
*
|
block: Add .bdrv_co_block_status() callback
We are gradually moving away from sector-based interfaces, towards
byte-based. Now that the block layer exposes byte-based allocation,
it's time to tackle the drivers. Add a new callback that operates
on as small as byte boundaries. Subsequent patches will then update
individual drivers, then finally remove .bdrv_co_get_block_status().
The new code also passes through the 'want_zero' hint, which will
allow subsequent patches to further optimize callers that only care
about how much of the image is allocated (want_zero is false),
rather than full details about runs of zeroes and which offsets the
allocation actually maps to (want_zero is true). As part of this
effort, fix another part of the documentation: the claim in commit
4c41cb4 that BDRV_BLOCK_ALLOCATED is short for 'DATA || ZERO' is a
lie at the block layer (see commit e88ae2264), even though it is
how the bit is computed from the driver layer. After all, there
are intentionally cases where we return ZERO but not ALLOCATED at
the block layer, when we know that a read sees zero because the
backing file is too short. Note that the driver interface is thus
slightly different than the public interface with regards to which
bits will be set, and what guarantees are provided on input.
We also add an assertion that any driver using the new callback will
make progress (the only time pnum will be 0 is if the block layer
already handled an out-of-bounds request, or if there is an error);
the old driver interface did not provide this guarantee, which
could lead to some inf-loops in drastic corner-case failures.
Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2018-02-13 21:26:41 +01:00
|
|
|
* If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the
|
|
|
|
* host offset within the returned BDS that is allocated for the
|
|
|
|
* corresponding raw guest data. However, whether that offset
|
|
|
|
* actually contains data also depends on BDRV_BLOCK_DATA, as follows:
|
2013-09-04 19:00:29 +02:00
|
|
|
*
|
|
|
|
* DATA ZERO OFFSET_VALID
|
2017-05-07 02:05:43 +02:00
|
|
|
* t t t sectors read as zero, returned file is zero at offset
|
|
|
|
* t f t sectors read as valid from file at offset
|
|
|
|
* f t t sectors preallocated, read as zero, returned file not
|
2013-09-04 19:00:29 +02:00
|
|
|
* necessarily zero at offset
|
|
|
|
* f f t sectors preallocated but read from backing_hd,
|
2017-05-07 02:05:43 +02:00
|
|
|
* returned file contains garbage at offset
|
2013-09-04 19:00:29 +02:00
|
|
|
* t t f sectors preallocated, read as zero, unknown offset
|
|
|
|
* t f f sectors read from unknown file or offset
|
|
|
|
* f t f not allocated or unknown offset, read as zero
|
|
|
|
* f f f not allocated or unknown offset, read from backing_hd
|
|
|
|
*/
|
2014-05-06 15:25:36 +02:00
|
|
|
#define BDRV_BLOCK_DATA 0x01
|
|
|
|
#define BDRV_BLOCK_ZERO 0x02
|
|
|
|
#define BDRV_BLOCK_OFFSET_VALID 0x04
|
|
|
|
#define BDRV_BLOCK_RAW 0x08
|
|
|
|
#define BDRV_BLOCK_ALLOCATED 0x10
|
2017-05-05 04:14:59 +02:00
|
|
|
#define BDRV_BLOCK_EOF 0x20
|
block: avoid recursive block_status call if possible
drv_co_block_status digs bs->file for additional, more accurate search
for hole inside region, reported as DATA by bs since 5daa74a6ebc.
This accuracy is not free: assume we have qcow2 disk. Actually, qcow2
knows, where are holes and where is data. But every block_status
request calls lseek additionally. Assume a big disk, full of
data, in any iterative copying block job (or img convert) we'll call
lseek(HOLE) on every iteration, and each of these lseeks will have to
iterate through all metadata up to the end of file. It's obviously
ineffective behavior. And for many scenarios we don't need this lseek
at all.
However, lseek is needed when we have metadata-preallocated image.
So, let's detect metadata-preallocation case and don't dig qcow2's
protocol file in other cases.
The idea is to compare allocation size in POV of filesystem with
allocations size in POV of Qcow2 (by refcounts). If allocation in fs is
significantly lower, consider it as metadata-preallocation case.
102 iotest changed, as our detector can't detect shrinked file as
metadata-preallocation, which don't seem to be wrong, as with metadata
preallocation we always have valid file length.
Two other iotests have a slight change in their QMP output sequence:
Active 'block-commit' returns earlier because the job coroutine yields
earlier on a blocking operation. This operation is loading the refcount
blocks in qcow2_detect_metadata_preallocation().
Suggested-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2019-04-08 18:26:17 +02:00
|
|
|
#define BDRV_BLOCK_RECURSE 0x40
|
2013-09-04 19:00:29 +02:00
|
|
|
|
2019-09-27 14:23:47 +02:00
|
|
|
typedef QTAILQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue;
|
2012-09-20 21:13:19 +02:00
|
|
|
|
|
|
|
typedef struct BDRVReopenState {
|
|
|
|
BlockDriverState *bs;
|
|
|
|
int flags;
|
2018-09-06 11:37:09 +02:00
|
|
|
BlockdevDetectZeroesOptions detect_zeroes;
|
2019-03-12 17:48:45 +01:00
|
|
|
bool backing_missing;
|
2019-03-12 17:48:47 +01:00
|
|
|
bool replace_backing_bs; /* new_backing_bs is ignored if this is false */
|
|
|
|
BlockDriverState *new_backing_bs; /* If NULL then detach the current bs */
|
2017-07-03 17:07:35 +02:00
|
|
|
uint64_t perm, shared_perm;
|
2015-04-10 17:50:50 +02:00
|
|
|
QDict *options;
|
2015-05-08 16:15:03 +02:00
|
|
|
QDict *explicit_options;
|
2012-09-20 21:13:19 +02:00
|
|
|
void *opaque;
|
|
|
|
} BDRVReopenState;
|
|
|
|
|
2014-05-23 15:29:41 +02:00
|
|
|
/*
|
|
|
|
* Block operation types
|
|
|
|
*/
|
|
|
|
typedef enum BlockOpType {
|
|
|
|
BLOCK_OP_TYPE_BACKUP_SOURCE,
|
|
|
|
BLOCK_OP_TYPE_BACKUP_TARGET,
|
|
|
|
BLOCK_OP_TYPE_CHANGE,
|
2014-09-11 07:14:00 +02:00
|
|
|
BLOCK_OP_TYPE_COMMIT_SOURCE,
|
|
|
|
BLOCK_OP_TYPE_COMMIT_TARGET,
|
2014-05-23 15:29:41 +02:00
|
|
|
BLOCK_OP_TYPE_DATAPLANE,
|
|
|
|
BLOCK_OP_TYPE_DRIVE_DEL,
|
|
|
|
BLOCK_OP_TYPE_EJECT,
|
|
|
|
BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT,
|
|
|
|
BLOCK_OP_TYPE_INTERNAL_SNAPSHOT,
|
|
|
|
BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE,
|
2015-12-24 05:45:02 +01:00
|
|
|
BLOCK_OP_TYPE_MIRROR_SOURCE,
|
2015-12-24 05:45:04 +01:00
|
|
|
BLOCK_OP_TYPE_MIRROR_TARGET,
|
2014-05-23 15:29:41 +02:00
|
|
|
BLOCK_OP_TYPE_RESIZE,
|
|
|
|
BLOCK_OP_TYPE_STREAM,
|
2014-06-27 18:25:25 +02:00
|
|
|
BLOCK_OP_TYPE_REPLACE,
|
2014-05-23 15:29:41 +02:00
|
|
|
BLOCK_OP_TYPE_MAX,
|
|
|
|
} BlockOpType;
|
2012-09-20 21:13:19 +02:00
|
|
|
|
2016-12-20 16:52:41 +01:00
|
|
|
/* Block node permission constants */
|
|
|
|
enum {
|
|
|
|
/**
|
|
|
|
* A user that has the "permission" of consistent reads is guaranteed that
|
|
|
|
* their view of the contents of the block device is complete and
|
|
|
|
* self-consistent, representing the contents of a disk at a specific
|
|
|
|
* point.
|
|
|
|
*
|
|
|
|
* For most block devices (including their backing files) this is true, but
|
|
|
|
* the property cannot be maintained in a few situations like for
|
|
|
|
* intermediate nodes of a commit block job.
|
|
|
|
*/
|
|
|
|
BLK_PERM_CONSISTENT_READ = 0x01,
|
|
|
|
|
|
|
|
/** This permission is required to change the visible disk contents. */
|
|
|
|
BLK_PERM_WRITE = 0x02,
|
|
|
|
|
|
|
|
/**
|
|
|
|
* This permission (which is weaker than BLK_PERM_WRITE) is both enough and
|
|
|
|
* required for writes to the block node when the caller promises that
|
|
|
|
* the visible disk content doesn't change.
|
2018-04-21 15:29:22 +02:00
|
|
|
*
|
|
|
|
* As the BLK_PERM_WRITE permission is strictly stronger, either is
|
|
|
|
* sufficient to perform an unchanging write.
|
2016-12-20 16:52:41 +01:00
|
|
|
*/
|
|
|
|
BLK_PERM_WRITE_UNCHANGED = 0x04,
|
|
|
|
|
|
|
|
/** This permission is required to change the size of a block node. */
|
|
|
|
BLK_PERM_RESIZE = 0x08,
|
|
|
|
|
|
|
|
/**
|
|
|
|
* This permission is required to change the node that this BdrvChild
|
|
|
|
* points to.
|
|
|
|
*/
|
|
|
|
BLK_PERM_GRAPH_MOD = 0x10,
|
|
|
|
|
|
|
|
BLK_PERM_ALL = 0x1f,
|
2018-07-03 16:48:47 +02:00
|
|
|
|
|
|
|
DEFAULT_PERM_PASSTHROUGH = BLK_PERM_CONSISTENT_READ
|
|
|
|
| BLK_PERM_WRITE
|
|
|
|
| BLK_PERM_WRITE_UNCHANGED
|
|
|
|
| BLK_PERM_RESIZE,
|
|
|
|
|
|
|
|
DEFAULT_PERM_UNCHANGED = BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH,
|
2016-12-20 16:52:41 +01:00
|
|
|
};
|
|
|
|
|
2017-05-02 18:35:36 +02:00
|
|
|
char *bdrv_perm_names(uint64_t perm);
|
2019-11-08 13:34:51 +01:00
|
|
|
uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm);
|
2017-05-02 18:35:36 +02:00
|
|
|
|
2011-11-03 09:57:25 +01:00
|
|
|
/* disk I/O throttling */
|
2007-11-11 03:51:17 +01:00
|
|
|
void bdrv_init(void);
|
2009-10-27 18:41:44 +01:00
|
|
|
void bdrv_init_with_whitelist(void);
|
2016-03-21 15:11:48 +01:00
|
|
|
bool bdrv_uses_whitelist(void);
|
2018-02-21 11:47:43 +01:00
|
|
|
int bdrv_is_whitelisted(BlockDriver *drv, bool read_only);
|
2013-07-10 15:47:39 +02:00
|
|
|
BlockDriver *bdrv_find_protocol(const char *filename,
|
2015-02-05 19:58:12 +01:00
|
|
|
bool allow_protocol_prefix,
|
|
|
|
Error **errp);
|
2007-11-11 03:51:17 +01:00
|
|
|
BlockDriver *bdrv_find_format(const char *format_name);
|
2009-05-18 16:42:10 +02:00
|
|
|
int bdrv_create(BlockDriver *drv, const char* filename,
|
2014-06-05 11:21:11 +02:00
|
|
|
QemuOpts *opts, Error **errp);
|
|
|
|
int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp);
|
2014-10-07 13:59:03 +02:00
|
|
|
BlockDriverState *bdrv_new(void);
|
2017-02-20 12:46:42 +01:00
|
|
|
void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
|
|
|
|
Error **errp);
|
2017-03-06 16:20:51 +01:00
|
|
|
void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
|
|
|
|
Error **errp);
|
2015-09-15 11:58:23 +02:00
|
|
|
|
2016-03-14 11:40:23 +01:00
|
|
|
int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough);
|
2013-02-08 14:06:11 +01:00
|
|
|
int bdrv_parse_discard_flags(const char *mode, int *flags);
|
2015-06-15 13:24:19 +02:00
|
|
|
BdrvChild *bdrv_open_child(const char *filename,
|
|
|
|
QDict *options, const char *bdref_key,
|
|
|
|
BlockDriverState* parent,
|
|
|
|
const BdrvChildRole *child_role,
|
|
|
|
bool allow_none, Error **errp);
|
2018-01-10 15:52:33 +01:00
|
|
|
BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp);
|
2017-02-17 20:42:32 +01:00
|
|
|
void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
|
|
|
|
Error **errp);
|
2015-01-16 18:23:41 +01:00
|
|
|
int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
|
|
|
|
const char *bdref_key, Error **errp);
|
2016-05-17 16:41:31 +02:00
|
|
|
BlockDriverState *bdrv_open(const char *filename, const char *reference,
|
|
|
|
QDict *options, int flags, Error **errp);
|
2017-01-18 17:16:41 +01:00
|
|
|
BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name,
|
|
|
|
int flags, Error **errp);
|
2012-09-20 21:13:19 +02:00
|
|
|
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
|
2019-03-12 17:48:44 +01:00
|
|
|
BlockDriverState *bs, QDict *options,
|
|
|
|
bool keep_old_opts);
|
2019-03-12 17:48:50 +01:00
|
|
|
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp);
|
2018-11-12 15:00:33 +01:00
|
|
|
int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
|
|
|
|
Error **errp);
|
2012-09-20 21:13:19 +02:00
|
|
|
int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
|
|
|
|
BlockReopenQueue *queue, Error **errp);
|
|
|
|
void bdrv_reopen_commit(BDRVReopenState *reopen_state);
|
|
|
|
void bdrv_reopen_abort(BDRVReopenState *reopen_state);
|
2016-06-16 15:13:15 +02:00
|
|
|
int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
|
2017-06-09 12:18:08 +02:00
|
|
|
int bytes, BdrvRequestFlags flags);
|
2016-06-16 15:13:15 +02:00
|
|
|
int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags);
|
2016-06-20 18:24:02 +02:00
|
|
|
int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes);
|
|
|
|
int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov);
|
2016-06-20 20:09:15 +02:00
|
|
|
int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes);
|
|
|
|
int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov);
|
|
|
|
int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
|
|
|
|
const void *buf, int count);
|
2012-02-07 14:27:25 +01:00
|
|
|
/*
|
|
|
|
* Efficiently zero a region of the disk image. Note that this is a regular
|
|
|
|
* I/O request like read or write and should have a reasonable size. This
|
|
|
|
* function is not suitable for zeroing the entire image in a single request
|
|
|
|
* because it may allocate memory for the entire region.
|
|
|
|
*/
|
2016-06-20 21:31:46 +02:00
|
|
|
int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
|
2017-06-09 12:18:08 +02:00
|
|
|
int bytes, BdrvRequestFlags flags);
|
2012-01-18 15:40:51 +01:00
|
|
|
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
|
|
|
|
const char *backing_file);
|
2014-07-18 20:24:56 +02:00
|
|
|
void bdrv_refresh_filename(BlockDriverState *bs);
|
block: Convert .bdrv_truncate callback to coroutine_fn
bdrv_truncate() is an operation that can block (even for a quite long
time, depending on the PreallocMode) in I/O paths that shouldn't block.
Convert it to a coroutine_fn so that we have the infrastructure for
drivers to make their .bdrv_co_truncate implementation asynchronous.
This change could potentially introduce new race conditions because
bdrv_truncate() isn't necessarily executed atomically any more. Whether
this is a problem needs to be evaluated for each block driver that
supports truncate:
* file-posix/win32, gluster, iscsi, nfs, rbd, ssh, sheepdog: The
protocol drivers are trivially safe because they don't actually yield
yet, so there is no change in behaviour.
* copy-on-read, crypto, raw-format: Essentially just filter drivers that
pass the request to a child node, no problem.
* qcow2: The implementation modifies metadata, so it needs to hold
s->lock to be safe with concurrent I/O requests. In order to avoid
double locking, this requires pulling the locking out into
preallocate_co() and using qcow2_write_caches() instead of
bdrv_flush().
* qed: Does a single header update, this is fine without locking.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2018-06-21 17:54:35 +02:00
|
|
|
|
2019-09-18 11:51:40 +02:00
|
|
|
int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
|
block: Convert .bdrv_truncate callback to coroutine_fn
bdrv_truncate() is an operation that can block (even for a quite long
time, depending on the PreallocMode) in I/O paths that shouldn't block.
Convert it to a coroutine_fn so that we have the infrastructure for
drivers to make their .bdrv_co_truncate implementation asynchronous.
This change could potentially introduce new race conditions because
bdrv_truncate() isn't necessarily executed atomically any more. Whether
this is a problem needs to be evaluated for each block driver that
supports truncate:
* file-posix/win32, gluster, iscsi, nfs, rbd, ssh, sheepdog: The
protocol drivers are trivially safe because they don't actually yield
yet, so there is no change in behaviour.
* copy-on-read, crypto, raw-format: Essentially just filter drivers that
pass the request to a child node, no problem.
* qcow2: The implementation modifies metadata, so it needs to hold
s->lock to be safe with concurrent I/O requests. In order to avoid
double locking, this requires pulling the locking out into
preallocate_co() and using qcow2_write_caches() instead of
bdrv_flush().
* qed: Does a single header update, this is fine without locking.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2018-06-21 17:54:35 +02:00
|
|
|
PreallocMode prealloc, Error **errp);
|
2019-09-18 11:51:40 +02:00
|
|
|
int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
|
|
|
|
PreallocMode prealloc, Error **errp);
|
block: Convert .bdrv_truncate callback to coroutine_fn
bdrv_truncate() is an operation that can block (even for a quite long
time, depending on the PreallocMode) in I/O paths that shouldn't block.
Convert it to a coroutine_fn so that we have the infrastructure for
drivers to make their .bdrv_co_truncate implementation asynchronous.
This change could potentially introduce new race conditions because
bdrv_truncate() isn't necessarily executed atomically any more. Whether
this is a problem needs to be evaluated for each block driver that
supports truncate:
* file-posix/win32, gluster, iscsi, nfs, rbd, ssh, sheepdog: The
protocol drivers are trivially safe because they don't actually yield
yet, so there is no change in behaviour.
* copy-on-read, crypto, raw-format: Essentially just filter drivers that
pass the request to a child node, no problem.
* qcow2: The implementation modifies metadata, so it needs to hold
s->lock to be safe with concurrent I/O requests. In order to avoid
double locking, this requires pulling the locking out into
preallocate_co() and using qcow2_write_caches() instead of
bdrv_flush().
* qed: Does a single header update, this is fine without locking.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2018-06-21 17:54:35 +02:00
|
|
|
|
2014-06-26 13:23:17 +02:00
|
|
|
int64_t bdrv_nb_sectors(BlockDriverState *bs);
|
2007-11-11 03:51:17 +01:00
|
|
|
int64_t bdrv_getlength(BlockDriverState *bs);
|
2011-07-12 13:56:39 +02:00
|
|
|
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs);
|
2017-07-05 14:57:30 +02:00
|
|
|
BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts,
|
|
|
|
BlockDriverState *in_bs, Error **errp);
|
2007-12-17 02:35:20 +01:00
|
|
|
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr);
|
2014-07-16 17:48:16 +02:00
|
|
|
void bdrv_refresh_limits(BlockDriverState *bs, Error **errp);
|
2007-11-11 03:51:17 +01:00
|
|
|
int bdrv_commit(BlockDriverState *bs);
|
2010-01-12 12:55:17 +01:00
|
|
|
int bdrv_change_backing_file(BlockDriverState *bs,
|
|
|
|
const char *backing_file, const char *backing_fmt);
|
2009-05-10 00:03:42 +02:00
|
|
|
void bdrv_register(BlockDriver *bdrv);
|
2017-06-27 20:36:18 +02:00
|
|
|
int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
|
block: extend block-commit to accept a string for the backing file
On some image chains, QEMU may not always be able to resolve the
filenames properly, when updating the backing file of an image
after a block commit.
For instance, certain relative pathnames may fail, or drives may
have been specified originally by file descriptor (e.g. /dev/fd/???),
or a relative protocol pathname may have been used.
In these instances, QEMU may lack the information to be able to make
the correct choice, but the user or management layer most likely does
have that knowledge.
With this extension to the block-commit api, the user is able to change
the backing file of the overlay image as part of the block-commit
operation.
This allows the change to be 'safe', in the sense that if the attempt
to write the overlay image metadata fails, then the block-commit
operation returns failure, without disrupting the guest.
If the commit top is the active layer, then specifying the backing
file string will be treated as an error (there is no overlay image
to modify in that case).
If a backing file string is not specified in the command, the backing
file string to use is determined in the same manner as it was
previously.
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Jeff Cody <jcody@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-06-25 21:40:10 +02:00
|
|
|
const char *backing_file_str);
|
2012-09-27 19:29:12 +02:00
|
|
|
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
|
|
|
|
BlockDriverState *bs);
|
2012-09-27 19:29:15 +02:00
|
|
|
BlockDriverState *bdrv_find_base(BlockDriverState *bs);
|
2019-03-12 17:48:40 +01:00
|
|
|
bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
|
|
|
|
Error **errp);
|
|
|
|
int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base,
|
|
|
|
Error **errp);
|
|
|
|
void bdrv_unfreeze_backing_chain(BlockDriverState *bs, BlockDriverState *base);
|
2009-05-10 00:03:42 +02:00
|
|
|
|
2010-06-29 11:43:13 +02:00
|
|
|
|
|
|
|
typedef struct BdrvCheckResult {
|
|
|
|
int corruptions;
|
|
|
|
int leaks;
|
|
|
|
int check_errors;
|
2012-05-11 18:16:54 +02:00
|
|
|
int corruptions_fixed;
|
|
|
|
int leaks_fixed;
|
2013-01-28 12:59:46 +01:00
|
|
|
int64_t image_end_offset;
|
2012-03-15 13:13:31 +01:00
|
|
|
BlockFragInfo bfi;
|
2010-06-29 11:43:13 +02:00
|
|
|
} BdrvCheckResult;
|
|
|
|
|
2012-05-11 16:07:02 +02:00
|
|
|
typedef enum {
|
|
|
|
BDRV_FIX_LEAKS = 1,
|
|
|
|
BDRV_FIX_ERRORS = 2,
|
|
|
|
} BdrvCheckMode;
|
|
|
|
|
|
|
|
int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix);
|
2010-06-29 11:43:13 +02:00
|
|
|
|
2014-10-27 11:12:50 +01:00
|
|
|
/* The units of offset and total_work_size may be chosen arbitrarily by the
|
|
|
|
* block driver; total_work_size may change during the course of the amendment
|
|
|
|
* operation */
|
|
|
|
typedef void BlockDriverAmendStatusCB(BlockDriverState *bs, int64_t offset,
|
2015-07-27 17:51:32 +02:00
|
|
|
int64_t total_work_size, void *opaque);
|
2014-10-27 11:12:50 +01:00
|
|
|
int bdrv_amend_options(BlockDriverState *bs_new, QemuOpts *opts,
|
2018-05-09 23:00:18 +02:00
|
|
|
BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
|
|
|
|
Error **errp);
|
2013-09-03 10:09:50 +02:00
|
|
|
|
2013-10-02 14:33:48 +02:00
|
|
|
/* external snapshots */
|
2014-01-23 21:31:36 +01:00
|
|
|
bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
|
|
|
|
BlockDriverState *candidate);
|
|
|
|
bool bdrv_is_first_non_filter(BlockDriverState *candidate);
|
2013-10-02 14:33:48 +02:00
|
|
|
|
2014-06-27 18:25:25 +02:00
|
|
|
/* check if a named node can be replaced when doing drive-mirror */
|
2015-07-17 04:12:22 +02:00
|
|
|
BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
|
|
|
|
const char *node_name, Error **errp);
|
2014-06-27 18:25:25 +02:00
|
|
|
|
2007-11-11 03:51:17 +01:00
|
|
|
/* async block I/O */
|
2014-10-07 13:59:14 +02:00
|
|
|
void bdrv_aio_cancel(BlockAIOCB *acb);
|
|
|
|
void bdrv_aio_cancel_async(BlockAIOCB *acb);
|
2007-11-11 03:51:17 +01:00
|
|
|
|
2009-03-12 20:57:08 +01:00
|
|
|
/* sg packet commands */
|
2016-10-20 12:56:14 +02:00
|
|
|
int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
|
2009-03-12 20:57:08 +01:00
|
|
|
|
2011-11-14 22:09:45 +01:00
|
|
|
/* Invalidate any cached metadata used by image formats */
|
2014-03-12 15:59:16 +01:00
|
|
|
void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp);
|
|
|
|
void bdrv_invalidate_cache_all(Error **errp);
|
2015-12-22 14:07:08 +01:00
|
|
|
int bdrv_inactivate_all(void);
|
2011-11-14 22:09:45 +01:00
|
|
|
|
2007-11-11 03:51:17 +01:00
|
|
|
/* Ensure contents are flushed to disk. */
|
2010-10-21 16:43:43 +02:00
|
|
|
int bdrv_flush(BlockDriverState *bs);
|
2011-10-17 12:32:12 +02:00
|
|
|
int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
|
2016-09-23 03:45:50 +02:00
|
|
|
int bdrv_flush_all(void);
|
2010-05-28 04:44:57 +02:00
|
|
|
void bdrv_close_all(void);
|
2014-10-21 13:03:55 +02:00
|
|
|
void bdrv_drain(BlockDriverState *bs);
|
2016-04-05 13:20:52 +02:00
|
|
|
void coroutine_fn bdrv_co_drain(BlockDriverState *bs);
|
2016-10-28 09:08:02 +02:00
|
|
|
void bdrv_drain_all_begin(void);
|
|
|
|
void bdrv_drain_all_end(void);
|
2011-11-30 13:23:43 +01:00
|
|
|
void bdrv_drain_all(void);
|
2008-10-06 15:55:43 +02:00
|
|
|
|
2016-10-27 12:48:55 +02:00
|
|
|
#define BDRV_POLL_WHILE(bs, cond) ({ \
|
|
|
|
BlockDriverState *bs_ = (bs); \
|
2018-09-18 17:09:16 +02:00
|
|
|
AIO_WAIT_WHILE(bdrv_get_aio_context(bs_), \
|
2018-02-16 17:50:12 +01:00
|
|
|
cond); })
|
2016-10-27 12:48:55 +02:00
|
|
|
|
2019-04-23 14:57:05 +02:00
|
|
|
int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
|
|
|
|
int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
|
2013-06-28 12:47:42 +02:00
|
|
|
int bdrv_has_zero_init_1(BlockDriverState *bs);
|
2010-04-14 17:30:35 +02:00
|
|
|
int bdrv_has_zero_init(BlockDriverState *bs);
|
2019-07-24 19:12:31 +02:00
|
|
|
int bdrv_has_zero_init_truncate(BlockDriverState *bs);
|
2013-10-24 12:06:54 +02:00
|
|
|
bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs);
|
|
|
|
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
|
block: Convert bdrv_get_block_status() to bytes
We are gradually moving away from sector-based interfaces, towards
byte-based. In the common case, allocation is unlikely to ever use
values that are not naturally sector-aligned, but it is possible
that byte-based values will let us be more precise about allocation
at the end of an unaligned file that can do byte-based access.
Changing the name of the function from bdrv_get_block_status() to
bdrv_block_status() ensures that the compiler enforces that all
callers are updated. For now, the io.c layer still assert()s that
all callers are sector-aligned, but that can be relaxed when a later
patch implements byte-based block status in the drivers.
There was an inherent limitation in returning the offset via the
return value: we only have room for BDRV_BLOCK_OFFSET_MASK bits, which
means an offset can only be mapped for sector-aligned queries (or,
if we declare that non-aligned input is at the same relative position
modulo 512 of the answer), so the new interface also changes things to
return the offset via output through a parameter by reference rather
than mashed into the return value. We'll have some glue code that
munges between the two styles until we finish converting all uses.
For the most part this patch is just the addition of scaling at the
callers followed by inverse scaling at bdrv_block_status(), coupled
with the tweak in calling convention. But some code, particularly
bdrv_is_allocated(), gets a lot simpler because it no longer has to
mess with sectors.
For ease of review, bdrv_get_block_status_above() will be tackled
separately.
Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2017-10-12 05:47:03 +02:00
|
|
|
int bdrv_block_status(BlockDriverState *bs, int64_t offset,
|
|
|
|
int64_t bytes, int64_t *pnum, int64_t *map,
|
|
|
|
BlockDriverState **file);
|
block: Convert bdrv_get_block_status_above() to bytes
We are gradually moving away from sector-based interfaces, towards
byte-based. In the common case, allocation is unlikely to ever use
values that are not naturally sector-aligned, but it is possible
that byte-based values will let us be more precise about allocation
at the end of an unaligned file that can do byte-based access.
Changing the name of the function from bdrv_get_block_status_above()
to bdrv_block_status_above() ensures that the compiler enforces that
all callers are updated. Likewise, since it a byte interface allows
an offset mapping that might not be sector aligned, split the mapping
out of the return value and into a pass-by-reference parameter. For
now, the io.c layer still assert()s that all uses are sector-aligned,
but that can be relaxed when a later patch implements byte-based
block status in the drivers.
For the most part this patch is just the addition of scaling at the
callers followed by inverse scaling at bdrv_block_status(), plus
updates for the new split return interface. But some code,
particularly bdrv_block_status(), gets a lot simpler because it no
longer has to mess with sectors. Likewise, mirror code no longer
computes s->granularity >> BDRV_SECTOR_BITS, and can therefore drop
an assertion about alignment because the loop no longer depends on
alignment (never mind that we don't really have a driver that
reports sub-sector alignments, so it's not really possible to test
the effect of sub-sector mirroring). Fix a neighboring assertion to
use is_power_of_2 while there.
For ease of review, bdrv_get_block_status() was tackled separately.
Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2017-10-12 05:47:08 +02:00
|
|
|
int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
|
|
|
|
int64_t offset, int64_t bytes, int64_t *pnum,
|
|
|
|
int64_t *map, BlockDriverState **file);
|
block: Make bdrv_is_allocated() byte-based
We are gradually moving away from sector-based interfaces, towards
byte-based. In the common case, allocation is unlikely to ever use
values that are not naturally sector-aligned, but it is possible
that byte-based values will let us be more precise about allocation
at the end of an unaligned file that can do byte-based access.
Changing the signature of the function to use int64_t *pnum ensures
that the compiler enforces that all callers are updated. For now,
the io.c layer still assert()s that all callers are sector-aligned
on input and that *pnum is sector-aligned on return to the caller,
but that can be relaxed when a later patch implements byte-based
block status. Therefore, this code adds usages like
DIV_ROUND_UP(,BDRV_SECTOR_SIZE) to callers that still want aligned
values, where the call might reasonbly give non-aligned results
in the future; on the other hand, no rounding is needed for callers
that should just continue to work with byte alignment.
For the most part this patch is just the addition of scaling at the
callers followed by inverse scaling at bdrv_is_allocated(). But
some code, particularly bdrv_commit(), gets a lot simpler because it
no longer has to mess with sectors; also, it is now possible to pass
NULL if the caller does not care how much of the image is allocated
beyond the initial offset. Leave comments where we can further
simplify once a later patch eliminates the need for sector-aligned
requests through bdrv_is_allocated().
For ease of review, bdrv_is_allocated_above() will be tackled
separately.
Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2017-07-07 14:44:57 +02:00
|
|
|
int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
|
|
|
|
int64_t *pnum);
|
2013-02-13 09:09:39 +01:00
|
|
|
int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
|
2019-05-29 19:56:14 +02:00
|
|
|
bool include_base, int64_t offset, int64_t bytes,
|
|
|
|
int64_t *pnum);
|
2007-11-11 03:51:17 +01:00
|
|
|
|
2016-06-24 00:37:26 +02:00
|
|
|
bool bdrv_is_read_only(BlockDriverState *bs);
|
2017-08-03 17:02:58 +02:00
|
|
|
int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
|
|
|
|
bool ignore_allow_rdw, Error **errp);
|
2018-10-12 11:27:41 +02:00
|
|
|
int bdrv_apply_auto_read_only(BlockDriverState *bs, const char *errmsg,
|
|
|
|
Error **errp);
|
2018-06-06 21:37:00 +02:00
|
|
|
bool bdrv_is_writable(BlockDriverState *bs);
|
2016-06-24 00:37:26 +02:00
|
|
|
bool bdrv_is_sg(BlockDriverState *bs);
|
2015-10-19 17:53:11 +02:00
|
|
|
bool bdrv_is_inserted(BlockDriverState *bs);
|
2011-09-06 18:58:47 +02:00
|
|
|
void bdrv_lock_medium(BlockDriverState *bs, bool locked);
|
2012-02-03 19:24:53 +01:00
|
|
|
void bdrv_eject(BlockDriverState *bs, bool eject_flag);
|
2012-06-13 10:11:48 +02:00
|
|
|
const char *bdrv_get_format_name(BlockDriverState *bs);
|
2014-01-23 21:31:32 +01:00
|
|
|
BlockDriverState *bdrv_find_node(const char *node_name);
|
2015-04-17 13:52:43 +02:00
|
|
|
BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp);
|
2018-12-21 18:09:07 +01:00
|
|
|
XDbgBlockGraph *bdrv_get_xdbg_block_graph(Error **errp);
|
2014-01-23 21:31:35 +01:00
|
|
|
BlockDriverState *bdrv_lookup_bs(const char *device,
|
|
|
|
const char *node_name,
|
|
|
|
Error **errp);
|
2014-06-25 21:40:09 +02:00
|
|
|
bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base);
|
2014-10-31 04:32:54 +01:00
|
|
|
BlockDriverState *bdrv_next_node(BlockDriverState *bs);
|
2018-03-28 18:29:18 +02:00
|
|
|
BlockDriverState *bdrv_next_all_states(BlockDriverState *bs);
|
2016-05-20 18:49:07 +02:00
|
|
|
|
|
|
|
typedef struct BdrvNextIterator {
|
|
|
|
enum {
|
|
|
|
BDRV_NEXT_BACKEND_ROOTS,
|
|
|
|
BDRV_NEXT_MONITOR_OWNED,
|
|
|
|
} phase;
|
|
|
|
BlockBackend *blk;
|
|
|
|
BlockDriverState *bs;
|
|
|
|
} BdrvNextIterator;
|
|
|
|
|
|
|
|
BlockDriverState *bdrv_first(BdrvNextIterator *it);
|
|
|
|
BlockDriverState *bdrv_next(BdrvNextIterator *it);
|
2017-11-10 18:25:45 +01:00
|
|
|
void bdrv_next_cleanup(BdrvNextIterator *it);
|
2016-05-20 18:49:07 +02:00
|
|
|
|
2016-03-16 19:54:41 +01:00
|
|
|
BlockDriverState *bdrv_next_monitor_owned(BlockDriverState *bs);
|
2016-06-24 00:37:26 +02:00
|
|
|
bool bdrv_is_encrypted(BlockDriverState *bs);
|
2007-11-11 03:51:17 +01:00
|
|
|
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
|
2019-03-07 14:33:58 +01:00
|
|
|
void *opaque, bool read_only);
|
2014-10-31 04:32:55 +01:00
|
|
|
const char *bdrv_get_node_name(const BlockDriverState *bs);
|
2014-10-07 13:59:11 +02:00
|
|
|
const char *bdrv_get_device_name(const BlockDriverState *bs);
|
2015-04-08 11:29:18 +02:00
|
|
|
const char *bdrv_get_device_or_node_name(const BlockDriverState *bs);
|
2012-06-05 16:49:24 +02:00
|
|
|
int bdrv_get_flags(BlockDriverState *bs);
|
2007-11-11 03:51:17 +01:00
|
|
|
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
|
2019-02-08 16:06:06 +01:00
|
|
|
ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs,
|
|
|
|
Error **errp);
|
2019-09-23 14:17:37 +02:00
|
|
|
BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs);
|
2013-01-21 17:09:42 +01:00
|
|
|
void bdrv_round_to_clusters(BlockDriverState *bs,
|
2017-10-12 05:46:59 +02:00
|
|
|
int64_t offset, int64_t bytes,
|
2016-06-02 11:41:52 +02:00
|
|
|
int64_t *cluster_offset,
|
2017-10-12 05:46:59 +02:00
|
|
|
int64_t *cluster_bytes);
|
2007-11-11 03:51:17 +01:00
|
|
|
|
|
|
|
void bdrv_get_backing_filename(BlockDriverState *bs,
|
|
|
|
char *filename, int filename_size);
|
2019-02-01 20:29:15 +01:00
|
|
|
char *bdrv_get_full_backing_filename(BlockDriverState *bs, Error **errp);
|
2019-02-01 20:29:14 +01:00
|
|
|
char *bdrv_get_full_backing_filename_from_filename(const char *backed,
|
|
|
|
const char *backing,
|
|
|
|
Error **errp);
|
2019-02-01 20:29:18 +01:00
|
|
|
char *bdrv_dirname(BlockDriverState *bs, Error **errp);
|
2007-11-11 03:51:17 +01:00
|
|
|
|
2014-12-03 14:57:22 +01:00
|
|
|
int path_has_protocol(const char *path);
|
2007-11-11 03:51:17 +01:00
|
|
|
int path_is_absolute(const char *path);
|
2019-02-01 20:29:13 +01:00
|
|
|
char *path_combine(const char *base_path, const char *filename);
|
2007-11-11 03:51:17 +01:00
|
|
|
|
2016-06-09 16:50:16 +02:00
|
|
|
int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
|
2013-04-05 21:27:53 +02:00
|
|
|
int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
|
2009-07-10 23:11:57 +02:00
|
|
|
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
|
|
|
|
int64_t pos, int size);
|
2009-04-05 21:10:55 +02:00
|
|
|
|
2009-07-10 23:11:57 +02:00
|
|
|
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
|
|
|
|
int64_t pos, int size);
|
2009-04-05 21:10:55 +02:00
|
|
|
|
2012-11-30 13:52:09 +01:00
|
|
|
void bdrv_img_create(const char *filename, const char *fmt,
|
|
|
|
const char *base_filename, const char *base_fmt,
|
2013-02-13 09:09:40 +01:00
|
|
|
char *options, uint64_t img_size, int flags,
|
2017-04-21 14:27:01 +02:00
|
|
|
bool quiet, Error **errp);
|
2010-12-16 13:52:15 +01:00
|
|
|
|
2013-11-28 10:23:32 +01:00
|
|
|
/* Returns the alignment in bytes that is required so that no bounce buffer
|
|
|
|
* is required throughout the stack */
|
2015-05-12 16:30:55 +02:00
|
|
|
size_t bdrv_min_mem_align(BlockDriverState *bs);
|
|
|
|
/* Returns optimal alignment in bytes for bounce buffer */
|
2013-11-28 10:23:32 +01:00
|
|
|
size_t bdrv_opt_mem_align(BlockDriverState *bs);
|
2011-08-03 15:08:19 +02:00
|
|
|
void *qemu_blockalign(BlockDriverState *bs, size_t size);
|
2014-10-22 14:09:27 +02:00
|
|
|
void *qemu_blockalign0(BlockDriverState *bs, size_t size);
|
2014-05-20 12:24:05 +02:00
|
|
|
void *qemu_try_blockalign(BlockDriverState *bs, size_t size);
|
2014-10-22 14:09:27 +02:00
|
|
|
void *qemu_try_blockalign0(BlockDriverState *bs, size_t size);
|
2013-01-11 16:41:27 +01:00
|
|
|
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
|
2011-08-03 15:08:19 +02:00
|
|
|
|
2011-11-28 17:08:47 +01:00
|
|
|
void bdrv_enable_copy_on_read(BlockDriverState *bs);
|
|
|
|
void bdrv_disable_copy_on_read(BlockDriverState *bs);
|
|
|
|
|
2013-08-23 03:14:46 +02:00
|
|
|
void bdrv_ref(BlockDriverState *bs);
|
|
|
|
void bdrv_unref(BlockDriverState *bs);
|
2015-06-15 13:51:04 +02:00
|
|
|
void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child);
|
2016-05-10 09:36:38 +02:00
|
|
|
BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
|
|
|
|
BlockDriverState *child_bs,
|
|
|
|
const char *child_name,
|
2016-12-20 22:21:17 +01:00
|
|
|
const BdrvChildRole *child_role,
|
|
|
|
Error **errp);
|
2010-03-15 17:27:00 +01:00
|
|
|
|
2014-05-23 15:29:42 +02:00
|
|
|
bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp);
|
|
|
|
void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason);
|
|
|
|
void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason);
|
|
|
|
void bdrv_op_block_all(BlockDriverState *bs, Error *reason);
|
|
|
|
void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason);
|
|
|
|
bool bdrv_op_blocker_is_empty(BlockDriverState *bs);
|
|
|
|
|
2015-06-16 14:19:22 +02:00
|
|
|
#define BLKDBG_EVENT(child, evt) \
|
|
|
|
do { \
|
|
|
|
if (child) { \
|
|
|
|
bdrv_debug_event(child->bs, evt); \
|
|
|
|
} \
|
|
|
|
} while (0)
|
|
|
|
|
2015-11-18 09:52:54 +01:00
|
|
|
void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event);
|
2010-03-15 17:27:00 +01:00
|
|
|
|
2012-12-06 14:32:58 +01:00
|
|
|
int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
|
|
|
|
const char *tag);
|
2013-11-20 03:01:54 +01:00
|
|
|
int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag);
|
2012-12-06 14:32:58 +01:00
|
|
|
int bdrv_debug_resume(BlockDriverState *bs, const char *tag);
|
|
|
|
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag);
|
|
|
|
|
2014-05-15 13:22:05 +02:00
|
|
|
/**
|
|
|
|
* bdrv_get_aio_context:
|
|
|
|
*
|
|
|
|
* Returns: the currently bound #AioContext
|
|
|
|
*/
|
|
|
|
AioContext *bdrv_get_aio_context(BlockDriverState *bs);
|
|
|
|
|
2017-04-10 14:09:25 +02:00
|
|
|
/**
|
|
|
|
* Transfer control to @co in the aio context of @bs
|
|
|
|
*/
|
|
|
|
void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine *co);
|
|
|
|
|
2019-05-06 19:17:59 +02:00
|
|
|
void bdrv_set_aio_context_ignore(BlockDriverState *bs,
|
|
|
|
AioContext *new_context, GSList **ignore);
|
2019-05-06 19:17:56 +02:00
|
|
|
int bdrv_try_set_aio_context(BlockDriverState *bs, AioContext *ctx,
|
|
|
|
Error **errp);
|
|
|
|
int bdrv_child_try_set_aio_context(BlockDriverState *bs, AioContext *ctx,
|
|
|
|
BdrvChild *ignore_child, Error **errp);
|
|
|
|
bool bdrv_child_can_set_aio_context(BdrvChild *c, AioContext *ctx,
|
|
|
|
GSList **ignore, Error **errp);
|
|
|
|
bool bdrv_can_set_aio_context(BlockDriverState *bs, AioContext *ctx,
|
|
|
|
GSList **ignore, Error **errp);
|
2015-02-16 12:47:54 +01:00
|
|
|
int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz);
|
|
|
|
int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo);
|
2014-05-08 16:34:37 +02:00
|
|
|
|
2014-07-04 12:04:33 +02:00
|
|
|
void bdrv_io_plug(BlockDriverState *bs);
|
|
|
|
void bdrv_io_unplug(BlockDriverState *bs);
|
|
|
|
|
2018-06-29 18:01:31 +02:00
|
|
|
/**
|
|
|
|
* bdrv_parent_drained_begin_single:
|
|
|
|
*
|
|
|
|
* Begin a quiesced section for the parent of @c. If @poll is true, wait for
|
|
|
|
* any pending activity to cease.
|
|
|
|
*/
|
|
|
|
void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll);
|
|
|
|
|
block: Introduce BdrvChild.parent_quiesce_counter
Commit 5cb2737e925042e6c7cd3fb0b01313950b03cddf laid out why
bdrv_do_drained_end() must decrement the quiesce_counter after
bdrv_drain_invoke(). It did not give a very good reason why it has to
happen after bdrv_parent_drained_end(), instead only claiming symmetry
to bdrv_do_drained_begin().
It turns out that delaying it for so long is wrong.
Situation: We have an active commit job (i.e. a mirror job) from top to
base for the following graph:
filter
|
[file]
|
v
top --[backing]--> base
Now the VM is closed, which results in the job being cancelled and a
bdrv_drain_all() happening pretty much simultaneously.
Beginning the drain means the job is paused once whenever one of its
nodes is quiesced. This is reversed when the drain ends.
With how the code currently is, after base's drain ends (which means
that it will have unpaused the job once), its quiesce_counter remains at
1 while it goes to undrain its parents (bdrv_parent_drained_end()). For
some reason or another, undraining filter causes the job to be kicked
and enter mirror_exit_common(), where it proceeds to invoke
block_job_remove_all_bdrv().
Now base will be detached from the job. Because its quiesce_counter is
still 1, it will unpause the job once more. So in total, undraining
base will unpause the job twice. Eventually, this will lead to the
job's pause_count going negative -- well, it would, were there not an
assertion against this, which crashes qemu.
The general problem is that if in bdrv_parent_drained_end() we undrain
parent A, and then undrain parent B, which then leads to A detaching the
child, bdrv_replace_child_noperm() will undrain A as if we had not done
so yet; that is, one time too many.
It follows that we cannot decrement the quiesce_counter after invoking
bdrv_parent_drained_end().
Unfortunately, decrementing it before bdrv_parent_drained_end() would be
wrong, too. Imagine the above situation in reverse: Undraining A leads
to B detaching the child. If we had already decremented the
quiesce_counter by that point, bdrv_replace_child_noperm() would undrain
B one time too little; because it expects bdrv_parent_drained_end() to
issue this undrain. But bdrv_parent_drained_end() won't do that,
because B is no longer a parent.
Therefore, we have to do something else. This patch opts for
introducing a second quiesce_counter that counts how many times a
child's parent has been quiesced (though c->role->drained_*). With
that, bdrv_replace_child_noperm() just has to undrain the parent exactly
that many times when removing a child, and it will always be right.
Signed-off-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2019-07-19 11:26:09 +02:00
|
|
|
/**
|
|
|
|
* bdrv_parent_drained_end_single:
|
|
|
|
*
|
|
|
|
* End a quiesced section for the parent of @c.
|
block: Do not poll in bdrv_do_drained_end()
We should never poll anywhere in bdrv_do_drained_end() (including its
recursive callees like bdrv_drain_invoke()), because it does not cope
well with graph changes. In fact, it has been written based on the
postulation that no graph changes will happen in it.
Instead, the callers that want to poll must poll, i.e. all currently
globally available wrappers: bdrv_drained_end(),
bdrv_subtree_drained_end(), bdrv_unapply_subtree_drain(), and
bdrv_drain_all_end(). Graph changes there do not matter.
They can poll simply by passing a pointer to a drained_end_counter and
wait until it reaches 0.
This patch also adds a non-polling global wrapper for
bdrv_do_drained_end() that takes a drained_end_counter pointer. We need
such a variant because now no function called anywhere from
bdrv_do_drained_end() must poll. This includes
BdrvChildRole.drained_end(), which already must not poll according to
its interface documentation, but bdrv_child_cb_drained_end() just
violates that by invoking bdrv_drained_end() (which does poll).
Therefore, BdrvChildRole.drained_end() must take a *drained_end_counter
parameter, which bdrv_child_cb_drained_end() can pass on to the new
bdrv_drained_end_no_poll() function.
Note that we now have a pattern of all drained_end-related functions
either polling or receiving a *drained_end_counter to let the caller
poll based on that.
A problem with a single poll loop is that when the drained section in
bdrv_set_aio_context_ignore() ends, some nodes in the subgraph may be in
the old contexts, while others are in the new context already. To let
the collective poll in bdrv_drained_end() work correctly, we must not
hold a lock to the old context, so that the old context can make
progress in case it is different from the current context.
(In the process, remove the comment saying that the current context is
always the old context, because it is wrong.)
In all other places, all nodes in a subtree must be in the same context,
so we can just poll that. The exception of course is
bdrv_drain_all_end(), but that always runs in the main context, so we
can just poll NULL (like bdrv_drain_all_begin() does).
Signed-off-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2019-07-19 11:26:14 +02:00
|
|
|
*
|
|
|
|
* This polls @bs's AioContext until all scheduled sub-drained_ends
|
|
|
|
* have settled, which may result in graph changes.
|
block: Introduce BdrvChild.parent_quiesce_counter
Commit 5cb2737e925042e6c7cd3fb0b01313950b03cddf laid out why
bdrv_do_drained_end() must decrement the quiesce_counter after
bdrv_drain_invoke(). It did not give a very good reason why it has to
happen after bdrv_parent_drained_end(), instead only claiming symmetry
to bdrv_do_drained_begin().
It turns out that delaying it for so long is wrong.
Situation: We have an active commit job (i.e. a mirror job) from top to
base for the following graph:
filter
|
[file]
|
v
top --[backing]--> base
Now the VM is closed, which results in the job being cancelled and a
bdrv_drain_all() happening pretty much simultaneously.
Beginning the drain means the job is paused once whenever one of its
nodes is quiesced. This is reversed when the drain ends.
With how the code currently is, after base's drain ends (which means
that it will have unpaused the job once), its quiesce_counter remains at
1 while it goes to undrain its parents (bdrv_parent_drained_end()). For
some reason or another, undraining filter causes the job to be kicked
and enter mirror_exit_common(), where it proceeds to invoke
block_job_remove_all_bdrv().
Now base will be detached from the job. Because its quiesce_counter is
still 1, it will unpause the job once more. So in total, undraining
base will unpause the job twice. Eventually, this will lead to the
job's pause_count going negative -- well, it would, were there not an
assertion against this, which crashes qemu.
The general problem is that if in bdrv_parent_drained_end() we undrain
parent A, and then undrain parent B, which then leads to A detaching the
child, bdrv_replace_child_noperm() will undrain A as if we had not done
so yet; that is, one time too many.
It follows that we cannot decrement the quiesce_counter after invoking
bdrv_parent_drained_end().
Unfortunately, decrementing it before bdrv_parent_drained_end() would be
wrong, too. Imagine the above situation in reverse: Undraining A leads
to B detaching the child. If we had already decremented the
quiesce_counter by that point, bdrv_replace_child_noperm() would undrain
B one time too little; because it expects bdrv_parent_drained_end() to
issue this undrain. But bdrv_parent_drained_end() won't do that,
because B is no longer a parent.
Therefore, we have to do something else. This patch opts for
introducing a second quiesce_counter that counts how many times a
child's parent has been quiesced (though c->role->drained_*). With
that, bdrv_replace_child_noperm() just has to undrain the parent exactly
that many times when removing a child, and it will always be right.
Signed-off-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2019-07-19 11:26:09 +02:00
|
|
|
*/
|
|
|
|
void bdrv_parent_drained_end_single(BdrvChild *c);
|
|
|
|
|
2018-03-22 14:11:20 +01:00
|
|
|
/**
|
|
|
|
* bdrv_drain_poll:
|
|
|
|
*
|
2018-03-23 12:40:41 +01:00
|
|
|
* Poll for pending requests in @bs, its parents (except for @ignore_parent),
|
2018-05-29 17:17:45 +02:00
|
|
|
* and if @recursive is true its children as well (used for subtree drain).
|
|
|
|
*
|
|
|
|
* If @ignore_bds_parents is true, parents that are BlockDriverStates must
|
|
|
|
* ignore the drain request because they will be drained separately (used for
|
|
|
|
* drain_all).
|
2018-03-23 12:40:41 +01:00
|
|
|
*
|
|
|
|
* This is part of bdrv_drained_begin.
|
2018-03-22 14:11:20 +01:00
|
|
|
*/
|
2018-03-23 12:40:41 +01:00
|
|
|
bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
|
2018-05-29 17:17:45 +02:00
|
|
|
BdrvChild *ignore_parent, bool ignore_bds_parents);
|
2018-03-22 14:11:20 +01:00
|
|
|
|
2015-10-23 05:08:09 +02:00
|
|
|
/**
|
|
|
|
* bdrv_drained_begin:
|
|
|
|
*
|
|
|
|
* Begin a quiesced section for exclusive access to the BDS, by disabling
|
|
|
|
* external request sources including NBD server and device model. Note that
|
|
|
|
* this doesn't block timers or coroutines from submitting more requests, which
|
|
|
|
* means block_job_pause is still necessary.
|
|
|
|
*
|
|
|
|
* This function can be recursive.
|
|
|
|
*/
|
|
|
|
void bdrv_drained_begin(BlockDriverState *bs);
|
|
|
|
|
2018-03-23 15:57:20 +01:00
|
|
|
/**
|
|
|
|
* bdrv_do_drained_begin_quiesce:
|
|
|
|
*
|
|
|
|
* Quiesces a BDS like bdrv_drained_begin(), but does not wait for already
|
|
|
|
* running requests to complete.
|
|
|
|
*/
|
|
|
|
void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
|
2018-05-29 17:17:45 +02:00
|
|
|
BdrvChild *parent, bool ignore_bds_parents);
|
2018-03-23 15:57:20 +01:00
|
|
|
|
2017-12-06 17:05:44 +01:00
|
|
|
/**
|
|
|
|
* Like bdrv_drained_begin, but recursively begins a quiesced section for
|
|
|
|
* exclusive access to all child nodes as well.
|
|
|
|
*/
|
|
|
|
void bdrv_subtree_drained_begin(BlockDriverState *bs);
|
|
|
|
|
2015-10-23 05:08:09 +02:00
|
|
|
/**
|
|
|
|
* bdrv_drained_end:
|
|
|
|
*
|
|
|
|
* End a quiescent section started by bdrv_drained_begin().
|
block: Do not poll in bdrv_do_drained_end()
We should never poll anywhere in bdrv_do_drained_end() (including its
recursive callees like bdrv_drain_invoke()), because it does not cope
well with graph changes. In fact, it has been written based on the
postulation that no graph changes will happen in it.
Instead, the callers that want to poll must poll, i.e. all currently
globally available wrappers: bdrv_drained_end(),
bdrv_subtree_drained_end(), bdrv_unapply_subtree_drain(), and
bdrv_drain_all_end(). Graph changes there do not matter.
They can poll simply by passing a pointer to a drained_end_counter and
wait until it reaches 0.
This patch also adds a non-polling global wrapper for
bdrv_do_drained_end() that takes a drained_end_counter pointer. We need
such a variant because now no function called anywhere from
bdrv_do_drained_end() must poll. This includes
BdrvChildRole.drained_end(), which already must not poll according to
its interface documentation, but bdrv_child_cb_drained_end() just
violates that by invoking bdrv_drained_end() (which does poll).
Therefore, BdrvChildRole.drained_end() must take a *drained_end_counter
parameter, which bdrv_child_cb_drained_end() can pass on to the new
bdrv_drained_end_no_poll() function.
Note that we now have a pattern of all drained_end-related functions
either polling or receiving a *drained_end_counter to let the caller
poll based on that.
A problem with a single poll loop is that when the drained section in
bdrv_set_aio_context_ignore() ends, some nodes in the subgraph may be in
the old contexts, while others are in the new context already. To let
the collective poll in bdrv_drained_end() work correctly, we must not
hold a lock to the old context, so that the old context can make
progress in case it is different from the current context.
(In the process, remove the comment saying that the current context is
always the old context, because it is wrong.)
In all other places, all nodes in a subtree must be in the same context,
so we can just poll that. The exception of course is
bdrv_drain_all_end(), but that always runs in the main context, so we
can just poll NULL (like bdrv_drain_all_begin() does).
Signed-off-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2019-07-19 11:26:14 +02:00
|
|
|
*
|
|
|
|
* This polls @bs's AioContext until all scheduled sub-drained_ends
|
|
|
|
* have settled. On one hand, that may result in graph changes. On
|
2019-07-22 15:30:54 +02:00
|
|
|
* the other, this requires that the caller either runs in the main
|
|
|
|
* loop; or that all involved nodes (@bs and all of its parents) are
|
|
|
|
* in the caller's AioContext.
|
2015-10-23 05:08:09 +02:00
|
|
|
*/
|
|
|
|
void bdrv_drained_end(BlockDriverState *bs);
|
|
|
|
|
block: Do not poll in bdrv_do_drained_end()
We should never poll anywhere in bdrv_do_drained_end() (including its
recursive callees like bdrv_drain_invoke()), because it does not cope
well with graph changes. In fact, it has been written based on the
postulation that no graph changes will happen in it.
Instead, the callers that want to poll must poll, i.e. all currently
globally available wrappers: bdrv_drained_end(),
bdrv_subtree_drained_end(), bdrv_unapply_subtree_drain(), and
bdrv_drain_all_end(). Graph changes there do not matter.
They can poll simply by passing a pointer to a drained_end_counter and
wait until it reaches 0.
This patch also adds a non-polling global wrapper for
bdrv_do_drained_end() that takes a drained_end_counter pointer. We need
such a variant because now no function called anywhere from
bdrv_do_drained_end() must poll. This includes
BdrvChildRole.drained_end(), which already must not poll according to
its interface documentation, but bdrv_child_cb_drained_end() just
violates that by invoking bdrv_drained_end() (which does poll).
Therefore, BdrvChildRole.drained_end() must take a *drained_end_counter
parameter, which bdrv_child_cb_drained_end() can pass on to the new
bdrv_drained_end_no_poll() function.
Note that we now have a pattern of all drained_end-related functions
either polling or receiving a *drained_end_counter to let the caller
poll based on that.
A problem with a single poll loop is that when the drained section in
bdrv_set_aio_context_ignore() ends, some nodes in the subgraph may be in
the old contexts, while others are in the new context already. To let
the collective poll in bdrv_drained_end() work correctly, we must not
hold a lock to the old context, so that the old context can make
progress in case it is different from the current context.
(In the process, remove the comment saying that the current context is
always the old context, because it is wrong.)
In all other places, all nodes in a subtree must be in the same context,
so we can just poll that. The exception of course is
bdrv_drain_all_end(), but that always runs in the main context, so we
can just poll NULL (like bdrv_drain_all_begin() does).
Signed-off-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2019-07-19 11:26:14 +02:00
|
|
|
/**
|
|
|
|
* bdrv_drained_end_no_poll:
|
|
|
|
*
|
|
|
|
* Same as bdrv_drained_end(), but do not poll for the subgraph to
|
|
|
|
* actually become unquiesced. Therefore, no graph changes will occur
|
|
|
|
* with this function.
|
|
|
|
*
|
|
|
|
* *drained_end_counter is incremented for every background operation
|
|
|
|
* that is scheduled, and will be decremented for every operation once
|
|
|
|
* it settles. The caller must poll until it reaches 0. The counter
|
|
|
|
* should be accessed using atomic operations only.
|
|
|
|
*/
|
|
|
|
void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter);
|
|
|
|
|
2017-12-06 17:05:44 +01:00
|
|
|
/**
|
|
|
|
* End a quiescent section started by bdrv_subtree_drained_begin().
|
|
|
|
*/
|
|
|
|
void bdrv_subtree_drained_end(BlockDriverState *bs);
|
|
|
|
|
2016-05-10 09:36:37 +02:00
|
|
|
void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child,
|
|
|
|
Error **errp);
|
|
|
|
void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
|
|
|
|
|
2017-06-28 14:05:21 +02:00
|
|
|
bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
|
|
|
|
uint32_t granularity, Error **errp);
|
2018-01-16 07:08:56 +01:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
* bdrv_register_buf/bdrv_unregister_buf:
|
|
|
|
*
|
|
|
|
* Register/unregister a buffer for I/O. For example, VFIO drivers are
|
|
|
|
* interested to know the memory areas that would later be used for I/O, so
|
|
|
|
* that they can prepare IOMMU mapping etc., to get better performance.
|
|
|
|
*/
|
|
|
|
void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size);
|
|
|
|
void bdrv_unregister_buf(BlockDriverState *bs, void *host);
|
2018-06-01 11:26:39 +02:00
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* bdrv_co_copy_range:
|
|
|
|
*
|
|
|
|
* Do offloaded copy between two children. If the operation is not implemented
|
|
|
|
* by the driver, or if the backend storage doesn't support it, a negative
|
|
|
|
* error code will be returned.
|
|
|
|
*
|
|
|
|
* Note: block layer doesn't emulate or fallback to a bounce buffer approach
|
|
|
|
* because usually the caller shouldn't attempt offloaded copy any more (e.g.
|
|
|
|
* calling copy_file_range(2)) after the first error, thus it should fall back
|
|
|
|
* to a read+write path in the caller level.
|
|
|
|
*
|
|
|
|
* @src: Source child to copy data from
|
|
|
|
* @src_offset: offset in @src image to read data
|
|
|
|
* @dst: Destination child to copy data to
|
|
|
|
* @dst_offset: offset in @dst image to write data
|
|
|
|
* @bytes: number of bytes to copy
|
2018-07-03 04:37:57 +02:00
|
|
|
* @flags: request flags. Supported flags:
|
2018-06-01 11:26:39 +02:00
|
|
|
* BDRV_REQ_ZERO_WRITE - treat the @src range as zero data and do zero
|
|
|
|
* write on @dst as if bdrv_co_pwrite_zeroes is
|
|
|
|
* called. Used to simplify caller code, or
|
|
|
|
* during BlockDriver.bdrv_co_copy_range_from()
|
|
|
|
* recursion.
|
2018-07-03 04:37:57 +02:00
|
|
|
* BDRV_REQ_NO_SERIALISING - do not serialize with other overlapping
|
|
|
|
* requests currently in flight.
|
2018-06-01 11:26:39 +02:00
|
|
|
*
|
|
|
|
* Returns: 0 if succeeded; negative error code if failed.
|
|
|
|
**/
|
|
|
|
int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset,
|
|
|
|
BdrvChild *dst, uint64_t dst_offset,
|
2018-07-09 18:37:17 +02:00
|
|
|
uint64_t bytes, BdrvRequestFlags read_flags,
|
|
|
|
BdrvRequestFlags write_flags);
|
2012-07-10 11:12:40 +02:00
|
|
|
#endif
|