diff --git a/block.c b/block.c index 1fbf6b9e69..c69be2cfe3 100644 --- a/block.c +++ b/block.c @@ -1641,6 +1641,20 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, goto open_failed; } + assert(!(bs->supported_read_flags & ~BDRV_REQ_MASK)); + assert(!(bs->supported_write_flags & ~BDRV_REQ_MASK)); + + /* + * Always allow the BDRV_REQ_REGISTERED_BUF optimization hint. This saves + * drivers that pass read/write requests through to a child the trouble of + * declaring support explicitly. + * + * Drivers must not propagate this flag accidentally when they initiate I/O + * to a bounce buffer. That case should be rare though. + */ + bs->supported_read_flags |= BDRV_REQ_REGISTERED_BUF; + bs->supported_write_flags |= BDRV_REQ_REGISTERED_BUF; + ret = refresh_total_sectors(bs, bs->total_sectors); if (ret < 0) { error_setg_errno(errp, -ret, "Could not refresh total sector count"); diff --git a/block/blkverify.c b/block/blkverify.c index 020b1ae7b6..f36fd6aeb2 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -235,8 +235,8 @@ blkverify_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, qemu_iovec_init(&raw_qiov, qiov->niov); qemu_iovec_clone(&raw_qiov, qiov, buf); - ret = blkverify_co_prwv(bs, &r, offset, bytes, qiov, &raw_qiov, flags, - false); + ret = blkverify_co_prwv(bs, &r, offset, bytes, qiov, &raw_qiov, + flags & ~BDRV_REQ_REGISTERED_BUF, false); cmp_offset = qemu_iovec_compare(qiov, &raw_qiov); if (cmp_offset != -1) { diff --git a/block/crypto.c b/block/crypto.c index 7a57774b76..c7365598a7 100644 --- a/block/crypto.c +++ b/block/crypto.c @@ -410,7 +410,6 @@ block_crypto_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, uint64_t sector_size = qcrypto_block_get_sector_size(crypto->block); uint64_t payload_offset = qcrypto_block_get_payload_offset(crypto->block); - assert(!flags); assert(payload_offset < INT64_MAX); assert(QEMU_IS_ALIGNED(offset, sector_size)); assert(QEMU_IS_ALIGNED(bytes, sector_size)); @@ -473,7 +472,8 @@ block_crypto_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, uint64_t sector_size = qcrypto_block_get_sector_size(crypto->block); uint64_t payload_offset = qcrypto_block_get_payload_offset(crypto->block); - assert(!(flags & ~BDRV_REQ_FUA)); + flags &= ~BDRV_REQ_REGISTERED_BUF; + assert(payload_offset < INT64_MAX); assert(QEMU_IS_ALIGNED(offset, sector_size)); assert(QEMU_IS_ALIGNED(bytes, sector_size)); diff --git a/block/file-posix.c b/block/file-posix.c index 23acffb9a4..b9647c5ffc 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2133,7 +2133,6 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { - assert(flags == 0); return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE); } diff --git a/block/gluster.c b/block/gluster.c index bb1144cf6a..7c90f7ba4b 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -1236,7 +1236,6 @@ static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs, QEMUIOVector *qiov, int flags) { - assert(!flags); return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1); } diff --git a/block/io.c b/block/io.c index cca402bf7b..4207648db6 100644 --- a/block/io.c +++ b/block/io.c @@ -1130,8 +1130,7 @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, int ret; bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); - assert(!(flags & ~BDRV_REQ_MASK)); - assert(!(flags & BDRV_REQ_NO_FALLBACK)); + assert(!(flags & ~bs->supported_read_flags)); if (!drv) { return -ENOMEDIUM; @@ -1195,23 +1194,29 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, BdrvRequestFlags flags) { BlockDriver *drv = bs->drv; + bool emulate_fua = false; int64_t sector_num; unsigned int nb_sectors; QEMUIOVector local_qiov; int ret; bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); - assert(!(flags & ~BDRV_REQ_MASK)); - assert(!(flags & BDRV_REQ_NO_FALLBACK)); if (!drv) { return -ENOMEDIUM; } + if ((flags & BDRV_REQ_FUA) && + (~bs->supported_write_flags & BDRV_REQ_FUA)) { + flags &= ~BDRV_REQ_FUA; + emulate_fua = true; + } + + flags &= bs->supported_write_flags; + if (drv->bdrv_co_pwritev_part) { ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, - flags & bs->supported_write_flags); - flags &= ~bs->supported_write_flags; + flags); goto emulate_flags; } @@ -1221,9 +1226,7 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, } if (drv->bdrv_co_pwritev) { - ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, - flags & bs->supported_write_flags); - flags &= ~bs->supported_write_flags; + ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, flags); goto emulate_flags; } @@ -1233,10 +1236,8 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, .coroutine = qemu_coroutine_self(), }; - acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, - flags & bs->supported_write_flags, + acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, flags, bdrv_co_io_em_complete, &co); - flags &= ~bs->supported_write_flags; if (acb == NULL) { ret = -EIO; } else { @@ -1254,12 +1255,10 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, assert(bytes <= BDRV_REQUEST_MAX_BYTES); assert(drv->bdrv_co_writev); - ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, - flags & bs->supported_write_flags); - flags &= ~bs->supported_write_flags; + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, flags); emulate_flags: - if (ret == 0 && (flags & BDRV_REQ_FUA)) { + if (ret == 0 && emulate_fua) { ret = bdrv_co_flush(bs); } @@ -1487,11 +1486,14 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), align); - /* TODO: We would need a per-BDS .supported_read_flags and + /* + * TODO: We would need a per-BDS .supported_read_flags and * potential fallback support, if we ever implement any read flags * to pass through to drivers. For now, there aren't any - * passthrough flags. */ - assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH))); + * passthrough flags except the BDRV_REQ_REGISTERED_BUF optimization hint. + */ + assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH | + BDRV_REQ_REGISTERED_BUF))); /* Handle Copy on Read and associated serialisation */ if (flags & BDRV_REQ_COPY_ON_READ) { @@ -1532,7 +1534,7 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, goto out; } - assert(!(flags & ~bs->supported_read_flags)); + assert(!(flags & ~(bs->supported_read_flags | BDRV_REQ_REGISTERED_BUF))); max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); if (bytes <= max_bytes && bytes <= max_transfer) { @@ -1721,7 +1723,8 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad) static int bdrv_pad_request(BlockDriverState *bs, QEMUIOVector **qiov, size_t *qiov_offset, int64_t *offset, int64_t *bytes, - BdrvRequestPadding *pad, bool *padded) + BdrvRequestPadding *pad, bool *padded, + BdrvRequestFlags *flags) { int ret; @@ -1749,6 +1752,10 @@ static int bdrv_pad_request(BlockDriverState *bs, if (padded) { *padded = true; } + if (flags) { + /* Can't use optimization hint with bounce buffer */ + *flags &= ~BDRV_REQ_REGISTERED_BUF; + } return 0; } @@ -1803,7 +1810,7 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, } ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, - NULL); + NULL, &flags); if (ret < 0) { goto fail; } @@ -1848,6 +1855,11 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, return -ENOTSUP; } + /* By definition there is no user buffer so this flag doesn't make sense */ + if (flags & BDRV_REQ_REGISTERED_BUF) { + return -EINVAL; + } + /* Invalidate the cached block-status data range if this write overlaps */ bdrv_bsc_invalidate_range(bs, offset, bytes); @@ -2133,6 +2145,9 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, bool padding; BdrvRequestPadding pad; + /* This flag doesn't make sense for padding or zero writes */ + flags &= ~BDRV_REQ_REGISTERED_BUF; + padding = bdrv_init_padding(bs, offset, bytes, &pad); if (padding) { assert(!(flags & BDRV_REQ_NO_WAIT)); @@ -2250,7 +2265,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, * alignment only if there is no ZERO flag. */ ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, - &padded); + &padded, &flags); if (ret < 0) { return ret; } diff --git a/block/mirror.c b/block/mirror.c index 80c0109d39..bed089d2e0 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1486,6 +1486,8 @@ static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs, qemu_iovec_init(&bounce_qiov, 1); qemu_iovec_add(&bounce_qiov, bounce_buf, bytes); qiov = &bounce_qiov; + + flags &= ~BDRV_REQ_REGISTERED_BUF; } ret = bdrv_mirror_top_do_write(bs, MIRROR_METHOD_COPY, offset, bytes, qiov, diff --git a/block/nbd.c b/block/nbd.c index 494b9d683e..7d485c86d2 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -1222,7 +1222,6 @@ static int coroutine_fn nbd_client_co_preadv(BlockDriverState *bs, int64_t offse }; assert(bytes <= NBD_MAX_BUFFER_SIZE); - assert(!flags); if (!bytes) { return 0; diff --git a/block/parallels.c b/block/parallels.c index c1523e7dab..dd15a44100 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -329,7 +329,6 @@ static coroutine_fn int parallels_co_writev(BlockDriverState *bs, QEMUIOVector hd_qiov; int ret = 0; - assert(!flags); qemu_iovec_init(&hd_qiov, qiov->niov); while (nb_sectors > 0) { diff --git a/block/qcow.c b/block/qcow.c index 311aaa8705..e9180c7b61 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -628,7 +628,6 @@ static coroutine_fn int qcow_co_preadv(BlockDriverState *bs, int64_t offset, uint8_t *buf; void *orig_buf; - assert(!flags); if (qiov->niov > 1) { buf = orig_buf = qemu_try_blockalign(bs, qiov->size); if (buf == NULL) { @@ -725,7 +724,6 @@ static coroutine_fn int qcow_co_pwritev(BlockDriverState *bs, int64_t offset, uint8_t *buf; void *orig_buf; - assert(!flags); s->cluster_cache_offset = -1; /* disable compressed cache */ /* We must always copy the iov when encrypting, so we diff --git a/block/qed.c b/block/qed.c index bda00e6257..99a9ec9b57 100644 --- a/block/qed.c +++ b/block/qed.c @@ -1395,7 +1395,6 @@ static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags) { - assert(!flags); return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE); } diff --git a/block/raw-format.c b/block/raw-format.c index f337ac7569..c8dc9bc850 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -258,6 +258,8 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset, qemu_iovec_add(&local_qiov, buf, 512); qemu_iovec_concat(&local_qiov, qiov, 512, qiov->size - 512); qiov = &local_qiov; + + flags &= ~BDRV_REQ_REGISTERED_BUF; } ret = raw_adjust_offset(bs, &offset, bytes, true); diff --git a/block/replication.c b/block/replication.c index c67f931f37..13f1d39571 100644 --- a/block/replication.c +++ b/block/replication.c @@ -261,7 +261,6 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs, int ret; int64_t n; - assert(!flags); ret = replication_get_io_status(s); if (ret < 0) { goto out; diff --git a/block/ssh.c b/block/ssh.c index a2dc646536..a3cddc392c 100644 --- a/block/ssh.c +++ b/block/ssh.c @@ -1196,7 +1196,6 @@ static coroutine_fn int ssh_co_writev(BlockDriverState *bs, BDRVSSHState *s = bs->opaque; int ret; - assert(!flags); qemu_co_mutex_lock(&s->lock); ret = ssh_write(s, bs, sector_num * BDRV_SECTOR_SIZE, nb_sectors * BDRV_SECTOR_SIZE, qiov); diff --git a/block/vhdx.c b/block/vhdx.c index e10e78ebfd..e2344ee0b7 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -1342,7 +1342,6 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, uint64_t bat_prior_offset = 0; bool bat_update = false; - assert(!flags); qemu_iovec_init(&hd_qiov, qiov->niov); qemu_co_mutex_lock(&s->lock); diff --git a/include/block/block-common.h b/include/block/block-common.h index fdb7306e78..061606e867 100644 --- a/include/block/block-common.h +++ b/include/block/block-common.h @@ -80,6 +80,15 @@ typedef enum { */ BDRV_REQ_MAY_UNMAP = 0x4, + /* + * An optimization hint when all QEMUIOVector elements are within + * previously registered bdrv_register_buf() memory ranges. + * + * Code that replaces the user's QEMUIOVector elements with bounce buffers + * must take care to clear this flag. + */ + BDRV_REQ_REGISTERED_BUF = 0x8, + BDRV_REQ_FUA = 0x10, BDRV_REQ_WRITE_COMPRESSED = 0x20,