From 1a62d0accdf85fbeac149018ee8d1728e754de73 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 12:31:59 -0600 Subject: [PATCH 01/25] block: Fragment reads to max transfer length Drivers should be able to rely on the block layer honoring the max transfer length, rather than needing to return -EINVAL (iscsi) or manually fragment things (nbd). This patch adds the fragmentation in the block layer, after requests have been aligned (fragmenting before alignment would lead to multiple unaligned requests, rather than just the head and tail). The return value was previously nebulous on success on whether it was zero or the length read; and fragmenting may introduce yet other non-zero values if we use the last length read. But as at least some callers are sloppy and expect only zero on success, it is easiest to just guarantee 0. [Fix uninitialized ret local variable in bdrv_aligned_preadv(). --Stefan] Signed-off-by: Eric Blake Message-id: 1468607524-19021-2-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/io.c | 57 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/block/io.c b/block/io.c index cfda7148d8..c3574a4dd6 100644 --- a/block/io.c +++ b/block/io.c @@ -971,21 +971,25 @@ err: /* * Forwards an already correctly aligned request to the BlockDriver. This - * handles copy on read and zeroing after EOF; any other features must be - * implemented by the caller. + * handles copy on read, zeroing after EOF, and fragmentation of large + * reads; any other features must be implemented by the caller. */ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, int64_t align, QEMUIOVector *qiov, int flags) { int64_t total_bytes, max_bytes; - int ret; + int ret = 0; + uint64_t bytes_remaining = bytes; + int max_transfer; assert(is_power_of_2(align)); assert((offset & (align - 1)) == 0); assert((bytes & (align - 1)) == 0); assert(!qiov || bytes == qiov->size); assert((bs->open_flags & BDRV_O_NO_IO) == 0); + max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), + align); /* TODO: We would need a per-BDS .supported_read_flags and * potential fallback support, if we ever implement any read flags @@ -1024,7 +1028,7 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, } } - /* Forward the request to the BlockDriver */ + /* Forward the request to the BlockDriver, possibly fragmenting it */ total_bytes = bdrv_getlength(bs); if (total_bytes < 0) { ret = total_bytes; @@ -1032,30 +1036,39 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, } max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); - if (bytes <= max_bytes) { + if (bytes <= max_bytes && bytes <= max_transfer) { ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0); - } else if (max_bytes > 0) { - QEMUIOVector local_qiov; - - qemu_iovec_init(&local_qiov, qiov->niov); - qemu_iovec_concat(&local_qiov, qiov, 0, max_bytes); - - ret = bdrv_driver_preadv(bs, offset, max_bytes, &local_qiov, 0); - - qemu_iovec_destroy(&local_qiov); - } else { - ret = 0; + goto out; } - /* Reading beyond end of file is supposed to produce zeroes */ - if (ret == 0 && total_bytes < offset + bytes) { - uint64_t zero_offset = MAX(0, total_bytes - offset); - uint64_t zero_bytes = offset + bytes - zero_offset; - qemu_iovec_memset(qiov, zero_offset, 0, zero_bytes); + while (bytes_remaining) { + int num; + + if (max_bytes) { + QEMUIOVector local_qiov; + + num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); + assert(num); + qemu_iovec_init(&local_qiov, qiov->niov); + qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); + + ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, + num, &local_qiov, 0); + max_bytes -= num; + qemu_iovec_destroy(&local_qiov); + } else { + num = bytes_remaining; + ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0, + bytes_remaining); + } + if (ret < 0) { + goto out; + } + bytes_remaining -= num; } out: - return ret; + return ret < 0 ? ret : 0; } /* From 8a39b4d6e2926a536027f7ffbc28a3ed2e5d32b0 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 12:32:00 -0600 Subject: [PATCH 02/25] raw_bsd: Don't advertise flags not supported by protocol layer The raw format layer supports all flags via passthrough - but it only makes sense to pass through flags that the lower layer actually supports. The next patch gives stronger reasoning for why this is correct. At the moment, the raw format layer ignores the max_transfer limit of its protocol layer, and an attempt to do the qemu-io 'w -f 0 40m' to an NBD server that lacks FUA will pass the entire 40m request to the NBD driver, which then fragments the request itself into a 32m write, 8m write, and flush. But once the block layer starts honoring limits and fragmenting packets, the raw driver will hand the NBD driver two separate requests; if both requests have BDRV_REQ_FUA set, then this would result in a 32m write, flush, 8m write, and second flush. By having the raw layer no longer advertise FUA support when the protocol layer lacks it, we are back to a single flush at the block layer for the overall 40m request. Note that 'w -f -z 0 40m' does not currently exhibit the same problem, because there, the fragmentation does not occur until at the NBD layer (the raw layer has .bdrv_co_pwrite_zeroes, and the NBD layer doesn't advertise max_pwrite_zeroes to constrain things at the raw layer) - but the problem is latent and we would again have too many flushes without this patch once the NBD layer implements support for the new NBD_CMD_WRITE_ZEROES command, if it sets max_pwrite_zeroes to the same 32m limit as recommended by the NBD protocol. Signed-off-by: Eric Blake Reviewed-by: Fam Zheng Reviewed-by: Stefan Hajnoczi Message-id: 1468607524-19021-3-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/raw_bsd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/raw_bsd.c b/block/raw_bsd.c index 5f9dd299a6..d7674132f2 100644 --- a/block/raw_bsd.c +++ b/block/raw_bsd.c @@ -192,8 +192,10 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { bs->sg = bs->file->bs->sg; - bs->supported_write_flags = BDRV_REQ_FUA; - bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP; + bs->supported_write_flags = BDRV_REQ_FUA & + bs->file->bs->supported_write_flags; + bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) & + bs->file->bs->supported_zero_flags; if (bs->probed && !bdrv_is_read_only(bs)) { fprintf(stderr, From 04ed95f4843281e292d93018d56d4b14705f9f2c Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 12:32:01 -0600 Subject: [PATCH 03/25] block: Fragment writes to max transfer length Drivers should be able to rely on the block layer honoring the max transfer length, rather than needing to return -EINVAL (iscsi) or manually fragment things (nbd). We already fragment write zeroes at the block layer; this patch adds the fragmentation for normal writes, after requests have been aligned (fragmenting before alignment would lead to multiple unaligned requests, rather than just the head and tail). When fragmenting a large request where FUA was requested, but where we know that FUA is implemented by flushing all requests rather than the given request, then we can still get by with only one flush. Note, however, that we need a followup patch to the raw format driver to avoid a regression in the number of flushes actually issued. The return value was previously nebulous on success (sometimes zero, sometimes the length written); since we never have a short write, and since fragmenting may store yet another positive value in 'ret', change the function to always return 0 on success, matching what we do in bdrv_aligned_preadv(). Signed-off-by: Eric Blake Message-id: 1468607524-19021-4-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/io.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/block/io.c b/block/io.c index c3574a4dd6..410394dc42 100644 --- a/block/io.c +++ b/block/io.c @@ -1269,7 +1269,8 @@ fail: } /* - * Forwards an already correctly aligned write request to the BlockDriver. + * Forwards an already correctly aligned write request to the BlockDriver, + * after possibly fragmenting it. */ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, @@ -1281,6 +1282,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, int64_t start_sector = offset >> BDRV_SECTOR_BITS; int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); + uint64_t bytes_remaining = bytes; + int max_transfer; assert(is_power_of_2(align)); assert((offset & (align - 1)) == 0); @@ -1288,6 +1291,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, assert(!qiov || bytes == qiov->size); assert((bs->open_flags & BDRV_O_NO_IO) == 0); assert(!(flags & ~BDRV_REQ_MASK)); + max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), + align); waited = wait_serialising_requests(req); assert(!waited || !req->serialising); @@ -1310,9 +1315,34 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, } else if (flags & BDRV_REQ_ZERO_WRITE) { bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); - } else { + } else if (bytes <= max_transfer) { bdrv_debug_event(bs, BLKDBG_PWRITEV); ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags); + } else { + bdrv_debug_event(bs, BLKDBG_PWRITEV); + while (bytes_remaining) { + int num = MIN(bytes_remaining, max_transfer); + QEMUIOVector local_qiov; + int local_flags = flags; + + assert(num); + if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && + !(bs->supported_write_flags & BDRV_REQ_FUA)) { + /* If FUA is going to be emulated by flush, we only + * need to flush on the last iteration */ + local_flags &= ~BDRV_REQ_FUA; + } + qemu_iovec_init(&local_qiov, qiov->niov); + qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); + + ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, + num, &local_qiov, local_flags); + qemu_iovec_destroy(&local_qiov); + if (ret < 0) { + break; + } + bytes_remaining -= num; + } } bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); @@ -1325,6 +1355,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, if (ret >= 0) { bs->total_sectors = MAX(bs->total_sectors, end_sector); + ret = 0; } return ret; From fb1a6de14ab353baa4bc3dfb777e662a26045ca9 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 12:32:02 -0600 Subject: [PATCH 04/25] nbd: Rely on block layer to break up large requests Now that the block layer will honor max_transfer, we can simplify our code to rely on that guarantee. The readv code can call directly into nbd-client, just as the writev code has done since commit 52a4650. Interestingly enough, while qemu-io 'w 0 40m' splits into a 32M and 8M transaction, 'w -z 0 40m' splits into two 16M and an 8M, because the block layer caps the bounce buffer for writing zeroes at 16M. When we later introduce support for NBD_CMD_WRITE_ZEROES, we can get a full 32M zero write (or larger, if the client and server negotiate that write zeroes can use a larger size than ordinary writes). Signed-off-by: Eric Blake Reviewed-by: Fam Zheng Reviewed-by: Stefan Hajnoczi Acked-by: Paolo Bonzini Message-id: 1468607524-19021-5-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/nbd-client.c | 51 ++++++++-------------------------------------- block/nbd.c | 12 +++-------- 2 files changed, 11 insertions(+), 52 deletions(-) diff --git a/block/nbd-client.c b/block/nbd-client.c index 4cc408d206..f1fb58bf26 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -217,15 +217,15 @@ static void nbd_coroutine_end(NbdClientSession *s, } } -static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, - int offset) +int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) { NbdClientSession *client = nbd_get_client_session(bs); struct nbd_request request = { .type = NBD_CMD_READ }; struct nbd_reply reply; ssize_t ret; + assert(nb_sectors <= NBD_MAX_SECTORS); request.from = sector_num * 512; request.len = nb_sectors * 512; @@ -234,16 +234,15 @@ static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num, if (ret < 0) { reply.error = -ret; } else { - nbd_co_receive_reply(client, &request, &reply, qiov, offset); + nbd_co_receive_reply(client, &request, &reply, qiov, 0); } nbd_coroutine_end(client, &request); return -reply.error; } -static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, - int offset, int flags) +int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov, int flags) { NbdClientSession *client = nbd_get_client_session(bs); struct nbd_request request = { .type = NBD_CMD_WRITE }; @@ -255,11 +254,12 @@ static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num, request.type |= NBD_CMD_FLAG_FUA; } + assert(nb_sectors <= NBD_MAX_SECTORS); request.from = sector_num * 512; request.len = nb_sectors * 512; nbd_coroutine_start(client, &request); - ret = nbd_co_send_request(bs, &request, qiov, offset); + ret = nbd_co_send_request(bs, &request, qiov, 0); if (ret < 0) { reply.error = -ret; } else { @@ -269,41 +269,6 @@ static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num, return -reply.error; } -int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - int offset = 0; - int ret; - while (nb_sectors > NBD_MAX_SECTORS) { - ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset); - if (ret < 0) { - return ret; - } - offset += NBD_MAX_SECTORS * 512; - sector_num += NBD_MAX_SECTORS; - nb_sectors -= NBD_MAX_SECTORS; - } - return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset); -} - -int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, int flags) -{ - int offset = 0; - int ret; - while (nb_sectors > NBD_MAX_SECTORS) { - ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset, - flags); - if (ret < 0) { - return ret; - } - offset += NBD_MAX_SECTORS * 512; - sector_num += NBD_MAX_SECTORS; - nb_sectors -= NBD_MAX_SECTORS; - } - return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset, flags); -} - int nbd_client_co_flush(BlockDriverState *bs) { NbdClientSession *client = nbd_get_client_session(bs); diff --git a/block/nbd.c b/block/nbd.c index 08e5b67b2f..8a130787c5 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -349,12 +349,6 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags, return ret; } -static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - return nbd_client_co_readv(bs, sector_num, nb_sectors, qiov); -} - static int nbd_co_flush(BlockDriverState *bs) { return nbd_client_co_flush(bs); @@ -450,7 +444,7 @@ static BlockDriver bdrv_nbd = { .instance_size = sizeof(BDRVNBDState), .bdrv_parse_filename = nbd_parse_filename, .bdrv_file_open = nbd_open, - .bdrv_co_readv = nbd_co_readv, + .bdrv_co_readv = nbd_client_co_readv, .bdrv_co_writev_flags = nbd_client_co_writev, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, @@ -468,7 +462,7 @@ static BlockDriver bdrv_nbd_tcp = { .instance_size = sizeof(BDRVNBDState), .bdrv_parse_filename = nbd_parse_filename, .bdrv_file_open = nbd_open, - .bdrv_co_readv = nbd_co_readv, + .bdrv_co_readv = nbd_client_co_readv, .bdrv_co_writev_flags = nbd_client_co_writev, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, @@ -486,7 +480,7 @@ static BlockDriver bdrv_nbd_unix = { .instance_size = sizeof(BDRVNBDState), .bdrv_parse_filename = nbd_parse_filename, .bdrv_file_open = nbd_open, - .bdrv_co_readv = nbd_co_readv, + .bdrv_co_readv = nbd_client_co_readv, .bdrv_co_writev_flags = nbd_client_co_writev, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, From 1e2a77a851b3369799272c8e77c07995a1a2e579 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 12:32:03 -0600 Subject: [PATCH 05/25] nbd: Drop unused offset parameter Now that NBD relies on the block layer to fragment things, we no longer need to track an offset argument for which fragment of a request we are actually servicing. While at it, use true and false instead of 0 and 1 for a bool parameter. Signed-off-by: Eric Blake Reviewed-by: Fam Zheng Reviewed-by: Stefan Hajnoczi Acked-by: Paolo Bonzini Message-id: 1468607524-19021-6-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/nbd-client.c | 31 ++++++++++++++++--------------- include/block/nbd.h | 1 - nbd/common.c | 5 +---- nbd/nbd-internal.h | 4 ++-- 4 files changed, 19 insertions(+), 22 deletions(-) diff --git a/block/nbd-client.c b/block/nbd-client.c index f1fb58bf26..f184844e40 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -116,7 +116,7 @@ static void nbd_restart_write(void *opaque) static int nbd_co_send_request(BlockDriverState *bs, struct nbd_request *request, - QEMUIOVector *qiov, int offset) + QEMUIOVector *qiov) { NbdClientSession *s = nbd_get_client_session(bs); AioContext *aio_context; @@ -149,8 +149,8 @@ static int nbd_co_send_request(BlockDriverState *bs, qio_channel_set_cork(s->ioc, true); rc = nbd_send_request(s->ioc, request); if (rc >= 0) { - ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, - offset, request->len, 0); + ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len, + false); if (ret != request->len) { rc = -EIO; } @@ -167,8 +167,9 @@ static int nbd_co_send_request(BlockDriverState *bs, } static void nbd_co_receive_reply(NbdClientSession *s, - struct nbd_request *request, struct nbd_reply *reply, - QEMUIOVector *qiov, int offset) + struct nbd_request *request, + struct nbd_reply *reply, + QEMUIOVector *qiov) { int ret; @@ -181,8 +182,8 @@ static void nbd_co_receive_reply(NbdClientSession *s, reply->error = EIO; } else { if (qiov && reply->error == 0) { - ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, - offset, request->len, 1); + ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len, + true); if (ret != request->len) { reply->error = EIO; } @@ -230,11 +231,11 @@ int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num, request.len = nb_sectors * 512; nbd_coroutine_start(client, &request); - ret = nbd_co_send_request(bs, &request, NULL, 0); + ret = nbd_co_send_request(bs, &request, NULL); if (ret < 0) { reply.error = -ret; } else { - nbd_co_receive_reply(client, &request, &reply, qiov, 0); + nbd_co_receive_reply(client, &request, &reply, qiov); } nbd_coroutine_end(client, &request); return -reply.error; @@ -259,11 +260,11 @@ int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, request.len = nb_sectors * 512; nbd_coroutine_start(client, &request); - ret = nbd_co_send_request(bs, &request, qiov, 0); + ret = nbd_co_send_request(bs, &request, qiov); if (ret < 0) { reply.error = -ret; } else { - nbd_co_receive_reply(client, &request, &reply, NULL, 0); + nbd_co_receive_reply(client, &request, &reply, NULL); } nbd_coroutine_end(client, &request); return -reply.error; @@ -284,11 +285,11 @@ int nbd_client_co_flush(BlockDriverState *bs) request.len = 0; nbd_coroutine_start(client, &request); - ret = nbd_co_send_request(bs, &request, NULL, 0); + ret = nbd_co_send_request(bs, &request, NULL); if (ret < 0) { reply.error = -ret; } else { - nbd_co_receive_reply(client, &request, &reply, NULL, 0); + nbd_co_receive_reply(client, &request, &reply, NULL); } nbd_coroutine_end(client, &request); return -reply.error; @@ -309,11 +310,11 @@ int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num, request.len = nb_sectors * 512; nbd_coroutine_start(client, &request); - ret = nbd_co_send_request(bs, &request, NULL, 0); + ret = nbd_co_send_request(bs, &request, NULL); if (ret < 0) { reply.error = -ret; } else { - nbd_co_receive_reply(client, &request, &reply, NULL, 0); + nbd_co_receive_reply(client, &request, &reply, NULL); } nbd_coroutine_end(client, &request); return -reply.error; diff --git a/include/block/nbd.h b/include/block/nbd.h index eeda3ebdf7..503f514e85 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -89,7 +89,6 @@ enum { ssize_t nbd_wr_syncv(QIOChannel *ioc, struct iovec *iov, size_t niov, - size_t offset, size_t length, bool do_read); int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint32_t *flags, diff --git a/nbd/common.c b/nbd/common.c index 8ddb2dd2f0..b583a4f4cf 100644 --- a/nbd/common.c +++ b/nbd/common.c @@ -23,7 +23,6 @@ ssize_t nbd_wr_syncv(QIOChannel *ioc, struct iovec *iov, size_t niov, - size_t offset, size_t length, bool do_read) { @@ -33,9 +32,7 @@ ssize_t nbd_wr_syncv(QIOChannel *ioc, struct iovec *local_iov_head = local_iov; unsigned int nlocal_iov = niov; - nlocal_iov = iov_copy(local_iov, nlocal_iov, - iov, niov, - offset, length); + nlocal_iov = iov_copy(local_iov, nlocal_iov, iov, niov, 0, length); while (nlocal_iov > 0) { ssize_t len; diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h index 26a9f4d9fd..93a6ca8549 100644 --- a/nbd/nbd-internal.h +++ b/nbd/nbd-internal.h @@ -101,14 +101,14 @@ static inline ssize_t read_sync(QIOChannel *ioc, void *buffer, size_t size) * our request/reply. Synchronization is done with recv_coroutine, so * that this is coroutine-safe. */ - return nbd_wr_syncv(ioc, &iov, 1, 0, size, true); + return nbd_wr_syncv(ioc, &iov, 1, size, true); } static inline ssize_t write_sync(QIOChannel *ioc, void *buffer, size_t size) { struct iovec iov = { .iov_base = buffer, .iov_len = size }; - return nbd_wr_syncv(ioc, &iov, 1, 0, size, false); + return nbd_wr_syncv(ioc, &iov, 1, size, false); } struct NBDTLSHandshakeData { From 6bd01f14dbef06dc22c0400f12dfbef06ab6b766 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 12:32:04 -0600 Subject: [PATCH 06/25] iscsi: Rely on block layer to break up large requests Now that the block layer honors max_request, we don't need to bother with an EINVAL on overlarge requests, but can instead assert that requests are well-behaved. Signed-off-by: Eric Blake Reviewed-by: Fam Zheng Reviewed-by: Stefan Hajnoczi Acked-by: Paolo Bonzini Message-id: 1468607524-19021-7-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/iscsi.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/block/iscsi.c b/block/iscsi.c index 129c3afa68..5602abdc6a 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -586,11 +586,8 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, return -EINVAL; } - if (bs->bl.max_transfer && - nb_sectors << BDRV_SECTOR_BITS > bs->bl.max_transfer) { - error_report("iSCSI Error: Write of %d sectors exceeds max_xfer_len " - "of %" PRIu32 " bytes", nb_sectors, bs->bl.max_transfer); - return -EINVAL; + if (bs->bl.max_transfer) { + assert(nb_sectors << BDRV_SECTOR_BITS <= bs->bl.max_transfer); } lba = sector_qemu2lun(sector_num, iscsilun); @@ -754,11 +751,8 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, return -EINVAL; } - if (bs->bl.max_transfer && - nb_sectors << BDRV_SECTOR_BITS > bs->bl.max_transfer) { - error_report("iSCSI Error: Read of %d sectors exceeds max_xfer_len " - "of %" PRIu32 " bytes", nb_sectors, bs->bl.max_transfer); - return -EINVAL; + if (bs->bl.max_transfer) { + assert(nb_sectors << BDRV_SECTOR_BITS <= bs->bl.max_transfer); } /* if cache.direct is off and we have a valid entry in our allocation map From 9f1963b3f72521f75a549f8afd61b19e7da63c6f Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:22:50 -0600 Subject: [PATCH 07/25] block: Convert bdrv_co_discard() to byte-based Another step towards byte-based interfaces everywhere. Replace the sector-based bdrv_co_discard() with a new byte-based bdrv_co_pdiscard(), which silently ignores any unaligned head or tail. Driver callbacks will be converted in followup patches. By calculating the alignment outside of the loop, and clamping the max discard to an aligned value, we can simplify the actions done within the loop. Signed-off-by: Eric Blake Reviewed-by: Stefan Hajnoczi Message-id: 1468624988-423-2-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/blkreplay.c | 3 +- block/block-backend.c | 3 +- block/io.c | 67 ++++++++++++++++++++++--------------------- block/raw_bsd.c | 3 +- include/block/block.h | 2 +- 5 files changed, 42 insertions(+), 36 deletions(-) diff --git a/block/blkreplay.c b/block/blkreplay.c index 3368c8c98d..c69e5a57a0 100755 --- a/block/blkreplay.c +++ b/block/blkreplay.c @@ -118,7 +118,8 @@ static int coroutine_fn blkreplay_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { uint64_t reqid = request_id++; - int ret = bdrv_co_discard(bs->file->bs, sector_num, nb_sectors); + int ret = bdrv_co_pdiscard(bs->file->bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS); block_request_create(reqid, bs, qemu_coroutine_self()); qemu_coroutine_yield(); diff --git a/block/block-backend.c b/block/block-backend.c index f9cea1b304..d982cf9d29 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1113,7 +1113,8 @@ int blk_co_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors) return ret; } - return bdrv_co_discard(blk_bs(blk), sector_num, nb_sectors); + return bdrv_co_pdiscard(blk_bs(blk), sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS); } int blk_co_flush(BlockBackend *blk) diff --git a/block/io.c b/block/io.c index 410394dc42..14d448d50c 100644 --- a/block/io.c +++ b/block/io.c @@ -2199,7 +2199,8 @@ static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) BlockAIOCBCoroutine *acb = opaque; BlockDriverState *bs = acb->common.bs; - acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); + acb->req.error = bdrv_co_pdiscard(bs, acb->req.sector << BDRV_SECTOR_BITS, + acb->req.nb_sectors << BDRV_SECTOR_BITS); bdrv_co_complete(acb); } @@ -2398,20 +2399,22 @@ static void coroutine_fn bdrv_discard_co_entry(void *opaque) { DiscardCo *rwco = opaque; - rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); + rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->sector_num << BDRV_SECTOR_BITS, + rwco->nb_sectors << BDRV_SECTOR_BITS); } -int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) +int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, + int count) { BdrvTrackedRequest req; - int max_discard, ret; + int max_pdiscard, ret; + int head, align; if (!bs->drv) { return -ENOMEDIUM; } - ret = bdrv_check_request(bs, sector_num, nb_sectors); + ret = bdrv_check_byte_request(bs, offset, count); if (ret < 0) { return ret; } else if (bs->read_only) { @@ -2428,45 +2431,45 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, return 0; } - tracked_request_begin(&req, bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, BDRV_TRACKED_DISCARD); + /* Discard is advisory, so ignore any unaligned head or tail */ + align = MAX(BDRV_SECTOR_SIZE, + MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment)); + assert(is_power_of_2(align)); + head = MIN(count, -offset & (align - 1)); + if (head) { + count -= head; + offset += head; + } + count = QEMU_ALIGN_DOWN(count, align); + if (!count) { + return 0; + } + + tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD); ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); if (ret < 0) { goto out; } - max_discard = MIN_NON_ZERO(bs->bl.max_pdiscard >> BDRV_SECTOR_BITS, - BDRV_REQUEST_MAX_SECTORS); - while (nb_sectors > 0) { + max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), + align); + + while (count > 0) { int ret; - int num = nb_sectors; - int discard_alignment = bs->bl.pdiscard_alignment >> BDRV_SECTOR_BITS; - - /* align request */ - if (discard_alignment && - num >= discard_alignment && - sector_num % discard_alignment) { - if (num > discard_alignment) { - num = discard_alignment; - } - num -= sector_num % discard_alignment; - } - - /* limit request size */ - if (num > max_discard) { - num = max_discard; - } + int num = MIN(count, max_pdiscard); if (bs->drv->bdrv_co_discard) { - ret = bs->drv->bdrv_co_discard(bs, sector_num, num); + ret = bs->drv->bdrv_co_discard(bs, offset >> BDRV_SECTOR_BITS, + num >> BDRV_SECTOR_BITS); } else { BlockAIOCB *acb; CoroutineIOCompletion co = { .coroutine = qemu_coroutine_self(), }; - acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, + acb = bs->drv->bdrv_aio_discard(bs, offset >> BDRV_SECTOR_BITS, + num >> BDRV_SECTOR_BITS, bdrv_co_io_em_complete, &co); if (acb == NULL) { ret = -EIO; @@ -2480,8 +2483,8 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, goto out; } - sector_num += num; - nb_sectors -= num; + offset += num; + count -= num; } ret = 0; out: diff --git a/block/raw_bsd.c b/block/raw_bsd.c index d7674132f2..68f0a91a9a 100644 --- a/block/raw_bsd.c +++ b/block/raw_bsd.c @@ -137,7 +137,8 @@ static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs, static int coroutine_fn raw_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { - return bdrv_co_discard(bs->file->bs, sector_num, nb_sectors); + return bdrv_co_pdiscard(bs->file->bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS); } static int64_t raw_getlength(BlockDriverState *bs) diff --git a/include/block/block.h b/include/block/block.h index 616d8b9f12..4f5cebf15f 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -342,7 +342,7 @@ void coroutine_fn bdrv_co_drain(BlockDriverState *bs); void bdrv_drain_all(void); int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors); -int bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors); +int bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, int count); int bdrv_has_zero_init_1(BlockDriverState *bs); int bdrv_has_zero_init(BlockDriverState *bs); bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs); From 0c51a893b643bc9393c685b47b9cea1e6831565f Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:22:51 -0600 Subject: [PATCH 08/25] block: Convert bdrv_discard() to byte-based Another step towards byte-based interfaces everywhere. Replace the sector-based bdrv_discard() with a new byte-based bdrv_pdiscard(), which silently ignores any unaligned head or tail. Signed-off-by: Eric Blake Reviewed-by: Stefan Hajnoczi Message-id: 1468624988-423-3-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/block-backend.c | 3 ++- block/io.c | 19 +++++++++---------- block/qcow2-refcount.c | 4 +--- include/block/block.h | 2 +- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index d982cf9d29..83b6407ea6 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1512,7 +1512,8 @@ int blk_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors) return ret; } - return bdrv_discard(blk_bs(blk), sector_num, nb_sectors); + return bdrv_pdiscard(blk_bs(blk), sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS); } int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, diff --git a/block/io.c b/block/io.c index 14d448d50c..8d413058be 100644 --- a/block/io.c +++ b/block/io.c @@ -2391,16 +2391,15 @@ int bdrv_flush(BlockDriverState *bs) typedef struct DiscardCo { BlockDriverState *bs; - int64_t sector_num; - int nb_sectors; + int64_t offset; + int count; int ret; } DiscardCo; -static void coroutine_fn bdrv_discard_co_entry(void *opaque) +static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque) { DiscardCo *rwco = opaque; - rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->sector_num << BDRV_SECTOR_BITS, - rwco->nb_sectors << BDRV_SECTOR_BITS); + rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->count); } int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, @@ -2495,23 +2494,23 @@ out: return ret; } -int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) +int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count) { Coroutine *co; DiscardCo rwco = { .bs = bs, - .sector_num = sector_num, - .nb_sectors = nb_sectors, + .offset = offset, + .count = count, .ret = NOT_DONE, }; if (qemu_in_coroutine()) { /* Fast-path if already in coroutine context */ - bdrv_discard_co_entry(&rwco); + bdrv_pdiscard_co_entry(&rwco); } else { AioContext *aio_context = bdrv_get_aio_context(bs); - co = qemu_coroutine_create(bdrv_discard_co_entry, &rwco); + co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco); qemu_coroutine_enter(co); while (rwco.ret == NOT_DONE) { aio_poll(aio_context, true); diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 49b6ce6bfd..cbfb3fe064 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -615,9 +615,7 @@ void qcow2_process_discards(BlockDriverState *bs, int ret) /* Discard is optional, ignore the return value */ if (ret >= 0) { - bdrv_discard(bs->file->bs, - d->offset >> BDRV_SECTOR_BITS, - d->bytes >> BDRV_SECTOR_BITS); + bdrv_pdiscard(bs->file->bs, d->offset, d->bytes); } g_free(d); diff --git a/include/block/block.h b/include/block/block.h index 4f5cebf15f..94cabbb685 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -341,7 +341,7 @@ void bdrv_drain(BlockDriverState *bs); void coroutine_fn bdrv_co_drain(BlockDriverState *bs); void bdrv_drain_all(void); -int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors); +int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count); int bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, int count); int bdrv_has_zero_init_1(BlockDriverState *bs); int bdrv_has_zero_init(BlockDriverState *bs); From b15404e0273e20baddcbbc1e8915f2e9ee9b117b Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:22:52 -0600 Subject: [PATCH 09/25] block: Switch BlockRequest to byte-based BlockRequest is the internal struct used by bdrv_aio_*. At the moment, all such calls were sector-based, but we will eventually convert to byte-based; start by changing the internal variables to be byte-based. No change to behavior, although the read and write code can now go byte-based through more of the stack. Signed-off-by: Eric Blake Message-id: 1468624988-423-4-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/io.c | 62 ++++++++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/block/io.c b/block/io.c index 8d413058be..85953bbc16 100644 --- a/block/io.c +++ b/block/io.c @@ -33,14 +33,13 @@ #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ -static BlockAIOCB *bdrv_co_aio_rw_vector(BdrvChild *child, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BdrvRequestFlags flags, - BlockCompletionFunc *cb, - void *opaque, - bool is_write); +static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child, + int64_t offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags, + BlockCompletionFunc *cb, + void *opaque, + bool is_write); static void coroutine_fn bdrv_co_do_rw(void *opaque); static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int count, BdrvRequestFlags flags); @@ -2015,8 +2014,9 @@ BlockAIOCB *bdrv_aio_readv(BdrvChild *child, int64_t sector_num, { trace_bdrv_aio_readv(child->bs, sector_num, nb_sectors, opaque); - return bdrv_co_aio_rw_vector(child, sector_num, qiov, nb_sectors, 0, - cb, opaque, false); + assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size); + return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov, + 0, cb, opaque, false); } BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num, @@ -2025,8 +2025,9 @@ BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num, { trace_bdrv_aio_writev(child->bs, sector_num, nb_sectors, opaque); - return bdrv_co_aio_rw_vector(child, sector_num, qiov, nb_sectors, 0, - cb, opaque, true); + assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size); + return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov, + 0, cb, opaque, true); } void bdrv_aio_cancel(BlockAIOCB *acb) @@ -2062,8 +2063,8 @@ typedef struct BlockRequest { union { /* Used during read, write, trim */ struct { - int64_t sector; - int nb_sectors; + int64_t offset; + int bytes; int flags; QEMUIOVector *qiov; }; @@ -2127,24 +2128,23 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque) BlockAIOCBCoroutine *acb = opaque; if (!acb->is_write) { - acb->req.error = bdrv_co_do_readv(acb->child, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov, acb->req.flags); + acb->req.error = bdrv_co_preadv(acb->child, acb->req.offset, + acb->req.qiov->size, acb->req.qiov, acb->req.flags); } else { - acb->req.error = bdrv_co_do_writev(acb->child, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov, acb->req.flags); + acb->req.error = bdrv_co_pwritev(acb->child, acb->req.offset, + acb->req.qiov->size, acb->req.qiov, acb->req.flags); } bdrv_co_complete(acb); } -static BlockAIOCB *bdrv_co_aio_rw_vector(BdrvChild *child, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BdrvRequestFlags flags, - BlockCompletionFunc *cb, - void *opaque, - bool is_write) +static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child, + int64_t offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags, + BlockCompletionFunc *cb, + void *opaque, + bool is_write) { Coroutine *co; BlockAIOCBCoroutine *acb; @@ -2153,8 +2153,7 @@ static BlockAIOCB *bdrv_co_aio_rw_vector(BdrvChild *child, acb->child = child; acb->need_bh = true; acb->req.error = -EINPROGRESS; - acb->req.sector = sector_num; - acb->req.nb_sectors = nb_sectors; + acb->req.offset = offset; acb->req.qiov = qiov; acb->req.flags = flags; acb->is_write = is_write; @@ -2199,8 +2198,7 @@ static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) BlockAIOCBCoroutine *acb = opaque; BlockDriverState *bs = acb->common.bs; - acb->req.error = bdrv_co_pdiscard(bs, acb->req.sector << BDRV_SECTOR_BITS, - acb->req.nb_sectors << BDRV_SECTOR_BITS); + acb->req.error = bdrv_co_pdiscard(bs, acb->req.offset, acb->req.bytes); bdrv_co_complete(acb); } @@ -2216,8 +2214,8 @@ BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); acb->need_bh = true; acb->req.error = -EINPROGRESS; - acb->req.sector = sector_num; - acb->req.nb_sectors = nb_sectors; + acb->req.offset = sector_num << BDRV_SECTOR_BITS; + acb->req.bytes = nb_sectors << BDRV_SECTOR_BITS; co = qemu_coroutine_create(bdrv_aio_discard_co_entry, acb); qemu_coroutine_enter(co); From 60ebac16bca3e3bf07c7ae67a69a7730aaa48712 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:22:53 -0600 Subject: [PATCH 10/25] block: Convert bdrv_aio_discard() to byte-based Another step towards byte-based interfaces everywhere. Replace the sector-based bdrv_aio_discard() with a new byte-based bdrv_aio_pdiscard(), which silently ignores any unaligned head or tail. Driver callbacks will be converted in followup patches. Signed-off-by: Eric Blake Message-id: 1468624988-423-5-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/block-backend.c | 3 ++- block/io.c | 15 +++++++-------- block/trace-events | 2 +- include/block/block.h | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index 83b6407ea6..8b16b95aab 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1074,7 +1074,8 @@ BlockAIOCB *blk_aio_discard(BlockBackend *blk, return blk_abort_aio_request(blk, cb, opaque, ret); } - return bdrv_aio_discard(blk_bs(blk), sector_num, nb_sectors, cb, opaque); + return bdrv_aio_pdiscard(blk_bs(blk), sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, cb, opaque); } void blk_aio_cancel(BlockAIOCB *acb) diff --git a/block/io.c b/block/io.c index 85953bbc16..58fdac50b2 100644 --- a/block/io.c +++ b/block/io.c @@ -2193,7 +2193,7 @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, return &acb->common; } -static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) +static void coroutine_fn bdrv_aio_pdiscard_co_entry(void *opaque) { BlockAIOCBCoroutine *acb = opaque; BlockDriverState *bs = acb->common.bs; @@ -2202,21 +2202,20 @@ static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) bdrv_co_complete(acb); } -BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +BlockAIOCB *bdrv_aio_pdiscard(BlockDriverState *bs, int64_t offset, int count, + BlockCompletionFunc *cb, void *opaque) { Coroutine *co; BlockAIOCBCoroutine *acb; - trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); + trace_bdrv_aio_pdiscard(bs, offset, count, opaque); acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); acb->need_bh = true; acb->req.error = -EINPROGRESS; - acb->req.offset = sector_num << BDRV_SECTOR_BITS; - acb->req.bytes = nb_sectors << BDRV_SECTOR_BITS; - co = qemu_coroutine_create(bdrv_aio_discard_co_entry, acb); + acb->req.offset = offset; + acb->req.bytes = count; + co = qemu_coroutine_create(bdrv_aio_pdiscard_co_entry, acb); qemu_coroutine_enter(co); bdrv_co_maybe_schedule_bh(acb); diff --git a/block/trace-events b/block/trace-events index 354967eacb..90d618a107 100644 --- a/block/trace-events +++ b/block/trace-events @@ -9,7 +9,7 @@ blk_co_preadv(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags blk_co_pwritev(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x" # block/io.c -bdrv_aio_discard(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p" +bdrv_aio_pdiscard(void *bs, int64_t offset, int count, void *opaque) "bs %p offset %"PRId64" count %d opaque %p" bdrv_aio_flush(void *bs, void *opaque) "bs %p opaque %p" bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p" bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p" diff --git a/include/block/block.h b/include/block/block.h index 94cabbb685..11c162d594 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -316,9 +316,9 @@ BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num, BlockCompletionFunc *cb, void *opaque); BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, BlockCompletionFunc *cb, void *opaque); -BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *bdrv_aio_pdiscard(BlockDriverState *bs, + int64_t offset, int count, + BlockCompletionFunc *cb, void *opaque); void bdrv_aio_cancel(BlockAIOCB *acb); void bdrv_aio_cancel_async(BlockAIOCB *acb); From 1c6c4bb7f0a4cc3987e58880ef96159985dc1228 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:22:54 -0600 Subject: [PATCH 11/25] block: Convert BB interface to byte-based discards Change sector-based blk_discard(), blk_co_discard(), and blk_aio_discard() to instead be byte-based blk_pdiscard(), blk_co_pdiscard(), and blk_aio_pdiscard(). NBD gets a lot simpler now that ignoring the unaligned portion of a byte-based discard request is handled under the hood by the block layer. Signed-off-by: Eric Blake Reviewed-by: Stefan Hajnoczi Message-id: 1468624988-423-6-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/block-backend.c | 25 +++++++++++-------------- block/mirror.c | 5 +++-- hw/block/xen_disk.c | 7 ++++--- hw/ide/core.c | 6 ++++-- hw/scsi/scsi-disk.c | 8 ++++---- include/sysemu/block-backend.h | 9 ++++----- nbd/server.c | 19 +++++-------------- qemu-io-cmds.c | 3 +-- 8 files changed, 36 insertions(+), 46 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index 8b16b95aab..effa038924 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1065,17 +1065,16 @@ BlockAIOCB *blk_aio_flush(BlockBackend *blk, return bdrv_aio_flush(blk_bs(blk), cb, opaque); } -BlockAIOCB *blk_aio_discard(BlockBackend *blk, - int64_t sector_num, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, + int64_t offset, int count, + BlockCompletionFunc *cb, void *opaque) { - int ret = blk_check_request(blk, sector_num, nb_sectors); + int ret = blk_check_byte_request(blk, offset, count); if (ret < 0) { return blk_abort_aio_request(blk, cb, opaque, ret); } - return bdrv_aio_pdiscard(blk_bs(blk), sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, cb, opaque); + return bdrv_aio_pdiscard(blk_bs(blk), offset, count, cb, opaque); } void blk_aio_cancel(BlockAIOCB *acb) @@ -1107,15 +1106,14 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, return bdrv_aio_ioctl(blk_bs(blk), req, buf, cb, opaque); } -int blk_co_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors) +int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int count) { - int ret = blk_check_request(blk, sector_num, nb_sectors); + int ret = blk_check_byte_request(blk, offset, count); if (ret < 0) { return ret; } - return bdrv_co_pdiscard(blk_bs(blk), sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS); + return bdrv_co_pdiscard(blk_bs(blk), offset, count); } int blk_co_flush(BlockBackend *blk) @@ -1506,15 +1504,14 @@ int blk_truncate(BlockBackend *blk, int64_t offset) return bdrv_truncate(blk_bs(blk), offset); } -int blk_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors) +int blk_pdiscard(BlockBackend *blk, int64_t offset, int count) { - int ret = blk_check_request(blk, sector_num, nb_sectors); + int ret = blk_check_byte_request(blk, offset, count); if (ret < 0) { return ret; } - return bdrv_pdiscard(blk_bs(blk), sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS); + return bdrv_pdiscard(blk_bs(blk), offset, count); } int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, diff --git a/block/mirror.c b/block/mirror.c index b1e633ecad..617bb18f4e 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -303,8 +303,9 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s, s->in_flight++; s->sectors_in_flight += nb_sectors; if (is_discard) { - blk_aio_discard(s->target, sector_num, op->nb_sectors, - mirror_write_complete, op); + blk_aio_pdiscard(s->target, sector_num << BDRV_SECTOR_BITS, + op->nb_sectors << BDRV_SECTOR_BITS, + mirror_write_complete, op); } else { blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE, op->nb_sectors * BDRV_SECTOR_SIZE, diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c index 90aca73121..3b8ad33fc5 100644 --- a/hw/block/xen_disk.c +++ b/hw/block/xen_disk.c @@ -574,9 +574,10 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq) { struct blkif_request_discard *discard_req = (void *)&ioreq->req; ioreq->aio_inflight++; - blk_aio_discard(blkdev->blk, - discard_req->sector_number, discard_req->nr_sectors, - qemu_aio_complete, ioreq); + blk_aio_pdiscard(blkdev->blk, + discard_req->sector_number << BDRV_SECTOR_BITS, + discard_req->nr_sectors << BDRV_SECTOR_BITS, + qemu_aio_complete, ioreq); break; } default: diff --git a/hw/ide/core.c b/hw/ide/core.c index b1daf967d6..081c9eb765 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -423,8 +423,10 @@ static void ide_issue_trim_cb(void *opaque, int ret) } /* Got an entry! Submit and exit. */ - iocb->aiocb = blk_aio_discard(iocb->blk, sector, count, - ide_issue_trim_cb, opaque); + iocb->aiocb = blk_aio_pdiscard(iocb->blk, + sector << BDRV_SECTOR_BITS, + count << BDRV_SECTOR_BITS, + ide_issue_trim_cb, opaque); return; } diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c index 8dbfc10b78..836a1553ed 100644 --- a/hw/scsi/scsi-disk.c +++ b/hw/scsi/scsi-disk.c @@ -1609,10 +1609,10 @@ static void scsi_unmap_complete_noio(UnmapCBData *data, int ret) goto done; } - r->req.aiocb = blk_aio_discard(s->qdev.conf.blk, - sector_num * (s->qdev.blocksize / 512), - nb_sectors * (s->qdev.blocksize / 512), - scsi_unmap_complete, data); + r->req.aiocb = blk_aio_pdiscard(s->qdev.conf.blk, + sector_num * s->qdev.blocksize, + nb_sectors * s->qdev.blocksize, + scsi_unmap_complete, data); data->count--; data->inbuf += 16; return; diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h index 3c3e82f05d..2da4905d18 100644 --- a/include/sysemu/block-backend.h +++ b/include/sysemu/block-backend.h @@ -139,15 +139,14 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset, BlockCompletionFunc *cb, void *opaque); BlockAIOCB *blk_aio_flush(BlockBackend *blk, BlockCompletionFunc *cb, void *opaque); -BlockAIOCB *blk_aio_discard(BlockBackend *blk, - int64_t sector_num, int nb_sectors, - BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int count, + BlockCompletionFunc *cb, void *opaque); void blk_aio_cancel(BlockAIOCB *acb); void blk_aio_cancel_async(BlockAIOCB *acb); int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf); BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, BlockCompletionFunc *cb, void *opaque); -int blk_co_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors); +int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int count); int blk_co_flush(BlockBackend *blk); int blk_flush(BlockBackend *blk); int blk_flush_all(void); @@ -207,7 +206,7 @@ int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, int blk_write_compressed(BlockBackend *blk, int64_t sector_num, const uint8_t *buf, int nb_sectors); int blk_truncate(BlockBackend *blk, int64_t offset); -int blk_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors); +int blk_pdiscard(BlockBackend *blk, int64_t offset, int count); int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, int64_t pos, int size); int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size); diff --git a/nbd/server.c b/nbd/server.c index fbc82de932..29e2099b5e 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -1182,20 +1182,11 @@ static void nbd_trip(void *opaque) break; case NBD_CMD_TRIM: TRACE("Request type is TRIM"); - /* Ignore unaligned head or tail, until block layer adds byte - * interface */ - if (request.len >= BDRV_SECTOR_SIZE) { - request.len -= (request.from + request.len) % BDRV_SECTOR_SIZE; - ret = blk_co_discard(exp->blk, - DIV_ROUND_UP(request.from + exp->dev_offset, - BDRV_SECTOR_SIZE), - request.len / BDRV_SECTOR_SIZE); - if (ret < 0) { - LOG("discard failed"); - reply.error = -ret; - } - } else { - TRACE("trim request too small, ignoring"); + ret = blk_co_pdiscard(exp->blk, request.from + exp->dev_offset, + request.len); + if (ret < 0) { + LOG("discard failed"); + reply.error = -ret; } if (nbd_co_send_reply(req, &reply, 0) < 0) { goto out; diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c index 6e29edb1fd..25954f5634 100644 --- a/qemu-io-cmds.c +++ b/qemu-io-cmds.c @@ -1696,8 +1696,7 @@ static int discard_f(BlockBackend *blk, int argc, char **argv) } gettimeofday(&t1, NULL); - ret = blk_discard(blk, offset >> BDRV_SECTOR_BITS, - count >> BDRV_SECTOR_BITS); + ret = blk_pdiscard(blk, offset, count); gettimeofday(&t2, NULL); if (ret < 0) { From 36e3b2e7338a5a5323f7ca9d8d4d439e971a18dd Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:22:55 -0600 Subject: [PATCH 12/25] raw-posix: Switch paio_submit() to byte-based The only remaining uses of paio_submit() were flush (with no offset or count) and discard (which we are switching to byte-based); furthermore, the similarly named paio_submit_co() is already byte-based. Signed-off-by: Eric Blake Reviewed-by: Stefan Hajnoczi Message-id: 1468624988-423-7-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/raw-posix.c | 14 ++++++++------ block/raw-win32.c | 19 +++++++++++-------- block/trace-events | 2 +- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index 20f4d7aa8d..b48efb3859 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -1214,7 +1214,7 @@ static int paio_submit_co(BlockDriverState *bs, int fd, } static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + int64_t offset, QEMUIOVector *qiov, int count, BlockCompletionFunc *cb, void *opaque, int type) { RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); @@ -1224,8 +1224,8 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd, acb->aio_type = type; acb->aio_fildes = fd; - acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE; - acb->aio_offset = sector_num * BDRV_SECTOR_SIZE; + acb->aio_nbytes = count; + acb->aio_offset = offset; if (qiov) { acb->aio_iov = qiov->iov; @@ -1233,7 +1233,7 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd, assert(qiov->size == acb->aio_nbytes); } - trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); + trace_paio_submit(acb, opaque, offset, count, type); pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); } @@ -1792,7 +1792,8 @@ static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs, { BDRVRawState *s = bs->opaque; - return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors, + return paio_submit(bs, s->fd, sector_num << BDRV_SECTOR_BITS, NULL, + nb_sectors << BDRV_SECTOR_BITS, cb, opaque, QEMU_AIO_DISCARD); } @@ -2212,7 +2213,8 @@ static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs, if (fd_open(bs) < 0) { return NULL; } - return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors, + return paio_submit(bs, s->fd, sector_num << BDRV_SECTOR_BITS, NULL, + nb_sectors << BDRV_SECTOR_BITS, cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); } diff --git a/block/raw-win32.c b/block/raw-win32.c index 9b813d99ae..56f45fea9e 100644 --- a/block/raw-win32.c +++ b/block/raw-win32.c @@ -142,7 +142,7 @@ static int aio_worker(void *arg) } static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + int64_t offset, QEMUIOVector *qiov, int count, BlockCompletionFunc *cb, void *opaque, int type) { RawWin32AIOData *acb = g_new(RawWin32AIOData, 1); @@ -155,11 +155,12 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile, if (qiov) { acb->aio_iov = qiov->iov; acb->aio_niov = qiov->niov; + assert(qiov->size == count); } - acb->aio_nbytes = nb_sectors * 512; - acb->aio_offset = sector_num * 512; + acb->aio_nbytes = count; + acb->aio_offset = offset; - trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); + trace_paio_submit(acb, opaque, offset, count, type); pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); } @@ -378,9 +379,10 @@ static BlockAIOCB *raw_aio_readv(BlockDriverState *bs, BDRVRawState *s = bs->opaque; if (s->aio) { return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov, - nb_sectors, cb, opaque, QEMU_AIO_READ); + nb_sectors, cb, opaque, QEMU_AIO_READ); } else { - return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors, + return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov, + nb_sectors << BDRV_SECTOR_BITS, cb, opaque, QEMU_AIO_READ); } } @@ -392,9 +394,10 @@ static BlockAIOCB *raw_aio_writev(BlockDriverState *bs, BDRVRawState *s = bs->opaque; if (s->aio) { return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov, - nb_sectors, cb, opaque, QEMU_AIO_WRITE); + nb_sectors, cb, opaque, QEMU_AIO_WRITE); } else { - return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors, + return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov, + nb_sectors << BDRV_SECTOR_BITS, cb, opaque, QEMU_AIO_WRITE); } } diff --git a/block/trace-events b/block/trace-events index 90d618a107..978ef4f02a 100644 --- a/block/trace-events +++ b/block/trace-events @@ -58,7 +58,7 @@ qmp_block_stream(void *bs, void *job) "bs %p job %p" # block/raw-win32.c # block/raw-posix.c paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d type %d" -paio_submit(void *acb, void *opaque, int64_t sector_num, int nb_sectors, int type) "acb %p opaque %p sector_num %"PRId64" nb_sectors %d type %d" +paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d" # block/qcow2.c qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset %" PRIx64 " bytes %d" From 7bbca9e290a9c7c217b5a24fc6094e91e54bd05d Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:22:56 -0600 Subject: [PATCH 13/25] rbd: Switch rbd_start_aio() to byte-based The internal function converts to byte-based before calling into RBD code; hoist the conversion to the callers so that callers can then be switched to byte-based themselves. Signed-off-by: Eric Blake Reviewed-by: Stefan Hajnoczi Message-id: 1468624988-423-8-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/rbd.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/block/rbd.c b/block/rbd.c index 0a5840d08b..01cbb636e0 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -649,9 +649,9 @@ static int rbd_aio_flush_wrapper(rbd_image_t image, } static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, - int64_t sector_num, + int64_t off, QEMUIOVector *qiov, - int nb_sectors, + int64_t size, BlockCompletionFunc *cb, void *opaque, RBDAIOCmd cmd) @@ -659,7 +659,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, RBDAIOCB *acb; RADOSCB *rcb = NULL; rbd_completion_t c; - int64_t off, size; char *buf; int r; @@ -668,6 +667,7 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque); acb->cmd = cmd; acb->qiov = qiov; + assert(!qiov || qiov->size == size); if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) { acb->bounce = NULL; } else { @@ -687,9 +687,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, buf = acb->bounce; - off = sector_num * BDRV_SECTOR_SIZE; - size = nb_sectors * BDRV_SECTOR_SIZE; - rcb = g_new(RADOSCB, 1); rcb->acb = acb; rcb->buf = buf; @@ -739,7 +736,8 @@ static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs, BlockCompletionFunc *cb, void *opaque) { - return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque, + return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov, + nb_sectors << BDRV_SECTOR_BITS, cb, opaque, RBD_AIO_READ); } @@ -750,7 +748,8 @@ static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs, BlockCompletionFunc *cb, void *opaque) { - return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque, + return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov, + nb_sectors << BDRV_SECTOR_BITS, cb, opaque, RBD_AIO_WRITE); } @@ -937,7 +936,8 @@ static BlockAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs, BlockCompletionFunc *cb, void *opaque) { - return rbd_start_aio(bs, sector_num, NULL, nb_sectors, cb, opaque, + return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, NULL, + nb_sectors << BDRV_SECTOR_BITS, cb, opaque, RBD_AIO_DISCARD); } #endif From 4da444a0bb5b6a7563a785f027402e5af4bd53aa Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:22:57 -0600 Subject: [PATCH 14/25] block: Convert .bdrv_aio_discard() to byte-based Another step towards byte-based interfaces everywhere. Replace the sector-based driver callback .bdrv_aio_discard() with a new byte-based .bdrv_aio_pdiscard(). Only raw-posix and RBD drivers are affected, so it was not worth splitting into multiple patches. Signed-off-by: Eric Blake Reviewed-by: Stefan Hajnoczi Message-id: 1468624988-423-9-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/io.c | 7 +++---- block/raw-posix.c | 18 ++++++++---------- block/rbd.c | 15 +++++++-------- include/block/block_int.h | 4 ++-- 4 files changed, 20 insertions(+), 24 deletions(-) diff --git a/block/io.c b/block/io.c index 58fdac50b2..5c759fae47 100644 --- a/block/io.c +++ b/block/io.c @@ -2423,7 +2423,7 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, return 0; } - if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { + if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_pdiscard) { return 0; } @@ -2464,9 +2464,8 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, .coroutine = qemu_coroutine_self(), }; - acb = bs->drv->bdrv_aio_discard(bs, offset >> BDRV_SECTOR_BITS, - num >> BDRV_SECTOR_BITS, - bdrv_co_io_em_complete, &co); + acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, + bdrv_co_io_em_complete, &co); if (acb == NULL) { ret = -EIO; goto out; diff --git a/block/raw-posix.c b/block/raw-posix.c index b48efb3859..6ed7547392 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -1786,14 +1786,13 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, return ret | BDRV_BLOCK_OFFSET_VALID | start; } -static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, +static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs, + int64_t offset, int count, BlockCompletionFunc *cb, void *opaque) { BDRVRawState *s = bs->opaque; - return paio_submit(bs, s->fd, sector_num << BDRV_SECTOR_BITS, NULL, - nb_sectors << BDRV_SECTOR_BITS, + return paio_submit(bs, s->fd, offset, NULL, count, cb, opaque, QEMU_AIO_DISCARD); } @@ -1865,7 +1864,7 @@ BlockDriver bdrv_file = { .bdrv_co_preadv = raw_co_preadv, .bdrv_co_pwritev = raw_co_pwritev, .bdrv_aio_flush = raw_aio_flush, - .bdrv_aio_discard = raw_aio_discard, + .bdrv_aio_pdiscard = raw_aio_pdiscard, .bdrv_refresh_limits = raw_refresh_limits, .bdrv_io_plug = raw_aio_plug, .bdrv_io_unplug = raw_aio_unplug, @@ -2204,8 +2203,8 @@ static int fd_open(BlockDriverState *bs) return -EIO; } -static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, +static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs, + int64_t offset, int count, BlockCompletionFunc *cb, void *opaque) { BDRVRawState *s = bs->opaque; @@ -2213,8 +2212,7 @@ static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs, if (fd_open(bs) < 0) { return NULL; } - return paio_submit(bs, s->fd, sector_num << BDRV_SECTOR_BITS, NULL, - nb_sectors << BDRV_SECTOR_BITS, + return paio_submit(bs, s->fd, offset, NULL, count, cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); } @@ -2309,7 +2307,7 @@ static BlockDriver bdrv_host_device = { .bdrv_co_preadv = raw_co_preadv, .bdrv_co_pwritev = raw_co_pwritev, .bdrv_aio_flush = raw_aio_flush, - .bdrv_aio_discard = hdev_aio_discard, + .bdrv_aio_pdiscard = hdev_aio_pdiscard, .bdrv_refresh_limits = raw_refresh_limits, .bdrv_io_plug = raw_aio_plug, .bdrv_io_unplug = raw_aio_unplug, diff --git a/block/rbd.c b/block/rbd.c index 01cbb636e0..0106fea45f 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -930,14 +930,13 @@ static int qemu_rbd_snap_list(BlockDriverState *bs, } #ifdef LIBRBD_SUPPORTS_DISCARD -static BlockAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) +static BlockAIOCB *qemu_rbd_aio_pdiscard(BlockDriverState *bs, + int64_t offset, + int count, + BlockCompletionFunc *cb, + void *opaque) { - return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, NULL, - nb_sectors << BDRV_SECTOR_BITS, cb, opaque, + return rbd_start_aio(bs, offset, NULL, count, cb, opaque, RBD_AIO_DISCARD); } #endif @@ -1001,7 +1000,7 @@ static BlockDriver bdrv_rbd = { #endif #ifdef LIBRBD_SUPPORTS_DISCARD - .bdrv_aio_discard = qemu_rbd_aio_discard, + .bdrv_aio_pdiscard = qemu_rbd_aio_pdiscard, #endif .bdrv_snapshot_create = qemu_rbd_snap_create, diff --git a/include/block/block_int.h b/include/block/block_int.h index a6b13adb45..9bf9aed83d 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -142,8 +142,8 @@ struct BlockDriver { BlockCompletionFunc *cb, void *opaque); BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs, BlockCompletionFunc *cb, void *opaque); - BlockAIOCB *(*bdrv_aio_discard)(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, + BlockAIOCB *(*bdrv_aio_pdiscard)(BlockDriverState *bs, + int64_t offset, int count, BlockCompletionFunc *cb, void *opaque); int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs, From 47a5486d598e0cfef6c87c5f44b85955a5f2c4ff Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:22:58 -0600 Subject: [PATCH 15/25] block: Add .bdrv_co_pdiscard() driver callback There's enough drivers with a sector-based callback that it will be easier to switch one at a time. This patch adds a byte-based callback, and then after all drivers are swapped, we'll drop the sector-based callback. [checkpatch doesn't like the space after coroutine_fn in block_int.h, but it's consistent with the rest of the file] Signed-off-by: Eric Blake Reviewed-by: Stefan Hajnoczi Message-id: 1468624988-423-10-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/io.c | 7 +++++-- include/block/block_int.h | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/block/io.c b/block/io.c index 5c759fae47..5ba0195a69 100644 --- a/block/io.c +++ b/block/io.c @@ -2423,7 +2423,8 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, return 0; } - if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_pdiscard) { + if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_co_pdiscard && + !bs->drv->bdrv_aio_pdiscard) { return 0; } @@ -2455,7 +2456,9 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, int ret; int num = MIN(count, max_pdiscard); - if (bs->drv->bdrv_co_discard) { + if (bs->drv->bdrv_co_pdiscard) { + ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); + } else if (bs->drv->bdrv_co_discard) { ret = bs->drv->bdrv_co_discard(bs, offset >> BDRV_SECTOR_BITS, num >> BDRV_SECTOR_BITS); } else { diff --git a/include/block/block_int.h b/include/block/block_int.h index 9bf9aed83d..8f16d1652e 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -167,6 +167,8 @@ struct BlockDriver { int64_t offset, int count, BdrvRequestFlags flags); int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs, int64_t sector_num, int nb_sectors); + int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs, + int64_t offset, int count); int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file); From aba76e2f0313178487e471bb55d022a89056c954 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:22:59 -0600 Subject: [PATCH 16/25] blkreplay: Switch .bdrv_co_discard() to byte-based Another step towards killing off sector-based block APIs. Signed-off-by: Eric Blake Reviewed-by: Stefan Hajnoczi Message-id: 1468624988-423-11-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/blkreplay.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/block/blkreplay.c b/block/blkreplay.c index c69e5a57a0..30f9d5ff6c 100755 --- a/block/blkreplay.c +++ b/block/blkreplay.c @@ -114,12 +114,11 @@ static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs, return ret; } -static int coroutine_fn blkreplay_co_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) +static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs, + int64_t offset, int count) { uint64_t reqid = request_id++; - int ret = bdrv_co_pdiscard(bs->file->bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS); + int ret = bdrv_co_pdiscard(bs->file->bs, offset, count); block_request_create(reqid, bs, qemu_coroutine_self()); qemu_coroutine_yield(); @@ -149,7 +148,7 @@ static BlockDriver bdrv_blkreplay = { .bdrv_co_pwritev = blkreplay_co_pwritev, .bdrv_co_pwrite_zeroes = blkreplay_co_pwrite_zeroes, - .bdrv_co_discard = blkreplay_co_discard, + .bdrv_co_pdiscard = blkreplay_co_pdiscard, .bdrv_co_flush = blkreplay_co_flush, }; From 1014170b825c4d9252fe531d13658f104447f31c Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:23:00 -0600 Subject: [PATCH 17/25] gluster: Switch .bdrv_co_discard() to byte-based Another step towards killing off sector-based block APIs. Signed-off-by: Eric Blake Reviewed-by: Stefan Hajnoczi Message-id: 1468624988-423-12-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/gluster.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/block/gluster.c b/block/gluster.c index 406c1e6357..ef3b0de280 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -724,14 +724,12 @@ error: } #ifdef CONFIG_GLUSTERFS_DISCARD -static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) +static coroutine_fn int qemu_gluster_co_pdiscard(BlockDriverState *bs, + int64_t offset, int size) { int ret; GlusterAIOCB acb; BDRVGlusterState *s = bs->opaque; - size_t size = nb_sectors * BDRV_SECTOR_SIZE; - off_t offset = sector_num * BDRV_SECTOR_SIZE; acb.size = 0; acb.ret = 0; @@ -976,7 +974,7 @@ static BlockDriver bdrv_gluster = { .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_co_discard = qemu_gluster_co_discard, + .bdrv_co_pdiscard = qemu_gluster_co_pdiscard, #endif #ifdef CONFIG_GLUSTERFS_ZEROFILL .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, @@ -1004,7 +1002,7 @@ static BlockDriver bdrv_gluster_tcp = { .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_co_discard = qemu_gluster_co_discard, + .bdrv_co_pdiscard = qemu_gluster_co_pdiscard, #endif #ifdef CONFIG_GLUSTERFS_ZEROFILL .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, @@ -1032,7 +1030,7 @@ static BlockDriver bdrv_gluster_unix = { .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_co_discard = qemu_gluster_co_discard, + .bdrv_co_pdiscard = qemu_gluster_co_pdiscard, #endif #ifdef CONFIG_GLUSTERFS_ZEROFILL .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, @@ -1060,7 +1058,7 @@ static BlockDriver bdrv_gluster_rdma = { .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_co_discard = qemu_gluster_co_discard, + .bdrv_co_pdiscard = qemu_gluster_co_pdiscard, #endif #ifdef CONFIG_GLUSTERFS_ZEROFILL .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, From 97c7e85cfea4f4edc1536c50ca2f3c1627d8e443 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:23:01 -0600 Subject: [PATCH 18/25] iscsi: Switch .bdrv_co_discard() to byte-based Another step towards killing off sector-based block APIs. Unlike write_zeroes, where we can be handed unaligned requests and must fail gracefully with -ENOTSUP for a fallback, we are guaranteed that discard requests are always aligned because the block layer already ignored unaligned head/tail. Signed-off-by: Eric Blake Reviewed-by: Stefan Hajnoczi Message-id: 1468624988-423-13-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/iscsi.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/block/iscsi.c b/block/iscsi.c index 5602abdc6a..95ce9e139e 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -1042,29 +1042,26 @@ iscsi_getlength(BlockDriverState *bs) } static int -coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) +coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, int count) { IscsiLun *iscsilun = bs->opaque; struct IscsiTask iTask; struct unmap_list list; - if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { - return -EINVAL; - } + assert(is_byte_request_lun_aligned(offset, count, iscsilun)); if (!iscsilun->lbp.lbpu) { /* UNMAP is not supported by the target */ return 0; } - list.lba = sector_qemu2lun(sector_num, iscsilun); - list.num = sector_qemu2lun(nb_sectors, iscsilun); + list.lba = offset / iscsilun->block_size; + list.num = count / iscsilun->block_size; iscsi_co_init_iscsitask(iscsilun, &iTask); retry: if (iscsi_unmap_task(iscsilun->iscsi, iscsilun->lun, 0, 0, &list, 1, - iscsi_co_generic_cb, &iTask) == NULL) { + iscsi_co_generic_cb, &iTask) == NULL) { return -ENOMEM; } @@ -1094,7 +1091,8 @@ retry: return iTask.err_code; } - iscsi_allocmap_set_invalid(iscsilun, sector_num, nb_sectors); + iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS, + count >> BDRV_SECTOR_BITS); return 0; } @@ -1998,7 +1996,7 @@ static BlockDriver bdrv_iscsi = { .bdrv_refresh_limits = iscsi_refresh_limits, .bdrv_co_get_block_status = iscsi_co_get_block_status, - .bdrv_co_discard = iscsi_co_discard, + .bdrv_co_pdiscard = iscsi_co_pdiscard, .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes, .bdrv_co_readv = iscsi_co_readv, .bdrv_co_writev_flags = iscsi_co_writev_flags, From 447e57c3b056c310463238268d514798b23c376f Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:23:02 -0600 Subject: [PATCH 19/25] nbd: Switch .bdrv_co_discard() to byte-based Another step towards killing off sector-based block APIs. While at it, call directly into nbd-client.c instead of having a pointless trivial wrapper in nbd.c. Signed-off-by: Eric Blake Reviewed-by: Stefan Hajnoczi Message-id: 1468624988-423-14-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/nbd-client.c | 11 ++++++----- block/nbd-client.h | 3 +-- block/nbd.c | 12 +++--------- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/block/nbd-client.c b/block/nbd-client.c index f184844e40..d22feea078 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -295,19 +295,20 @@ int nbd_client_co_flush(BlockDriverState *bs) return -reply.error; } -int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) +int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count) { NbdClientSession *client = nbd_get_client_session(bs); - struct nbd_request request = { .type = NBD_CMD_TRIM }; + struct nbd_request request = { + .type = NBD_CMD_TRIM, + .from = offset, + .len = count, + }; struct nbd_reply reply; ssize_t ret; if (!(client->nbdflags & NBD_FLAG_SEND_TRIM)) { return 0; } - request.from = sector_num * 512; - request.len = nb_sectors * 512; nbd_coroutine_start(client, &request); ret = nbd_co_send_request(bs, &request, NULL); diff --git a/block/nbd-client.h b/block/nbd-client.h index c618dadc39..62dec33118 100644 --- a/block/nbd-client.h +++ b/block/nbd-client.h @@ -44,8 +44,7 @@ int nbd_client_init(BlockDriverState *bs, Error **errp); void nbd_client_close(BlockDriverState *bs); -int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors); +int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count); int nbd_client_co_flush(BlockDriverState *bs); int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags); diff --git a/block/nbd.c b/block/nbd.c index 8a130787c5..42cae0e1fd 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -360,12 +360,6 @@ static void nbd_refresh_limits(BlockDriverState *bs, Error **errp) bs->bl.max_transfer = NBD_MAX_BUFFER_SIZE; } -static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) -{ - return nbd_client_co_discard(bs, sector_num, nb_sectors); -} - static void nbd_close(BlockDriverState *bs) { nbd_client_close(bs); @@ -448,7 +442,7 @@ static BlockDriver bdrv_nbd = { .bdrv_co_writev_flags = nbd_client_co_writev, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, - .bdrv_co_discard = nbd_co_discard, + .bdrv_co_pdiscard = nbd_client_co_pdiscard, .bdrv_refresh_limits = nbd_refresh_limits, .bdrv_getlength = nbd_getlength, .bdrv_detach_aio_context = nbd_detach_aio_context, @@ -466,7 +460,7 @@ static BlockDriver bdrv_nbd_tcp = { .bdrv_co_writev_flags = nbd_client_co_writev, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, - .bdrv_co_discard = nbd_co_discard, + .bdrv_co_pdiscard = nbd_client_co_pdiscard, .bdrv_refresh_limits = nbd_refresh_limits, .bdrv_getlength = nbd_getlength, .bdrv_detach_aio_context = nbd_detach_aio_context, @@ -484,7 +478,7 @@ static BlockDriver bdrv_nbd_unix = { .bdrv_co_writev_flags = nbd_client_co_writev, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, - .bdrv_co_discard = nbd_co_discard, + .bdrv_co_pdiscard = nbd_client_co_pdiscard, .bdrv_refresh_limits = nbd_refresh_limits, .bdrv_getlength = nbd_getlength, .bdrv_detach_aio_context = nbd_detach_aio_context, From 82e8a7888b7840e6b1770e131970d031ad64bc5b Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:23:03 -0600 Subject: [PATCH 20/25] qcow2: Switch .bdrv_co_discard() to byte-based Another step towards killing off sector-based block APIs. Signed-off-by: Eric Blake Reviewed-by: Stefan Hajnoczi Message-id: 1468624988-423-15-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/qcow2.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index a6bca735e5..d620d0a85b 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -2479,15 +2479,15 @@ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs, return ret; } -static coroutine_fn int qcow2_co_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) +static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs, + int64_t offset, int count) { int ret; BDRVQcow2State *s = bs->opaque; qemu_co_mutex_lock(&s->lock); - ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors, QCOW2_DISCARD_REQUEST, false); + ret = qcow2_discard_clusters(bs, offset, count >> BDRV_SECTOR_BITS, + QCOW2_DISCARD_REQUEST, false); qemu_co_mutex_unlock(&s->lock); return ret; } @@ -3410,7 +3410,7 @@ BlockDriver bdrv_qcow2 = { .bdrv_co_flush_to_os = qcow2_co_flush_to_os, .bdrv_co_pwrite_zeroes = qcow2_co_pwrite_zeroes, - .bdrv_co_discard = qcow2_co_discard, + .bdrv_co_pdiscard = qcow2_co_pdiscard, .bdrv_truncate = qcow2_truncate, .bdrv_write_compressed = qcow2_write_compressed, .bdrv_make_empty = qcow2_make_empty, From 5f61ad079af198aa6ba46e4461715ed834f72214 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:23:04 -0600 Subject: [PATCH 21/25] raw_bsd: Switch .bdrv_co_discard() to byte-based Another step towards killing off sector-based block APIs. Signed-off-by: Eric Blake Reviewed-by: Stefan Hajnoczi Message-id: 1468624988-423-16-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/raw_bsd.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/block/raw_bsd.c b/block/raw_bsd.c index 68f0a91a9a..961aa1370a 100644 --- a/block/raw_bsd.c +++ b/block/raw_bsd.c @@ -134,11 +134,10 @@ static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs, return bdrv_co_pwrite_zeroes(bs->file, offset, count, flags); } -static int coroutine_fn raw_co_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) +static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs, + int64_t offset, int count) { - return bdrv_co_pdiscard(bs->file->bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS); + return bdrv_co_pdiscard(bs->file->bs, offset, count); } static int64_t raw_getlength(BlockDriverState *bs) @@ -244,7 +243,7 @@ BlockDriver bdrv_raw = { .bdrv_co_readv = &raw_co_readv, .bdrv_co_writev_flags = &raw_co_writev_flags, .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes, - .bdrv_co_discard = &raw_co_discard, + .bdrv_co_pdiscard = &raw_co_pdiscard, .bdrv_co_get_block_status = &raw_co_get_block_status, .bdrv_truncate = &raw_truncate, .bdrv_getlength = &raw_getlength, From dde475376317ba86e9531b7ebd9e04306e8f9bd4 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:23:05 -0600 Subject: [PATCH 22/25] sheepdog: Switch .bdrv_co_discard() to byte-based Another step towards killing off sector-based block APIs. Signed-off-by: Eric Blake Reviewed-by: Stefan Hajnoczi Message-id: 1468624988-423-17-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/sheepdog.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/block/sheepdog.c b/block/sheepdog.c index e739c56f08..66e1cb2b2d 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -2800,8 +2800,8 @@ static int sd_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, } -static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) +static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset, + int count) { SheepdogAIOCB *acb; BDRVSheepdogState *s = bs->opaque; @@ -2811,7 +2811,7 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num, uint32_t zero = 0; if (!s->discard_supported) { - return 0; + return 0; } memset(&discard_iov, 0, sizeof(discard_iov)); @@ -2820,7 +2820,10 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num, iov.iov_len = sizeof(zero); discard_iov.iov = &iov; discard_iov.niov = 1; - acb = sd_aio_setup(bs, &discard_iov, sector_num, nb_sectors); + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((count & (BDRV_SECTOR_SIZE - 1)) == 0); + acb = sd_aio_setup(bs, &discard_iov, offset >> BDRV_SECTOR_BITS, + count >> BDRV_SECTOR_BITS); acb->aiocb_type = AIOCB_DISCARD_OBJ; acb->aio_done_func = sd_finish_aiocb; @@ -2954,7 +2957,7 @@ static BlockDriver bdrv_sheepdog = { .bdrv_co_readv = sd_co_readv, .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, - .bdrv_co_discard = sd_co_discard, + .bdrv_co_pdiscard = sd_co_pdiscard, .bdrv_co_get_block_status = sd_co_get_block_status, .bdrv_snapshot_create = sd_snapshot_create, @@ -2990,7 +2993,7 @@ static BlockDriver bdrv_sheepdog_tcp = { .bdrv_co_readv = sd_co_readv, .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, - .bdrv_co_discard = sd_co_discard, + .bdrv_co_pdiscard = sd_co_pdiscard, .bdrv_co_get_block_status = sd_co_get_block_status, .bdrv_snapshot_create = sd_snapshot_create, @@ -3026,7 +3029,7 @@ static BlockDriver bdrv_sheepdog_unix = { .bdrv_co_readv = sd_co_readv, .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, - .bdrv_co_discard = sd_co_discard, + .bdrv_co_pdiscard = sd_co_pdiscard, .bdrv_co_get_block_status = sd_co_get_block_status, .bdrv_snapshot_create = sd_snapshot_create, From 02aefe43cb437ae437d5df88b7f2951f21d62ead Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:23:06 -0600 Subject: [PATCH 23/25] block: Kill .bdrv_co_discard() Now that all drivers have a byte-based .bdrv_co_pdiscard(), we no longer need to worry about the sector-based version. We can also relax our minimum alignment to 1 for drivers that support it. Signed-off-by: Eric Blake Reviewed-by: Stefan Hajnoczi Message-id: 1468624988-423-18-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/io.c | 9 ++------- include/block/block_int.h | 2 -- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/block/io.c b/block/io.c index 5ba0195a69..7323f0fb7b 100644 --- a/block/io.c +++ b/block/io.c @@ -2423,14 +2423,12 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, return 0; } - if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_co_pdiscard && - !bs->drv->bdrv_aio_pdiscard) { + if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { return 0; } /* Discard is advisory, so ignore any unaligned head or tail */ - align = MAX(BDRV_SECTOR_SIZE, - MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment)); + align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); assert(is_power_of_2(align)); head = MIN(count, -offset & (align - 1)); if (head) { @@ -2458,9 +2456,6 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, if (bs->drv->bdrv_co_pdiscard) { ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); - } else if (bs->drv->bdrv_co_discard) { - ret = bs->drv->bdrv_co_discard(bs, offset >> BDRV_SECTOR_BITS, - num >> BDRV_SECTOR_BITS); } else { BlockAIOCB *acb; CoroutineIOCompletion co = { diff --git a/include/block/block_int.h b/include/block/block_int.h index 8f16d1652e..a069f97b37 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -165,8 +165,6 @@ struct BlockDriver { */ int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs, int64_t offset, int count, BdrvRequestFlags flags); - int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs, - int64_t sector_num, int nb_sectors); int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs, int64_t offset, int count); int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs, From 70c4fb2648f31fcfd69296dd4f2b1b664572270f Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:23:07 -0600 Subject: [PATCH 24/25] nbd: Convert to byte-based interface The NBD protocol doesn't have any notion of sectors, so it is a fairly easy conversion to use byte-based read and write. Signed-off-by: Eric Blake Acked-by: Paolo Bonzini Message-id: 1468624988-423-19-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/nbd-client.c | 30 +++++++++++++++++------------- block/nbd-client.h | 8 ++++---- block/nbd.c | 12 ++++++------ include/block/nbd.h | 1 - 4 files changed, 27 insertions(+), 24 deletions(-) diff --git a/block/nbd-client.c b/block/nbd-client.c index d22feea078..2cf3237ef3 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -218,17 +218,20 @@ static void nbd_coroutine_end(NbdClientSession *s, } } -int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) +int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, int flags) { NbdClientSession *client = nbd_get_client_session(bs); - struct nbd_request request = { .type = NBD_CMD_READ }; + struct nbd_request request = { + .type = NBD_CMD_READ, + .from = offset, + .len = bytes, + }; struct nbd_reply reply; ssize_t ret; - assert(nb_sectors <= NBD_MAX_SECTORS); - request.from = sector_num * 512; - request.len = nb_sectors * 512; + assert(bytes <= NBD_MAX_BUFFER_SIZE); + assert(!flags); nbd_coroutine_start(client, &request); ret = nbd_co_send_request(bs, &request, NULL); @@ -239,14 +242,17 @@ int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num, } nbd_coroutine_end(client, &request); return -reply.error; - } -int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, int flags) +int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, int flags) { NbdClientSession *client = nbd_get_client_session(bs); - struct nbd_request request = { .type = NBD_CMD_WRITE }; + struct nbd_request request = { + .type = NBD_CMD_WRITE, + .from = offset, + .len = bytes, + }; struct nbd_reply reply; ssize_t ret; @@ -255,9 +261,7 @@ int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, request.type |= NBD_CMD_FLAG_FUA; } - assert(nb_sectors <= NBD_MAX_SECTORS); - request.from = sector_num * 512; - request.len = nb_sectors * 512; + assert(bytes <= NBD_MAX_BUFFER_SIZE); nbd_coroutine_start(client, &request); ret = nbd_co_send_request(bs, &request, qiov); diff --git a/block/nbd-client.h b/block/nbd-client.h index 62dec33118..fa9817b7d7 100644 --- a/block/nbd-client.h +++ b/block/nbd-client.h @@ -46,10 +46,10 @@ void nbd_client_close(BlockDriverState *bs); int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count); int nbd_client_co_flush(BlockDriverState *bs); -int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, int flags); -int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov); +int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, int flags); +int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, int flags); void nbd_client_detach_aio_context(BlockDriverState *bs); void nbd_client_attach_aio_context(BlockDriverState *bs, diff --git a/block/nbd.c b/block/nbd.c index 42cae0e1fd..8d57220f18 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -438,8 +438,8 @@ static BlockDriver bdrv_nbd = { .instance_size = sizeof(BDRVNBDState), .bdrv_parse_filename = nbd_parse_filename, .bdrv_file_open = nbd_open, - .bdrv_co_readv = nbd_client_co_readv, - .bdrv_co_writev_flags = nbd_client_co_writev, + .bdrv_co_preadv = nbd_client_co_preadv, + .bdrv_co_pwritev = nbd_client_co_pwritev, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_pdiscard = nbd_client_co_pdiscard, @@ -456,8 +456,8 @@ static BlockDriver bdrv_nbd_tcp = { .instance_size = sizeof(BDRVNBDState), .bdrv_parse_filename = nbd_parse_filename, .bdrv_file_open = nbd_open, - .bdrv_co_readv = nbd_client_co_readv, - .bdrv_co_writev_flags = nbd_client_co_writev, + .bdrv_co_preadv = nbd_client_co_preadv, + .bdrv_co_pwritev = nbd_client_co_pwritev, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_pdiscard = nbd_client_co_pdiscard, @@ -474,8 +474,8 @@ static BlockDriver bdrv_nbd_unix = { .instance_size = sizeof(BDRVNBDState), .bdrv_parse_filename = nbd_parse_filename, .bdrv_file_open = nbd_open, - .bdrv_co_readv = nbd_client_co_readv, - .bdrv_co_writev_flags = nbd_client_co_writev, + .bdrv_co_preadv = nbd_client_co_preadv, + .bdrv_co_pwritev = nbd_client_co_pwritev, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_pdiscard = nbd_client_co_pdiscard, diff --git a/include/block/nbd.h b/include/block/nbd.h index 503f514e85..cb91820f38 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -77,7 +77,6 @@ enum { /* Maximum size of a single READ/WRITE data buffer */ #define NBD_MAX_BUFFER_SIZE (32 * 1024 * 1024) -#define NBD_MAX_SECTORS (NBD_MAX_BUFFER_SIZE / BDRV_SECTOR_SIZE) /* Maximum size of an export name. The NBD spec requires 256 and * suggests that servers support up to 4096, but we stick to only the From decaeed7734bddc95e2c81858fbbec3e923310a1 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 15 Jul 2016 17:23:08 -0600 Subject: [PATCH 25/25] raw_bsd: Convert to byte-based interface Since the raw format driver is just passing things through, we can do byte-based read and write if the underlying protocol does likewise. There's one tricky part - if we probed the image format, we document that we restrict operations on the initial sector. It's easiest to keep this guarantee by enforcing read-modify-write on sub-sector operations (yes, this partially reverts commit ad82be2f). Signed-off-by: Eric Blake Message-id: 1468624988-423-20-git-send-email-eblake@redhat.com Signed-off-by: Stefan Hajnoczi --- block/raw_bsd.c | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/block/raw_bsd.c b/block/raw_bsd.c index 961aa1370a..588d4080f9 100644 --- a/block/raw_bsd.c +++ b/block/raw_bsd.c @@ -50,33 +50,30 @@ static int raw_reopen_prepare(BDRVReopenState *reopen_state, return 0; } -static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) +static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, + int flags) { BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); - return bdrv_co_readv(bs->file, sector_num, nb_sectors, qiov); + return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); } -static int coroutine_fn -raw_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - QEMUIOVector *qiov, int flags) +static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, + int flags) { void *buf = NULL; BlockDriver *drv; QEMUIOVector local_qiov; int ret; - if (bs->probed && sector_num == 0) { - /* As long as these conditions are true, we can't get partial writes to - * the probe buffer and can just directly check the request. */ + if (bs->probed && offset < BLOCK_PROBE_BUF_SIZE && bytes) { + /* Handling partial writes would be a pain - so we just + * require that guests have 512-byte request alignment if + * probing occurred */ QEMU_BUILD_BUG_ON(BLOCK_PROBE_BUF_SIZE != 512); QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != 512); - - if (nb_sectors == 0) { - /* qemu_iovec_to_buf() would fail, but we want to return success - * instead of -EINVAL in this case. */ - return 0; - } + assert(offset == 0 && bytes >= BLOCK_PROBE_BUF_SIZE); buf = qemu_try_blockalign(bs->file->bs, 512); if (!buf) { @@ -105,8 +102,7 @@ raw_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, } BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); - ret = bdrv_co_pwritev(bs->file, sector_num * BDRV_SECTOR_SIZE, - nb_sectors * BDRV_SECTOR_SIZE, qiov, flags); + ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); fail: if (qiov == &local_qiov) { @@ -150,6 +146,16 @@ static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) return bdrv_get_info(bs->file->bs, bdi); } +static void raw_refresh_limits(BlockDriverState *bs, Error **errp) +{ + if (bs->probed) { + /* To make it easier to protect the first sector, any probed + * image is restricted to read-modify-write on sub-sector + * operations. */ + bs->bl.request_alignment = BDRV_SECTOR_SIZE; + } +} + static int raw_truncate(BlockDriverState *bs, int64_t offset) { return bdrv_truncate(bs->file->bs, offset); @@ -240,8 +246,8 @@ BlockDriver bdrv_raw = { .bdrv_open = &raw_open, .bdrv_close = &raw_close, .bdrv_create = &raw_create, - .bdrv_co_readv = &raw_co_readv, - .bdrv_co_writev_flags = &raw_co_writev_flags, + .bdrv_co_preadv = &raw_co_preadv, + .bdrv_co_pwritev = &raw_co_pwritev, .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes, .bdrv_co_pdiscard = &raw_co_pdiscard, .bdrv_co_get_block_status = &raw_co_get_block_status, @@ -249,6 +255,7 @@ BlockDriver bdrv_raw = { .bdrv_getlength = &raw_getlength, .has_variable_length = true, .bdrv_get_info = &raw_get_info, + .bdrv_refresh_limits = &raw_refresh_limits, .bdrv_probe_blocksizes = &raw_probe_blocksizes, .bdrv_probe_geometry = &raw_probe_geometry, .bdrv_media_changed = &raw_media_changed,