diff --git a/MAINTAINERS b/MAINTAINERS index ed884b2f19..1444b26dc0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1718,9 +1718,9 @@ L: qemu-block@nongnu.org S: Supported F: block/linux-aio.c F: include/block/raw-aio.h -F: block/raw-posix.c -F: block/raw-win32.c -F: block/raw_bsd.c +F: block/raw-format.c +F: block/file-posix.c +F: block/file-win32.c F: block/win32-aio.c qcow2 diff --git a/block/Makefile.objs b/block/Makefile.objs index 67a036a1df..0b8fd06f27 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -1,4 +1,4 @@ -block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o dmg.o +block-obj-y += raw-format.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o dmg.o block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o block-obj-y += qed-check.o @@ -6,8 +6,8 @@ block-obj-y += vhdx.o vhdx-endian.o vhdx-log.o block-obj-y += quorum.o block-obj-y += parallels.o blkdebug.o blkverify.o blkreplay.o block-obj-y += block-backend.o snapshot.o qapi.o -block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o -block-obj-$(CONFIG_POSIX) += raw-posix.o +block-obj-$(CONFIG_WIN32) += file-win32.o win32-aio.o +block-obj-$(CONFIG_POSIX) += file-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o block-obj-y += null.o mirror.o commit.o io.o block-obj-y += throttle-groups.o diff --git a/block/blkdebug.c b/block/blkdebug.c index 4127571454..acccf85666 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -58,10 +58,6 @@ typedef struct BlkdebugSuspendedReq { QLIST_ENTRY(BlkdebugSuspendedReq) next; } BlkdebugSuspendedReq; -static const AIOCBInfo blkdebug_aiocb_info = { - .aiocb_size = sizeof(BlkdebugAIOCB), -}; - enum { ACTION_INJECT_ERROR, ACTION_SET_STATE, @@ -77,7 +73,7 @@ typedef struct BlkdebugRule { int error; int immediately; int once; - int64_t sector; + int64_t offset; } inject; struct { int new_state; @@ -174,6 +170,7 @@ static int add_rule(void *opaque, QemuOpts *opts, Error **errp) const char* event_name; BlkdebugEvent event; struct BlkdebugRule *rule; + int64_t sector; /* Find the right event for the rule */ event_name = qemu_opt_get(opts, "event"); @@ -200,7 +197,9 @@ static int add_rule(void *opaque, QemuOpts *opts, Error **errp) rule->options.inject.once = qemu_opt_get_bool(opts, "once", 0); rule->options.inject.immediately = qemu_opt_get_bool(opts, "immediately", 0); - rule->options.inject.sector = qemu_opt_get_number(opts, "sector", -1); + sector = qemu_opt_get_number(opts, "sector", -1); + rule->options.inject.offset = + sector == -1 ? -1 : sector * BDRV_SECTOR_SIZE; break; case ACTION_SET_STATE: @@ -408,17 +407,14 @@ out: static void error_callback_bh(void *opaque) { - struct BlkdebugAIOCB *acb = opaque; - acb->common.cb(acb->common.opaque, acb->ret); - qemu_aio_unref(acb); + Coroutine *co = opaque; + qemu_coroutine_enter(co); } -static BlockAIOCB *inject_error(BlockDriverState *bs, - BlockCompletionFunc *cb, void *opaque, BlkdebugRule *rule) +static int inject_error(BlockDriverState *bs, BlkdebugRule *rule) { BDRVBlkdebugState *s = bs->opaque; int error = rule->options.inject.error; - struct BlkdebugAIOCB *acb; bool immediately = rule->options.inject.immediately; if (rule->options.inject.once) { @@ -426,81 +422,79 @@ static BlockAIOCB *inject_error(BlockDriverState *bs, remove_rule(rule); } - if (immediately) { - return NULL; + if (!immediately) { + aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), error_callback_bh, + qemu_coroutine_self()); + qemu_coroutine_yield(); } - acb = qemu_aio_get(&blkdebug_aiocb_info, bs, cb, opaque); - acb->ret = -error; - - aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), error_callback_bh, acb); - - return &acb->common; + return -error; } -static BlockAIOCB *blkdebug_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +static int coroutine_fn +blkdebug_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVBlkdebugState *s = bs->opaque; BlkdebugRule *rule = NULL; QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { - if (rule->options.inject.sector == -1 || - (rule->options.inject.sector >= sector_num && - rule->options.inject.sector < sector_num + nb_sectors)) { + uint64_t inject_offset = rule->options.inject.offset; + + if (inject_offset == -1 || + (inject_offset >= offset && inject_offset < offset + bytes)) + { break; } } if (rule && rule->options.inject.error) { - return inject_error(bs, cb, opaque, rule); + return inject_error(bs, rule); } - return bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors, - cb, opaque); + return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); } -static BlockAIOCB *blkdebug_aio_writev(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +static int coroutine_fn +blkdebug_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVBlkdebugState *s = bs->opaque; BlkdebugRule *rule = NULL; QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { - if (rule->options.inject.sector == -1 || - (rule->options.inject.sector >= sector_num && - rule->options.inject.sector < sector_num + nb_sectors)) { + uint64_t inject_offset = rule->options.inject.offset; + + if (inject_offset == -1 || + (inject_offset >= offset && inject_offset < offset + bytes)) + { break; } } if (rule && rule->options.inject.error) { - return inject_error(bs, cb, opaque, rule); + return inject_error(bs, rule); } - return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, - cb, opaque); + return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); } -static BlockAIOCB *blkdebug_aio_flush(BlockDriverState *bs, - BlockCompletionFunc *cb, void *opaque) +static int blkdebug_co_flush(BlockDriverState *bs) { BDRVBlkdebugState *s = bs->opaque; BlkdebugRule *rule = NULL; QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { - if (rule->options.inject.sector == -1) { + if (rule->options.inject.offset == -1) { break; } } if (rule && rule->options.inject.error) { - return inject_error(bs, cb, opaque, rule); + return inject_error(bs, rule); } - return bdrv_aio_flush(bs->file->bs, cb, opaque); + return bdrv_co_flush(bs->file->bs); } @@ -752,9 +746,9 @@ static BlockDriver bdrv_blkdebug = { .bdrv_refresh_filename = blkdebug_refresh_filename, .bdrv_refresh_limits = blkdebug_refresh_limits, - .bdrv_aio_readv = blkdebug_aio_readv, - .bdrv_aio_writev = blkdebug_aio_writev, - .bdrv_aio_flush = blkdebug_aio_flush, + .bdrv_co_preadv = blkdebug_co_preadv, + .bdrv_co_pwritev = blkdebug_co_pwritev, + .bdrv_co_flush_to_disk = blkdebug_co_flush, .bdrv_debug_event = blkdebug_debug_event, .bdrv_debug_breakpoint = blkdebug_debug_breakpoint, diff --git a/block/blkverify.c b/block/blkverify.c index 28f9af6dba..43a940c2f5 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -19,38 +19,36 @@ typedef struct { BdrvChild *test_file; } BDRVBlkverifyState; -typedef struct BlkverifyAIOCB BlkverifyAIOCB; -struct BlkverifyAIOCB { - BlockAIOCB common; +typedef struct BlkverifyRequest { + Coroutine *co; + BlockDriverState *bs; /* Request metadata */ bool is_write; - int64_t sector_num; - int nb_sectors; + uint64_t offset; + uint64_t bytes; + int flags; + + int (*request_fn)(BdrvChild *, int64_t, unsigned int, QEMUIOVector *, + BdrvRequestFlags); + + int ret; /* test image result */ + int raw_ret; /* raw image result */ - int ret; /* first completed request's result */ unsigned int done; /* completion counter */ QEMUIOVector *qiov; /* user I/O vector */ - QEMUIOVector raw_qiov; /* cloned I/O vector for raw file */ - void *buf; /* buffer for raw file I/O */ + QEMUIOVector *raw_qiov; /* cloned I/O vector for raw file */ +} BlkverifyRequest; - void (*verify)(BlkverifyAIOCB *acb); -}; - -static const AIOCBInfo blkverify_aiocb_info = { - .aiocb_size = sizeof(BlkverifyAIOCB), -}; - -static void GCC_FMT_ATTR(2, 3) blkverify_err(BlkverifyAIOCB *acb, +static void GCC_FMT_ATTR(2, 3) blkverify_err(BlkverifyRequest *r, const char *fmt, ...) { va_list ap; va_start(ap, fmt); - fprintf(stderr, "blkverify: %s sector_num=%" PRId64 " nb_sectors=%d ", - acb->is_write ? "write" : "read", acb->sector_num, - acb->nb_sectors); + fprintf(stderr, "blkverify: %s offset=%" PRId64 " bytes=%" PRId64 " ", + r->is_write ? "write" : "read", r->offset, r->bytes); vfprintf(stderr, fmt, ap); fprintf(stderr, "\n"); va_end(ap); @@ -166,113 +164,106 @@ static int64_t blkverify_getlength(BlockDriverState *bs) return bdrv_getlength(s->test_file->bs); } -static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write, - int64_t sector_num, QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) +static void coroutine_fn blkverify_do_test_req(void *opaque) { - BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aiocb_info, bs, cb, opaque); + BlkverifyRequest *r = opaque; + BDRVBlkverifyState *s = r->bs->opaque; - acb->is_write = is_write; - acb->sector_num = sector_num; - acb->nb_sectors = nb_sectors; - acb->ret = -EINPROGRESS; - acb->done = 0; - acb->qiov = qiov; - acb->buf = NULL; - acb->verify = NULL; - return acb; + r->ret = r->request_fn(s->test_file, r->offset, r->bytes, r->qiov, + r->flags); + r->done++; + qemu_coroutine_enter_if_inactive(r->co); } -static void blkverify_aio_bh(void *opaque) +static void coroutine_fn blkverify_do_raw_req(void *opaque) { - BlkverifyAIOCB *acb = opaque; + BlkverifyRequest *r = opaque; - if (acb->buf) { - qemu_iovec_destroy(&acb->raw_qiov); - qemu_vfree(acb->buf); + r->raw_ret = r->request_fn(r->bs->file, r->offset, r->bytes, r->raw_qiov, + r->flags); + r->done++; + qemu_coroutine_enter_if_inactive(r->co); +} + +static int coroutine_fn +blkverify_co_prwv(BlockDriverState *bs, BlkverifyRequest *r, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, QEMUIOVector *raw_qiov, + int flags, bool is_write) +{ + Coroutine *co_a, *co_b; + + *r = (BlkverifyRequest) { + .co = qemu_coroutine_self(), + .bs = bs, + .offset = offset, + .bytes = bytes, + .qiov = qiov, + .raw_qiov = raw_qiov, + .flags = flags, + .is_write = is_write, + .request_fn = is_write ? bdrv_co_pwritev : bdrv_co_preadv, + }; + + co_a = qemu_coroutine_create(blkverify_do_test_req, r); + co_b = qemu_coroutine_create(blkverify_do_raw_req, r); + + qemu_coroutine_enter(co_a); + qemu_coroutine_enter(co_b); + + while (r->done < 2) { + qemu_coroutine_yield(); } - acb->common.cb(acb->common.opaque, acb->ret); - qemu_aio_unref(acb); -} -static void blkverify_aio_cb(void *opaque, int ret) -{ - BlkverifyAIOCB *acb = opaque; - - switch (++acb->done) { - case 1: - acb->ret = ret; - break; - - case 2: - if (acb->ret != ret) { - blkverify_err(acb, "return value mismatch %d != %d", acb->ret, ret); - } - - if (acb->verify) { - acb->verify(acb); - } - - aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs), - blkverify_aio_bh, acb); - break; + if (r->ret != r->raw_ret) { + blkverify_err(r, "return value mismatch %d != %d", r->ret, r->raw_ret); } + + return r->ret; } -static void blkverify_verify_readv(BlkverifyAIOCB *acb) +static int coroutine_fn +blkverify_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { - ssize_t offset = qemu_iovec_compare(acb->qiov, &acb->raw_qiov); - if (offset != -1) { - blkverify_err(acb, "contents mismatch in sector %" PRId64, - acb->sector_num + (int64_t)(offset / BDRV_SECTOR_SIZE)); + BlkverifyRequest r; + QEMUIOVector raw_qiov; + void *buf; + ssize_t cmp_offset; + int ret; + + buf = qemu_blockalign(bs->file->bs, qiov->size); + qemu_iovec_init(&raw_qiov, qiov->niov); + qemu_iovec_clone(&raw_qiov, qiov, buf); + + ret = blkverify_co_prwv(bs, &r, offset, bytes, qiov, &raw_qiov, flags, + false); + + cmp_offset = qemu_iovec_compare(qiov, &raw_qiov); + if (cmp_offset != -1) { + blkverify_err(&r, "contents mismatch at offset %" PRId64, + offset + cmp_offset); } + + qemu_iovec_destroy(&raw_qiov); + qemu_vfree(buf); + + return ret; } -static BlockAIOCB *blkverify_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +static int coroutine_fn +blkverify_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { - BDRVBlkverifyState *s = bs->opaque; - BlkverifyAIOCB *acb = blkverify_aio_get(bs, false, sector_num, qiov, - nb_sectors, cb, opaque); - - acb->verify = blkverify_verify_readv; - acb->buf = qemu_blockalign(bs->file->bs, qiov->size); - qemu_iovec_init(&acb->raw_qiov, acb->qiov->niov); - qemu_iovec_clone(&acb->raw_qiov, qiov, acb->buf); - - bdrv_aio_readv(s->test_file, sector_num, qiov, nb_sectors, - blkverify_aio_cb, acb); - bdrv_aio_readv(bs->file, sector_num, &acb->raw_qiov, nb_sectors, - blkverify_aio_cb, acb); - return &acb->common; + BlkverifyRequest r; + return blkverify_co_prwv(bs, &r, offset, bytes, qiov, qiov, flags, true); } -static BlockAIOCB *blkverify_aio_writev(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - BDRVBlkverifyState *s = bs->opaque; - BlkverifyAIOCB *acb = blkverify_aio_get(bs, true, sector_num, qiov, - nb_sectors, cb, opaque); - - bdrv_aio_writev(s->test_file, sector_num, qiov, nb_sectors, - blkverify_aio_cb, acb); - bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, - blkverify_aio_cb, acb); - return &acb->common; -} - -static BlockAIOCB *blkverify_aio_flush(BlockDriverState *bs, - BlockCompletionFunc *cb, - void *opaque) +static int blkverify_co_flush(BlockDriverState *bs) { BDRVBlkverifyState *s = bs->opaque; /* Only flush test file, the raw file is not important */ - return bdrv_aio_flush(s->test_file->bs, cb, opaque); + return bdrv_co_flush(s->test_file->bs); } static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs, @@ -332,9 +323,9 @@ static BlockDriver bdrv_blkverify = { .bdrv_getlength = blkverify_getlength, .bdrv_refresh_filename = blkverify_refresh_filename, - .bdrv_aio_readv = blkverify_aio_readv, - .bdrv_aio_writev = blkverify_aio_writev, - .bdrv_aio_flush = blkverify_aio_flush, + .bdrv_co_preadv = blkverify_co_preadv, + .bdrv_co_pwritev = blkverify_co_pwritev, + .bdrv_co_flush = blkverify_co_flush, .is_filter = true, .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter, diff --git a/block/raw-posix.c b/block/file-posix.c similarity index 100% rename from block/raw-posix.c rename to block/file-posix.c diff --git a/block/raw-win32.c b/block/file-win32.c similarity index 100% rename from block/raw-win32.c rename to block/file-win32.c diff --git a/block/gluster.c b/block/gluster.c index a0a74e49fd..1a22f2982d 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -1253,7 +1253,7 @@ static int qemu_gluster_has_zero_init(BlockDriverState *bs) * If @start is in a trailing hole or beyond EOF, return -ENXIO. * If we can't find out, return a negative errno other than -ENXIO. * - * (Shamefully copied from raw-posix.c, only miniscule adaptions.) + * (Shamefully copied from file-posix.c, only miniscule adaptions.) */ static int find_allocation(BlockDriverState *bs, off_t start, off_t *data, off_t *hole) @@ -1349,7 +1349,7 @@ exit: * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes * beyond the end of the disk image it will be clamped. * - * (Based on raw_co_get_block_status() from raw-posix.c.) + * (Based on raw_co_get_block_status() from file-posix.c.) */ static int64_t coroutine_fn qemu_gluster_co_get_block_status( BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum, diff --git a/block/quorum.c b/block/quorum.c index d122299352..86e2072dce 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -97,7 +97,7 @@ typedef struct QuorumAIOCB QuorumAIOCB; * $children_count QuorumChildRequest. */ typedef struct QuorumChildRequest { - BlockAIOCB *aiocb; + BlockDriverState *bs; QEMUIOVector qiov; uint8_t *buf; int ret; @@ -110,11 +110,12 @@ typedef struct QuorumChildRequest { * used to do operations on each children and track overall progress. */ struct QuorumAIOCB { - BlockAIOCB common; + BlockDriverState *bs; + Coroutine *co; /* Request metadata */ - uint64_t sector_num; - int nb_sectors; + uint64_t offset; + uint64_t bytes; QEMUIOVector *qiov; /* calling IOV */ @@ -133,32 +134,15 @@ struct QuorumAIOCB { int children_read; /* how many children have been read from */ }; -static bool quorum_vote(QuorumAIOCB *acb); - -static void quorum_aio_cancel(BlockAIOCB *blockacb) -{ - QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common); - BDRVQuorumState *s = acb->common.bs->opaque; - int i; - - /* cancel all callbacks */ - for (i = 0; i < s->num_children; i++) { - if (acb->qcrs[i].aiocb) { - bdrv_aio_cancel_async(acb->qcrs[i].aiocb); - } - } -} - -static AIOCBInfo quorum_aiocb_info = { - .aiocb_size = sizeof(QuorumAIOCB), - .cancel_async = quorum_aio_cancel, -}; +typedef struct QuorumCo { + QuorumAIOCB *acb; + int idx; +} QuorumCo; static void quorum_aio_finalize(QuorumAIOCB *acb) { - acb->common.cb(acb->common.opaque, acb->vote_ret); g_free(acb->qcrs); - qemu_aio_unref(acb); + g_free(acb); } static bool quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b) @@ -171,30 +155,26 @@ static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b) return a->l == b->l; } -static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, - BlockDriverState *bs, +static QuorumAIOCB *quorum_aio_get(BlockDriverState *bs, QEMUIOVector *qiov, - uint64_t sector_num, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) + uint64_t offset, + uint64_t bytes) { - QuorumAIOCB *acb = qemu_aio_get(&quorum_aiocb_info, bs, cb, opaque); + BDRVQuorumState *s = bs->opaque; + QuorumAIOCB *acb = g_new(QuorumAIOCB, 1); int i; - acb->common.bs->opaque = s; - acb->sector_num = sector_num; - acb->nb_sectors = nb_sectors; - acb->qiov = qiov; - acb->qcrs = g_new0(QuorumChildRequest, s->num_children); - acb->count = 0; - acb->success_count = 0; - acb->rewrite_count = 0; - acb->votes.compare = quorum_sha256_compare; - QLIST_INIT(&acb->votes.vote_list); - acb->is_read = false; - acb->vote_ret = 0; + *acb = (QuorumAIOCB) { + .co = qemu_coroutine_self(), + .bs = bs, + .offset = offset, + .bytes = bytes, + .qiov = qiov, + .votes.compare = quorum_sha256_compare, + .votes.vote_list = QLIST_HEAD_INITIALIZER(acb.votes.vote_list), + }; + acb->qcrs = g_new0(QuorumChildRequest, s->num_children); for (i = 0; i < s->num_children; i++) { acb->qcrs[i].buf = NULL; acb->qcrs[i].ret = 0; @@ -204,30 +184,37 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, return acb; } -static void quorum_report_bad(QuorumOpType type, uint64_t sector_num, - int nb_sectors, char *node_name, int ret) +static void quorum_report_bad(QuorumOpType type, uint64_t offset, + uint64_t bytes, char *node_name, int ret) { const char *msg = NULL; + int64_t start_sector = offset / BDRV_SECTOR_SIZE; + int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); + if (ret < 0) { msg = strerror(-ret); } - qapi_event_send_quorum_report_bad(type, !!msg, msg, node_name, - sector_num, nb_sectors, &error_abort); + qapi_event_send_quorum_report_bad(type, !!msg, msg, node_name, start_sector, + end_sector - start_sector, &error_abort); } static void quorum_report_failure(QuorumAIOCB *acb) { - const char *reference = bdrv_get_device_or_node_name(acb->common.bs); - qapi_event_send_quorum_failure(reference, acb->sector_num, - acb->nb_sectors, &error_abort); + const char *reference = bdrv_get_device_or_node_name(acb->bs); + int64_t start_sector = acb->offset / BDRV_SECTOR_SIZE; + int64_t end_sector = DIV_ROUND_UP(acb->offset + acb->bytes, + BDRV_SECTOR_SIZE); + + qapi_event_send_quorum_failure(reference, start_sector, + end_sector - start_sector, &error_abort); } static int quorum_vote_error(QuorumAIOCB *acb); static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb) { - BDRVQuorumState *s = acb->common.bs->opaque; + BDRVQuorumState *s = acb->bs->opaque; if (acb->success_count < s->threshold) { acb->vote_ret = quorum_vote_error(acb); @@ -238,22 +225,7 @@ static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb) return false; } -static void quorum_rewrite_aio_cb(void *opaque, int ret) -{ - QuorumAIOCB *acb = opaque; - - /* one less rewrite to do */ - acb->rewrite_count--; - - /* wait until all rewrite callbacks have completed */ - if (acb->rewrite_count) { - return; - } - - quorum_aio_finalize(acb); -} - -static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb); +static int read_fifo_child(QuorumAIOCB *acb); static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) { @@ -272,70 +244,7 @@ static void quorum_report_bad_acb(QuorumChildRequest *sacb, int ret) { QuorumAIOCB *acb = sacb->parent; QuorumOpType type = acb->is_read ? QUORUM_OP_TYPE_READ : QUORUM_OP_TYPE_WRITE; - quorum_report_bad(type, acb->sector_num, acb->nb_sectors, - sacb->aiocb->bs->node_name, ret); -} - -static void quorum_fifo_aio_cb(void *opaque, int ret) -{ - QuorumChildRequest *sacb = opaque; - QuorumAIOCB *acb = sacb->parent; - BDRVQuorumState *s = acb->common.bs->opaque; - - assert(acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO); - - if (ret < 0) { - quorum_report_bad_acb(sacb, ret); - - /* We try to read next child in FIFO order if we fail to read */ - if (acb->children_read < s->num_children) { - read_fifo_child(acb); - return; - } - } - - acb->vote_ret = ret; - - /* FIXME: rewrite failed children if acb->children_read > 1? */ - quorum_aio_finalize(acb); -} - -static void quorum_aio_cb(void *opaque, int ret) -{ - QuorumChildRequest *sacb = opaque; - QuorumAIOCB *acb = sacb->parent; - BDRVQuorumState *s = acb->common.bs->opaque; - bool rewrite = false; - int i; - - sacb->ret = ret; - if (ret == 0) { - acb->success_count++; - } else { - quorum_report_bad_acb(sacb, ret); - } - acb->count++; - assert(acb->count <= s->num_children); - assert(acb->success_count <= s->num_children); - if (acb->count < s->num_children) { - return; - } - - /* Do the vote on read */ - if (acb->is_read) { - rewrite = quorum_vote(acb); - for (i = 0; i < s->num_children; i++) { - qemu_vfree(acb->qcrs[i].buf); - qemu_iovec_destroy(&acb->qcrs[i].qiov); - } - } else { - quorum_has_too_much_io_failed(acb); - } - - /* if no rewrite is done the code will finish right away */ - if (!rewrite) { - quorum_aio_finalize(acb); - } + quorum_report_bad(type, acb->offset, acb->bytes, sacb->bs->node_name, ret); } static void quorum_report_bad_versions(BDRVQuorumState *s, @@ -350,14 +259,31 @@ static void quorum_report_bad_versions(BDRVQuorumState *s, continue; } QLIST_FOREACH(item, &version->items, next) { - quorum_report_bad(QUORUM_OP_TYPE_READ, acb->sector_num, - acb->nb_sectors, + quorum_report_bad(QUORUM_OP_TYPE_READ, acb->offset, acb->bytes, s->children[item->index]->bs->node_name, 0); } } } -static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb, +static void quorum_rewrite_entry(void *opaque) +{ + QuorumCo *co = opaque; + QuorumAIOCB *acb = co->acb; + BDRVQuorumState *s = acb->bs->opaque; + + /* Ignore any errors, it's just a correction attempt for already + * corrupted data. */ + bdrv_co_pwritev(s->children[co->idx], acb->offset, acb->bytes, + acb->qiov, 0); + + /* Wake up the caller after the last rewrite */ + acb->rewrite_count--; + if (!acb->rewrite_count) { + qemu_coroutine_enter_if_inactive(acb->co); + } +} + +static bool quorum_rewrite_bad_versions(QuorumAIOCB *acb, QuorumVoteValue *value) { QuorumVoteVersion *version; @@ -376,7 +302,7 @@ static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb, } } - /* quorum_rewrite_aio_cb will count down this to zero */ + /* quorum_rewrite_entry will count down this to zero */ acb->rewrite_count = count; /* now fire the correcting rewrites */ @@ -385,9 +311,14 @@ static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb, continue; } QLIST_FOREACH(item, &version->items, next) { - bdrv_aio_writev(s->children[item->index], acb->sector_num, - acb->qiov, acb->nb_sectors, quorum_rewrite_aio_cb, - acb); + Coroutine *co; + QuorumCo data = { + .acb = acb, + .idx = item->index, + }; + + co = qemu_coroutine_create(quorum_rewrite_entry, &data); + qemu_coroutine_enter(co); } } @@ -507,8 +438,8 @@ static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb, va_list ap; va_start(ap, fmt); - fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ", - acb->sector_num, acb->nb_sectors); + fprintf(stderr, "quorum: offset=%" PRIu64 " bytes=%" PRIu64 " ", + acb->offset, acb->bytes); vfprintf(stderr, fmt, ap); fprintf(stderr, "\n"); va_end(ap); @@ -519,16 +450,15 @@ static bool quorum_compare(QuorumAIOCB *acb, QEMUIOVector *a, QEMUIOVector *b) { - BDRVQuorumState *s = acb->common.bs->opaque; + BDRVQuorumState *s = acb->bs->opaque; ssize_t offset; /* This driver will replace blkverify in this particular case */ if (s->is_blkverify) { offset = qemu_iovec_compare(a, b); if (offset != -1) { - quorum_err(acb, "contents mismatch in sector %" PRId64, - acb->sector_num + - (uint64_t)(offset / BDRV_SECTOR_SIZE)); + quorum_err(acb, "contents mismatch at offset %" PRIu64, + acb->offset + offset); } return true; } @@ -539,7 +469,7 @@ static bool quorum_compare(QuorumAIOCB *acb, /* Do a vote to get the error code */ static int quorum_vote_error(QuorumAIOCB *acb) { - BDRVQuorumState *s = acb->common.bs->opaque; + BDRVQuorumState *s = acb->bs->opaque; QuorumVoteVersion *winner = NULL; QuorumVotes error_votes; QuorumVoteValue result_value; @@ -568,17 +498,16 @@ static int quorum_vote_error(QuorumAIOCB *acb) return ret; } -static bool quorum_vote(QuorumAIOCB *acb) +static void quorum_vote(QuorumAIOCB *acb) { bool quorum = true; - bool rewrite = false; int i, j, ret; QuorumVoteValue hash; - BDRVQuorumState *s = acb->common.bs->opaque; + BDRVQuorumState *s = acb->bs->opaque; QuorumVoteVersion *winner; if (quorum_has_too_much_io_failed(acb)) { - return false; + return; } /* get the index of the first successful read */ @@ -606,7 +535,7 @@ static bool quorum_vote(QuorumAIOCB *acb) /* Every successful read agrees */ if (quorum) { quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov); - return false; + return; } /* compute hashes for each successful read, also store indexes */ @@ -641,19 +570,46 @@ static bool quorum_vote(QuorumAIOCB *acb) /* corruption correction is enabled */ if (s->rewrite_corrupted) { - rewrite = quorum_rewrite_bad_versions(s, acb, &winner->value); + quorum_rewrite_bad_versions(acb, &winner->value); } free_exit: /* free lists */ quorum_free_vote_list(&acb->votes); - return rewrite; } -static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb) +static void read_quorum_children_entry(void *opaque) { - BDRVQuorumState *s = acb->common.bs->opaque; - int i; + QuorumCo *co = opaque; + QuorumAIOCB *acb = co->acb; + BDRVQuorumState *s = acb->bs->opaque; + int i = co->idx; + QuorumChildRequest *sacb = &acb->qcrs[i]; + + sacb->bs = s->children[i]->bs; + sacb->ret = bdrv_co_preadv(s->children[i], acb->offset, acb->bytes, + &acb->qcrs[i].qiov, 0); + + if (sacb->ret == 0) { + acb->success_count++; + } else { + quorum_report_bad_acb(sacb, sacb->ret); + } + + acb->count++; + assert(acb->count <= s->num_children); + assert(acb->success_count <= s->num_children); + + /* Wake up the caller after the last read */ + if (acb->count == s->num_children) { + qemu_coroutine_enter_if_inactive(acb->co); + } +} + +static int read_quorum_children(QuorumAIOCB *acb) +{ + BDRVQuorumState *s = acb->bs->opaque; + int i, ret; acb->children_read = s->num_children; for (i = 0; i < s->num_children; i++) { @@ -663,65 +619,131 @@ static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb) } for (i = 0; i < s->num_children; i++) { - acb->qcrs[i].aiocb = bdrv_aio_readv(s->children[i], acb->sector_num, - &acb->qcrs[i].qiov, acb->nb_sectors, - quorum_aio_cb, &acb->qcrs[i]); + Coroutine *co; + QuorumCo data = { + .acb = acb, + .idx = i, + }; + + co = qemu_coroutine_create(read_quorum_children_entry, &data); + qemu_coroutine_enter(co); } - return &acb->common; + while (acb->count < s->num_children) { + qemu_coroutine_yield(); + } + + /* Do the vote on read */ + quorum_vote(acb); + for (i = 0; i < s->num_children; i++) { + qemu_vfree(acb->qcrs[i].buf); + qemu_iovec_destroy(&acb->qcrs[i].qiov); + } + + while (acb->rewrite_count) { + qemu_coroutine_yield(); + } + + ret = acb->vote_ret; + + return ret; } -static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb) +static int read_fifo_child(QuorumAIOCB *acb) { - BDRVQuorumState *s = acb->common.bs->opaque; - int n = acb->children_read++; + BDRVQuorumState *s = acb->bs->opaque; + int n, ret; - acb->qcrs[n].aiocb = bdrv_aio_readv(s->children[n], acb->sector_num, - acb->qiov, acb->nb_sectors, - quorum_fifo_aio_cb, &acb->qcrs[n]); + /* We try to read the next child in FIFO order if we failed to read */ + do { + n = acb->children_read++; + acb->qcrs[n].bs = s->children[n]->bs; + ret = bdrv_co_preadv(s->children[n], acb->offset, acb->bytes, + acb->qiov, 0); + if (ret < 0) { + quorum_report_bad_acb(&acb->qcrs[n], ret); + } + } while (ret < 0 && acb->children_read < s->num_children); - return &acb->common; + /* FIXME: rewrite failed children if acb->children_read > 1? */ + + return ret; } -static BlockAIOCB *quorum_aio_readv(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) +static int quorum_co_preadv(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, int flags) { BDRVQuorumState *s = bs->opaque; - QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, - nb_sectors, cb, opaque); + QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes); + int ret; + acb->is_read = true; acb->children_read = 0; if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) { - return read_quorum_children(acb); + ret = read_quorum_children(acb); + } else { + ret = read_fifo_child(acb); } + quorum_aio_finalize(acb); - return read_fifo_child(acb); + return ret; } -static BlockAIOCB *quorum_aio_writev(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) +static void write_quorum_entry(void *opaque) +{ + QuorumCo *co = opaque; + QuorumAIOCB *acb = co->acb; + BDRVQuorumState *s = acb->bs->opaque; + int i = co->idx; + QuorumChildRequest *sacb = &acb->qcrs[i]; + + sacb->bs = s->children[i]->bs; + sacb->ret = bdrv_co_pwritev(s->children[i], acb->offset, acb->bytes, + acb->qiov, 0); + if (sacb->ret == 0) { + acb->success_count++; + } else { + quorum_report_bad_acb(sacb, sacb->ret); + } + acb->count++; + assert(acb->count <= s->num_children); + assert(acb->success_count <= s->num_children); + + /* Wake up the caller after the last write */ + if (acb->count == s->num_children) { + qemu_coroutine_enter_if_inactive(acb->co); + } +} + +static int quorum_co_pwritev(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, int flags) { BDRVQuorumState *s = bs->opaque; - QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, nb_sectors, - cb, opaque); - int i; + QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes); + int i, ret; for (i = 0; i < s->num_children; i++) { - acb->qcrs[i].aiocb = bdrv_aio_writev(s->children[i], sector_num, - qiov, nb_sectors, &quorum_aio_cb, - &acb->qcrs[i]); + Coroutine *co; + QuorumCo data = { + .acb = acb, + .idx = i, + }; + + co = qemu_coroutine_create(write_quorum_entry, &data); + qemu_coroutine_enter(co); } - return &acb->common; + while (acb->count < s->num_children) { + qemu_coroutine_yield(); + } + + quorum_has_too_much_io_failed(acb); + + ret = acb->vote_ret; + quorum_aio_finalize(acb); + + return ret; } static int64_t quorum_getlength(BlockDriverState *bs) @@ -765,7 +787,7 @@ static coroutine_fn int quorum_co_flush(BlockDriverState *bs) result = bdrv_co_flush(s->children[i]->bs); if (result) { quorum_report_bad(QUORUM_OP_TYPE_FLUSH, 0, - bdrv_nb_sectors(s->children[i]->bs), + bdrv_getlength(s->children[i]->bs), s->children[i]->bs->node_name, result); result_value.l = result; quorum_count_vote(&error_votes, &result_value, i); @@ -1098,8 +1120,8 @@ static BlockDriver bdrv_quorum = { .bdrv_getlength = quorum_getlength, - .bdrv_aio_readv = quorum_aio_readv, - .bdrv_aio_writev = quorum_aio_writev, + .bdrv_co_preadv = quorum_co_preadv, + .bdrv_co_pwritev = quorum_co_pwritev, .bdrv_add_child = quorum_add_child, .bdrv_del_child = quorum_del_child, diff --git a/block/raw_bsd.c b/block/raw-format.c similarity index 99% rename from block/raw_bsd.c rename to block/raw-format.c index 8a5b9b0424..8404a82e0c 100644 --- a/block/raw_bsd.c +++ b/block/raw-format.c @@ -1,4 +1,4 @@ -/* BlockDriver implementation for "raw" +/* BlockDriver implementation for "raw" format driver * * Copyright (C) 2010-2016 Red Hat, Inc. * Copyright (C) 2010, Blue Swirl diff --git a/block/trace-events b/block/trace-events index cfc05f2478..671a6a851c 100644 --- a/block/trace-events +++ b/block/trace-events @@ -53,8 +53,8 @@ qmp_block_job_resume(void *job) "job %p" qmp_block_job_complete(void *job) "job %p" qmp_block_stream(void *bs, void *job) "bs %p job %p" -# block/raw-win32.c -# block/raw-posix.c +# block/file-win32.c +# block/file-posix.c paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d type %d" paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d" diff --git a/configure b/configure index 218df87d21..86f5214dd0 100755 --- a/configure +++ b/configure @@ -2750,7 +2750,7 @@ if compile_prog "" "" ; then fi ########################################## -# xfsctl() probe, used for raw-posix +# xfsctl() probe, used for file-posix.c if test "$xfs" != "no" ; then cat > $TMPC << EOF #include /* NULL */ diff --git a/include/block/block_int.h b/include/block/block_int.h index 83a423c580..4e4562d444 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -184,7 +184,7 @@ struct BlockDriver { /* * Flushes all data that was already written to the OS all the way down to - * the disk (for example raw-posix calls fsync()). + * the disk (for example file-posix.c calls fsync()). */ int coroutine_fn (*bdrv_co_flush_to_disk)(BlockDriverState *bs); diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h index e6a60d55fd..12584ed1b7 100644 --- a/include/qemu/coroutine.h +++ b/include/qemu/coroutine.h @@ -70,6 +70,12 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque); */ void qemu_coroutine_enter(Coroutine *coroutine); +/** + * Transfer control to a coroutine if it's not active (i.e. part of the call + * stack of the running coroutine). Otherwise, do nothing. + */ +void qemu_coroutine_enter_if_inactive(Coroutine *co); + /** * Transfer control back to a coroutine's caller * diff --git a/qemu-img.c b/qemu-img.c index 6949b73ca5..5df66fe661 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -3559,20 +3559,23 @@ static void bench_cb(void *opaque, int ret) } while (b->n > b->in_flight && b->in_flight < b->nrreq) { + int64_t offset = b->offset; + /* blk_aio_* might look for completed I/Os and kick bench_cb + * again, so make sure this operation is counted by in_flight + * and b->offset is ready for the next submission. + */ + b->in_flight++; + b->offset += b->step; + b->offset %= b->image_size; if (b->write) { - acb = blk_aio_pwritev(b->blk, b->offset, b->qiov, 0, - bench_cb, b); + acb = blk_aio_pwritev(b->blk, offset, b->qiov, 0, bench_cb, b); } else { - acb = blk_aio_preadv(b->blk, b->offset, b->qiov, 0, - bench_cb, b); + acb = blk_aio_preadv(b->blk, offset, b->qiov, 0, bench_cb, b); } if (!acb) { error_report("Failed to issue request"); exit(EXIT_FAILURE); } - b->in_flight++; - b->offset += b->step; - b->offset %= b->image_size; } } diff --git a/tests/qemu-iotests/071.out b/tests/qemu-iotests/071.out index 8ff423f56b..dd879f1212 100644 --- a/tests/qemu-iotests/071.out +++ b/tests/qemu-iotests/071.out @@ -12,7 +12,7 @@ read 512/512 bytes at offset 229376 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) wrote 512/512 bytes at offset 0 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -blkverify: read sector_num=0 nb_sectors=1 contents mismatch in sector 0 +blkverify: read offset=0 bytes=512 contents mismatch at offset 0 === Testing blkverify through file blockref === @@ -26,7 +26,7 @@ read 512/512 bytes at offset 229376 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) wrote 512/512 bytes at offset 0 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -blkverify: read sector_num=0 nb_sectors=1 contents mismatch in sector 0 +blkverify: read offset=0 bytes=512 contents mismatch at offset 0 === Testing blkdebug through filename === @@ -56,7 +56,7 @@ QMP_VERSION {"return": {}} {"return": {}} {"return": {}} -blkverify: read sector_num=0 nb_sectors=1 contents mismatch in sector 0 +blkverify: read offset=0 bytes=512 contents mismatch at offset 0 === Testing blkverify on existing raw block device === @@ -66,7 +66,7 @@ QMP_VERSION {"return": {}} {"return": {}} {"return": {}} -blkverify: read sector_num=0 nb_sectors=1 contents mismatch in sector 0 +blkverify: read offset=0 bytes=512 contents mismatch at offset 0 === Testing blkdebug's set-state through QMP === diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c index 737bffa984..a5d2f6c0c3 100644 --- a/util/qemu-coroutine.c +++ b/util/qemu-coroutine.c @@ -131,6 +131,13 @@ void qemu_coroutine_enter(Coroutine *co) } } +void qemu_coroutine_enter_if_inactive(Coroutine *co) +{ + if (!qemu_coroutine_entered(co)) { + qemu_coroutine_enter(co); + } +} + void coroutine_fn qemu_coroutine_yield(void) { Coroutine *self = qemu_coroutine_self();