From d37d0e365afb6825a90d8356fc6adcc1f58f40f3 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Mar 2020 17:08:06 +0000 Subject: [PATCH] aio-posix: remove idle poll handlers to improve scalability When there are many poll handlers it's likely that some of them are idle most of the time. Remove handlers that haven't had activity recently so that the polling loop scales better for guests with a large number of devices. This feature only takes effect for the Linux io_uring fd monitoring implementation because it is capable of combining fd monitoring with userspace polling. The other implementations can't do that and risk starving fds in favor of poll handlers, so don't try this optimization when they are in use. IOPS improves from 10k to 105k when the guest has 100 virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1 device for rw=randread,iodepth=1,bs=4k,ioengine=libaio on NVMe. [Clarified aio_poll_handlers locking discipline explanation in comment after discussion with Paolo Bonzini . --Stefan] Signed-off-by: Stefan Hajnoczi Link: https://lore.kernel.org/r/20200305170806.1313245-8-stefanha@redhat.com Message-Id: <20200305170806.1313245-8-stefanha@redhat.com> --- include/block/aio.h | 8 ++++ util/aio-posix.c | 93 +++++++++++++++++++++++++++++++++++++++++---- util/aio-posix.h | 2 + util/trace-events | 2 + 4 files changed, 98 insertions(+), 7 deletions(-) diff --git a/include/block/aio.h b/include/block/aio.h index f07ebb76b8..cb1989105a 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -227,6 +227,14 @@ struct AioContext { int64_t poll_grow; /* polling time growth factor */ int64_t poll_shrink; /* polling time shrink factor */ + /* + * List of handlers participating in userspace polling. Protected by + * ctx->list_lock. Iterated and modified mostly by the event loop thread + * from aio_poll() with ctx->list_lock incremented. aio_set_fd_handler() + * only touches the list to delete nodes if ctx->list_lock's count is zero. + */ + AioHandlerList poll_aio_handlers; + /* Are we in polling mode or monitoring file descriptors? */ bool poll_started; diff --git a/util/aio-posix.c b/util/aio-posix.c index 759989b45b..cd6cf0a4a9 100644 --- a/util/aio-posix.c +++ b/util/aio-posix.c @@ -22,6 +22,9 @@ #include "trace.h" #include "aio-posix.h" +/* Stop userspace polling on a handler if it isn't active for some time */ +#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND) + bool aio_poll_disabled(AioContext *ctx) { return atomic_read(&ctx->poll_disable_cnt); @@ -78,6 +81,7 @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node) * deleted because deleted nodes are only cleaned up while * no one is walking the handlers list. */ + QLIST_SAFE_REMOVE(node, node_poll); QLIST_REMOVE(node, node); return true; } @@ -205,7 +209,7 @@ static bool poll_set_started(AioContext *ctx, bool started) ctx->poll_started = started; qemu_lockcnt_inc(&ctx->list_lock); - QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { + QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) { IOHandler *fn; if (QLIST_IS_INSERTED(node, node_deleted)) { @@ -286,6 +290,7 @@ static void aio_free_deleted_handlers(AioContext *ctx) while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) { QLIST_REMOVE(node, node); QLIST_REMOVE(node, node_deleted); + QLIST_SAFE_REMOVE(node, node_poll); g_free(node); } @@ -300,6 +305,22 @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node) revents = node->pfd.revents & node->pfd.events; node->pfd.revents = 0; + /* + * Start polling AioHandlers when they become ready because activity is + * likely to continue. Note that starvation is theoretically possible when + * fdmon_supports_polling(), but only until the fd fires for the first + * time. + */ + if (!QLIST_IS_INSERTED(node, node_deleted) && + !QLIST_IS_INSERTED(node, node_poll) && + node->io_poll) { + trace_poll_add(ctx, node, node->pfd.fd, revents); + if (ctx->poll_started && node->io_poll_begin) { + node->io_poll_begin(node->opaque); + } + QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll); + } + if (!QLIST_IS_INSERTED(node, node_deleted) && (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) && aio_node_check(ctx, node->is_external) && @@ -364,15 +385,19 @@ void aio_dispatch(AioContext *ctx) timerlistgroup_run_timers(&ctx->tlg); } -static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout) +static bool run_poll_handlers_once(AioContext *ctx, + int64_t now, + int64_t *timeout) { bool progress = false; AioHandler *node; + AioHandler *tmp; - QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { - if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll && - aio_node_check(ctx, node->is_external) && + QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) { + if (aio_node_check(ctx, node->is_external) && node->io_poll(node->opaque)) { + node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS; + /* * Polling was successful, exit try_poll_mode immediately * to adjust the next polling time. @@ -389,6 +414,50 @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout) return progress; } +static bool fdmon_supports_polling(AioContext *ctx) +{ + return ctx->fdmon_ops->need_wait != aio_poll_disabled; +} + +static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now) +{ + AioHandler *node; + AioHandler *tmp; + bool progress = false; + + /* + * File descriptor monitoring implementations without userspace polling + * support suffer from starvation when a subset of handlers is polled + * because fds will not be processed in a timely fashion. Don't remove + * idle poll handlers. + */ + if (!fdmon_supports_polling(ctx)) { + return false; + } + + QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) { + if (node->poll_idle_timeout == 0LL) { + node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS; + } else if (now >= node->poll_idle_timeout) { + trace_poll_remove(ctx, node, node->pfd.fd); + node->poll_idle_timeout = 0LL; + QLIST_SAFE_REMOVE(node, node_poll); + if (ctx->poll_started && node->io_poll_end) { + node->io_poll_end(node->opaque); + + /* + * Final poll in case ->io_poll_end() races with an event. + * Nevermind about re-adding the handler in the rare case where + * this causes progress. + */ + progress = node->io_poll(node->opaque) || progress; + } + } + } + + return progress; +} + /* run_poll_handlers: * @ctx: the AioContext * @max_ns: maximum time to poll for, in nanoseconds @@ -424,12 +493,17 @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); do { - progress = run_poll_handlers_once(ctx, timeout); + progress = run_poll_handlers_once(ctx, start_time, timeout); elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time; max_ns = qemu_soonest_timeout(*timeout, max_ns); assert(!(max_ns && progress)); } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx)); + if (remove_idle_poll_handlers(ctx, start_time + elapsed_time)) { + *timeout = 0; + progress = true; + } + /* If time has passed with no successful polling, adjust *timeout to * keep the same ending time. */ @@ -454,8 +528,13 @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) */ static bool try_poll_mode(AioContext *ctx, int64_t *timeout) { - int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); + int64_t max_ns; + if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) { + return false; + } + + max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) { poll_set_started(ctx, true); diff --git a/util/aio-posix.h b/util/aio-posix.h index 55fc771327..c80c04506a 100644 --- a/util/aio-posix.h +++ b/util/aio-posix.h @@ -30,10 +30,12 @@ struct AioHandler { QLIST_ENTRY(AioHandler) node; QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */ QLIST_ENTRY(AioHandler) node_deleted; + QLIST_ENTRY(AioHandler) node_poll; #ifdef CONFIG_LINUX_IO_URING QSLIST_ENTRY(AioHandler) node_submitted; unsigned flags; /* see fdmon-io_uring.c */ #endif + int64_t poll_idle_timeout; /* when to stop userspace polling */ bool is_external; }; diff --git a/util/trace-events b/util/trace-events index 83b6639018..0ce42822eb 100644 --- a/util/trace-events +++ b/util/trace-events @@ -5,6 +5,8 @@ run_poll_handlers_begin(void *ctx, int64_t max_ns, int64_t timeout) "ctx %p max_ run_poll_handlers_end(void *ctx, bool progress, int64_t timeout) "ctx %p progress %d new timeout %"PRId64 poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64 poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64 +poll_add(void *ctx, void *node, int fd, unsigned revents) "ctx %p node %p fd %d revents 0x%x" +poll_remove(void *ctx, void *node, int fd) "ctx %p node %p fd %d" # async.c aio_co_schedule(void *ctx, void *co) "ctx %p co %p"