2009-08-20 16:58:35 +02:00
|
|
|
/*
|
|
|
|
* Linux native AIO support.
|
|
|
|
*
|
|
|
|
* Copyright (C) 2009 IBM, Corp.
|
|
|
|
* Copyright (C) 2009 Red Hat, Inc.
|
|
|
|
*
|
|
|
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
|
|
|
* See the COPYING file in the top-level directory.
|
|
|
|
*/
|
2016-01-18 19:01:42 +01:00
|
|
|
#include "qemu/osdep.h"
|
2012-12-17 18:19:44 +01:00
|
|
|
#include "block/aio.h"
|
2012-12-17 18:20:00 +01:00
|
|
|
#include "qemu/queue.h"
|
2014-08-06 17:18:07 +02:00
|
|
|
#include "block/block.h"
|
2012-06-09 10:57:37 +02:00
|
|
|
#include "block/raw-aio.h"
|
2012-12-17 18:20:00 +01:00
|
|
|
#include "qemu/event_notifier.h"
|
2014-08-06 17:18:07 +02:00
|
|
|
#include "qemu/coroutine.h"
|
linux-aio: properly bubble up errors from initialization
laio_init() can fail for a couple of reasons, which will lead to a NULL
pointer dereference in laio_attach_aio_context().
To solve this, add a aio_setup_linux_aio() function which is called
early in raw_open_common. If this fails, propagate the error up. The
signature of aio_get_linux_aio() was not modified, because it seems
preferable to return the actual errno from the possible failing
initialization calls.
Additionally, when the AioContext changes, we need to associate a
LinuxAioState with the new AioContext. Use the bdrv_attach_aio_context
callback and call the new aio_setup_linux_aio(), which will allocate a
new AioContext if needed, and return errors on failures. If it fails for
any reason, fallback to threaded AIO with an error message, as the
device is already in-use by the guest.
Add an assert that aio_get_linux_aio() cannot return NULL.
Signed-off-by: Nishanth Aravamudan <naravamudan@digitalocean.com>
Message-id: 20180622193700.6523-1-naravamudan@digitalocean.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2018-06-22 21:37:00 +02:00
|
|
|
#include "qapi/error.h"
|
2009-08-20 16:58:35 +02:00
|
|
|
|
|
|
|
#include <libaio.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Queue size (per-device).
|
|
|
|
*
|
|
|
|
* XXX: eventually we need to communicate this to the guest and/or make it
|
|
|
|
* tunable by the guest. If we get more outstanding requests at a time
|
|
|
|
* than this we will get EAGAIN from io_submit which is communicated to
|
|
|
|
* the guest as an I/O error.
|
|
|
|
*/
|
2020-01-07 07:01:01 +01:00
|
|
|
#define MAX_EVENTS 1024
|
2009-08-20 16:58:35 +02:00
|
|
|
|
2021-07-21 11:42:11 +02:00
|
|
|
/* Maximum number of requests in a batch. (default value) */
|
|
|
|
#define DEFAULT_MAX_BATCH 32
|
|
|
|
|
2009-08-20 16:58:35 +02:00
|
|
|
struct qemu_laiocb {
|
2014-08-06 17:18:07 +02:00
|
|
|
Coroutine *co;
|
2016-04-07 18:33:35 +02:00
|
|
|
LinuxAioState *ctx;
|
2009-08-20 16:58:35 +02:00
|
|
|
struct iocb iocb;
|
|
|
|
ssize_t ret;
|
|
|
|
size_t nbytes;
|
2011-10-13 15:42:52 +02:00
|
|
|
QEMUIOVector *qiov;
|
|
|
|
bool is_read;
|
2014-12-11 14:52:26 +01:00
|
|
|
QSIMPLEQ_ENTRY(qemu_laiocb) next;
|
2009-08-20 16:58:35 +02:00
|
|
|
};
|
|
|
|
|
2014-07-04 12:04:34 +02:00
|
|
|
typedef struct {
|
|
|
|
int plugged;
|
2016-07-13 15:03:24 +02:00
|
|
|
unsigned int in_queue;
|
|
|
|
unsigned int in_flight;
|
2014-12-11 14:52:27 +01:00
|
|
|
bool blocked;
|
2014-12-11 14:52:26 +01:00
|
|
|
QSIMPLEQ_HEAD(, qemu_laiocb) pending;
|
2014-07-04 12:04:34 +02:00
|
|
|
} LaioQueue;
|
|
|
|
|
2016-04-07 18:33:35 +02:00
|
|
|
struct LinuxAioState {
|
2016-07-04 18:33:20 +02:00
|
|
|
AioContext *aio_context;
|
|
|
|
|
2009-08-20 16:58:35 +02:00
|
|
|
io_context_t ctx;
|
2012-02-24 08:39:02 +01:00
|
|
|
EventNotifier e;
|
2014-07-04 12:04:34 +02:00
|
|
|
|
2017-02-13 14:52:31 +01:00
|
|
|
/* io queue for submit at batch. Protected by AioContext lock. */
|
2014-07-04 12:04:34 +02:00
|
|
|
LaioQueue io_q;
|
2014-08-04 17:56:33 +02:00
|
|
|
|
2017-02-13 14:52:31 +01:00
|
|
|
/* I/O completion processing. Only runs in I/O thread. */
|
2014-08-04 17:56:33 +02:00
|
|
|
QEMUBH *completion_bh;
|
|
|
|
int event_idx;
|
|
|
|
int event_max;
|
2009-08-20 16:58:35 +02:00
|
|
|
};
|
|
|
|
|
2016-04-07 18:33:35 +02:00
|
|
|
static void ioq_submit(LinuxAioState *s);
|
2014-12-11 14:52:26 +01:00
|
|
|
|
2009-08-20 16:58:35 +02:00
|
|
|
static inline ssize_t io_event_ret(struct io_event *ev)
|
|
|
|
{
|
|
|
|
return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
|
|
|
|
}
|
|
|
|
|
2009-10-22 17:54:41 +02:00
|
|
|
/*
|
2019-06-02 22:17:09 +02:00
|
|
|
* Completes an AIO request.
|
2009-10-22 17:54:41 +02:00
|
|
|
*/
|
2016-04-07 18:33:35 +02:00
|
|
|
static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
|
2009-10-22 17:54:41 +02:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = laiocb->ret;
|
|
|
|
if (ret != -ECANCELED) {
|
2011-10-13 15:42:52 +02:00
|
|
|
if (ret == laiocb->nbytes) {
|
2009-10-22 17:54:41 +02:00
|
|
|
ret = 0;
|
2011-10-13 15:42:52 +02:00
|
|
|
} else if (ret >= 0) {
|
|
|
|
/* Short reads mean EOF, pad with zeros. */
|
|
|
|
if (laiocb->is_read) {
|
consolidate qemu_iovec_memset{,_skip}() into single function and use existing iov_memset()
This patch combines two functions into one, and replaces
the implementation with already existing iov_memset() from
iov.c.
The new prototype of qemu_iovec_memset():
size_t qemu_iovec_memset(qiov, size_t offset, int fillc, size_t bytes)
It is different from former qemu_iovec_memset_skip(), and
I want to make other functions to be consistent with it
too: first how much to skip, second what, and 3rd how many
of it. It also returns actual number of bytes filled in,
which may be less than the requested `bytes' if qiov is
smaller than offset+bytes, in the same way iov_memset()
does.
While at it, use utility function iov_memset() from
iov.h in posix-aio-compat.c, where qiov was used.
Signed-off-by: Michael Tokarev <mjt@tls.msk.ru>
2012-03-10 13:54:23 +01:00
|
|
|
qemu_iovec_memset(laiocb->qiov, ret, 0,
|
|
|
|
laiocb->qiov->size - ret);
|
2011-10-13 15:42:52 +02:00
|
|
|
} else {
|
2016-06-23 13:37:16 +02:00
|
|
|
ret = -ENOSPC;
|
2011-10-13 15:42:52 +02:00
|
|
|
}
|
|
|
|
}
|
2009-10-22 17:54:41 +02:00
|
|
|
}
|
|
|
|
|
2014-08-06 17:18:07 +02:00
|
|
|
laiocb->ret = ret;
|
2019-06-02 22:17:09 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the coroutine is already entered it must be in ioq_submit() and
|
|
|
|
* will notice laio->ret has been filled in when it eventually runs
|
|
|
|
* later. Coroutines cannot be entered recursively so avoid doing
|
|
|
|
* that!
|
|
|
|
*/
|
|
|
|
if (!qemu_coroutine_entered(laiocb->co)) {
|
|
|
|
aio_co_wake(laiocb->co);
|
2014-08-06 17:18:07 +02:00
|
|
|
}
|
2009-10-22 17:54:41 +02:00
|
|
|
}
|
|
|
|
|
2016-07-19 14:27:41 +02:00
|
|
|
/**
|
|
|
|
* aio_ring buffer which is shared between userspace and kernel.
|
|
|
|
*
|
|
|
|
* This copied from linux/fs/aio.c, common header does not exist
|
|
|
|
* but AIO exists for ages so we assume ABI is stable.
|
|
|
|
*/
|
|
|
|
struct aio_ring {
|
|
|
|
unsigned id; /* kernel internal index number */
|
|
|
|
unsigned nr; /* number of io_events */
|
|
|
|
unsigned head; /* Written to by userland or by kernel. */
|
|
|
|
unsigned tail;
|
|
|
|
|
|
|
|
unsigned magic;
|
|
|
|
unsigned compat_features;
|
|
|
|
unsigned incompat_features;
|
|
|
|
unsigned header_length; /* size of aio_ring */
|
|
|
|
|
misc: Replace zero-length arrays with flexible array member (automatic)
Description copied from Linux kernel commit from Gustavo A. R. Silva
(see [3]):
--v-- description start --v--
The current codebase makes use of the zero-length array language
extension to the C90 standard, but the preferred mechanism to
declare variable-length types such as these ones is a flexible
array member [1], introduced in C99:
struct foo {
int stuff;
struct boo array[];
};
By making use of the mechanism above, we will get a compiler
warning in case the flexible array does not occur last in the
structure, which will help us prevent some kind of undefined
behavior bugs from being unadvertenly introduced [2] to the
Linux codebase from now on.
--^-- description end --^--
Do the similar housekeeping in the QEMU codebase (which uses
C99 since commit 7be41675f7cb).
All these instances of code were found with the help of the
following Coccinelle script:
@@
identifier s, m, a;
type t, T;
@@
struct s {
...
t m;
- T a[0];
+ T a[];
};
@@
identifier s, m, a;
type t, T;
@@
struct s {
...
t m;
- T a[0];
+ T a[];
} QEMU_PACKED;
[1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html
[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=76497732932f
[3] https://git.kernel.org/pub/scm/linux/kernel/git/gustavoars/linux.git/commit/?id=17642a2fbd2c1
Inspired-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2020-03-04 16:38:15 +01:00
|
|
|
struct io_event io_events[];
|
2016-07-19 14:27:41 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* io_getevents_peek:
|
|
|
|
* @ctx: AIO context
|
|
|
|
* @events: pointer on events array, output value
|
|
|
|
|
|
|
|
* Returns the number of completed events and sets a pointer
|
|
|
|
* on events array. This function does not update the internal
|
|
|
|
* ring buffer, only reads head and tail. When @events has been
|
|
|
|
* processed io_getevents_commit() must be called.
|
|
|
|
*/
|
|
|
|
static inline unsigned int io_getevents_peek(io_context_t ctx,
|
|
|
|
struct io_event **events)
|
|
|
|
{
|
|
|
|
struct aio_ring *ring = (struct aio_ring *)ctx;
|
|
|
|
unsigned int head = ring->head, tail = ring->tail;
|
|
|
|
unsigned int nr;
|
|
|
|
|
|
|
|
nr = tail >= head ? tail - head : ring->nr - head;
|
|
|
|
*events = ring->io_events + head;
|
|
|
|
/* To avoid speculative loads of s->events[i] before observing tail.
|
|
|
|
Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */
|
|
|
|
smp_rmb();
|
|
|
|
|
|
|
|
return nr;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* io_getevents_commit:
|
|
|
|
* @ctx: AIO context
|
|
|
|
* @nr: the number of events on which head should be advanced
|
|
|
|
*
|
|
|
|
* Advances head of a ring buffer.
|
|
|
|
*/
|
|
|
|
static inline void io_getevents_commit(io_context_t ctx, unsigned int nr)
|
|
|
|
{
|
|
|
|
struct aio_ring *ring = (struct aio_ring *)ctx;
|
|
|
|
|
|
|
|
if (nr) {
|
|
|
|
ring->head = (ring->head + nr) % ring->nr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* io_getevents_advance_and_peek:
|
|
|
|
* @ctx: AIO context
|
|
|
|
* @events: pointer on events array, output value
|
|
|
|
* @nr: the number of events on which head should be advanced
|
|
|
|
*
|
|
|
|
* Advances head of a ring buffer and returns number of elements left.
|
|
|
|
*/
|
|
|
|
static inline unsigned int
|
|
|
|
io_getevents_advance_and_peek(io_context_t ctx,
|
|
|
|
struct io_event **events,
|
|
|
|
unsigned int nr)
|
|
|
|
{
|
|
|
|
io_getevents_commit(ctx, nr);
|
|
|
|
return io_getevents_peek(ctx, events);
|
|
|
|
}
|
|
|
|
|
2016-07-19 14:27:42 +02:00
|
|
|
/**
|
|
|
|
* qemu_laio_process_completions:
|
|
|
|
* @s: AIO state
|
|
|
|
*
|
|
|
|
* Fetches completed I/O requests and invokes their callbacks.
|
2014-08-04 17:56:33 +02:00
|
|
|
*
|
|
|
|
* The function is somewhat tricky because it supports nested event loops, for
|
|
|
|
* example when a request callback invokes aio_poll(). In order to do this,
|
2016-07-19 14:27:42 +02:00
|
|
|
* indices are kept in LinuxAioState. Function schedules BH completion so it
|
|
|
|
* can be called again in a nested event loop. When there are no events left
|
|
|
|
* to complete the BH is being canceled.
|
2014-08-04 17:56:33 +02:00
|
|
|
*/
|
2016-07-19 14:27:42 +02:00
|
|
|
static void qemu_laio_process_completions(LinuxAioState *s)
|
2009-08-20 16:58:35 +02:00
|
|
|
{
|
2016-07-19 14:27:41 +02:00
|
|
|
struct io_event *events;
|
2009-08-20 16:58:35 +02:00
|
|
|
|
2014-08-04 17:56:33 +02:00
|
|
|
/* Reschedule so nested event loops see currently pending completions */
|
|
|
|
qemu_bh_schedule(s->completion_bh);
|
2009-08-20 16:58:35 +02:00
|
|
|
|
2016-07-19 14:27:41 +02:00
|
|
|
while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events,
|
|
|
|
s->event_idx))) {
|
|
|
|
for (s->event_idx = 0; s->event_idx < s->event_max; ) {
|
|
|
|
struct iocb *iocb = events[s->event_idx].obj;
|
|
|
|
struct qemu_laiocb *laiocb =
|
2014-08-04 17:56:33 +02:00
|
|
|
container_of(iocb, struct qemu_laiocb, iocb);
|
|
|
|
|
2016-07-19 14:27:41 +02:00
|
|
|
laiocb->ret = io_event_ret(&events[s->event_idx]);
|
2014-08-04 17:56:33 +02:00
|
|
|
|
2016-07-19 14:27:41 +02:00
|
|
|
/* Change counters one-by-one because we can be nested. */
|
|
|
|
s->io_q.in_flight--;
|
|
|
|
s->event_idx++;
|
|
|
|
qemu_laio_process_completion(laiocb);
|
|
|
|
}
|
2014-08-04 17:56:33 +02:00
|
|
|
}
|
2014-12-11 14:52:26 +01:00
|
|
|
|
2016-07-19 14:27:41 +02:00
|
|
|
qemu_bh_cancel(s->completion_bh);
|
|
|
|
|
|
|
|
/* If we are nested we have to notify the level above that we are done
|
|
|
|
* by setting event_max to zero, upper level will then jump out of it's
|
|
|
|
* own `for` loop. If we are the last all counters droped to zero. */
|
|
|
|
s->event_max = 0;
|
|
|
|
s->event_idx = 0;
|
2016-07-19 14:27:42 +02:00
|
|
|
}
|
2016-07-19 14:27:41 +02:00
|
|
|
|
2016-07-19 14:27:42 +02:00
|
|
|
static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
|
|
|
|
{
|
block/linux-aio: acquire AioContext before qemu_laio_process_completions
In qemu_laio_process_completions_and_submit, the AioContext is acquired
before the ioq_submit iteration and after qemu_laio_process_completions,
but the latter is not thread safe either.
This change avoids a number of random crashes when the Main Thread and
an IO Thread collide processing completions for the same AioContext.
This is an example of such crash:
- The IO Thread is trying to acquire the AioContext at aio_co_enter,
which evidences that it didn't lock it before:
Thread 3 (Thread 0x7fdfd8bd8700 (LWP 36743)):
#0 0x00007fdfe0dd542d in __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
#1 0x00007fdfe0dd0de6 in _L_lock_870 () at /lib64/libpthread.so.0
#2 0x00007fdfe0dd0cdf in __GI___pthread_mutex_lock (mutex=mutex@entry=0x5631fde0e6c0)
at ../nptl/pthread_mutex_lock.c:114
#3 0x00005631fc0603a7 in qemu_mutex_lock_impl (mutex=0x5631fde0e6c0, file=0x5631fc23520f "util/async.c", line=511) at util/qemu-thread-posix.c:66
#4 0x00005631fc05b558 in aio_co_enter (ctx=0x5631fde0e660, co=0x7fdfcc0c2b40) at util/async.c:493
#5 0x00005631fc05b5ac in aio_co_wake (co=<optimized out>) at util/async.c:478
#6 0x00005631fbfc51ad in qemu_laio_process_completion (laiocb=<optimized out>) at block/linux-aio.c:104
#7 0x00005631fbfc523c in qemu_laio_process_completions (s=s@entry=0x7fdfc0297670)
at block/linux-aio.c:222
#8 0x00005631fbfc5499 in qemu_laio_process_completions_and_submit (s=0x7fdfc0297670)
at block/linux-aio.c:237
#9 0x00005631fc05d978 in aio_dispatch_handlers (ctx=ctx@entry=0x5631fde0e660) at util/aio-posix.c:406
#10 0x00005631fc05e3ea in aio_poll (ctx=0x5631fde0e660, blocking=blocking@entry=true)
at util/aio-posix.c:693
#11 0x00005631fbd7ad96 in iothread_run (opaque=0x5631fde0e1c0) at iothread.c:64
#12 0x00007fdfe0dcee25 in start_thread (arg=0x7fdfd8bd8700) at pthread_create.c:308
#13 0x00007fdfe0afc34d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
- The Main Thread is also processing completions from the same
AioContext, and crashes due to failed assertion at util/iov.c:78:
Thread 1 (Thread 0x7fdfeb5eac80 (LWP 36740)):
#0 0x00007fdfe0a391f7 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
#1 0x00007fdfe0a3a8e8 in __GI_abort () at abort.c:90
#2 0x00007fdfe0a32266 in __assert_fail_base (fmt=0x7fdfe0b84e68 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=assertion@entry=0x5631fc238ccb "offset == 0", file=file@entry=0x5631fc23698e "util/iov.c", line=line@entry=78, function=function@entry=0x5631fc236adc <__PRETTY_FUNCTION__.15220> "iov_memset")
at assert.c:92
#3 0x00007fdfe0a32312 in __GI___assert_fail (assertion=assertion@entry=0x5631fc238ccb "offset == 0", file=file@entry=0x5631fc23698e "util/iov.c", line=line@entry=78, function=function@entry=0x5631fc236adc <__PRETTY_FUNCTION__.15220> "iov_memset") at assert.c:101
#4 0x00005631fc065287 in iov_memset (iov=<optimized out>, iov_cnt=<optimized out>, offset=<optimized out>, offset@entry=65536, fillc=fillc@entry=0, bytes=15515191315812405248) at util/iov.c:78
#5 0x00005631fc065a63 in qemu_iovec_memset (qiov=<optimized out>, offset=offset@entry=65536, fillc=fillc@entry=0, bytes=<optimized out>) at util/iov.c:410
#6 0x00005631fbfc5178 in qemu_laio_process_completion (laiocb=0x7fdd920df630) at block/linux-aio.c:88
#7 0x00005631fbfc523c in qemu_laio_process_completions (s=s@entry=0x7fdfc0297670)
at block/linux-aio.c:222
#8 0x00005631fbfc5499 in qemu_laio_process_completions_and_submit (s=0x7fdfc0297670)
at block/linux-aio.c:237
#9 0x00005631fbfc54ed in qemu_laio_poll_cb (opaque=<optimized out>) at block/linux-aio.c:272
#10 0x00005631fc05d85e in run_poll_handlers_once (ctx=ctx@entry=0x5631fde0e660) at util/aio-posix.c:497
#11 0x00005631fc05e2ca in aio_poll (blocking=false, ctx=0x5631fde0e660) at util/aio-posix.c:574
#12 0x00005631fc05e2ca in aio_poll (ctx=0x5631fde0e660, blocking=blocking@entry=false)
at util/aio-posix.c:604
#13 0x00005631fbfcb8a3 in bdrv_do_drained_begin (ignore_parent=<optimized out>, recursive=<optimized out>, bs=<optimized out>) at block/io.c:273
#14 0x00005631fbfcb8a3 in bdrv_do_drained_begin (bs=0x5631fe8b6200, recursive=<optimized out>, parent=0x0, ignore_bds_parents=<optimized out>, poll=<optimized out>) at block/io.c:390
#15 0x00005631fbfbcd2e in blk_drain (blk=0x5631fe83ac80) at block/block-backend.c:1590
#16 0x00005631fbfbe138 in blk_remove_bs (blk=blk@entry=0x5631fe83ac80) at block/block-backend.c:774
#17 0x00005631fbfbe3d6 in blk_unref (blk=0x5631fe83ac80) at block/block-backend.c:401
#18 0x00005631fbfbe3d6 in blk_unref (blk=0x5631fe83ac80) at block/block-backend.c:449
#19 0x00005631fbfc9a69 in commit_complete (job=0x5631fe8b94b0, opaque=0x7fdfcc1bb080)
at block/commit.c:92
#20 0x00005631fbf7d662 in job_defer_to_main_loop_bh (opaque=0x7fdfcc1b4560) at job.c:973
#21 0x00005631fc05ad41 in aio_bh_poll (bh=0x7fdfcc01ad90) at util/async.c:90
#22 0x00005631fc05ad41 in aio_bh_poll (ctx=ctx@entry=0x5631fddffdb0) at util/async.c:118
#23 0x00005631fc05e210 in aio_dispatch (ctx=0x5631fddffdb0) at util/aio-posix.c:436
#24 0x00005631fc05ac1e in aio_ctx_dispatch (source=<optimized out>, callback=<optimized out>, user_data=<optimized out>) at util/async.c:261
#25 0x00007fdfeaae44c9 in g_main_context_dispatch (context=0x5631fde00140) at gmain.c:3201
#26 0x00007fdfeaae44c9 in g_main_context_dispatch (context=context@entry=0x5631fde00140) at gmain.c:3854
#27 0x00005631fc05d503 in main_loop_wait () at util/main-loop.c:215
#28 0x00005631fc05d503 in main_loop_wait (timeout=<optimized out>) at util/main-loop.c:238
#29 0x00005631fc05d503 in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497
#30 0x00005631fbd81412 in main_loop () at vl.c:1866
#31 0x00005631fbc18ff3 in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>)
at vl.c:4647
- A closer examination shows that s->io_q.in_flight appears to have
gone backwards:
(gdb) frame 7
#7 0x00005631fbfc523c in qemu_laio_process_completions (s=s@entry=0x7fdfc0297670)
at block/linux-aio.c:222
222 qemu_laio_process_completion(laiocb);
(gdb) p s
$2 = (LinuxAioState *) 0x7fdfc0297670
(gdb) p *s
$3 = {aio_context = 0x5631fde0e660, ctx = 0x7fdfeb43b000, e = {rfd = 33, wfd = 33}, io_q = {plugged = 0,
in_queue = 0, in_flight = 4294967280, blocked = false, pending = {sqh_first = 0x0,
sqh_last = 0x7fdfc0297698}}, completion_bh = 0x7fdfc0280ef0, event_idx = 21, event_max = 241}
(gdb) p/x s->io_q.in_flight
$4 = 0xfffffff0
Signed-off-by: Sergio Lopez <slp@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2018-09-05 13:23:34 +02:00
|
|
|
aio_context_acquire(s->aio_context);
|
2016-07-19 14:27:42 +02:00
|
|
|
qemu_laio_process_completions(s);
|
2017-02-13 14:52:31 +01:00
|
|
|
|
2014-12-11 14:52:26 +01:00
|
|
|
if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
|
|
|
|
ioq_submit(s);
|
|
|
|
}
|
2017-02-13 14:52:31 +01:00
|
|
|
aio_context_release(s->aio_context);
|
2014-08-04 17:56:33 +02:00
|
|
|
}
|
|
|
|
|
2016-07-19 14:27:42 +02:00
|
|
|
static void qemu_laio_completion_bh(void *opaque)
|
|
|
|
{
|
|
|
|
LinuxAioState *s = opaque;
|
|
|
|
|
|
|
|
qemu_laio_process_completions_and_submit(s);
|
|
|
|
}
|
|
|
|
|
2014-08-04 17:56:33 +02:00
|
|
|
static void qemu_laio_completion_cb(EventNotifier *e)
|
|
|
|
{
|
2016-04-07 18:33:35 +02:00
|
|
|
LinuxAioState *s = container_of(e, LinuxAioState, e);
|
2014-08-04 17:56:33 +02:00
|
|
|
|
|
|
|
if (event_notifier_test_and_clear(&s->e)) {
|
2016-07-19 14:27:42 +02:00
|
|
|
qemu_laio_process_completions_and_submit(s);
|
2009-08-20 16:58:35 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-12-01 20:26:44 +01:00
|
|
|
static bool qemu_laio_poll_cb(void *opaque)
|
|
|
|
{
|
|
|
|
EventNotifier *e = opaque;
|
|
|
|
LinuxAioState *s = container_of(e, LinuxAioState, e);
|
|
|
|
struct io_event *events;
|
|
|
|
|
aio-posix: split poll check from ready handler
Adaptive polling measures the execution time of the polling check plus
handlers called when a polled event becomes ready. Handlers can take a
significant amount of time, making it look like polling was running for
a long time when in fact the event handler was running for a long time.
For example, on Linux the io_submit(2) syscall invoked when a virtio-blk
device's virtqueue becomes ready can take 10s of microseconds. This
can exceed the default polling interval (32 microseconds) and cause
adaptive polling to stop polling.
By excluding the handler's execution time from the polling check we make
the adaptive polling calculation more accurate. As a result, the event
loop now stays in polling mode where previously it would have fallen
back to file descriptor monitoring.
The following data was collected with virtio-blk num-queues=2
event_idx=off using an IOThread. Before:
168k IOPS, IOThread syscalls:
9837.115 ( 0.020 ms): IO iothread1/620155 io_submit(ctx_id: 140512552468480, nr: 16, iocbpp: 0x7fcb9f937db0) = 16
9837.158 ( 0.002 ms): IO iothread1/620155 write(fd: 103, buf: 0x556a2ef71b88, count: 8) = 8
9837.161 ( 0.001 ms): IO iothread1/620155 write(fd: 104, buf: 0x556a2ef71b88, count: 8) = 8
9837.163 ( 0.001 ms): IO iothread1/620155 ppoll(ufds: 0x7fcb90002800, nfds: 4, tsp: 0x7fcb9f1342d0, sigsetsize: 8) = 3
9837.164 ( 0.001 ms): IO iothread1/620155 read(fd: 107, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.174 ( 0.001 ms): IO iothread1/620155 read(fd: 105, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.176 ( 0.001 ms): IO iothread1/620155 read(fd: 106, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.209 ( 0.035 ms): IO iothread1/620155 io_submit(ctx_id: 140512552468480, nr: 32, iocbpp: 0x7fca7d0cebe0) = 32
174k IOPS (+3.6%), IOThread syscalls:
9809.566 ( 0.036 ms): IO iothread1/623061 io_submit(ctx_id: 140539805028352, nr: 32, iocbpp: 0x7fd0cdd62be0) = 32
9809.625 ( 0.001 ms): IO iothread1/623061 write(fd: 103, buf: 0x5647cfba5f58, count: 8) = 8
9809.627 ( 0.002 ms): IO iothread1/623061 write(fd: 104, buf: 0x5647cfba5f58, count: 8) = 8
9809.663 ( 0.036 ms): IO iothread1/623061 io_submit(ctx_id: 140539805028352, nr: 32, iocbpp: 0x7fd0d0388b50) = 32
Notice that ppoll(2) and eventfd read(2) syscalls are eliminated because
the IOThread stays in polling mode instead of falling back to file
descriptor monitoring.
As usual, polling is not implemented on Windows so this patch ignores
the new io_poll_read() callback in aio-win32.c.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Message-id: 20211207132336.36627-2-stefanha@redhat.com
[Fixed up aio_set_event_notifier() calls in
tests/unit/test-fdmon-epoll.c added after this series was queued.
--Stefan]
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-12-07 14:23:31 +01:00
|
|
|
return io_getevents_peek(s->ctx, &events);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void qemu_laio_poll_ready(EventNotifier *opaque)
|
|
|
|
{
|
|
|
|
EventNotifier *e = opaque;
|
|
|
|
LinuxAioState *s = container_of(e, LinuxAioState, e);
|
2016-12-01 20:26:44 +01:00
|
|
|
|
|
|
|
qemu_laio_process_completions_and_submit(s);
|
|
|
|
}
|
|
|
|
|
2014-07-04 12:04:34 +02:00
|
|
|
static void ioq_init(LaioQueue *io_q)
|
|
|
|
{
|
2014-12-11 14:52:26 +01:00
|
|
|
QSIMPLEQ_INIT(&io_q->pending);
|
2014-07-04 12:04:34 +02:00
|
|
|
io_q->plugged = 0;
|
2016-07-13 15:03:24 +02:00
|
|
|
io_q->in_queue = 0;
|
|
|
|
io_q->in_flight = 0;
|
2014-12-11 14:52:27 +01:00
|
|
|
io_q->blocked = false;
|
2014-07-04 12:04:34 +02:00
|
|
|
}
|
|
|
|
|
2016-04-07 18:33:35 +02:00
|
|
|
static void ioq_submit(LinuxAioState *s)
|
2014-07-04 12:04:34 +02:00
|
|
|
{
|
2014-12-11 14:52:30 +01:00
|
|
|
int ret, len;
|
2014-12-11 14:52:26 +01:00
|
|
|
struct qemu_laiocb *aiocb;
|
2016-07-13 15:03:24 +02:00
|
|
|
struct iocb *iocbs[MAX_EVENTS];
|
2014-12-11 14:52:30 +01:00
|
|
|
QSIMPLEQ_HEAD(, qemu_laiocb) completed;
|
2014-07-04 12:04:34 +02:00
|
|
|
|
2014-12-11 14:52:27 +01:00
|
|
|
do {
|
2016-07-13 15:03:24 +02:00
|
|
|
if (s->io_q.in_flight >= MAX_EVENTS) {
|
|
|
|
break;
|
|
|
|
}
|
2014-12-11 14:52:27 +01:00
|
|
|
len = 0;
|
|
|
|
QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
|
|
|
|
iocbs[len++] = &aiocb->iocb;
|
2016-07-13 15:03:24 +02:00
|
|
|
if (s->io_q.in_flight + len >= MAX_EVENTS) {
|
2014-12-11 14:52:27 +01:00
|
|
|
break;
|
|
|
|
}
|
2014-12-11 14:52:26 +01:00
|
|
|
}
|
2014-07-04 12:04:34 +02:00
|
|
|
|
2014-12-11 14:52:27 +01:00
|
|
|
ret = io_submit(s->ctx, len, iocbs);
|
|
|
|
if (ret == -EAGAIN) {
|
2014-12-11 14:52:30 +01:00
|
|
|
break;
|
2014-12-11 14:52:27 +01:00
|
|
|
}
|
|
|
|
if (ret < 0) {
|
2016-08-09 13:20:19 +02:00
|
|
|
/* Fail the first request, retry the rest */
|
|
|
|
aiocb = QSIMPLEQ_FIRST(&s->io_q.pending);
|
|
|
|
QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next);
|
|
|
|
s->io_q.in_queue--;
|
|
|
|
aiocb->ret = ret;
|
|
|
|
qemu_laio_process_completion(aiocb);
|
|
|
|
continue;
|
2014-12-11 14:52:27 +01:00
|
|
|
}
|
|
|
|
|
2016-07-13 15:03:24 +02:00
|
|
|
s->io_q.in_flight += ret;
|
|
|
|
s->io_q.in_queue -= ret;
|
2014-12-11 14:52:30 +01:00
|
|
|
aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
|
|
|
|
QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
|
2014-12-11 14:52:27 +01:00
|
|
|
} while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
|
2016-07-13 15:03:24 +02:00
|
|
|
s->io_q.blocked = (s->io_q.in_queue > 0);
|
linux-aio: process completions from ioq_submit()
In order to reduce completion latency it makes sense to harvest completed
requests ASAP. Very fast backend device can complete requests just after
submission, so it is worth trying to check ring buffer in order to peek
completed requests directly after io_submit() has been called.
Indeed, this patch reduces the completions latencies and increases the
overall throughput, e.g. the following is the percentiles of number of
completed requests at once:
1th 10th 20th 30th 40th 50th 60th 70th 80th 90th 99.99th
Before 2 4 42 112 128 128 128 128 128 128 128
After 1 1 4 14 33 45 47 48 50 51 108
That means, that before the current patch is applied the ring buffer is
observed as full (128 requests were consumed at once) in 60% of calls.
After patch is applied the distribution of number of completed requests
is "smoother" and the queue (requests in-flight) is almost never full.
The fio read results are the following (write results are almost the
same and are not showed here):
Before
------
job: (groupid=0, jobs=8): err= 0: pid=2227: Tue Jul 19 11:29:50 2016
Description : [Emulation of Storage Server Access Pattern]
read : io=54681MB, bw=1822.7MB/s, iops=179779, runt= 30001msec
slat (usec): min=172, max=16883, avg=338.35, stdev=109.66
clat (usec): min=1, max=21977, avg=1051.45, stdev=299.29
lat (usec): min=317, max=22521, avg=1389.83, stdev=300.73
clat percentiles (usec):
| 1.00th=[ 346], 5.00th=[ 596], 10.00th=[ 708], 20.00th=[ 852],
| 30.00th=[ 932], 40.00th=[ 996], 50.00th=[ 1048], 60.00th=[ 1112],
| 70.00th=[ 1176], 80.00th=[ 1256], 90.00th=[ 1384], 95.00th=[ 1496],
| 99.00th=[ 1800], 99.50th=[ 1928], 99.90th=[ 2320], 99.95th=[ 2672],
| 99.99th=[ 4704]
bw (KB /s): min=205229, max=553181, per=12.50%, avg=233278.26, stdev=18383.51
After
------
job: (groupid=0, jobs=8): err= 0: pid=2220: Tue Jul 19 11:31:51 2016
Description : [Emulation of Storage Server Access Pattern]
read : io=57637MB, bw=1921.2MB/s, iops=189529, runt= 30002msec
slat (usec): min=169, max=20636, avg=329.61, stdev=124.18
clat (usec): min=2, max=19592, avg=988.78, stdev=251.04
lat (usec): min=381, max=21067, avg=1318.42, stdev=243.58
clat percentiles (usec):
| 1.00th=[ 310], 5.00th=[ 580], 10.00th=[ 748], 20.00th=[ 876],
| 30.00th=[ 908], 40.00th=[ 948], 50.00th=[ 1012], 60.00th=[ 1064],
| 70.00th=[ 1080], 80.00th=[ 1128], 90.00th=[ 1224], 95.00th=[ 1288],
| 99.00th=[ 1496], 99.50th=[ 1608], 99.90th=[ 1960], 99.95th=[ 2256],
| 99.99th=[ 5408]
bw (KB /s): min=212149, max=390160, per=12.49%, avg=245746.04, stdev=11606.75
Throughput increased from 1822MB/s to 1921MB/s, average completion latencies
decreased from 1051us to 988us.
Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
Message-id: 1468931263-32667-4-git-send-email-roman.penyaev@profitbricks.com
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: qemu-devel@nongnu.org
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2016-07-19 14:27:43 +02:00
|
|
|
|
|
|
|
if (s->io_q.in_flight) {
|
|
|
|
/* We can try to complete something just right away if there are
|
|
|
|
* still requests in-flight. */
|
|
|
|
qemu_laio_process_completions(s);
|
|
|
|
/*
|
|
|
|
* Even we have completed everything (in_flight == 0), the queue can
|
|
|
|
* have still pended requests (in_queue > 0). We do not attempt to
|
|
|
|
* repeat submission to avoid IO hang. The reason is simple: s->e is
|
|
|
|
* still set and completion callback will be called shortly and all
|
|
|
|
* pended requests will be submitted from there.
|
|
|
|
*/
|
|
|
|
}
|
2014-07-04 12:04:34 +02:00
|
|
|
}
|
|
|
|
|
2021-10-26 18:23:45 +02:00
|
|
|
static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
|
|
|
|
{
|
|
|
|
uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* AIO context can be shared between multiple block devices, so
|
|
|
|
* `dev_max_batch` allows reducing the batch size for latency-sensitive
|
|
|
|
* devices.
|
|
|
|
*/
|
|
|
|
max_batch = MIN_NON_ZERO(dev_max_batch, max_batch);
|
|
|
|
|
|
|
|
/* limit the batch with the number of available events */
|
|
|
|
max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);
|
|
|
|
|
|
|
|
return max_batch;
|
|
|
|
}
|
|
|
|
|
2016-04-07 18:33:35 +02:00
|
|
|
void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
|
2014-07-04 12:04:34 +02:00
|
|
|
{
|
2016-07-04 18:33:20 +02:00
|
|
|
s->io_q.plugged++;
|
2014-07-04 12:04:34 +02:00
|
|
|
}
|
|
|
|
|
2021-10-26 18:23:46 +02:00
|
|
|
void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s,
|
|
|
|
uint64_t dev_max_batch)
|
2014-07-04 12:04:34 +02:00
|
|
|
{
|
2016-04-07 18:33:34 +02:00
|
|
|
assert(s->io_q.plugged);
|
2021-10-26 18:23:46 +02:00
|
|
|
if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||
|
|
|
|
(--s->io_q.plugged == 0 &&
|
|
|
|
!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) {
|
2014-12-11 14:52:29 +01:00
|
|
|
ioq_submit(s);
|
2014-07-04 12:04:34 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-08-06 17:18:07 +02:00
|
|
|
static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
|
2021-10-26 18:23:45 +02:00
|
|
|
int type, uint64_t dev_max_batch)
|
2009-08-20 16:58:35 +02:00
|
|
|
{
|
2014-08-06 17:18:07 +02:00
|
|
|
LinuxAioState *s = laiocb->ctx;
|
|
|
|
struct iocb *iocbs = &laiocb->iocb;
|
|
|
|
QEMUIOVector *qiov = laiocb->qiov;
|
2009-08-20 16:58:35 +02:00
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case QEMU_AIO_WRITE:
|
|
|
|
io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
|
2018-12-13 23:37:37 +01:00
|
|
|
break;
|
2009-08-20 16:58:35 +02:00
|
|
|
case QEMU_AIO_READ:
|
|
|
|
io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
|
2018-12-13 23:37:37 +01:00
|
|
|
break;
|
2011-08-30 09:46:11 +02:00
|
|
|
/* Currently Linux kernel does not support other operations */
|
2009-08-20 16:58:35 +02:00
|
|
|
default:
|
|
|
|
fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
|
|
|
|
__func__, type);
|
2014-08-06 17:18:07 +02:00
|
|
|
return -EIO;
|
2009-08-20 16:58:35 +02:00
|
|
|
}
|
2012-02-24 08:39:02 +01:00
|
|
|
io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
|
2009-08-20 16:58:35 +02:00
|
|
|
|
2014-12-11 14:52:26 +01:00
|
|
|
QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
|
2016-07-13 15:03:24 +02:00
|
|
|
s->io_q.in_queue++;
|
2014-12-11 14:52:27 +01:00
|
|
|
if (!s->io_q.blocked &&
|
2016-07-13 15:03:24 +02:00
|
|
|
(!s->io_q.plugged ||
|
2021-10-26 18:23:45 +02:00
|
|
|
s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) {
|
2014-12-11 14:52:26 +01:00
|
|
|
ioq_submit(s);
|
2014-07-04 12:04:34 +02:00
|
|
|
}
|
2009-08-20 16:58:35 +02:00
|
|
|
|
2014-08-06 17:18:07 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
|
2021-10-26 18:23:45 +02:00
|
|
|
uint64_t offset, QEMUIOVector *qiov, int type,
|
|
|
|
uint64_t dev_max_batch)
|
2014-08-06 17:18:07 +02:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct qemu_laiocb laiocb = {
|
|
|
|
.co = qemu_coroutine_self(),
|
2016-06-03 17:36:27 +02:00
|
|
|
.nbytes = qiov->size,
|
2014-08-06 17:18:07 +02:00
|
|
|
.ctx = s,
|
linux-aio: process completions from ioq_submit()
In order to reduce completion latency it makes sense to harvest completed
requests ASAP. Very fast backend device can complete requests just after
submission, so it is worth trying to check ring buffer in order to peek
completed requests directly after io_submit() has been called.
Indeed, this patch reduces the completions latencies and increases the
overall throughput, e.g. the following is the percentiles of number of
completed requests at once:
1th 10th 20th 30th 40th 50th 60th 70th 80th 90th 99.99th
Before 2 4 42 112 128 128 128 128 128 128 128
After 1 1 4 14 33 45 47 48 50 51 108
That means, that before the current patch is applied the ring buffer is
observed as full (128 requests were consumed at once) in 60% of calls.
After patch is applied the distribution of number of completed requests
is "smoother" and the queue (requests in-flight) is almost never full.
The fio read results are the following (write results are almost the
same and are not showed here):
Before
------
job: (groupid=0, jobs=8): err= 0: pid=2227: Tue Jul 19 11:29:50 2016
Description : [Emulation of Storage Server Access Pattern]
read : io=54681MB, bw=1822.7MB/s, iops=179779, runt= 30001msec
slat (usec): min=172, max=16883, avg=338.35, stdev=109.66
clat (usec): min=1, max=21977, avg=1051.45, stdev=299.29
lat (usec): min=317, max=22521, avg=1389.83, stdev=300.73
clat percentiles (usec):
| 1.00th=[ 346], 5.00th=[ 596], 10.00th=[ 708], 20.00th=[ 852],
| 30.00th=[ 932], 40.00th=[ 996], 50.00th=[ 1048], 60.00th=[ 1112],
| 70.00th=[ 1176], 80.00th=[ 1256], 90.00th=[ 1384], 95.00th=[ 1496],
| 99.00th=[ 1800], 99.50th=[ 1928], 99.90th=[ 2320], 99.95th=[ 2672],
| 99.99th=[ 4704]
bw (KB /s): min=205229, max=553181, per=12.50%, avg=233278.26, stdev=18383.51
After
------
job: (groupid=0, jobs=8): err= 0: pid=2220: Tue Jul 19 11:31:51 2016
Description : [Emulation of Storage Server Access Pattern]
read : io=57637MB, bw=1921.2MB/s, iops=189529, runt= 30002msec
slat (usec): min=169, max=20636, avg=329.61, stdev=124.18
clat (usec): min=2, max=19592, avg=988.78, stdev=251.04
lat (usec): min=381, max=21067, avg=1318.42, stdev=243.58
clat percentiles (usec):
| 1.00th=[ 310], 5.00th=[ 580], 10.00th=[ 748], 20.00th=[ 876],
| 30.00th=[ 908], 40.00th=[ 948], 50.00th=[ 1012], 60.00th=[ 1064],
| 70.00th=[ 1080], 80.00th=[ 1128], 90.00th=[ 1224], 95.00th=[ 1288],
| 99.00th=[ 1496], 99.50th=[ 1608], 99.90th=[ 1960], 99.95th=[ 2256],
| 99.99th=[ 5408]
bw (KB /s): min=212149, max=390160, per=12.49%, avg=245746.04, stdev=11606.75
Throughput increased from 1822MB/s to 1921MB/s, average completion latencies
decreased from 1051us to 988us.
Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
Message-id: 1468931263-32667-4-git-send-email-roman.penyaev@profitbricks.com
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: qemu-devel@nongnu.org
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2016-07-19 14:27:43 +02:00
|
|
|
.ret = -EINPROGRESS,
|
2014-08-06 17:18:07 +02:00
|
|
|
.is_read = (type == QEMU_AIO_READ),
|
|
|
|
.qiov = qiov,
|
|
|
|
};
|
|
|
|
|
2021-10-26 18:23:45 +02:00
|
|
|
ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
|
2014-08-06 17:18:07 +02:00
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
linux-aio: process completions from ioq_submit()
In order to reduce completion latency it makes sense to harvest completed
requests ASAP. Very fast backend device can complete requests just after
submission, so it is worth trying to check ring buffer in order to peek
completed requests directly after io_submit() has been called.
Indeed, this patch reduces the completions latencies and increases the
overall throughput, e.g. the following is the percentiles of number of
completed requests at once:
1th 10th 20th 30th 40th 50th 60th 70th 80th 90th 99.99th
Before 2 4 42 112 128 128 128 128 128 128 128
After 1 1 4 14 33 45 47 48 50 51 108
That means, that before the current patch is applied the ring buffer is
observed as full (128 requests were consumed at once) in 60% of calls.
After patch is applied the distribution of number of completed requests
is "smoother" and the queue (requests in-flight) is almost never full.
The fio read results are the following (write results are almost the
same and are not showed here):
Before
------
job: (groupid=0, jobs=8): err= 0: pid=2227: Tue Jul 19 11:29:50 2016
Description : [Emulation of Storage Server Access Pattern]
read : io=54681MB, bw=1822.7MB/s, iops=179779, runt= 30001msec
slat (usec): min=172, max=16883, avg=338.35, stdev=109.66
clat (usec): min=1, max=21977, avg=1051.45, stdev=299.29
lat (usec): min=317, max=22521, avg=1389.83, stdev=300.73
clat percentiles (usec):
| 1.00th=[ 346], 5.00th=[ 596], 10.00th=[ 708], 20.00th=[ 852],
| 30.00th=[ 932], 40.00th=[ 996], 50.00th=[ 1048], 60.00th=[ 1112],
| 70.00th=[ 1176], 80.00th=[ 1256], 90.00th=[ 1384], 95.00th=[ 1496],
| 99.00th=[ 1800], 99.50th=[ 1928], 99.90th=[ 2320], 99.95th=[ 2672],
| 99.99th=[ 4704]
bw (KB /s): min=205229, max=553181, per=12.50%, avg=233278.26, stdev=18383.51
After
------
job: (groupid=0, jobs=8): err= 0: pid=2220: Tue Jul 19 11:31:51 2016
Description : [Emulation of Storage Server Access Pattern]
read : io=57637MB, bw=1921.2MB/s, iops=189529, runt= 30002msec
slat (usec): min=169, max=20636, avg=329.61, stdev=124.18
clat (usec): min=2, max=19592, avg=988.78, stdev=251.04
lat (usec): min=381, max=21067, avg=1318.42, stdev=243.58
clat percentiles (usec):
| 1.00th=[ 310], 5.00th=[ 580], 10.00th=[ 748], 20.00th=[ 876],
| 30.00th=[ 908], 40.00th=[ 948], 50.00th=[ 1012], 60.00th=[ 1064],
| 70.00th=[ 1080], 80.00th=[ 1128], 90.00th=[ 1224], 95.00th=[ 1288],
| 99.00th=[ 1496], 99.50th=[ 1608], 99.90th=[ 1960], 99.95th=[ 2256],
| 99.99th=[ 5408]
bw (KB /s): min=212149, max=390160, per=12.49%, avg=245746.04, stdev=11606.75
Throughput increased from 1822MB/s to 1921MB/s, average completion latencies
decreased from 1051us to 988us.
Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
Message-id: 1468931263-32667-4-git-send-email-roman.penyaev@profitbricks.com
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: qemu-devel@nongnu.org
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2016-07-19 14:27:43 +02:00
|
|
|
if (laiocb.ret == -EINPROGRESS) {
|
|
|
|
qemu_coroutine_yield();
|
|
|
|
}
|
2014-08-06 17:18:07 +02:00
|
|
|
return laiocb.ret;
|
|
|
|
}
|
|
|
|
|
2016-04-07 18:33:35 +02:00
|
|
|
void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
|
2014-05-08 16:34:47 +02:00
|
|
|
{
|
aio-posix: split poll check from ready handler
Adaptive polling measures the execution time of the polling check plus
handlers called when a polled event becomes ready. Handlers can take a
significant amount of time, making it look like polling was running for
a long time when in fact the event handler was running for a long time.
For example, on Linux the io_submit(2) syscall invoked when a virtio-blk
device's virtqueue becomes ready can take 10s of microseconds. This
can exceed the default polling interval (32 microseconds) and cause
adaptive polling to stop polling.
By excluding the handler's execution time from the polling check we make
the adaptive polling calculation more accurate. As a result, the event
loop now stays in polling mode where previously it would have fallen
back to file descriptor monitoring.
The following data was collected with virtio-blk num-queues=2
event_idx=off using an IOThread. Before:
168k IOPS, IOThread syscalls:
9837.115 ( 0.020 ms): IO iothread1/620155 io_submit(ctx_id: 140512552468480, nr: 16, iocbpp: 0x7fcb9f937db0) = 16
9837.158 ( 0.002 ms): IO iothread1/620155 write(fd: 103, buf: 0x556a2ef71b88, count: 8) = 8
9837.161 ( 0.001 ms): IO iothread1/620155 write(fd: 104, buf: 0x556a2ef71b88, count: 8) = 8
9837.163 ( 0.001 ms): IO iothread1/620155 ppoll(ufds: 0x7fcb90002800, nfds: 4, tsp: 0x7fcb9f1342d0, sigsetsize: 8) = 3
9837.164 ( 0.001 ms): IO iothread1/620155 read(fd: 107, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.174 ( 0.001 ms): IO iothread1/620155 read(fd: 105, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.176 ( 0.001 ms): IO iothread1/620155 read(fd: 106, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.209 ( 0.035 ms): IO iothread1/620155 io_submit(ctx_id: 140512552468480, nr: 32, iocbpp: 0x7fca7d0cebe0) = 32
174k IOPS (+3.6%), IOThread syscalls:
9809.566 ( 0.036 ms): IO iothread1/623061 io_submit(ctx_id: 140539805028352, nr: 32, iocbpp: 0x7fd0cdd62be0) = 32
9809.625 ( 0.001 ms): IO iothread1/623061 write(fd: 103, buf: 0x5647cfba5f58, count: 8) = 8
9809.627 ( 0.002 ms): IO iothread1/623061 write(fd: 104, buf: 0x5647cfba5f58, count: 8) = 8
9809.663 ( 0.036 ms): IO iothread1/623061 io_submit(ctx_id: 140539805028352, nr: 32, iocbpp: 0x7fd0d0388b50) = 32
Notice that ppoll(2) and eventfd read(2) syscalls are eliminated because
the IOThread stays in polling mode instead of falling back to file
descriptor monitoring.
As usual, polling is not implemented on Windows so this patch ignores
the new io_poll_read() callback in aio-win32.c.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Message-id: 20211207132336.36627-2-stefanha@redhat.com
[Fixed up aio_set_event_notifier() calls in
tests/unit/test-fdmon-epoll.c added after this series was queued.
--Stefan]
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-12-07 14:23:31 +01:00
|
|
|
aio_set_event_notifier(old_context, &s->e, false, NULL, NULL, NULL);
|
2014-08-04 17:56:33 +02:00
|
|
|
qemu_bh_delete(s->completion_bh);
|
2017-02-13 14:52:31 +01:00
|
|
|
s->aio_context = NULL;
|
2014-05-08 16:34:47 +02:00
|
|
|
}
|
|
|
|
|
2016-04-07 18:33:35 +02:00
|
|
|
void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
|
2014-05-08 16:34:47 +02:00
|
|
|
{
|
2016-07-04 18:33:20 +02:00
|
|
|
s->aio_context = new_context;
|
2014-08-04 17:56:33 +02:00
|
|
|
s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
|
2015-10-23 05:08:05 +02:00
|
|
|
aio_set_event_notifier(new_context, &s->e, false,
|
2016-12-01 20:26:44 +01:00
|
|
|
qemu_laio_completion_cb,
|
aio-posix: split poll check from ready handler
Adaptive polling measures the execution time of the polling check plus
handlers called when a polled event becomes ready. Handlers can take a
significant amount of time, making it look like polling was running for
a long time when in fact the event handler was running for a long time.
For example, on Linux the io_submit(2) syscall invoked when a virtio-blk
device's virtqueue becomes ready can take 10s of microseconds. This
can exceed the default polling interval (32 microseconds) and cause
adaptive polling to stop polling.
By excluding the handler's execution time from the polling check we make
the adaptive polling calculation more accurate. As a result, the event
loop now stays in polling mode where previously it would have fallen
back to file descriptor monitoring.
The following data was collected with virtio-blk num-queues=2
event_idx=off using an IOThread. Before:
168k IOPS, IOThread syscalls:
9837.115 ( 0.020 ms): IO iothread1/620155 io_submit(ctx_id: 140512552468480, nr: 16, iocbpp: 0x7fcb9f937db0) = 16
9837.158 ( 0.002 ms): IO iothread1/620155 write(fd: 103, buf: 0x556a2ef71b88, count: 8) = 8
9837.161 ( 0.001 ms): IO iothread1/620155 write(fd: 104, buf: 0x556a2ef71b88, count: 8) = 8
9837.163 ( 0.001 ms): IO iothread1/620155 ppoll(ufds: 0x7fcb90002800, nfds: 4, tsp: 0x7fcb9f1342d0, sigsetsize: 8) = 3
9837.164 ( 0.001 ms): IO iothread1/620155 read(fd: 107, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.174 ( 0.001 ms): IO iothread1/620155 read(fd: 105, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.176 ( 0.001 ms): IO iothread1/620155 read(fd: 106, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.209 ( 0.035 ms): IO iothread1/620155 io_submit(ctx_id: 140512552468480, nr: 32, iocbpp: 0x7fca7d0cebe0) = 32
174k IOPS (+3.6%), IOThread syscalls:
9809.566 ( 0.036 ms): IO iothread1/623061 io_submit(ctx_id: 140539805028352, nr: 32, iocbpp: 0x7fd0cdd62be0) = 32
9809.625 ( 0.001 ms): IO iothread1/623061 write(fd: 103, buf: 0x5647cfba5f58, count: 8) = 8
9809.627 ( 0.002 ms): IO iothread1/623061 write(fd: 104, buf: 0x5647cfba5f58, count: 8) = 8
9809.663 ( 0.036 ms): IO iothread1/623061 io_submit(ctx_id: 140539805028352, nr: 32, iocbpp: 0x7fd0d0388b50) = 32
Notice that ppoll(2) and eventfd read(2) syscalls are eliminated because
the IOThread stays in polling mode instead of falling back to file
descriptor monitoring.
As usual, polling is not implemented on Windows so this patch ignores
the new io_poll_read() callback in aio-win32.c.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Message-id: 20211207132336.36627-2-stefanha@redhat.com
[Fixed up aio_set_event_notifier() calls in
tests/unit/test-fdmon-epoll.c added after this series was queued.
--Stefan]
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-12-07 14:23:31 +01:00
|
|
|
qemu_laio_poll_cb,
|
|
|
|
qemu_laio_poll_ready);
|
2014-05-08 16:34:47 +02:00
|
|
|
}
|
|
|
|
|
linux-aio: properly bubble up errors from initialization
laio_init() can fail for a couple of reasons, which will lead to a NULL
pointer dereference in laio_attach_aio_context().
To solve this, add a aio_setup_linux_aio() function which is called
early in raw_open_common. If this fails, propagate the error up. The
signature of aio_get_linux_aio() was not modified, because it seems
preferable to return the actual errno from the possible failing
initialization calls.
Additionally, when the AioContext changes, we need to associate a
LinuxAioState with the new AioContext. Use the bdrv_attach_aio_context
callback and call the new aio_setup_linux_aio(), which will allocate a
new AioContext if needed, and return errors on failures. If it fails for
any reason, fallback to threaded AIO with an error message, as the
device is already in-use by the guest.
Add an assert that aio_get_linux_aio() cannot return NULL.
Signed-off-by: Nishanth Aravamudan <naravamudan@digitalocean.com>
Message-id: 20180622193700.6523-1-naravamudan@digitalocean.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2018-06-22 21:37:00 +02:00
|
|
|
LinuxAioState *laio_init(Error **errp)
|
2009-08-20 16:58:35 +02:00
|
|
|
{
|
linux-aio: properly bubble up errors from initialization
laio_init() can fail for a couple of reasons, which will lead to a NULL
pointer dereference in laio_attach_aio_context().
To solve this, add a aio_setup_linux_aio() function which is called
early in raw_open_common. If this fails, propagate the error up. The
signature of aio_get_linux_aio() was not modified, because it seems
preferable to return the actual errno from the possible failing
initialization calls.
Additionally, when the AioContext changes, we need to associate a
LinuxAioState with the new AioContext. Use the bdrv_attach_aio_context
callback and call the new aio_setup_linux_aio(), which will allocate a
new AioContext if needed, and return errors on failures. If it fails for
any reason, fallback to threaded AIO with an error message, as the
device is already in-use by the guest.
Add an assert that aio_get_linux_aio() cannot return NULL.
Signed-off-by: Nishanth Aravamudan <naravamudan@digitalocean.com>
Message-id: 20180622193700.6523-1-naravamudan@digitalocean.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2018-06-22 21:37:00 +02:00
|
|
|
int rc;
|
2016-04-07 18:33:35 +02:00
|
|
|
LinuxAioState *s;
|
2009-08-20 16:58:35 +02:00
|
|
|
|
2011-08-21 05:09:37 +02:00
|
|
|
s = g_malloc0(sizeof(*s));
|
linux-aio: properly bubble up errors from initialization
laio_init() can fail for a couple of reasons, which will lead to a NULL
pointer dereference in laio_attach_aio_context().
To solve this, add a aio_setup_linux_aio() function which is called
early in raw_open_common. If this fails, propagate the error up. The
signature of aio_get_linux_aio() was not modified, because it seems
preferable to return the actual errno from the possible failing
initialization calls.
Additionally, when the AioContext changes, we need to associate a
LinuxAioState with the new AioContext. Use the bdrv_attach_aio_context
callback and call the new aio_setup_linux_aio(), which will allocate a
new AioContext if needed, and return errors on failures. If it fails for
any reason, fallback to threaded AIO with an error message, as the
device is already in-use by the guest.
Add an assert that aio_get_linux_aio() cannot return NULL.
Signed-off-by: Nishanth Aravamudan <naravamudan@digitalocean.com>
Message-id: 20180622193700.6523-1-naravamudan@digitalocean.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2018-06-22 21:37:00 +02:00
|
|
|
rc = event_notifier_init(&s->e, false);
|
|
|
|
if (rc < 0) {
|
|
|
|
error_setg_errno(errp, -rc, "failed to to initialize event notifier");
|
2009-08-20 16:58:35 +02:00
|
|
|
goto out_free_state;
|
2012-02-24 08:39:02 +01:00
|
|
|
}
|
2009-08-20 16:58:35 +02:00
|
|
|
|
linux-aio: properly bubble up errors from initialization
laio_init() can fail for a couple of reasons, which will lead to a NULL
pointer dereference in laio_attach_aio_context().
To solve this, add a aio_setup_linux_aio() function which is called
early in raw_open_common. If this fails, propagate the error up. The
signature of aio_get_linux_aio() was not modified, because it seems
preferable to return the actual errno from the possible failing
initialization calls.
Additionally, when the AioContext changes, we need to associate a
LinuxAioState with the new AioContext. Use the bdrv_attach_aio_context
callback and call the new aio_setup_linux_aio(), which will allocate a
new AioContext if needed, and return errors on failures. If it fails for
any reason, fallback to threaded AIO with an error message, as the
device is already in-use by the guest.
Add an assert that aio_get_linux_aio() cannot return NULL.
Signed-off-by: Nishanth Aravamudan <naravamudan@digitalocean.com>
Message-id: 20180622193700.6523-1-naravamudan@digitalocean.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2018-06-22 21:37:00 +02:00
|
|
|
rc = io_setup(MAX_EVENTS, &s->ctx);
|
|
|
|
if (rc < 0) {
|
|
|
|
error_setg_errno(errp, -rc, "failed to create linux AIO context");
|
2009-08-20 16:58:35 +02:00
|
|
|
goto out_close_efd;
|
2012-02-24 08:39:02 +01:00
|
|
|
}
|
2009-08-20 16:58:35 +02:00
|
|
|
|
2014-07-04 12:04:34 +02:00
|
|
|
ioq_init(&s->io_q);
|
|
|
|
|
2009-08-20 16:58:35 +02:00
|
|
|
return s;
|
|
|
|
|
|
|
|
out_close_efd:
|
2012-02-24 08:39:02 +01:00
|
|
|
event_notifier_cleanup(&s->e);
|
2009-08-20 16:58:35 +02:00
|
|
|
out_free_state:
|
2011-08-21 05:09:37 +02:00
|
|
|
g_free(s);
|
2009-08-20 16:58:35 +02:00
|
|
|
return NULL;
|
|
|
|
}
|
2014-05-08 16:34:48 +02:00
|
|
|
|
2016-04-07 18:33:35 +02:00
|
|
|
void laio_cleanup(LinuxAioState *s)
|
2014-05-08 16:34:48 +02:00
|
|
|
{
|
|
|
|
event_notifier_cleanup(&s->e);
|
2014-07-12 05:43:37 +02:00
|
|
|
|
|
|
|
if (io_destroy(s->ctx) != 0) {
|
|
|
|
fprintf(stderr, "%s: destroy AIO context %p failed\n",
|
|
|
|
__func__, &s->ctx);
|
|
|
|
}
|
2014-05-08 16:34:48 +02:00
|
|
|
g_free(s);
|
|
|
|
}
|