2011-01-17 17:08:14 +01:00
|
|
|
/*
|
|
|
|
* QEMU coroutines
|
|
|
|
*
|
|
|
|
* Copyright IBM, Corp. 2011
|
|
|
|
*
|
|
|
|
* Authors:
|
|
|
|
* Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
|
|
|
|
* Kevin Wolf <kwolf@redhat.com>
|
|
|
|
*
|
|
|
|
* This work is licensed under the terms of the GNU LGPL, version 2 or later.
|
|
|
|
* See the COPYING.LIB file in the top-level directory.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2016-01-29 18:49:55 +01:00
|
|
|
#include "qemu/osdep.h"
|
2011-01-17 17:08:14 +01:00
|
|
|
#include "trace.h"
|
|
|
|
#include "qemu-common.h"
|
2013-05-17 15:51:25 +02:00
|
|
|
#include "qemu/thread.h"
|
coroutine: rewrite pool to avoid mutex
This patch removes the mutex by using fancy lock-free manipulation of
the pool. Lock-free stacks and queues are not hard, but they can suffer
from the ABA problem so they are better avoided unless you have some
deferred reclamation scheme like RCU. Otherwise you have to stick
with adding to a list, and emptying it completely. This is what this
patch does, by coupling a lock-free global list of available coroutines
with per-CPU lists that are actually used on coroutine creation.
Whenever the destruction pool is big enough, the next thread that runs
out of coroutines will steal the whole destruction pool. This is positive
in two ways:
1) the allocation does not have to do any atomic operation in the fast
path, it's entirely using thread-local storage. Once every POOL_BATCH_SIZE
allocations it will do a single atomic_xchg. Release does an atomic_cmpxchg
loop, that hopefully doesn't cause any starvation, and an atomic_inc.
A later patch will also remove atomic operations from the release path,
and try to avoid the atomic_xchg altogether---succeeding in doing so if
all devices either use ioeventfd or are not submitting requests actively.
2) in theory this should be completely adaptive. The number of coroutines
around should be a little more than POOL_BATCH_SIZE * number of allocating
threads; so this also empties qemu_coroutine_adjust_pool_size. (The previous
pool size was POOL_BATCH_SIZE * number of block backends, so it was a bit
more generous. But if you actually have many high-iodepth disks, it's better
to put them in different iothreads, which will also use separate thread
pools and aio=native file descriptors).
This speeds up perf/cost (in tests/test-coroutine) by a factor of ~1.33.
No matter if we end with some kind of coroutine bypass scheme or not,
it cannot hurt to optimize hot code.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 1417518350-6167-6-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-12-02 12:05:48 +01:00
|
|
|
#include "qemu/atomic.h"
|
2015-09-01 15:48:02 +02:00
|
|
|
#include "qemu/coroutine.h"
|
|
|
|
#include "qemu/coroutine_int.h"
|
2017-02-13 14:52:19 +01:00
|
|
|
#include "block/aio.h"
|
2011-01-17 17:08:14 +01:00
|
|
|
|
2013-02-19 11:59:09 +01:00
|
|
|
enum {
|
coroutine: rewrite pool to avoid mutex
This patch removes the mutex by using fancy lock-free manipulation of
the pool. Lock-free stacks and queues are not hard, but they can suffer
from the ABA problem so they are better avoided unless you have some
deferred reclamation scheme like RCU. Otherwise you have to stick
with adding to a list, and emptying it completely. This is what this
patch does, by coupling a lock-free global list of available coroutines
with per-CPU lists that are actually used on coroutine creation.
Whenever the destruction pool is big enough, the next thread that runs
out of coroutines will steal the whole destruction pool. This is positive
in two ways:
1) the allocation does not have to do any atomic operation in the fast
path, it's entirely using thread-local storage. Once every POOL_BATCH_SIZE
allocations it will do a single atomic_xchg. Release does an atomic_cmpxchg
loop, that hopefully doesn't cause any starvation, and an atomic_inc.
A later patch will also remove atomic operations from the release path,
and try to avoid the atomic_xchg altogether---succeeding in doing so if
all devices either use ioeventfd or are not submitting requests actively.
2) in theory this should be completely adaptive. The number of coroutines
around should be a little more than POOL_BATCH_SIZE * number of allocating
threads; so this also empties qemu_coroutine_adjust_pool_size. (The previous
pool size was POOL_BATCH_SIZE * number of block backends, so it was a bit
more generous. But if you actually have many high-iodepth disks, it's better
to put them in different iothreads, which will also use separate thread
pools and aio=native file descriptors).
This speeds up perf/cost (in tests/test-coroutine) by a factor of ~1.33.
No matter if we end with some kind of coroutine bypass scheme or not,
it cannot hurt to optimize hot code.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 1417518350-6167-6-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-12-02 12:05:48 +01:00
|
|
|
POOL_BATCH_SIZE = 64,
|
2013-02-19 11:59:09 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
/** Free list to speed up creation */
|
coroutine: rewrite pool to avoid mutex
This patch removes the mutex by using fancy lock-free manipulation of
the pool. Lock-free stacks and queues are not hard, but they can suffer
from the ABA problem so they are better avoided unless you have some
deferred reclamation scheme like RCU. Otherwise you have to stick
with adding to a list, and emptying it completely. This is what this
patch does, by coupling a lock-free global list of available coroutines
with per-CPU lists that are actually used on coroutine creation.
Whenever the destruction pool is big enough, the next thread that runs
out of coroutines will steal the whole destruction pool. This is positive
in two ways:
1) the allocation does not have to do any atomic operation in the fast
path, it's entirely using thread-local storage. Once every POOL_BATCH_SIZE
allocations it will do a single atomic_xchg. Release does an atomic_cmpxchg
loop, that hopefully doesn't cause any starvation, and an atomic_inc.
A later patch will also remove atomic operations from the release path,
and try to avoid the atomic_xchg altogether---succeeding in doing so if
all devices either use ioeventfd or are not submitting requests actively.
2) in theory this should be completely adaptive. The number of coroutines
around should be a little more than POOL_BATCH_SIZE * number of allocating
threads; so this also empties qemu_coroutine_adjust_pool_size. (The previous
pool size was POOL_BATCH_SIZE * number of block backends, so it was a bit
more generous. But if you actually have many high-iodepth disks, it's better
to put them in different iothreads, which will also use separate thread
pools and aio=native file descriptors).
This speeds up perf/cost (in tests/test-coroutine) by a factor of ~1.33.
No matter if we end with some kind of coroutine bypass scheme or not,
it cannot hurt to optimize hot code.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 1417518350-6167-6-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-12-02 12:05:48 +01:00
|
|
|
static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
|
|
|
|
static unsigned int release_pool_size;
|
|
|
|
static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool);
|
2014-12-02 12:05:50 +01:00
|
|
|
static __thread unsigned int alloc_pool_size;
|
coroutine: rewrite pool to avoid mutex
This patch removes the mutex by using fancy lock-free manipulation of
the pool. Lock-free stacks and queues are not hard, but they can suffer
from the ABA problem so they are better avoided unless you have some
deferred reclamation scheme like RCU. Otherwise you have to stick
with adding to a list, and emptying it completely. This is what this
patch does, by coupling a lock-free global list of available coroutines
with per-CPU lists that are actually used on coroutine creation.
Whenever the destruction pool is big enough, the next thread that runs
out of coroutines will steal the whole destruction pool. This is positive
in two ways:
1) the allocation does not have to do any atomic operation in the fast
path, it's entirely using thread-local storage. Once every POOL_BATCH_SIZE
allocations it will do a single atomic_xchg. Release does an atomic_cmpxchg
loop, that hopefully doesn't cause any starvation, and an atomic_inc.
A later patch will also remove atomic operations from the release path,
and try to avoid the atomic_xchg altogether---succeeding in doing so if
all devices either use ioeventfd or are not submitting requests actively.
2) in theory this should be completely adaptive. The number of coroutines
around should be a little more than POOL_BATCH_SIZE * number of allocating
threads; so this also empties qemu_coroutine_adjust_pool_size. (The previous
pool size was POOL_BATCH_SIZE * number of block backends, so it was a bit
more generous. But if you actually have many high-iodepth disks, it's better
to put them in different iothreads, which will also use separate thread
pools and aio=native file descriptors).
This speeds up perf/cost (in tests/test-coroutine) by a factor of ~1.33.
No matter if we end with some kind of coroutine bypass scheme or not,
it cannot hurt to optimize hot code.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 1417518350-6167-6-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-12-02 12:05:48 +01:00
|
|
|
static __thread Notifier coroutine_pool_cleanup_notifier;
|
|
|
|
|
|
|
|
static void coroutine_pool_cleanup(Notifier *n, void *value)
|
|
|
|
{
|
|
|
|
Coroutine *co;
|
|
|
|
Coroutine *tmp;
|
|
|
|
|
|
|
|
QSLIST_FOREACH_SAFE(co, &alloc_pool, pool_next, tmp) {
|
|
|
|
QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
|
|
|
|
qemu_coroutine_delete(co);
|
|
|
|
}
|
|
|
|
}
|
2013-02-19 11:59:09 +01:00
|
|
|
|
coroutine: move entry argument to qemu_coroutine_create
In practice the entry argument is always known at creation time, and
it is confusing that sometimes qemu_coroutine_enter is used with a
non-NULL argument to re-enter a coroutine (this happens in
block/sheepdog.c and tests/test-coroutine.c). So pass the opaque value
at creation time, for consistency with e.g. aio_bh_new.
Mostly done with the following semantic patch:
@ entry1 @
expression entry, arg, co;
@@
- co = qemu_coroutine_create(entry);
+ co = qemu_coroutine_create(entry, arg);
...
- qemu_coroutine_enter(co, arg);
+ qemu_coroutine_enter(co);
@ entry2 @
expression entry, arg;
identifier co;
@@
- Coroutine *co = qemu_coroutine_create(entry);
+ Coroutine *co = qemu_coroutine_create(entry, arg);
...
- qemu_coroutine_enter(co, arg);
+ qemu_coroutine_enter(co);
@ entry3 @
expression entry, arg;
@@
- qemu_coroutine_enter(qemu_coroutine_create(entry), arg);
+ qemu_coroutine_enter(qemu_coroutine_create(entry, arg));
@ reentry @
expression co;
@@
- qemu_coroutine_enter(co, NULL);
+ qemu_coroutine_enter(co);
except for the aforementioned few places where the semantic patch
stumbled (as expected) and for test_co_queue, which would otherwise
produce an uninitialized variable warning.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2016-07-04 19:10:01 +02:00
|
|
|
Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque)
|
2011-01-17 17:08:14 +01:00
|
|
|
{
|
2013-09-11 16:42:35 +02:00
|
|
|
Coroutine *co = NULL;
|
2013-02-19 11:59:09 +01:00
|
|
|
|
2013-09-11 16:42:35 +02:00
|
|
|
if (CONFIG_COROUTINE_POOL) {
|
coroutine: rewrite pool to avoid mutex
This patch removes the mutex by using fancy lock-free manipulation of
the pool. Lock-free stacks and queues are not hard, but they can suffer
from the ABA problem so they are better avoided unless you have some
deferred reclamation scheme like RCU. Otherwise you have to stick
with adding to a list, and emptying it completely. This is what this
patch does, by coupling a lock-free global list of available coroutines
with per-CPU lists that are actually used on coroutine creation.
Whenever the destruction pool is big enough, the next thread that runs
out of coroutines will steal the whole destruction pool. This is positive
in two ways:
1) the allocation does not have to do any atomic operation in the fast
path, it's entirely using thread-local storage. Once every POOL_BATCH_SIZE
allocations it will do a single atomic_xchg. Release does an atomic_cmpxchg
loop, that hopefully doesn't cause any starvation, and an atomic_inc.
A later patch will also remove atomic operations from the release path,
and try to avoid the atomic_xchg altogether---succeeding in doing so if
all devices either use ioeventfd or are not submitting requests actively.
2) in theory this should be completely adaptive. The number of coroutines
around should be a little more than POOL_BATCH_SIZE * number of allocating
threads; so this also empties qemu_coroutine_adjust_pool_size. (The previous
pool size was POOL_BATCH_SIZE * number of block backends, so it was a bit
more generous. But if you actually have many high-iodepth disks, it's better
to put them in different iothreads, which will also use separate thread
pools and aio=native file descriptors).
This speeds up perf/cost (in tests/test-coroutine) by a factor of ~1.33.
No matter if we end with some kind of coroutine bypass scheme or not,
it cannot hurt to optimize hot code.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 1417518350-6167-6-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-12-02 12:05:48 +01:00
|
|
|
co = QSLIST_FIRST(&alloc_pool);
|
|
|
|
if (!co) {
|
|
|
|
if (release_pool_size > POOL_BATCH_SIZE) {
|
|
|
|
/* Slow path; a good place to register the destructor, too. */
|
|
|
|
if (!coroutine_pool_cleanup_notifier.notify) {
|
|
|
|
coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup;
|
|
|
|
qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This is not exact; there could be a little skew between
|
|
|
|
* release_pool_size and the actual size of release_pool. But
|
|
|
|
* it is just a heuristic, it does not need to be perfect.
|
|
|
|
*/
|
2014-12-02 12:05:50 +01:00
|
|
|
alloc_pool_size = atomic_xchg(&release_pool_size, 0);
|
coroutine: rewrite pool to avoid mutex
This patch removes the mutex by using fancy lock-free manipulation of
the pool. Lock-free stacks and queues are not hard, but they can suffer
from the ABA problem so they are better avoided unless you have some
deferred reclamation scheme like RCU. Otherwise you have to stick
with adding to a list, and emptying it completely. This is what this
patch does, by coupling a lock-free global list of available coroutines
with per-CPU lists that are actually used on coroutine creation.
Whenever the destruction pool is big enough, the next thread that runs
out of coroutines will steal the whole destruction pool. This is positive
in two ways:
1) the allocation does not have to do any atomic operation in the fast
path, it's entirely using thread-local storage. Once every POOL_BATCH_SIZE
allocations it will do a single atomic_xchg. Release does an atomic_cmpxchg
loop, that hopefully doesn't cause any starvation, and an atomic_inc.
A later patch will also remove atomic operations from the release path,
and try to avoid the atomic_xchg altogether---succeeding in doing so if
all devices either use ioeventfd or are not submitting requests actively.
2) in theory this should be completely adaptive. The number of coroutines
around should be a little more than POOL_BATCH_SIZE * number of allocating
threads; so this also empties qemu_coroutine_adjust_pool_size. (The previous
pool size was POOL_BATCH_SIZE * number of block backends, so it was a bit
more generous. But if you actually have many high-iodepth disks, it's better
to put them in different iothreads, which will also use separate thread
pools and aio=native file descriptors).
This speeds up perf/cost (in tests/test-coroutine) by a factor of ~1.33.
No matter if we end with some kind of coroutine bypass scheme or not,
it cannot hurt to optimize hot code.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 1417518350-6167-6-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-12-02 12:05:48 +01:00
|
|
|
QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool);
|
|
|
|
co = QSLIST_FIRST(&alloc_pool);
|
|
|
|
}
|
|
|
|
}
|
2013-09-11 16:42:35 +02:00
|
|
|
if (co) {
|
coroutine: rewrite pool to avoid mutex
This patch removes the mutex by using fancy lock-free manipulation of
the pool. Lock-free stacks and queues are not hard, but they can suffer
from the ABA problem so they are better avoided unless you have some
deferred reclamation scheme like RCU. Otherwise you have to stick
with adding to a list, and emptying it completely. This is what this
patch does, by coupling a lock-free global list of available coroutines
with per-CPU lists that are actually used on coroutine creation.
Whenever the destruction pool is big enough, the next thread that runs
out of coroutines will steal the whole destruction pool. This is positive
in two ways:
1) the allocation does not have to do any atomic operation in the fast
path, it's entirely using thread-local storage. Once every POOL_BATCH_SIZE
allocations it will do a single atomic_xchg. Release does an atomic_cmpxchg
loop, that hopefully doesn't cause any starvation, and an atomic_inc.
A later patch will also remove atomic operations from the release path,
and try to avoid the atomic_xchg altogether---succeeding in doing so if
all devices either use ioeventfd or are not submitting requests actively.
2) in theory this should be completely adaptive. The number of coroutines
around should be a little more than POOL_BATCH_SIZE * number of allocating
threads; so this also empties qemu_coroutine_adjust_pool_size. (The previous
pool size was POOL_BATCH_SIZE * number of block backends, so it was a bit
more generous. But if you actually have many high-iodepth disks, it's better
to put them in different iothreads, which will also use separate thread
pools and aio=native file descriptors).
This speeds up perf/cost (in tests/test-coroutine) by a factor of ~1.33.
No matter if we end with some kind of coroutine bypass scheme or not,
it cannot hurt to optimize hot code.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 1417518350-6167-6-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-12-02 12:05:48 +01:00
|
|
|
QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
|
2014-12-02 12:05:50 +01:00
|
|
|
alloc_pool_size--;
|
2013-09-11 16:42:35 +02:00
|
|
|
}
|
2013-05-17 15:51:25 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!co) {
|
2013-02-19 11:59:09 +01:00
|
|
|
co = qemu_coroutine_new();
|
|
|
|
}
|
|
|
|
|
2011-01-17 17:08:14 +01:00
|
|
|
co->entry = entry;
|
coroutine: move entry argument to qemu_coroutine_create
In practice the entry argument is always known at creation time, and
it is confusing that sometimes qemu_coroutine_enter is used with a
non-NULL argument to re-enter a coroutine (this happens in
block/sheepdog.c and tests/test-coroutine.c). So pass the opaque value
at creation time, for consistency with e.g. aio_bh_new.
Mostly done with the following semantic patch:
@ entry1 @
expression entry, arg, co;
@@
- co = qemu_coroutine_create(entry);
+ co = qemu_coroutine_create(entry, arg);
...
- qemu_coroutine_enter(co, arg);
+ qemu_coroutine_enter(co);
@ entry2 @
expression entry, arg;
identifier co;
@@
- Coroutine *co = qemu_coroutine_create(entry);
+ Coroutine *co = qemu_coroutine_create(entry, arg);
...
- qemu_coroutine_enter(co, arg);
+ qemu_coroutine_enter(co);
@ entry3 @
expression entry, arg;
@@
- qemu_coroutine_enter(qemu_coroutine_create(entry), arg);
+ qemu_coroutine_enter(qemu_coroutine_create(entry, arg));
@ reentry @
expression co;
@@
- qemu_coroutine_enter(co, NULL);
+ qemu_coroutine_enter(co);
except for the aforementioned few places where the semantic patch
stumbled (as expected) and for test_co_queue, which would otherwise
produce an uninitialized variable warning.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2016-07-04 19:10:01 +02:00
|
|
|
co->entry_arg = opaque;
|
2016-07-04 19:09:59 +02:00
|
|
|
QSIMPLEQ_INIT(&co->co_queue_wakeup);
|
2011-01-17 17:08:14 +01:00
|
|
|
return co;
|
|
|
|
}
|
|
|
|
|
2013-02-19 11:59:09 +01:00
|
|
|
static void coroutine_delete(Coroutine *co)
|
|
|
|
{
|
coroutine: rewrite pool to avoid mutex
This patch removes the mutex by using fancy lock-free manipulation of
the pool. Lock-free stacks and queues are not hard, but they can suffer
from the ABA problem so they are better avoided unless you have some
deferred reclamation scheme like RCU. Otherwise you have to stick
with adding to a list, and emptying it completely. This is what this
patch does, by coupling a lock-free global list of available coroutines
with per-CPU lists that are actually used on coroutine creation.
Whenever the destruction pool is big enough, the next thread that runs
out of coroutines will steal the whole destruction pool. This is positive
in two ways:
1) the allocation does not have to do any atomic operation in the fast
path, it's entirely using thread-local storage. Once every POOL_BATCH_SIZE
allocations it will do a single atomic_xchg. Release does an atomic_cmpxchg
loop, that hopefully doesn't cause any starvation, and an atomic_inc.
A later patch will also remove atomic operations from the release path,
and try to avoid the atomic_xchg altogether---succeeding in doing so if
all devices either use ioeventfd or are not submitting requests actively.
2) in theory this should be completely adaptive. The number of coroutines
around should be a little more than POOL_BATCH_SIZE * number of allocating
threads; so this also empties qemu_coroutine_adjust_pool_size. (The previous
pool size was POOL_BATCH_SIZE * number of block backends, so it was a bit
more generous. But if you actually have many high-iodepth disks, it's better
to put them in different iothreads, which will also use separate thread
pools and aio=native file descriptors).
This speeds up perf/cost (in tests/test-coroutine) by a factor of ~1.33.
No matter if we end with some kind of coroutine bypass scheme or not,
it cannot hurt to optimize hot code.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 1417518350-6167-6-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-12-02 12:05:48 +01:00
|
|
|
co->caller = NULL;
|
|
|
|
|
2013-09-11 16:42:35 +02:00
|
|
|
if (CONFIG_COROUTINE_POOL) {
|
coroutine: rewrite pool to avoid mutex
This patch removes the mutex by using fancy lock-free manipulation of
the pool. Lock-free stacks and queues are not hard, but they can suffer
from the ABA problem so they are better avoided unless you have some
deferred reclamation scheme like RCU. Otherwise you have to stick
with adding to a list, and emptying it completely. This is what this
patch does, by coupling a lock-free global list of available coroutines
with per-CPU lists that are actually used on coroutine creation.
Whenever the destruction pool is big enough, the next thread that runs
out of coroutines will steal the whole destruction pool. This is positive
in two ways:
1) the allocation does not have to do any atomic operation in the fast
path, it's entirely using thread-local storage. Once every POOL_BATCH_SIZE
allocations it will do a single atomic_xchg. Release does an atomic_cmpxchg
loop, that hopefully doesn't cause any starvation, and an atomic_inc.
A later patch will also remove atomic operations from the release path,
and try to avoid the atomic_xchg altogether---succeeding in doing so if
all devices either use ioeventfd or are not submitting requests actively.
2) in theory this should be completely adaptive. The number of coroutines
around should be a little more than POOL_BATCH_SIZE * number of allocating
threads; so this also empties qemu_coroutine_adjust_pool_size. (The previous
pool size was POOL_BATCH_SIZE * number of block backends, so it was a bit
more generous. But if you actually have many high-iodepth disks, it's better
to put them in different iothreads, which will also use separate thread
pools and aio=native file descriptors).
This speeds up perf/cost (in tests/test-coroutine) by a factor of ~1.33.
No matter if we end with some kind of coroutine bypass scheme or not,
it cannot hurt to optimize hot code.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 1417518350-6167-6-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-12-02 12:05:48 +01:00
|
|
|
if (release_pool_size < POOL_BATCH_SIZE * 2) {
|
|
|
|
QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
|
|
|
|
atomic_inc(&release_pool_size);
|
2013-09-11 16:42:35 +02:00
|
|
|
return;
|
|
|
|
}
|
2014-12-02 12:05:50 +01:00
|
|
|
if (alloc_pool_size < POOL_BATCH_SIZE) {
|
|
|
|
QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next);
|
|
|
|
alloc_pool_size++;
|
|
|
|
return;
|
|
|
|
}
|
2013-02-19 11:59:09 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
qemu_coroutine_delete(co);
|
|
|
|
}
|
|
|
|
|
2017-04-10 14:06:12 +02:00
|
|
|
void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co)
|
2011-01-17 17:08:14 +01:00
|
|
|
{
|
|
|
|
Coroutine *self = qemu_coroutine_self();
|
2015-02-10 11:31:52 +01:00
|
|
|
CoroutineAction ret;
|
2011-01-17 17:08:14 +01:00
|
|
|
|
2017-04-10 14:06:12 +02:00
|
|
|
trace_qemu_aio_coroutine_enter(ctx, self, co, co->entry_arg);
|
2011-01-17 17:08:14 +01:00
|
|
|
|
|
|
|
if (co->caller) {
|
|
|
|
fprintf(stderr, "Co-routine re-entered recursively\n");
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
|
|
|
|
co->caller = self;
|
2017-04-10 14:06:12 +02:00
|
|
|
co->ctx = ctx;
|
2017-02-13 14:52:19 +01:00
|
|
|
|
|
|
|
/* Store co->ctx before anything that stores co. Matches
|
2017-02-13 19:12:40 +01:00
|
|
|
* barrier in aio_co_wake and qemu_co_mutex_wake.
|
2017-02-13 14:52:19 +01:00
|
|
|
*/
|
|
|
|
smp_wmb();
|
|
|
|
|
2015-02-10 11:31:52 +01:00
|
|
|
ret = qemu_coroutine_switch(self, co, COROUTINE_ENTER);
|
|
|
|
|
|
|
|
qemu_co_queue_run_restart(co);
|
|
|
|
|
|
|
|
switch (ret) {
|
|
|
|
case COROUTINE_YIELD:
|
|
|
|
return;
|
|
|
|
case COROUTINE_TERMINATE:
|
2016-08-11 17:51:59 +02:00
|
|
|
assert(!co->locks_held);
|
2015-02-10 11:31:52 +01:00
|
|
|
trace_qemu_coroutine_terminate(co);
|
|
|
|
coroutine_delete(co);
|
|
|
|
return;
|
|
|
|
default:
|
|
|
|
abort();
|
|
|
|
}
|
2011-01-17 17:08:14 +01:00
|
|
|
}
|
|
|
|
|
2017-04-10 14:06:12 +02:00
|
|
|
void qemu_coroutine_enter(Coroutine *co)
|
|
|
|
{
|
|
|
|
qemu_aio_coroutine_enter(qemu_get_current_aio_context(), co);
|
|
|
|
}
|
|
|
|
|
2016-11-07 16:34:35 +01:00
|
|
|
void qemu_coroutine_enter_if_inactive(Coroutine *co)
|
|
|
|
{
|
|
|
|
if (!qemu_coroutine_entered(co)) {
|
|
|
|
qemu_coroutine_enter(co);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-01-17 17:08:14 +01:00
|
|
|
void coroutine_fn qemu_coroutine_yield(void)
|
|
|
|
{
|
|
|
|
Coroutine *self = qemu_coroutine_self();
|
|
|
|
Coroutine *to = self->caller;
|
|
|
|
|
|
|
|
trace_qemu_coroutine_yield(self, to);
|
|
|
|
|
|
|
|
if (!to) {
|
|
|
|
fprintf(stderr, "Co-routine is yielding to no one\n");
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
|
|
|
|
self->caller = NULL;
|
2015-02-10 11:17:53 +01:00
|
|
|
qemu_coroutine_switch(self, to, COROUTINE_YIELD);
|
2011-01-17 17:08:14 +01:00
|
|
|
}
|
2016-09-27 17:18:34 +02:00
|
|
|
|
|
|
|
bool qemu_coroutine_entered(Coroutine *co)
|
|
|
|
{
|
|
|
|
return co->caller;
|
|
|
|
}
|