chardev: fix race with client connections in tcp_chr_wait_connected

When the 'reconnect' option is given for a client connection, the
qmp_chardev_open_socket_client method will run an asynchronous
connection attempt. The QIOChannel socket executes this is a single use
background thread, so the connection will succeed immediately (assuming
the server is listening). The chardev, however, won't get the result
from this background thread until the main loop starts running and
processes idle callbacks.

Thus when tcp_chr_wait_connected is run s->ioc will be NULL, but the
state will be TCP_CHARDEV_STATE_CONNECTING, and there may already
be an established connection that will be associated with the chardev
by the pending idle callback. tcp_chr_wait_connected doesn't check the
state, only s->ioc, so attempts to establish another connection
synchronously.

If the server allows multiple connections this is unhelpful but not a
fatal problem as the duplicate connection will get ignored by the
tcp_chr_new_client method when it sees the state is already connected.

If the server only supports a single connection, however, the
tcp_chr_wait_connected method will hang forever because the server will
not accept its synchronous connection attempt until the first connection
is closed.

To deal with this tcp_chr_wait_connected needs to synchronize with the
completion of the background connection task. To do this it needs to
create the QIOTask directly and use the qio_task_wait_thread method.
This will cancel the pending idle callback and directly dispatch the
task completion callback, allowing the connection to be associated
with the chardev. If the background connection failed, it can still
attempt a new synchronous connection.

Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Message-Id: <20190211182442.8542-15-berrange@redhat.com>
Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
This commit is contained in:
Daniel P. Berrangé 2019-02-11 18:24:40 +00:00 committed by Marc-André Lureau
parent d1885e549d
commit 4b47373a0d

View File

@ -80,6 +80,8 @@ typedef struct {
GSource *reconnect_timer;
int64_t reconnect_time;
bool connect_err_reported;
QIOTask *connect_task;
} SocketChardev;
#define SOCKET_CHARDEV(obj) \
@ -118,6 +120,7 @@ static void qemu_chr_socket_restart_timer(Chardev *chr)
char *name;
assert(s->state == TCP_CHARDEV_STATE_DISCONNECTED);
assert(!s->reconnect_timer);
name = g_strdup_printf("chardev-socket-reconnect-%s", chr->label);
s->reconnect_timer = qemu_chr_timeout_add_ms(chr,
s->reconnect_time * 1000,
@ -965,7 +968,56 @@ static int tcp_chr_wait_connected(Chardev *chr, Error **errp)
}
}
while (!s->ioc) {
tcp_chr_reconn_timer_cancel(s);
/*
* We expect states to be as follows:
*
* - server
* - wait -> CONNECTED
* - nowait -> DISCONNECTED
* - client
* - reconnect == 0 -> CONNECTED
* - reconnect != 0 -> CONNECTING
*
*/
if (s->state == TCP_CHARDEV_STATE_CONNECTING) {
if (!s->connect_task) {
error_setg(errp,
"Unexpected 'connecting' state without connect task "
"while waiting for connection completion");
return -1;
}
/*
* tcp_chr_wait_connected should only ever be run from the
* main loop thread associated with chr->gcontext, otherwise
* qio_task_wait_thread has a dangerous race condition with
* free'ing of the s->connect_task object.
*
* Acquiring the main context doesn't 100% prove we're in
* the main loop thread, but it does at least guarantee
* that the main loop won't be executed by another thread
* avoiding the race condition with the task idle callback.
*/
g_main_context_acquire(chr->gcontext);
qio_task_wait_thread(s->connect_task);
g_main_context_release(chr->gcontext);
/*
* The completion callback (qemu_chr_socket_connected) for
* s->connect_task should have set this to NULL by the time
* qio_task_wait_thread has returned.
*/
assert(!s->connect_task);
/*
* NB we are *not* guaranteed to have "s->state == ..CONNECTED"
* at this point as this first connect may be failed, so
* allow the next loop to run regardless.
*/
}
while (s->state != TCP_CHARDEV_STATE_CONNECTED) {
if (s->is_listen) {
tcp_chr_accept_server_sync(chr);
} else {
@ -1014,6 +1066,8 @@ static void qemu_chr_socket_connected(QIOTask *task, void *opaque)
SocketChardev *s = SOCKET_CHARDEV(chr);
Error *err = NULL;
s->connect_task = NULL;
if (qio_task_propagate_error(task, &err)) {
tcp_chr_change_state(s, TCP_CHARDEV_STATE_DISCONNECTED);
check_report_connect_error(chr, err);
@ -1028,6 +1082,20 @@ cleanup:
object_unref(OBJECT(sioc));
}
static void tcp_chr_connect_client_task(QIOTask *task,
gpointer opaque)
{
QIOChannelSocket *ioc = QIO_CHANNEL_SOCKET(qio_task_get_source(task));
SocketAddress *addr = opaque;
Error *err = NULL;
qio_channel_socket_connect_sync(ioc, addr, &err);
qio_task_set_error(task, err);
}
static void tcp_chr_connect_client_async(Chardev *chr)
{
SocketChardev *s = SOCKET_CHARDEV(chr);
@ -1036,9 +1104,23 @@ static void tcp_chr_connect_client_async(Chardev *chr)
tcp_chr_change_state(s, TCP_CHARDEV_STATE_CONNECTING);
sioc = qio_channel_socket_new();
tcp_chr_set_client_ioc_name(chr, sioc);
qio_channel_socket_connect_async(sioc, s->addr,
qemu_chr_socket_connected,
chr, NULL, chr->gcontext);
/*
* Normally code would use the qio_channel_socket_connect_async
* method which uses a QIOTask + qio_task_set_error internally
* to avoid blocking. The tcp_chr_wait_connected method, however,
* needs a way to synchronize with completion of the background
* connect task which can't be done with the QIOChannelSocket
* async APIs. Thus we must use QIOTask directly to implement
* the non-blocking concept locally.
*/
s->connect_task = qio_task_new(OBJECT(sioc),
qemu_chr_socket_connected,
chr, NULL);
qio_task_run_in_thread(s->connect_task,
tcp_chr_connect_client_task,
s->addr,
NULL,
chr->gcontext);
}
static gboolean socket_reconnect_timeout(gpointer opaque)