2013-12-01 22:23:41 +01:00
|
|
|
/*
|
|
|
|
* QEMU Block driver for NBD
|
|
|
|
*
|
2016-10-14 20:33:04 +02:00
|
|
|
* Copyright (C) 2016 Red Hat, Inc.
|
2013-12-01 22:23:41 +01:00
|
|
|
* Copyright (C) 2008 Bull S.A.S.
|
|
|
|
* Author: Laurent Vivier <Laurent.Vivier@bull.net>
|
|
|
|
*
|
|
|
|
* Some parts:
|
|
|
|
* Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws>
|
|
|
|
*
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
* of this software and associated documentation files (the "Software"), to deal
|
|
|
|
* in the Software without restriction, including without limitation the rights
|
|
|
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
* copies of the Software, and to permit persons to whom the Software is
|
|
|
|
* furnished to do so, subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice shall be included in
|
|
|
|
* all copies or substantial portions of the Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
|
* THE SOFTWARE.
|
|
|
|
*/
|
|
|
|
|
2016-01-18 19:01:42 +01:00
|
|
|
#include "qemu/osdep.h"
|
block/nbd-client: use traces instead of noisy error_report_err
Reduce extra noise of nbd-client, change 083 correspondingly.
In various commits (be41c100 in 2.10, f140e300 in 2.11, 78a33ab
in 2.12), we added spots where qemu as an NBD client would report
problems communicating with the server to stderr, because there
was no where else to send the error to. However, this is racy,
particularly since the most common source of these errors is when
either the client or the server abruptly hangs up, leaving one
coroutine to report the error only if it wins (or loses) the
race in attempting the read from the server before another
thread completes its cleanup of a protocol error that caused the
disconnect in the first place. The race is also apparent in the
fact that differences in the flush behavior of the server can
alter the frequency of encountering the race in the client (see
commit 6d39db96).
Rather than polluting stderr, it's better to just trace these
situations, for use by developers debugging a flaky connection,
particularly since the real error that either triggers the abrupt
disconnection in the first place, or that results from the EIO
when a request can't receive a reply, DOES make it back to the
user in the normal Error propagation channels.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20181102151152.288399-4-vsementsov@virtuozzo.com>
[eblake: drop depedence on error hint, enhance commit message]
Signed-off-by: Eric Blake <eblake@redhat.com>
2018-11-02 16:11:52 +01:00
|
|
|
|
|
|
|
#include "trace.h"
|
2017-05-26 13:09:13 +02:00
|
|
|
#include "qapi/error.h"
|
2013-12-01 22:23:41 +01:00
|
|
|
#include "nbd-client.h"
|
|
|
|
|
2017-09-18 23:46:49 +02:00
|
|
|
#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ (uint64_t)(intptr_t)(bs))
|
|
|
|
#define INDEX_TO_HANDLE(bs, index) ((index) ^ (uint64_t)(intptr_t)(bs))
|
2013-12-01 22:23:41 +01:00
|
|
|
|
2017-08-04 17:14:31 +02:00
|
|
|
static void nbd_recv_coroutines_wake_all(NBDClientSession *s)
|
2013-12-01 22:23:45 +01:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < MAX_NBD_REQUESTS; i++) {
|
2017-08-22 14:51:13 +02:00
|
|
|
NBDClientRequest *req = &s->requests[i];
|
|
|
|
|
|
|
|
if (req->coroutine && req->receiving) {
|
|
|
|
aio_co_wake(req->coroutine);
|
2013-12-01 22:23:45 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-06 22:06:16 +01:00
|
|
|
static void nbd_teardown_connection(BlockDriverState *bs)
|
2014-02-26 15:30:18 +01:00
|
|
|
{
|
2016-10-14 20:33:06 +02:00
|
|
|
NBDClientSession *client = nbd_get_client_session(bs);
|
2015-02-06 22:06:16 +01:00
|
|
|
|
2019-02-01 14:01:37 +01:00
|
|
|
assert(client->ioc);
|
2016-02-10 19:41:01 +01:00
|
|
|
|
2014-02-26 15:30:18 +01:00
|
|
|
/* finish any pending coroutines */
|
2016-02-10 19:41:01 +01:00
|
|
|
qio_channel_shutdown(client->ioc,
|
|
|
|
QIO_CHANNEL_SHUTDOWN_BOTH,
|
|
|
|
NULL);
|
2019-02-01 14:01:38 +01:00
|
|
|
BDRV_POLL_WHILE(bs, client->connection_co);
|
2014-02-26 15:30:18 +01:00
|
|
|
|
2015-02-06 22:06:16 +01:00
|
|
|
nbd_client_detach_aio_context(bs);
|
2016-02-10 19:41:01 +01:00
|
|
|
object_unref(OBJECT(client->sioc));
|
|
|
|
client->sioc = NULL;
|
|
|
|
object_unref(OBJECT(client->ioc));
|
|
|
|
client->ioc = NULL;
|
2014-02-26 15:30:18 +01:00
|
|
|
}
|
|
|
|
|
2019-02-01 14:01:38 +01:00
|
|
|
static coroutine_fn void nbd_connection_entry(void *opaque)
|
2013-12-01 22:23:41 +01:00
|
|
|
{
|
2017-02-13 14:52:24 +01:00
|
|
|
NBDClientSession *s = opaque;
|
2013-12-01 22:23:41 +01:00
|
|
|
uint64_t i;
|
2017-08-17 18:14:13 +02:00
|
|
|
int ret = 0;
|
2017-05-26 13:09:13 +02:00
|
|
|
Error *local_err = NULL;
|
2013-12-01 22:23:41 +01:00
|
|
|
|
nbd-client: Fix regression when server sends garbage
When we switched NBD to use coroutines for qemu 2.9 (in particular,
commit a12a712a), we introduced a regression: if a server sends us
garbage (such as a corrupted magic number), we quit the read loop
but do not stop sending further queued commands, resulting in the
client hanging when it never reads the response to those additional
commands. In qemu 2.8, we properly detected that the server is no
longer reliable, and cancelled all existing pending commands with
EIO, then tore down the socket so that all further command attempts
get EPIPE.
Restore the proper behavior of quitting (almost) all communication
with a broken server: Once we know we are out of sync or otherwise
can't trust the server, we must assume that any further incoming
data is unreliable and therefore end all pending commands with EIO,
and quit trying to send any further commands. As an exception, we
still (try to) send NBD_CMD_DISC to let the server know we are going
away (in part, because it is easier to do that than to further
refactor nbd_teardown_connection, and in part because it is the
only command where we do not have to wait for a reply).
Based on a patch by Vladimir Sementsov-Ogievskiy.
A malicious server can be created with the following hack,
followed by setting NBD_SERVER_DEBUG to a non-zero value in the
environment when running qemu-nbd:
| --- a/nbd/server.c
| +++ b/nbd/server.c
| @@ -919,6 +919,17 @@ static int nbd_send_reply(QIOChannel *ioc, NBDReply *reply, Error **errp)
| stl_be_p(buf + 4, reply->error);
| stq_be_p(buf + 8, reply->handle);
|
| + static int debug;
| + static int count;
| + if (!count++) {
| + const char *str = getenv("NBD_SERVER_DEBUG");
| + if (str) {
| + debug = atoi(str);
| + }
| + }
| + if (debug && !(count % debug)) {
| + buf[0] = 0;
| + }
| return nbd_write(ioc, buf, sizeof(buf), errp);
| }
Reported-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20170814213426.24681-1-eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2017-08-14 23:34:26 +02:00
|
|
|
while (!s->quit) {
|
2019-02-15 16:53:51 +01:00
|
|
|
/*
|
|
|
|
* The NBD client can only really be considered idle when it has
|
|
|
|
* yielded from qio_channel_readv_all_eof(), waiting for data. This is
|
|
|
|
* the point where the additional scheduled coroutine entry happens
|
|
|
|
* after nbd_client_attach_aio_context().
|
|
|
|
*
|
|
|
|
* Therefore we keep an additional in_flight reference all the time and
|
|
|
|
* only drop it temporarily here.
|
|
|
|
*/
|
2017-02-13 14:52:24 +01:00
|
|
|
assert(s->reply.handle == 0);
|
2019-02-18 14:56:01 +01:00
|
|
|
ret = nbd_receive_reply(s->bs, s->ioc, &s->reply, &local_err);
|
2019-02-15 16:53:51 +01:00
|
|
|
|
2017-11-12 02:39:36 +01:00
|
|
|
if (local_err) {
|
block/nbd-client: use traces instead of noisy error_report_err
Reduce extra noise of nbd-client, change 083 correspondingly.
In various commits (be41c100 in 2.10, f140e300 in 2.11, 78a33ab
in 2.12), we added spots where qemu as an NBD client would report
problems communicating with the server to stderr, because there
was no where else to send the error to. However, this is racy,
particularly since the most common source of these errors is when
either the client or the server abruptly hangs up, leaving one
coroutine to report the error only if it wins (or loses) the
race in attempting the read from the server before another
thread completes its cleanup of a protocol error that caused the
disconnect in the first place. The race is also apparent in the
fact that differences in the flush behavior of the server can
alter the frequency of encountering the race in the client (see
commit 6d39db96).
Rather than polluting stderr, it's better to just trace these
situations, for use by developers debugging a flaky connection,
particularly since the real error that either triggers the abrupt
disconnection in the first place, or that results from the EIO
when a request can't receive a reply, DOES make it back to the
user in the normal Error propagation channels.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20181102151152.288399-4-vsementsov@virtuozzo.com>
[eblake: drop depedence on error hint, enhance commit message]
Signed-off-by: Eric Blake <eblake@redhat.com>
2018-11-02 16:11:52 +01:00
|
|
|
trace_nbd_read_reply_entry_fail(ret, error_get_pretty(local_err));
|
|
|
|
error_free(local_err);
|
2017-05-26 13:09:13 +02:00
|
|
|
}
|
2017-03-14 12:11:56 +01:00
|
|
|
if (ret <= 0) {
|
2017-02-13 14:52:24 +01:00
|
|
|
break;
|
2013-12-01 22:23:41 +01:00
|
|
|
}
|
|
|
|
|
2017-02-13 14:52:24 +01:00
|
|
|
/* There's no need for a mutex on the receive side, because the
|
|
|
|
* handler acts as a synchronization point and ensures that only
|
|
|
|
* one coroutine is called until the reply finishes.
|
|
|
|
*/
|
|
|
|
i = HANDLE_TO_INDEX(s, s->reply.handle);
|
2017-08-22 14:51:13 +02:00
|
|
|
if (i >= MAX_NBD_REQUESTS ||
|
|
|
|
!s->requests[i].coroutine ||
|
2017-10-27 12:40:35 +02:00
|
|
|
!s->requests[i].receiving ||
|
2017-10-27 12:40:37 +02:00
|
|
|
(nbd_reply_is_structured(&s->reply) && !s->info.structured_reply))
|
2017-10-27 12:40:35 +02:00
|
|
|
{
|
2017-02-13 14:52:24 +01:00
|
|
|
break;
|
|
|
|
}
|
2013-12-01 22:23:41 +01:00
|
|
|
|
2017-08-22 14:51:13 +02:00
|
|
|
/* We're woken up again by the request itself. Note that there
|
2019-02-01 14:01:38 +01:00
|
|
|
* is no race between yielding and reentering connection_co. This
|
2017-02-13 14:52:24 +01:00
|
|
|
* is because:
|
|
|
|
*
|
2017-08-22 14:51:13 +02:00
|
|
|
* - if the request runs on the same AioContext, it is only
|
2017-02-13 14:52:24 +01:00
|
|
|
* entered after we yield
|
|
|
|
*
|
2017-08-22 14:51:13 +02:00
|
|
|
* - if the request runs on a different AioContext, reentering
|
2019-02-01 14:01:38 +01:00
|
|
|
* connection_co happens through a bottom half, which can only
|
2017-02-13 14:52:24 +01:00
|
|
|
* run after we yield.
|
|
|
|
*/
|
2017-08-22 14:51:13 +02:00
|
|
|
aio_co_wake(s->requests[i].coroutine);
|
2017-02-13 14:52:24 +01:00
|
|
|
qemu_coroutine_yield();
|
2013-12-01 22:23:41 +01:00
|
|
|
}
|
2017-03-14 12:11:56 +01:00
|
|
|
|
2017-08-22 14:51:13 +02:00
|
|
|
s->quit = true;
|
2017-08-04 17:14:31 +02:00
|
|
|
nbd_recv_coroutines_wake_all(s);
|
2019-02-15 16:53:51 +01:00
|
|
|
bdrv_dec_in_flight(s->bs);
|
|
|
|
|
2019-02-01 14:01:38 +01:00
|
|
|
s->connection_co = NULL;
|
block: Fix hangs in synchronous APIs with iothreads
In the block layer, synchronous APIs are often implemented by creating a
coroutine that calls the asynchronous coroutine-based implementation and
then waiting for completion with BDRV_POLL_WHILE().
For this to work with iothreads (more specifically, when the synchronous
API is called in a thread that is not the home thread of the block
device, so that the coroutine will run in a different thread), we must
make sure to call aio_wait_kick() at the end of the operation. Many
places are missing this, so that BDRV_POLL_WHILE() keeps hanging even if
the condition has long become false.
Note that bdrv_dec_in_flight() involves an aio_wait_kick() call. This
corresponds to the BDRV_POLL_WHILE() in the drain functions, but it is
generally not enough for most other operations because they haven't set
the return value in the coroutine entry stub yet. To avoid race
conditions there, we need to kick after setting the return value.
The race window is small enough that the problem doesn't usually surface
in the common path. However, it does surface and causes easily
reproducible hangs if the operation can return early before even calling
bdrv_inc/dec_in_flight, which many of them do (trivial error or no-op
success paths).
The bug in bdrv_truncate(), bdrv_check() and bdrv_invalidate_cache() is
slightly different: These functions even neglected to schedule the
coroutine in the home thread of the node. This avoids the hang, but is
obviously wrong, too. Fix those to schedule the coroutine in the right
AioContext in addition to adding aio_wait_kick() calls.
Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2019-01-07 13:02:48 +01:00
|
|
|
aio_wait_kick();
|
2013-12-01 22:23:41 +01:00
|
|
|
}
|
|
|
|
|
2015-02-06 22:06:16 +01:00
|
|
|
static int nbd_co_send_request(BlockDriverState *bs,
|
2016-10-14 20:33:07 +02:00
|
|
|
NBDRequest *request,
|
2016-07-15 20:32:03 +02:00
|
|
|
QEMUIOVector *qiov)
|
2013-12-01 22:23:41 +01:00
|
|
|
{
|
2016-10-14 20:33:06 +02:00
|
|
|
NBDClientSession *s = nbd_get_client_session(bs);
|
2017-09-05 21:11:14 +02:00
|
|
|
int rc, i;
|
2013-12-01 22:23:41 +01:00
|
|
|
|
|
|
|
qemu_co_mutex_lock(&s->send_mutex);
|
2017-06-01 12:44:56 +02:00
|
|
|
while (s->in_flight == MAX_NBD_REQUESTS) {
|
|
|
|
qemu_co_queue_wait(&s->free_sema, &s->send_mutex);
|
|
|
|
}
|
|
|
|
s->in_flight++;
|
nbd: fix the co_queue multi-adding bug
When we tested the VM migartion between different hosts with NBD
devices, we found if we sent a cancel command after the drive_mirror
was just started, a coroutine re-enter error would occur. The stack
was as follow:
(gdb) bt
00) 0x00007fdfc744d885 in raise () from /lib64/libc.so.6
01) 0x00007fdfc744ee61 in abort () from /lib64/libc.so.6
02) 0x00007fdfca467cc5 in qemu_coroutine_enter (co=0x7fdfcaedb400, opaque=0x0)
at qemu-coroutine.c:118
03) 0x00007fdfca467f6c in qemu_co_queue_run_restart (co=0x7fdfcaedb400) at
qemu-coroutine-lock.c:59
04) 0x00007fdfca467be5 in coroutine_swap (from=0x7fdfcaf3c4e8,
to=0x7fdfcaedb400) at qemu-coroutine.c:96
05) 0x00007fdfca467cea in qemu_coroutine_enter (co=0x7fdfcaedb400, opaque=0x0)
at qemu-coroutine.c:123
06) 0x00007fdfca467f6c in qemu_co_queue_run_restart (co=0x7fdfcaedbdc0) at
qemu-coroutine-lock.c:59
07) 0x00007fdfca467be5 in coroutine_swap (from=0x7fdfcaf3c4e8,
to=0x7fdfcaedbdc0) at qemu-coroutine.c:96
08) 0x00007fdfca467cea in qemu_coroutine_enter (co=0x7fdfcaedbdc0, opaque=0x0)
at qemu-coroutine.c:123
09) 0x00007fdfca4a1fa4 in nbd_recv_coroutines_enter_all (s=0x7fdfcaef7dd0) at
block/nbd-client.c:41
10) 0x00007fdfca4a1ff9 in nbd_teardown_connection (client=0x7fdfcaef7dd0) at
block/nbd-client.c:50
11) 0x00007fdfca4a20f0 in nbd_reply_ready (opaque=0x7fdfcaef7dd0) at
block/nbd-client.c:92
12) 0x00007fdfca45ed80 in aio_dispatch (ctx=0x7fdfcae15e90) at aio-posix.c:144
13) 0x00007fdfca45ef1b in aio_poll (ctx=0x7fdfcae15e90, blocking=false) at
aio-posix.c:222
14) 0x00007fdfca448c34 in aio_ctx_dispatch (source=0x7fdfcae15e90, callback=0x0,
user_data=0x0) at async.c:212
15) 0x00007fdfc8f2f69a in g_main_context_dispatch () from
/usr/lib64/libglib-2.0.so.0
16) 0x00007fdfca45c391 in glib_pollfds_poll () at main-loop.c:190
17) 0x00007fdfca45c489 in os_host_main_loop_wait (timeout=1483677098) at
main-loop.c:235
18) 0x00007fdfca45c57b in main_loop_wait (nonblocking=0) at main-loop.c:484
19) 0x00007fdfca25f403 in main_loop () at vl.c:2249
20) 0x00007fdfca266fc2 in main (argc=42, argv=0x7ffff517d638,
envp=0x7ffff517d790) at vl.c:4814
We find the nbd_recv_coroutines_enter_all function (triggered by a cancel
command or a network connection breaking down) will enter a coroutine which
is waiting for the sending lock. If the lock is still held by another coroutine,
the entering coroutine will be added into the co_queue again. Latter, when the
lock is released, a coroutine re-enter error will occur.
This bug can be fixed simply by delaying the setting of recv_coroutine as
suggested by paolo. After applying this patch, we have tested the cancel
operation in mirror phase looply for more than 5 hous and everything is fine.
Without this patch, a coroutine re-enter error will occur in 5 minutes.
Signed-off-by: Bn Wu <wu.wubin@huawei.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 1423552846-3896-1-git-send-email-wu.wubin@huawei.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2015-02-10 08:20:46 +01:00
|
|
|
|
|
|
|
for (i = 0; i < MAX_NBD_REQUESTS; i++) {
|
2017-08-22 14:51:13 +02:00
|
|
|
if (s->requests[i].coroutine == NULL) {
|
nbd: fix the co_queue multi-adding bug
When we tested the VM migartion between different hosts with NBD
devices, we found if we sent a cancel command after the drive_mirror
was just started, a coroutine re-enter error would occur. The stack
was as follow:
(gdb) bt
00) 0x00007fdfc744d885 in raise () from /lib64/libc.so.6
01) 0x00007fdfc744ee61 in abort () from /lib64/libc.so.6
02) 0x00007fdfca467cc5 in qemu_coroutine_enter (co=0x7fdfcaedb400, opaque=0x0)
at qemu-coroutine.c:118
03) 0x00007fdfca467f6c in qemu_co_queue_run_restart (co=0x7fdfcaedb400) at
qemu-coroutine-lock.c:59
04) 0x00007fdfca467be5 in coroutine_swap (from=0x7fdfcaf3c4e8,
to=0x7fdfcaedb400) at qemu-coroutine.c:96
05) 0x00007fdfca467cea in qemu_coroutine_enter (co=0x7fdfcaedb400, opaque=0x0)
at qemu-coroutine.c:123
06) 0x00007fdfca467f6c in qemu_co_queue_run_restart (co=0x7fdfcaedbdc0) at
qemu-coroutine-lock.c:59
07) 0x00007fdfca467be5 in coroutine_swap (from=0x7fdfcaf3c4e8,
to=0x7fdfcaedbdc0) at qemu-coroutine.c:96
08) 0x00007fdfca467cea in qemu_coroutine_enter (co=0x7fdfcaedbdc0, opaque=0x0)
at qemu-coroutine.c:123
09) 0x00007fdfca4a1fa4 in nbd_recv_coroutines_enter_all (s=0x7fdfcaef7dd0) at
block/nbd-client.c:41
10) 0x00007fdfca4a1ff9 in nbd_teardown_connection (client=0x7fdfcaef7dd0) at
block/nbd-client.c:50
11) 0x00007fdfca4a20f0 in nbd_reply_ready (opaque=0x7fdfcaef7dd0) at
block/nbd-client.c:92
12) 0x00007fdfca45ed80 in aio_dispatch (ctx=0x7fdfcae15e90) at aio-posix.c:144
13) 0x00007fdfca45ef1b in aio_poll (ctx=0x7fdfcae15e90, blocking=false) at
aio-posix.c:222
14) 0x00007fdfca448c34 in aio_ctx_dispatch (source=0x7fdfcae15e90, callback=0x0,
user_data=0x0) at async.c:212
15) 0x00007fdfc8f2f69a in g_main_context_dispatch () from
/usr/lib64/libglib-2.0.so.0
16) 0x00007fdfca45c391 in glib_pollfds_poll () at main-loop.c:190
17) 0x00007fdfca45c489 in os_host_main_loop_wait (timeout=1483677098) at
main-loop.c:235
18) 0x00007fdfca45c57b in main_loop_wait (nonblocking=0) at main-loop.c:484
19) 0x00007fdfca25f403 in main_loop () at vl.c:2249
20) 0x00007fdfca266fc2 in main (argc=42, argv=0x7ffff517d638,
envp=0x7ffff517d790) at vl.c:4814
We find the nbd_recv_coroutines_enter_all function (triggered by a cancel
command or a network connection breaking down) will enter a coroutine which
is waiting for the sending lock. If the lock is still held by another coroutine,
the entering coroutine will be added into the co_queue again. Latter, when the
lock is released, a coroutine re-enter error will occur.
This bug can be fixed simply by delaying the setting of recv_coroutine as
suggested by paolo. After applying this patch, we have tested the cancel
operation in mirror phase looply for more than 5 hous and everything is fine.
Without this patch, a coroutine re-enter error will occur in 5 minutes.
Signed-off-by: Bn Wu <wu.wubin@huawei.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 1423552846-3896-1-git-send-email-wu.wubin@huawei.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2015-02-10 08:20:46 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-02-10 19:41:04 +01:00
|
|
|
g_assert(qemu_in_coroutine());
|
nbd: fix the co_queue multi-adding bug
When we tested the VM migartion between different hosts with NBD
devices, we found if we sent a cancel command after the drive_mirror
was just started, a coroutine re-enter error would occur. The stack
was as follow:
(gdb) bt
00) 0x00007fdfc744d885 in raise () from /lib64/libc.so.6
01) 0x00007fdfc744ee61 in abort () from /lib64/libc.so.6
02) 0x00007fdfca467cc5 in qemu_coroutine_enter (co=0x7fdfcaedb400, opaque=0x0)
at qemu-coroutine.c:118
03) 0x00007fdfca467f6c in qemu_co_queue_run_restart (co=0x7fdfcaedb400) at
qemu-coroutine-lock.c:59
04) 0x00007fdfca467be5 in coroutine_swap (from=0x7fdfcaf3c4e8,
to=0x7fdfcaedb400) at qemu-coroutine.c:96
05) 0x00007fdfca467cea in qemu_coroutine_enter (co=0x7fdfcaedb400, opaque=0x0)
at qemu-coroutine.c:123
06) 0x00007fdfca467f6c in qemu_co_queue_run_restart (co=0x7fdfcaedbdc0) at
qemu-coroutine-lock.c:59
07) 0x00007fdfca467be5 in coroutine_swap (from=0x7fdfcaf3c4e8,
to=0x7fdfcaedbdc0) at qemu-coroutine.c:96
08) 0x00007fdfca467cea in qemu_coroutine_enter (co=0x7fdfcaedbdc0, opaque=0x0)
at qemu-coroutine.c:123
09) 0x00007fdfca4a1fa4 in nbd_recv_coroutines_enter_all (s=0x7fdfcaef7dd0) at
block/nbd-client.c:41
10) 0x00007fdfca4a1ff9 in nbd_teardown_connection (client=0x7fdfcaef7dd0) at
block/nbd-client.c:50
11) 0x00007fdfca4a20f0 in nbd_reply_ready (opaque=0x7fdfcaef7dd0) at
block/nbd-client.c:92
12) 0x00007fdfca45ed80 in aio_dispatch (ctx=0x7fdfcae15e90) at aio-posix.c:144
13) 0x00007fdfca45ef1b in aio_poll (ctx=0x7fdfcae15e90, blocking=false) at
aio-posix.c:222
14) 0x00007fdfca448c34 in aio_ctx_dispatch (source=0x7fdfcae15e90, callback=0x0,
user_data=0x0) at async.c:212
15) 0x00007fdfc8f2f69a in g_main_context_dispatch () from
/usr/lib64/libglib-2.0.so.0
16) 0x00007fdfca45c391 in glib_pollfds_poll () at main-loop.c:190
17) 0x00007fdfca45c489 in os_host_main_loop_wait (timeout=1483677098) at
main-loop.c:235
18) 0x00007fdfca45c57b in main_loop_wait (nonblocking=0) at main-loop.c:484
19) 0x00007fdfca25f403 in main_loop () at vl.c:2249
20) 0x00007fdfca266fc2 in main (argc=42, argv=0x7ffff517d638,
envp=0x7ffff517d790) at vl.c:4814
We find the nbd_recv_coroutines_enter_all function (triggered by a cancel
command or a network connection breaking down) will enter a coroutine which
is waiting for the sending lock. If the lock is still held by another coroutine,
the entering coroutine will be added into the co_queue again. Latter, when the
lock is released, a coroutine re-enter error will occur.
This bug can be fixed simply by delaying the setting of recv_coroutine as
suggested by paolo. After applying this patch, we have tested the cancel
operation in mirror phase looply for more than 5 hous and everything is fine.
Without this patch, a coroutine re-enter error will occur in 5 minutes.
Signed-off-by: Bn Wu <wu.wubin@huawei.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 1423552846-3896-1-git-send-email-wu.wubin@huawei.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2015-02-10 08:20:46 +01:00
|
|
|
assert(i < MAX_NBD_REQUESTS);
|
2017-08-22 14:51:13 +02:00
|
|
|
|
|
|
|
s->requests[i].coroutine = qemu_coroutine_self();
|
2017-10-27 12:40:37 +02:00
|
|
|
s->requests[i].offset = request->from;
|
2017-08-22 14:51:13 +02:00
|
|
|
s->requests[i].receiving = false;
|
|
|
|
|
nbd: fix the co_queue multi-adding bug
When we tested the VM migartion between different hosts with NBD
devices, we found if we sent a cancel command after the drive_mirror
was just started, a coroutine re-enter error would occur. The stack
was as follow:
(gdb) bt
00) 0x00007fdfc744d885 in raise () from /lib64/libc.so.6
01) 0x00007fdfc744ee61 in abort () from /lib64/libc.so.6
02) 0x00007fdfca467cc5 in qemu_coroutine_enter (co=0x7fdfcaedb400, opaque=0x0)
at qemu-coroutine.c:118
03) 0x00007fdfca467f6c in qemu_co_queue_run_restart (co=0x7fdfcaedb400) at
qemu-coroutine-lock.c:59
04) 0x00007fdfca467be5 in coroutine_swap (from=0x7fdfcaf3c4e8,
to=0x7fdfcaedb400) at qemu-coroutine.c:96
05) 0x00007fdfca467cea in qemu_coroutine_enter (co=0x7fdfcaedb400, opaque=0x0)
at qemu-coroutine.c:123
06) 0x00007fdfca467f6c in qemu_co_queue_run_restart (co=0x7fdfcaedbdc0) at
qemu-coroutine-lock.c:59
07) 0x00007fdfca467be5 in coroutine_swap (from=0x7fdfcaf3c4e8,
to=0x7fdfcaedbdc0) at qemu-coroutine.c:96
08) 0x00007fdfca467cea in qemu_coroutine_enter (co=0x7fdfcaedbdc0, opaque=0x0)
at qemu-coroutine.c:123
09) 0x00007fdfca4a1fa4 in nbd_recv_coroutines_enter_all (s=0x7fdfcaef7dd0) at
block/nbd-client.c:41
10) 0x00007fdfca4a1ff9 in nbd_teardown_connection (client=0x7fdfcaef7dd0) at
block/nbd-client.c:50
11) 0x00007fdfca4a20f0 in nbd_reply_ready (opaque=0x7fdfcaef7dd0) at
block/nbd-client.c:92
12) 0x00007fdfca45ed80 in aio_dispatch (ctx=0x7fdfcae15e90) at aio-posix.c:144
13) 0x00007fdfca45ef1b in aio_poll (ctx=0x7fdfcae15e90, blocking=false) at
aio-posix.c:222
14) 0x00007fdfca448c34 in aio_ctx_dispatch (source=0x7fdfcae15e90, callback=0x0,
user_data=0x0) at async.c:212
15) 0x00007fdfc8f2f69a in g_main_context_dispatch () from
/usr/lib64/libglib-2.0.so.0
16) 0x00007fdfca45c391 in glib_pollfds_poll () at main-loop.c:190
17) 0x00007fdfca45c489 in os_host_main_loop_wait (timeout=1483677098) at
main-loop.c:235
18) 0x00007fdfca45c57b in main_loop_wait (nonblocking=0) at main-loop.c:484
19) 0x00007fdfca25f403 in main_loop () at vl.c:2249
20) 0x00007fdfca266fc2 in main (argc=42, argv=0x7ffff517d638,
envp=0x7ffff517d790) at vl.c:4814
We find the nbd_recv_coroutines_enter_all function (triggered by a cancel
command or a network connection breaking down) will enter a coroutine which
is waiting for the sending lock. If the lock is still held by another coroutine,
the entering coroutine will be added into the co_queue again. Latter, when the
lock is released, a coroutine re-enter error will occur.
This bug can be fixed simply by delaying the setting of recv_coroutine as
suggested by paolo. After applying this patch, we have tested the cancel
operation in mirror phase looply for more than 5 hous and everything is fine.
Without this patch, a coroutine re-enter error will occur in 5 minutes.
Signed-off-by: Bn Wu <wu.wubin@huawei.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 1423552846-3896-1-git-send-email-wu.wubin@huawei.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2015-02-10 08:20:46 +01:00
|
|
|
request->handle = INDEX_TO_HANDLE(s, i);
|
2016-02-10 19:41:01 +01:00
|
|
|
|
nbd-client: Fix regression when server sends garbage
When we switched NBD to use coroutines for qemu 2.9 (in particular,
commit a12a712a), we introduced a regression: if a server sends us
garbage (such as a corrupted magic number), we quit the read loop
but do not stop sending further queued commands, resulting in the
client hanging when it never reads the response to those additional
commands. In qemu 2.8, we properly detected that the server is no
longer reliable, and cancelled all existing pending commands with
EIO, then tore down the socket so that all further command attempts
get EPIPE.
Restore the proper behavior of quitting (almost) all communication
with a broken server: Once we know we are out of sync or otherwise
can't trust the server, we must assume that any further incoming
data is unreliable and therefore end all pending commands with EIO,
and quit trying to send any further commands. As an exception, we
still (try to) send NBD_CMD_DISC to let the server know we are going
away (in part, because it is easier to do that than to further
refactor nbd_teardown_connection, and in part because it is the
only command where we do not have to wait for a reply).
Based on a patch by Vladimir Sementsov-Ogievskiy.
A malicious server can be created with the following hack,
followed by setting NBD_SERVER_DEBUG to a non-zero value in the
environment when running qemu-nbd:
| --- a/nbd/server.c
| +++ b/nbd/server.c
| @@ -919,6 +919,17 @@ static int nbd_send_reply(QIOChannel *ioc, NBDReply *reply, Error **errp)
| stl_be_p(buf + 4, reply->error);
| stq_be_p(buf + 8, reply->handle);
|
| + static int debug;
| + static int count;
| + if (!count++) {
| + const char *str = getenv("NBD_SERVER_DEBUG");
| + if (str) {
| + debug = atoi(str);
| + }
| + }
| + if (debug && !(count % debug)) {
| + buf[0] = 0;
| + }
| return nbd_write(ioc, buf, sizeof(buf), errp);
| }
Reported-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20170814213426.24681-1-eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2017-08-14 23:34:26 +02:00
|
|
|
if (s->quit) {
|
nbd-client: avoid read_reply_co entry if send failed
The following segfault is encountered if the NBD server closes the UNIX
domain socket immediately after negotiation:
Program terminated with signal SIGSEGV, Segmentation fault.
#0 aio_co_schedule (ctx=0x0, co=0xd3c0ff2ef0) at util/async.c:441
441 QSLIST_INSERT_HEAD_ATOMIC(&ctx->scheduled_coroutines,
(gdb) bt
#0 0x000000d3c01a50f8 in aio_co_schedule (ctx=0x0, co=0xd3c0ff2ef0) at util/async.c:441
#1 0x000000d3c012fa90 in nbd_coroutine_end (bs=bs@entry=0xd3c0fec650, request=<optimized out>) at block/nbd-client.c:207
#2 0x000000d3c012fb58 in nbd_client_co_preadv (bs=0xd3c0fec650, offset=0, bytes=<optimized out>, qiov=0x7ffc10a91b20, flags=0) at block/nbd-client.c:237
#3 0x000000d3c0128e63 in bdrv_driver_preadv (bs=bs@entry=0xd3c0fec650, offset=offset@entry=0, bytes=bytes@entry=512, qiov=qiov@entry=0x7ffc10a91b20, flags=0) at block/io.c:836
#4 0x000000d3c012c3e0 in bdrv_aligned_preadv (child=child@entry=0xd3c0ff51d0, req=req@entry=0x7f31885d6e90, offset=offset@entry=0, bytes=bytes@entry=512, align=align@entry=1, qiov=qiov@entry=0x7ffc10a91b20, f
+lags=0) at block/io.c:1086
#5 0x000000d3c012c6b8 in bdrv_co_preadv (child=0xd3c0ff51d0, offset=offset@entry=0, bytes=bytes@entry=512, qiov=qiov@entry=0x7ffc10a91b20, flags=flags@entry=0) at block/io.c:1182
#6 0x000000d3c011cc17 in blk_co_preadv (blk=0xd3c0ff4f80, offset=0, bytes=512, qiov=0x7ffc10a91b20, flags=0) at block/block-backend.c:1032
#7 0x000000d3c011ccec in blk_read_entry (opaque=0x7ffc10a91b40) at block/block-backend.c:1079
#8 0x000000d3c01bbb96 in coroutine_trampoline (i0=<optimized out>, i1=<optimized out>) at util/coroutine-ucontext.c:79
#9 0x00007f3196cb8600 in __start_context () at /lib64/libc.so.6
The problem is that nbd_client_init() uses
nbd_client_attach_aio_context() -> aio_co_schedule(new_context,
client->read_reply_co). Execution of read_reply_co is deferred to a BH
which doesn't run until later.
In the mean time blk_co_preadv() can be called and nbd_coroutine_end()
calls aio_wake() on read_reply_co. At this point in time
read_reply_co's ctx isn't set because it has never been entered yet.
This patch simplifies the nbd_co_send_request() ->
nbd_co_receive_reply() -> nbd_coroutine_end() lifecycle to just
nbd_co_send_request() -> nbd_co_receive_reply(). The request is "ended"
if an error occurs at any point. Callers no longer have to invoke
nbd_coroutine_end().
This cleanup also eliminates the segfault because we don't call
aio_co_schedule() to wake up s->read_reply_co if sending the request
failed. It is only necessary to wake up s->read_reply_co if a reply was
received.
Note this only happens with UNIX domain sockets on Linux. It doesn't
seem possible to reproduce this with TCP sockets.
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20170829122745.14309-2-stefanha@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
2017-08-29 14:27:43 +02:00
|
|
|
rc = -EIO;
|
|
|
|
goto err;
|
nbd-client: Fix regression when server sends garbage
When we switched NBD to use coroutines for qemu 2.9 (in particular,
commit a12a712a), we introduced a regression: if a server sends us
garbage (such as a corrupted magic number), we quit the read loop
but do not stop sending further queued commands, resulting in the
client hanging when it never reads the response to those additional
commands. In qemu 2.8, we properly detected that the server is no
longer reliable, and cancelled all existing pending commands with
EIO, then tore down the socket so that all further command attempts
get EPIPE.
Restore the proper behavior of quitting (almost) all communication
with a broken server: Once we know we are out of sync or otherwise
can't trust the server, we must assume that any further incoming
data is unreliable and therefore end all pending commands with EIO,
and quit trying to send any further commands. As an exception, we
still (try to) send NBD_CMD_DISC to let the server know we are going
away (in part, because it is easier to do that than to further
refactor nbd_teardown_connection, and in part because it is the
only command where we do not have to wait for a reply).
Based on a patch by Vladimir Sementsov-Ogievskiy.
A malicious server can be created with the following hack,
followed by setting NBD_SERVER_DEBUG to a non-zero value in the
environment when running qemu-nbd:
| --- a/nbd/server.c
| +++ b/nbd/server.c
| @@ -919,6 +919,17 @@ static int nbd_send_reply(QIOChannel *ioc, NBDReply *reply, Error **errp)
| stl_be_p(buf + 4, reply->error);
| stq_be_p(buf + 8, reply->handle);
|
| + static int debug;
| + static int count;
| + if (!count++) {
| + const char *str = getenv("NBD_SERVER_DEBUG");
| + if (str) {
| + debug = atoi(str);
| + }
| + }
| + if (debug && !(count % debug)) {
| + buf[0] = 0;
| + }
| return nbd_write(ioc, buf, sizeof(buf), errp);
| }
Reported-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20170814213426.24681-1-eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2017-08-14 23:34:26 +02:00
|
|
|
}
|
2019-02-01 14:01:37 +01:00
|
|
|
assert(s->ioc);
|
2016-02-10 19:41:01 +01:00
|
|
|
|
2013-12-01 22:23:41 +01:00
|
|
|
if (qiov) {
|
2016-02-10 19:41:01 +01:00
|
|
|
qio_channel_set_cork(s->ioc, true);
|
2016-02-10 19:41:04 +01:00
|
|
|
rc = nbd_send_request(s->ioc, request);
|
nbd-client: Fix regression when server sends garbage
When we switched NBD to use coroutines for qemu 2.9 (in particular,
commit a12a712a), we introduced a regression: if a server sends us
garbage (such as a corrupted magic number), we quit the read loop
but do not stop sending further queued commands, resulting in the
client hanging when it never reads the response to those additional
commands. In qemu 2.8, we properly detected that the server is no
longer reliable, and cancelled all existing pending commands with
EIO, then tore down the socket so that all further command attempts
get EPIPE.
Restore the proper behavior of quitting (almost) all communication
with a broken server: Once we know we are out of sync or otherwise
can't trust the server, we must assume that any further incoming
data is unreliable and therefore end all pending commands with EIO,
and quit trying to send any further commands. As an exception, we
still (try to) send NBD_CMD_DISC to let the server know we are going
away (in part, because it is easier to do that than to further
refactor nbd_teardown_connection, and in part because it is the
only command where we do not have to wait for a reply).
Based on a patch by Vladimir Sementsov-Ogievskiy.
A malicious server can be created with the following hack,
followed by setting NBD_SERVER_DEBUG to a non-zero value in the
environment when running qemu-nbd:
| --- a/nbd/server.c
| +++ b/nbd/server.c
| @@ -919,6 +919,17 @@ static int nbd_send_reply(QIOChannel *ioc, NBDReply *reply, Error **errp)
| stl_be_p(buf + 4, reply->error);
| stq_be_p(buf + 8, reply->handle);
|
| + static int debug;
| + static int count;
| + if (!count++) {
| + const char *str = getenv("NBD_SERVER_DEBUG");
| + if (str) {
| + debug = atoi(str);
| + }
| + }
| + if (debug && !(count % debug)) {
| + buf[0] = 0;
| + }
| return nbd_write(ioc, buf, sizeof(buf), errp);
| }
Reported-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20170814213426.24681-1-eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2017-08-14 23:34:26 +02:00
|
|
|
if (rc >= 0 && !s->quit) {
|
2017-09-05 21:11:14 +02:00
|
|
|
if (qio_channel_writev_all(s->ioc, qiov->iov, qiov->niov,
|
|
|
|
NULL) < 0) {
|
2013-12-01 22:23:41 +01:00
|
|
|
rc = -EIO;
|
|
|
|
}
|
2017-09-20 14:45:07 +02:00
|
|
|
} else if (rc >= 0) {
|
|
|
|
rc = -EIO;
|
2013-12-01 22:23:41 +01:00
|
|
|
}
|
2016-02-10 19:41:01 +01:00
|
|
|
qio_channel_set_cork(s->ioc, false);
|
2013-12-01 22:23:41 +01:00
|
|
|
} else {
|
2016-02-10 19:41:04 +01:00
|
|
|
rc = nbd_send_request(s->ioc, request);
|
2013-12-01 22:23:41 +01:00
|
|
|
}
|
nbd-client: avoid read_reply_co entry if send failed
The following segfault is encountered if the NBD server closes the UNIX
domain socket immediately after negotiation:
Program terminated with signal SIGSEGV, Segmentation fault.
#0 aio_co_schedule (ctx=0x0, co=0xd3c0ff2ef0) at util/async.c:441
441 QSLIST_INSERT_HEAD_ATOMIC(&ctx->scheduled_coroutines,
(gdb) bt
#0 0x000000d3c01a50f8 in aio_co_schedule (ctx=0x0, co=0xd3c0ff2ef0) at util/async.c:441
#1 0x000000d3c012fa90 in nbd_coroutine_end (bs=bs@entry=0xd3c0fec650, request=<optimized out>) at block/nbd-client.c:207
#2 0x000000d3c012fb58 in nbd_client_co_preadv (bs=0xd3c0fec650, offset=0, bytes=<optimized out>, qiov=0x7ffc10a91b20, flags=0) at block/nbd-client.c:237
#3 0x000000d3c0128e63 in bdrv_driver_preadv (bs=bs@entry=0xd3c0fec650, offset=offset@entry=0, bytes=bytes@entry=512, qiov=qiov@entry=0x7ffc10a91b20, flags=0) at block/io.c:836
#4 0x000000d3c012c3e0 in bdrv_aligned_preadv (child=child@entry=0xd3c0ff51d0, req=req@entry=0x7f31885d6e90, offset=offset@entry=0, bytes=bytes@entry=512, align=align@entry=1, qiov=qiov@entry=0x7ffc10a91b20, f
+lags=0) at block/io.c:1086
#5 0x000000d3c012c6b8 in bdrv_co_preadv (child=0xd3c0ff51d0, offset=offset@entry=0, bytes=bytes@entry=512, qiov=qiov@entry=0x7ffc10a91b20, flags=flags@entry=0) at block/io.c:1182
#6 0x000000d3c011cc17 in blk_co_preadv (blk=0xd3c0ff4f80, offset=0, bytes=512, qiov=0x7ffc10a91b20, flags=0) at block/block-backend.c:1032
#7 0x000000d3c011ccec in blk_read_entry (opaque=0x7ffc10a91b40) at block/block-backend.c:1079
#8 0x000000d3c01bbb96 in coroutine_trampoline (i0=<optimized out>, i1=<optimized out>) at util/coroutine-ucontext.c:79
#9 0x00007f3196cb8600 in __start_context () at /lib64/libc.so.6
The problem is that nbd_client_init() uses
nbd_client_attach_aio_context() -> aio_co_schedule(new_context,
client->read_reply_co). Execution of read_reply_co is deferred to a BH
which doesn't run until later.
In the mean time blk_co_preadv() can be called and nbd_coroutine_end()
calls aio_wake() on read_reply_co. At this point in time
read_reply_co's ctx isn't set because it has never been entered yet.
This patch simplifies the nbd_co_send_request() ->
nbd_co_receive_reply() -> nbd_coroutine_end() lifecycle to just
nbd_co_send_request() -> nbd_co_receive_reply(). The request is "ended"
if an error occurs at any point. Callers no longer have to invoke
nbd_coroutine_end().
This cleanup also eliminates the segfault because we don't call
aio_co_schedule() to wake up s->read_reply_co if sending the request
failed. It is only necessary to wake up s->read_reply_co if a reply was
received.
Note this only happens with UNIX domain sockets on Linux. It doesn't
seem possible to reproduce this with TCP sockets.
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20170829122745.14309-2-stefanha@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
2017-08-29 14:27:43 +02:00
|
|
|
|
|
|
|
err:
|
nbd-client: Fix regression when server sends garbage
When we switched NBD to use coroutines for qemu 2.9 (in particular,
commit a12a712a), we introduced a regression: if a server sends us
garbage (such as a corrupted magic number), we quit the read loop
but do not stop sending further queued commands, resulting in the
client hanging when it never reads the response to those additional
commands. In qemu 2.8, we properly detected that the server is no
longer reliable, and cancelled all existing pending commands with
EIO, then tore down the socket so that all further command attempts
get EPIPE.
Restore the proper behavior of quitting (almost) all communication
with a broken server: Once we know we are out of sync or otherwise
can't trust the server, we must assume that any further incoming
data is unreliable and therefore end all pending commands with EIO,
and quit trying to send any further commands. As an exception, we
still (try to) send NBD_CMD_DISC to let the server know we are going
away (in part, because it is easier to do that than to further
refactor nbd_teardown_connection, and in part because it is the
only command where we do not have to wait for a reply).
Based on a patch by Vladimir Sementsov-Ogievskiy.
A malicious server can be created with the following hack,
followed by setting NBD_SERVER_DEBUG to a non-zero value in the
environment when running qemu-nbd:
| --- a/nbd/server.c
| +++ b/nbd/server.c
| @@ -919,6 +919,17 @@ static int nbd_send_reply(QIOChannel *ioc, NBDReply *reply, Error **errp)
| stl_be_p(buf + 4, reply->error);
| stq_be_p(buf + 8, reply->handle);
|
| + static int debug;
| + static int count;
| + if (!count++) {
| + const char *str = getenv("NBD_SERVER_DEBUG");
| + if (str) {
| + debug = atoi(str);
| + }
| + }
| + if (debug && !(count % debug)) {
| + buf[0] = 0;
| + }
| return nbd_write(ioc, buf, sizeof(buf), errp);
| }
Reported-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20170814213426.24681-1-eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2017-08-14 23:34:26 +02:00
|
|
|
if (rc < 0) {
|
|
|
|
s->quit = true;
|
nbd-client: avoid read_reply_co entry if send failed
The following segfault is encountered if the NBD server closes the UNIX
domain socket immediately after negotiation:
Program terminated with signal SIGSEGV, Segmentation fault.
#0 aio_co_schedule (ctx=0x0, co=0xd3c0ff2ef0) at util/async.c:441
441 QSLIST_INSERT_HEAD_ATOMIC(&ctx->scheduled_coroutines,
(gdb) bt
#0 0x000000d3c01a50f8 in aio_co_schedule (ctx=0x0, co=0xd3c0ff2ef0) at util/async.c:441
#1 0x000000d3c012fa90 in nbd_coroutine_end (bs=bs@entry=0xd3c0fec650, request=<optimized out>) at block/nbd-client.c:207
#2 0x000000d3c012fb58 in nbd_client_co_preadv (bs=0xd3c0fec650, offset=0, bytes=<optimized out>, qiov=0x7ffc10a91b20, flags=0) at block/nbd-client.c:237
#3 0x000000d3c0128e63 in bdrv_driver_preadv (bs=bs@entry=0xd3c0fec650, offset=offset@entry=0, bytes=bytes@entry=512, qiov=qiov@entry=0x7ffc10a91b20, flags=0) at block/io.c:836
#4 0x000000d3c012c3e0 in bdrv_aligned_preadv (child=child@entry=0xd3c0ff51d0, req=req@entry=0x7f31885d6e90, offset=offset@entry=0, bytes=bytes@entry=512, align=align@entry=1, qiov=qiov@entry=0x7ffc10a91b20, f
+lags=0) at block/io.c:1086
#5 0x000000d3c012c6b8 in bdrv_co_preadv (child=0xd3c0ff51d0, offset=offset@entry=0, bytes=bytes@entry=512, qiov=qiov@entry=0x7ffc10a91b20, flags=flags@entry=0) at block/io.c:1182
#6 0x000000d3c011cc17 in blk_co_preadv (blk=0xd3c0ff4f80, offset=0, bytes=512, qiov=0x7ffc10a91b20, flags=0) at block/block-backend.c:1032
#7 0x000000d3c011ccec in blk_read_entry (opaque=0x7ffc10a91b40) at block/block-backend.c:1079
#8 0x000000d3c01bbb96 in coroutine_trampoline (i0=<optimized out>, i1=<optimized out>) at util/coroutine-ucontext.c:79
#9 0x00007f3196cb8600 in __start_context () at /lib64/libc.so.6
The problem is that nbd_client_init() uses
nbd_client_attach_aio_context() -> aio_co_schedule(new_context,
client->read_reply_co). Execution of read_reply_co is deferred to a BH
which doesn't run until later.
In the mean time blk_co_preadv() can be called and nbd_coroutine_end()
calls aio_wake() on read_reply_co. At this point in time
read_reply_co's ctx isn't set because it has never been entered yet.
This patch simplifies the nbd_co_send_request() ->
nbd_co_receive_reply() -> nbd_coroutine_end() lifecycle to just
nbd_co_send_request() -> nbd_co_receive_reply(). The request is "ended"
if an error occurs at any point. Callers no longer have to invoke
nbd_coroutine_end().
This cleanup also eliminates the segfault because we don't call
aio_co_schedule() to wake up s->read_reply_co if sending the request
failed. It is only necessary to wake up s->read_reply_co if a reply was
received.
Note this only happens with UNIX domain sockets on Linux. It doesn't
seem possible to reproduce this with TCP sockets.
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20170829122745.14309-2-stefanha@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
2017-08-29 14:27:43 +02:00
|
|
|
s->requests[i].coroutine = NULL;
|
|
|
|
s->in_flight--;
|
|
|
|
qemu_co_queue_next(&s->free_sema);
|
nbd-client: Fix regression when server sends garbage
When we switched NBD to use coroutines for qemu 2.9 (in particular,
commit a12a712a), we introduced a regression: if a server sends us
garbage (such as a corrupted magic number), we quit the read loop
but do not stop sending further queued commands, resulting in the
client hanging when it never reads the response to those additional
commands. In qemu 2.8, we properly detected that the server is no
longer reliable, and cancelled all existing pending commands with
EIO, then tore down the socket so that all further command attempts
get EPIPE.
Restore the proper behavior of quitting (almost) all communication
with a broken server: Once we know we are out of sync or otherwise
can't trust the server, we must assume that any further incoming
data is unreliable and therefore end all pending commands with EIO,
and quit trying to send any further commands. As an exception, we
still (try to) send NBD_CMD_DISC to let the server know we are going
away (in part, because it is easier to do that than to further
refactor nbd_teardown_connection, and in part because it is the
only command where we do not have to wait for a reply).
Based on a patch by Vladimir Sementsov-Ogievskiy.
A malicious server can be created with the following hack,
followed by setting NBD_SERVER_DEBUG to a non-zero value in the
environment when running qemu-nbd:
| --- a/nbd/server.c
| +++ b/nbd/server.c
| @@ -919,6 +919,17 @@ static int nbd_send_reply(QIOChannel *ioc, NBDReply *reply, Error **errp)
| stl_be_p(buf + 4, reply->error);
| stq_be_p(buf + 8, reply->handle);
|
| + static int debug;
| + static int count;
| + if (!count++) {
| + const char *str = getenv("NBD_SERVER_DEBUG");
| + if (str) {
| + debug = atoi(str);
| + }
| + }
| + if (debug && !(count % debug)) {
| + buf[0] = 0;
| + }
| return nbd_write(ioc, buf, sizeof(buf), errp);
| }
Reported-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20170814213426.24681-1-eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2017-08-14 23:34:26 +02:00
|
|
|
}
|
2013-12-01 22:23:41 +01:00
|
|
|
qemu_co_mutex_unlock(&s->send_mutex);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2017-10-27 12:40:37 +02:00
|
|
|
static inline uint16_t payload_advance16(uint8_t **payload)
|
|
|
|
{
|
|
|
|
*payload += 2;
|
|
|
|
return lduw_be_p(*payload - 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline uint32_t payload_advance32(uint8_t **payload)
|
|
|
|
{
|
|
|
|
*payload += 4;
|
|
|
|
return ldl_be_p(*payload - 4);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline uint64_t payload_advance64(uint8_t **payload)
|
|
|
|
{
|
|
|
|
*payload += 8;
|
|
|
|
return ldq_be_p(*payload - 8);
|
|
|
|
}
|
|
|
|
|
nbd/client: Trace server noncompliance on structured reads
Just as we recently added a trace for a server sending block status
that doesn't match the server's advertised minimum block alignment,
let's do the same for read chunks. But since qemu 3.1 is such a
server (because it advertised 512-byte alignment, but when serving a
file that ends in data but is not sector-aligned, NBD_CMD_READ would
detect a mid-sector change between data and hole at EOF and the
resulting read chunks are unaligned), we don't want to change our
behavior of otherwise tolerating unaligned reads.
Note that even though we fixed the server for 4.0 to advertise an
actual block alignment (which gets rid of the unaligned reads at EOF
for posix files), we can still trigger it via other means:
$ qemu-nbd --image-opts driver=blkdebug,align=512,image.driver=file,image.filename=/path/to/non-aligned-file
Arguably, that is a bug in the blkdebug block status function, for
leaking a block status that is not aligned. It may also be possible to
observe issues with a backing layer with smaller alignment than the
active layer, although so far I have been unable to write a reliable
iotest for that scenario.
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20190330165349.32256-1-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2019-03-30 17:53:49 +01:00
|
|
|
static int nbd_parse_offset_hole_payload(NBDClientSession *client,
|
|
|
|
NBDStructuredReplyChunk *chunk,
|
2017-10-27 12:40:37 +02:00
|
|
|
uint8_t *payload, uint64_t orig_offset,
|
|
|
|
QEMUIOVector *qiov, Error **errp)
|
|
|
|
{
|
|
|
|
uint64_t offset;
|
|
|
|
uint32_t hole_size;
|
|
|
|
|
|
|
|
if (chunk->length != sizeof(offset) + sizeof(hole_size)) {
|
|
|
|
error_setg(errp, "Protocol error: invalid payload for "
|
|
|
|
"NBD_REPLY_TYPE_OFFSET_HOLE");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
offset = payload_advance64(&payload);
|
|
|
|
hole_size = payload_advance32(&payload);
|
|
|
|
|
2017-11-08 22:57:02 +01:00
|
|
|
if (!hole_size || offset < orig_offset || hole_size > qiov->size ||
|
2017-10-27 12:40:37 +02:00
|
|
|
offset > orig_offset + qiov->size - hole_size) {
|
|
|
|
error_setg(errp, "Protocol error: server sent chunk exceeding requested"
|
|
|
|
" region");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
nbd/client: Trace server noncompliance on structured reads
Just as we recently added a trace for a server sending block status
that doesn't match the server's advertised minimum block alignment,
let's do the same for read chunks. But since qemu 3.1 is such a
server (because it advertised 512-byte alignment, but when serving a
file that ends in data but is not sector-aligned, NBD_CMD_READ would
detect a mid-sector change between data and hole at EOF and the
resulting read chunks are unaligned), we don't want to change our
behavior of otherwise tolerating unaligned reads.
Note that even though we fixed the server for 4.0 to advertise an
actual block alignment (which gets rid of the unaligned reads at EOF
for posix files), we can still trigger it via other means:
$ qemu-nbd --image-opts driver=blkdebug,align=512,image.driver=file,image.filename=/path/to/non-aligned-file
Arguably, that is a bug in the blkdebug block status function, for
leaking a block status that is not aligned. It may also be possible to
observe issues with a backing layer with smaller alignment than the
active layer, although so far I have been unable to write a reliable
iotest for that scenario.
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20190330165349.32256-1-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2019-03-30 17:53:49 +01:00
|
|
|
if (client->info.min_block &&
|
|
|
|
!QEMU_IS_ALIGNED(hole_size, client->info.min_block)) {
|
|
|
|
trace_nbd_structured_read_compliance("hole");
|
|
|
|
}
|
2017-10-27 12:40:37 +02:00
|
|
|
|
|
|
|
qemu_iovec_memset(qiov, offset - orig_offset, 0, hole_size);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-03-12 16:21:23 +01:00
|
|
|
/* nbd_parse_blockstatus_payload
|
nbd: Tolerate some server non-compliance in NBD_CMD_BLOCK_STATUS
The NBD spec states that NBD_CMD_FLAG_REQ_ONE (which we currently
always use) should not reply with an extent larger than our request,
and that the server's response should be exactly one extent. Right
now, that means that if a server sends more than one extent, we treat
the server as broken, fail the block status request, and disconnect,
which prevents all further use of the block device. But while good
software should be strict in what it sends, it should be tolerant in
what it receives.
While trying to implement NBD_CMD_BLOCK_STATUS in nbdkit, we
temporarily had a non-compliant server sending too many extents in
spite of REQ_ONE. Oddly enough, 'qemu-img convert' with qemu 3.1
failed with a somewhat useful message:
qemu-img: Protocol error: invalid payload for NBD_REPLY_TYPE_BLOCK_STATUS
which then disappeared with commit d8b4bad8, on the grounds that an
error message flagged only at the time of coroutine teardown is
pointless, and instead we should rely on the actual failed API to
report an error - in other words, the 3.1 behavior was masking the
fact that qemu-img was not reporting an error. That has since been
fixed in the previous patch, where qemu-img convert now fails with:
qemu-img: error while reading block status of sector 0: Invalid argument
But even that is harsh. Since we already partially relaxed things in
commit acfd8f7a to tolerate a server that exceeds the cap (although
that change was made prior to the NBD spec actually putting a cap on
the extent length during REQ_ONE - in fact, the NBD spec change was
BECAUSE of the qemu behavior prior to that commit), it's not that much
harder to argue that we should also tolerate a server that sends too
many extents. But at the same time, it's nice to trace when we are
being tolerant of server non-compliance, in order to help server
writers fix their implementations to be more portable (if they refer
to our traces, rather than just stderr).
Reported-by: Richard W.M. Jones <rjones@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20190323212639.579-3-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2019-03-23 22:26:39 +01:00
|
|
|
* Based on our request, we expect only one extent in reply, for the
|
|
|
|
* base:allocation context.
|
2018-03-12 16:21:23 +01:00
|
|
|
*/
|
|
|
|
static int nbd_parse_blockstatus_payload(NBDClientSession *client,
|
|
|
|
NBDStructuredReplyChunk *chunk,
|
|
|
|
uint8_t *payload, uint64_t orig_length,
|
|
|
|
NBDExtent *extent, Error **errp)
|
|
|
|
{
|
|
|
|
uint32_t context_id;
|
|
|
|
|
nbd: Tolerate some server non-compliance in NBD_CMD_BLOCK_STATUS
The NBD spec states that NBD_CMD_FLAG_REQ_ONE (which we currently
always use) should not reply with an extent larger than our request,
and that the server's response should be exactly one extent. Right
now, that means that if a server sends more than one extent, we treat
the server as broken, fail the block status request, and disconnect,
which prevents all further use of the block device. But while good
software should be strict in what it sends, it should be tolerant in
what it receives.
While trying to implement NBD_CMD_BLOCK_STATUS in nbdkit, we
temporarily had a non-compliant server sending too many extents in
spite of REQ_ONE. Oddly enough, 'qemu-img convert' with qemu 3.1
failed with a somewhat useful message:
qemu-img: Protocol error: invalid payload for NBD_REPLY_TYPE_BLOCK_STATUS
which then disappeared with commit d8b4bad8, on the grounds that an
error message flagged only at the time of coroutine teardown is
pointless, and instead we should rely on the actual failed API to
report an error - in other words, the 3.1 behavior was masking the
fact that qemu-img was not reporting an error. That has since been
fixed in the previous patch, where qemu-img convert now fails with:
qemu-img: error while reading block status of sector 0: Invalid argument
But even that is harsh. Since we already partially relaxed things in
commit acfd8f7a to tolerate a server that exceeds the cap (although
that change was made prior to the NBD spec actually putting a cap on
the extent length during REQ_ONE - in fact, the NBD spec change was
BECAUSE of the qemu behavior prior to that commit), it's not that much
harder to argue that we should also tolerate a server that sends too
many extents. But at the same time, it's nice to trace when we are
being tolerant of server non-compliance, in order to help server
writers fix their implementations to be more portable (if they refer
to our traces, rather than just stderr).
Reported-by: Richard W.M. Jones <rjones@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20190323212639.579-3-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2019-03-23 22:26:39 +01:00
|
|
|
/* The server succeeded, so it must have sent [at least] one extent */
|
|
|
|
if (chunk->length < sizeof(context_id) + sizeof(*extent)) {
|
2018-03-12 16:21:23 +01:00
|
|
|
error_setg(errp, "Protocol error: invalid payload for "
|
|
|
|
"NBD_REPLY_TYPE_BLOCK_STATUS");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
context_id = payload_advance32(&payload);
|
2019-01-17 20:36:47 +01:00
|
|
|
if (client->info.context_id != context_id) {
|
2018-03-12 16:21:23 +01:00
|
|
|
error_setg(errp, "Protocol error: unexpected context id %d for "
|
|
|
|
"NBD_REPLY_TYPE_BLOCK_STATUS, when negotiated context "
|
|
|
|
"id is %d", context_id,
|
2019-01-17 20:36:47 +01:00
|
|
|
client->info.context_id);
|
2018-03-12 16:21:23 +01:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
extent->length = payload_advance32(&payload);
|
|
|
|
extent->flags = payload_advance32(&payload);
|
|
|
|
|
nbd-client: Work around server BLOCK_STATUS misalignment at EOF
The NBD spec is clear that a server that advertises a minimum block
size should reply to NBD_CMD_BLOCK_STATUS with extents aligned
accordingly. However, we know that the qemu NBD server implementation
has had a corner-case bug where it is not compliant with the spec,
present since the introduction of NBD_CMD_BLOCK_STATUS in qemu 2.12
(and unlikely to be patched in time for 4.0). Namely, when qemu is
serving a file that is not a multiple of 512 bytes, it rounds the size
advertised over NBD up to the next sector boundary (someday, I'd like
to fix that to be byte-accurate, but it's a much bigger audit not
appropriate for this release); yet if the final sector contains data
prior to EOF, lseek(SEEK_HOLE) will point to the implicit hole
mid-sector which qemu then reported over NBD.
We are well within our rights to hang up on a server that can't follow
the spec, but it is more useful to try and keep the connection alive
in spite of the problem. Do so by tracing a message about the problem,
and then either truncating the request back to an aligned boundary (if
it covered more than the final sector) or widening it out to the full
boundary with a forced status of data (since truncating would result
in 0 bytes, but we have to make progress, and valid since data is a
default-safe answer). And in practice, since the problem only happens
on a sector that starts with data and ends with a hole, we are going
to want to read that full sector anyway (where qemu as the server
fills in the tail beyond EOF with appropriate NUL bytes).
Easy reproduction:
$ printf %1000d 1 > file
$ qemu-nbd -f raw -t file & pid=$!
$ qemu-img map --output=json -f raw nbd://localhost:10809
qemu-img: Could not read file metadata: Invalid argument
$ kill $pid
where the patched version instead succeeds with:
[{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true}]
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20190326171317.4036-1-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2019-03-26 18:13:17 +01:00
|
|
|
if (extent->length == 0) {
|
2018-03-12 16:21:23 +01:00
|
|
|
error_setg(errp, "Protocol error: server sent status chunk with "
|
nbd-client: Work around server BLOCK_STATUS misalignment at EOF
The NBD spec is clear that a server that advertises a minimum block
size should reply to NBD_CMD_BLOCK_STATUS with extents aligned
accordingly. However, we know that the qemu NBD server implementation
has had a corner-case bug where it is not compliant with the spec,
present since the introduction of NBD_CMD_BLOCK_STATUS in qemu 2.12
(and unlikely to be patched in time for 4.0). Namely, when qemu is
serving a file that is not a multiple of 512 bytes, it rounds the size
advertised over NBD up to the next sector boundary (someday, I'd like
to fix that to be byte-accurate, but it's a much bigger audit not
appropriate for this release); yet if the final sector contains data
prior to EOF, lseek(SEEK_HOLE) will point to the implicit hole
mid-sector which qemu then reported over NBD.
We are well within our rights to hang up on a server that can't follow
the spec, but it is more useful to try and keep the connection alive
in spite of the problem. Do so by tracing a message about the problem,
and then either truncating the request back to an aligned boundary (if
it covered more than the final sector) or widening it out to the full
boundary with a forced status of data (since truncating would result
in 0 bytes, but we have to make progress, and valid since data is a
default-safe answer). And in practice, since the problem only happens
on a sector that starts with data and ends with a hole, we are going
to want to read that full sector anyway (where qemu as the server
fills in the tail beyond EOF with appropriate NUL bytes).
Easy reproduction:
$ printf %1000d 1 > file
$ qemu-nbd -f raw -t file & pid=$!
$ qemu-img map --output=json -f raw nbd://localhost:10809
qemu-img: Could not read file metadata: Invalid argument
$ kill $pid
where the patched version instead succeeds with:
[{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true}]
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20190326171317.4036-1-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2019-03-26 18:13:17 +01:00
|
|
|
"zero length");
|
2018-03-12 16:21:23 +01:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
nbd-client: Work around server BLOCK_STATUS misalignment at EOF
The NBD spec is clear that a server that advertises a minimum block
size should reply to NBD_CMD_BLOCK_STATUS with extents aligned
accordingly. However, we know that the qemu NBD server implementation
has had a corner-case bug where it is not compliant with the spec,
present since the introduction of NBD_CMD_BLOCK_STATUS in qemu 2.12
(and unlikely to be patched in time for 4.0). Namely, when qemu is
serving a file that is not a multiple of 512 bytes, it rounds the size
advertised over NBD up to the next sector boundary (someday, I'd like
to fix that to be byte-accurate, but it's a much bigger audit not
appropriate for this release); yet if the final sector contains data
prior to EOF, lseek(SEEK_HOLE) will point to the implicit hole
mid-sector which qemu then reported over NBD.
We are well within our rights to hang up on a server that can't follow
the spec, but it is more useful to try and keep the connection alive
in spite of the problem. Do so by tracing a message about the problem,
and then either truncating the request back to an aligned boundary (if
it covered more than the final sector) or widening it out to the full
boundary with a forced status of data (since truncating would result
in 0 bytes, but we have to make progress, and valid since data is a
default-safe answer). And in practice, since the problem only happens
on a sector that starts with data and ends with a hole, we are going
to want to read that full sector anyway (where qemu as the server
fills in the tail beyond EOF with appropriate NUL bytes).
Easy reproduction:
$ printf %1000d 1 > file
$ qemu-nbd -f raw -t file & pid=$!
$ qemu-img map --output=json -f raw nbd://localhost:10809
qemu-img: Could not read file metadata: Invalid argument
$ kill $pid
where the patched version instead succeeds with:
[{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true}]
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20190326171317.4036-1-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2019-03-26 18:13:17 +01:00
|
|
|
/*
|
|
|
|
* A server sending unaligned block status is in violation of the
|
|
|
|
* protocol, but as qemu-nbd 3.1 is such a server (at least for
|
|
|
|
* POSIX files that are not a multiple of 512 bytes, since qemu
|
|
|
|
* rounds files up to 512-byte multiples but lseek(SEEK_HOLE)
|
|
|
|
* still sees an implicit hole beyond the real EOF), it's nicer to
|
|
|
|
* work around the misbehaving server. If the request included
|
|
|
|
* more than the final unaligned block, truncate it back to an
|
|
|
|
* aligned result; if the request was only the final block, round
|
|
|
|
* up to the full block and change the status to fully-allocated
|
|
|
|
* (always a safe status, even if it loses information).
|
|
|
|
*/
|
|
|
|
if (client->info.min_block && !QEMU_IS_ALIGNED(extent->length,
|
|
|
|
client->info.min_block)) {
|
|
|
|
trace_nbd_parse_blockstatus_compliance("extent length is unaligned");
|
|
|
|
if (extent->length > client->info.min_block) {
|
|
|
|
extent->length = QEMU_ALIGN_DOWN(extent->length,
|
|
|
|
client->info.min_block);
|
|
|
|
} else {
|
|
|
|
extent->length = client->info.min_block;
|
|
|
|
extent->flags = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
nbd: Tolerate some server non-compliance in NBD_CMD_BLOCK_STATUS
The NBD spec states that NBD_CMD_FLAG_REQ_ONE (which we currently
always use) should not reply with an extent larger than our request,
and that the server's response should be exactly one extent. Right
now, that means that if a server sends more than one extent, we treat
the server as broken, fail the block status request, and disconnect,
which prevents all further use of the block device. But while good
software should be strict in what it sends, it should be tolerant in
what it receives.
While trying to implement NBD_CMD_BLOCK_STATUS in nbdkit, we
temporarily had a non-compliant server sending too many extents in
spite of REQ_ONE. Oddly enough, 'qemu-img convert' with qemu 3.1
failed with a somewhat useful message:
qemu-img: Protocol error: invalid payload for NBD_REPLY_TYPE_BLOCK_STATUS
which then disappeared with commit d8b4bad8, on the grounds that an
error message flagged only at the time of coroutine teardown is
pointless, and instead we should rely on the actual failed API to
report an error - in other words, the 3.1 behavior was masking the
fact that qemu-img was not reporting an error. That has since been
fixed in the previous patch, where qemu-img convert now fails with:
qemu-img: error while reading block status of sector 0: Invalid argument
But even that is harsh. Since we already partially relaxed things in
commit acfd8f7a to tolerate a server that exceeds the cap (although
that change was made prior to the NBD spec actually putting a cap on
the extent length during REQ_ONE - in fact, the NBD spec change was
BECAUSE of the qemu behavior prior to that commit), it's not that much
harder to argue that we should also tolerate a server that sends too
many extents. But at the same time, it's nice to trace when we are
being tolerant of server non-compliance, in order to help server
writers fix their implementations to be more portable (if they refer
to our traces, rather than just stderr).
Reported-by: Richard W.M. Jones <rjones@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20190323212639.579-3-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2019-03-23 22:26:39 +01:00
|
|
|
/*
|
|
|
|
* We used NBD_CMD_FLAG_REQ_ONE, so the server should not have
|
|
|
|
* sent us any more than one extent, nor should it have included
|
|
|
|
* status beyond our request in that extent. However, it's easy
|
|
|
|
* enough to ignore the server's noncompliance without killing the
|
|
|
|
* connection; just ignore trailing extents, and clamp things to
|
|
|
|
* the length of our request.
|
|
|
|
*/
|
|
|
|
if (chunk->length > sizeof(context_id) + sizeof(*extent)) {
|
|
|
|
trace_nbd_parse_blockstatus_compliance("more than one extent");
|
|
|
|
}
|
2018-05-04 00:26:26 +02:00
|
|
|
if (extent->length > orig_length) {
|
|
|
|
extent->length = orig_length;
|
nbd: Tolerate some server non-compliance in NBD_CMD_BLOCK_STATUS
The NBD spec states that NBD_CMD_FLAG_REQ_ONE (which we currently
always use) should not reply with an extent larger than our request,
and that the server's response should be exactly one extent. Right
now, that means that if a server sends more than one extent, we treat
the server as broken, fail the block status request, and disconnect,
which prevents all further use of the block device. But while good
software should be strict in what it sends, it should be tolerant in
what it receives.
While trying to implement NBD_CMD_BLOCK_STATUS in nbdkit, we
temporarily had a non-compliant server sending too many extents in
spite of REQ_ONE. Oddly enough, 'qemu-img convert' with qemu 3.1
failed with a somewhat useful message:
qemu-img: Protocol error: invalid payload for NBD_REPLY_TYPE_BLOCK_STATUS
which then disappeared with commit d8b4bad8, on the grounds that an
error message flagged only at the time of coroutine teardown is
pointless, and instead we should rely on the actual failed API to
report an error - in other words, the 3.1 behavior was masking the
fact that qemu-img was not reporting an error. That has since been
fixed in the previous patch, where qemu-img convert now fails with:
qemu-img: error while reading block status of sector 0: Invalid argument
But even that is harsh. Since we already partially relaxed things in
commit acfd8f7a to tolerate a server that exceeds the cap (although
that change was made prior to the NBD spec actually putting a cap on
the extent length during REQ_ONE - in fact, the NBD spec change was
BECAUSE of the qemu behavior prior to that commit), it's not that much
harder to argue that we should also tolerate a server that sends too
many extents. But at the same time, it's nice to trace when we are
being tolerant of server non-compliance, in order to help server
writers fix their implementations to be more portable (if they refer
to our traces, rather than just stderr).
Reported-by: Richard W.M. Jones <rjones@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20190323212639.579-3-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2019-03-23 22:26:39 +01:00
|
|
|
trace_nbd_parse_blockstatus_compliance("extent length too large");
|
2018-05-04 00:26:26 +02:00
|
|
|
}
|
|
|
|
|
2018-03-12 16:21:23 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-10-27 12:40:37 +02:00
|
|
|
/* nbd_parse_error_payload
|
|
|
|
* on success @errp contains message describing nbd error reply
|
|
|
|
*/
|
|
|
|
static int nbd_parse_error_payload(NBDStructuredReplyChunk *chunk,
|
|
|
|
uint8_t *payload, int *request_ret,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
uint32_t error;
|
|
|
|
uint16_t message_size;
|
|
|
|
|
|
|
|
assert(chunk->type & (1 << 15));
|
|
|
|
|
|
|
|
if (chunk->length < sizeof(error) + sizeof(message_size)) {
|
|
|
|
error_setg(errp,
|
|
|
|
"Protocol error: invalid payload for structured error");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
error = nbd_errno_to_system_errno(payload_advance32(&payload));
|
|
|
|
if (error == 0) {
|
2017-11-08 22:56:57 +01:00
|
|
|
error_setg(errp, "Protocol error: server sent structured error chunk "
|
2017-10-27 12:40:37 +02:00
|
|
|
"with error = 0");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
*request_ret = -error;
|
|
|
|
message_size = payload_advance16(&payload);
|
|
|
|
|
|
|
|
if (message_size > chunk->length - sizeof(error) - sizeof(message_size)) {
|
2017-11-08 22:56:57 +01:00
|
|
|
error_setg(errp, "Protocol error: server sent structured error chunk "
|
2017-10-27 12:40:37 +02:00
|
|
|
"with incorrect message size");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* TODO: Add a trace point to mention the server complaint */
|
|
|
|
|
|
|
|
/* TODO handle ERROR_OFFSET */
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nbd_co_receive_offset_data_payload(NBDClientSession *s,
|
|
|
|
uint64_t orig_offset,
|
|
|
|
QEMUIOVector *qiov, Error **errp)
|
|
|
|
{
|
|
|
|
QEMUIOVector sub_qiov;
|
|
|
|
uint64_t offset;
|
|
|
|
size_t data_size;
|
|
|
|
int ret;
|
|
|
|
NBDStructuredReplyChunk *chunk = &s->reply.structured;
|
|
|
|
|
|
|
|
assert(nbd_reply_is_structured(&s->reply));
|
|
|
|
|
2017-11-08 22:57:02 +01:00
|
|
|
/* The NBD spec requires at least one byte of payload */
|
|
|
|
if (chunk->length <= sizeof(offset)) {
|
2017-10-27 12:40:37 +02:00
|
|
|
error_setg(errp, "Protocol error: invalid payload for "
|
|
|
|
"NBD_REPLY_TYPE_OFFSET_DATA");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2019-01-28 17:58:30 +01:00
|
|
|
if (nbd_read64(s->ioc, &offset, "OFFSET_DATA offset", errp) < 0) {
|
2017-10-27 12:40:37 +02:00
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
|
|
|
|
data_size = chunk->length - sizeof(offset);
|
2017-11-08 22:57:02 +01:00
|
|
|
assert(data_size);
|
2017-10-27 12:40:37 +02:00
|
|
|
if (offset < orig_offset || data_size > qiov->size ||
|
|
|
|
offset > orig_offset + qiov->size - data_size) {
|
|
|
|
error_setg(errp, "Protocol error: server sent chunk exceeding requested"
|
|
|
|
" region");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
nbd/client: Trace server noncompliance on structured reads
Just as we recently added a trace for a server sending block status
that doesn't match the server's advertised minimum block alignment,
let's do the same for read chunks. But since qemu 3.1 is such a
server (because it advertised 512-byte alignment, but when serving a
file that ends in data but is not sector-aligned, NBD_CMD_READ would
detect a mid-sector change between data and hole at EOF and the
resulting read chunks are unaligned), we don't want to change our
behavior of otherwise tolerating unaligned reads.
Note that even though we fixed the server for 4.0 to advertise an
actual block alignment (which gets rid of the unaligned reads at EOF
for posix files), we can still trigger it via other means:
$ qemu-nbd --image-opts driver=blkdebug,align=512,image.driver=file,image.filename=/path/to/non-aligned-file
Arguably, that is a bug in the blkdebug block status function, for
leaking a block status that is not aligned. It may also be possible to
observe issues with a backing layer with smaller alignment than the
active layer, although so far I have been unable to write a reliable
iotest for that scenario.
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20190330165349.32256-1-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2019-03-30 17:53:49 +01:00
|
|
|
if (s->info.min_block && !QEMU_IS_ALIGNED(data_size, s->info.min_block)) {
|
|
|
|
trace_nbd_structured_read_compliance("data");
|
|
|
|
}
|
2017-10-27 12:40:37 +02:00
|
|
|
|
|
|
|
qemu_iovec_init(&sub_qiov, qiov->niov);
|
|
|
|
qemu_iovec_concat(&sub_qiov, qiov, offset - orig_offset, data_size);
|
|
|
|
ret = qio_channel_readv_all(s->ioc, sub_qiov.iov, sub_qiov.niov, errp);
|
|
|
|
qemu_iovec_destroy(&sub_qiov);
|
|
|
|
|
|
|
|
return ret < 0 ? -EIO : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define NBD_MAX_MALLOC_PAYLOAD 1000
|
|
|
|
/* nbd_co_receive_structured_payload
|
|
|
|
*/
|
|
|
|
static coroutine_fn int nbd_co_receive_structured_payload(
|
|
|
|
NBDClientSession *s, void **payload, Error **errp)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
uint32_t len;
|
|
|
|
|
|
|
|
assert(nbd_reply_is_structured(&s->reply));
|
|
|
|
|
|
|
|
len = s->reply.structured.length;
|
|
|
|
|
|
|
|
if (len == 0) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (payload == NULL) {
|
|
|
|
error_setg(errp, "Unexpected structured payload");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (len > NBD_MAX_MALLOC_PAYLOAD) {
|
|
|
|
error_setg(errp, "Payload too large");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
*payload = g_new(char, len);
|
2019-01-28 17:58:30 +01:00
|
|
|
ret = nbd_read(s->ioc, *payload, len, "structured payload", errp);
|
2017-10-27 12:40:37 +02:00
|
|
|
if (ret < 0) {
|
|
|
|
g_free(*payload);
|
|
|
|
*payload = NULL;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* nbd_co_do_receive_one_chunk
|
|
|
|
* for simple reply:
|
|
|
|
* set request_ret to received reply error
|
|
|
|
* if qiov is not NULL: read payload to @qiov
|
|
|
|
* for structured reply chunk:
|
|
|
|
* if error chunk: read payload, set @request_ret, do not set @payload
|
|
|
|
* else if offset_data chunk: read payload data to @qiov, do not set @payload
|
|
|
|
* else: read payload to @payload
|
|
|
|
*
|
|
|
|
* If function fails, @errp contains corresponding error message, and the
|
|
|
|
* connection with the server is suspect. If it returns 0, then the
|
|
|
|
* transaction succeeded (although @request_ret may be a negative errno
|
|
|
|
* corresponding to the server's error reply), and errp is unchanged.
|
|
|
|
*/
|
|
|
|
static coroutine_fn int nbd_co_do_receive_one_chunk(
|
|
|
|
NBDClientSession *s, uint64_t handle, bool only_structured,
|
|
|
|
int *request_ret, QEMUIOVector *qiov, void **payload, Error **errp)
|
2013-12-01 22:23:41 +01:00
|
|
|
{
|
2017-09-20 14:45:05 +02:00
|
|
|
int ret;
|
2017-10-12 11:53:08 +02:00
|
|
|
int i = HANDLE_TO_INDEX(s, handle);
|
2017-10-27 12:40:37 +02:00
|
|
|
void *local_payload = NULL;
|
|
|
|
NBDStructuredReplyChunk *chunk;
|
|
|
|
|
|
|
|
if (payload) {
|
|
|
|
*payload = NULL;
|
|
|
|
}
|
|
|
|
*request_ret = 0;
|
2013-12-01 22:23:41 +01:00
|
|
|
|
2019-02-01 14:01:38 +01:00
|
|
|
/* Wait until we're woken up by nbd_connection_entry. */
|
2017-08-22 14:51:13 +02:00
|
|
|
s->requests[i].receiving = true;
|
2013-12-01 22:23:41 +01:00
|
|
|
qemu_coroutine_yield();
|
2017-08-22 14:51:13 +02:00
|
|
|
s->requests[i].receiving = false;
|
2019-02-01 14:01:37 +01:00
|
|
|
if (s->quit) {
|
2017-10-27 12:40:37 +02:00
|
|
|
error_setg(errp, "Connection closed");
|
|
|
|
return -EIO;
|
|
|
|
}
|
2019-02-01 14:01:37 +01:00
|
|
|
assert(s->ioc);
|
2017-10-27 12:40:37 +02:00
|
|
|
|
|
|
|
assert(s->reply.handle == handle);
|
|
|
|
|
|
|
|
if (nbd_reply_is_simple(&s->reply)) {
|
|
|
|
if (only_structured) {
|
|
|
|
error_setg(errp, "Protocol error: simple reply when structured "
|
|
|
|
"reply chunk was expected");
|
|
|
|
return -EINVAL;
|
2013-12-01 22:23:41 +01:00
|
|
|
}
|
|
|
|
|
2017-10-27 12:40:37 +02:00
|
|
|
*request_ret = -nbd_errno_to_system_errno(s->reply.simple.error);
|
|
|
|
if (*request_ret < 0 || !qiov) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return qio_channel_readv_all(s->ioc, qiov->iov, qiov->niov,
|
|
|
|
errp) < 0 ? -EIO : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* handle structured reply chunk */
|
|
|
|
assert(s->info.structured_reply);
|
|
|
|
chunk = &s->reply.structured;
|
|
|
|
|
|
|
|
if (chunk->type == NBD_REPLY_TYPE_NONE) {
|
|
|
|
if (!(chunk->flags & NBD_REPLY_FLAG_DONE)) {
|
|
|
|
error_setg(errp, "Protocol error: NBD_REPLY_TYPE_NONE chunk without"
|
2017-11-08 22:56:57 +01:00
|
|
|
" NBD_REPLY_FLAG_DONE flag set");
|
2017-10-27 12:40:37 +02:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
2017-11-08 22:57:02 +01:00
|
|
|
if (chunk->length) {
|
|
|
|
error_setg(errp, "Protocol error: NBD_REPLY_TYPE_NONE chunk with"
|
|
|
|
" nonzero length");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2017-10-27 12:40:37 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (chunk->type == NBD_REPLY_TYPE_OFFSET_DATA) {
|
|
|
|
if (!qiov) {
|
|
|
|
error_setg(errp, "Unexpected NBD_REPLY_TYPE_OFFSET_DATA chunk");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return nbd_co_receive_offset_data_payload(s, s->requests[i].offset,
|
|
|
|
qiov, errp);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nbd_reply_type_is_error(chunk->type)) {
|
|
|
|
payload = &local_payload;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = nbd_co_receive_structured_payload(s, payload, errp);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
2013-12-01 22:23:41 +01:00
|
|
|
}
|
2017-02-13 14:52:24 +01:00
|
|
|
|
2017-10-27 12:40:37 +02:00
|
|
|
if (nbd_reply_type_is_error(chunk->type)) {
|
|
|
|
ret = nbd_parse_error_payload(chunk, local_payload, request_ret, errp);
|
|
|
|
g_free(local_payload);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* nbd_co_receive_one_chunk
|
2019-02-01 14:01:38 +01:00
|
|
|
* Read reply, wake up connection_co and set s->quit if needed.
|
2017-10-27 12:40:37 +02:00
|
|
|
* Return value is a fatal error code or normal nbd reply error code
|
|
|
|
*/
|
|
|
|
static coroutine_fn int nbd_co_receive_one_chunk(
|
|
|
|
NBDClientSession *s, uint64_t handle, bool only_structured,
|
2019-02-01 14:01:33 +01:00
|
|
|
int *request_ret, QEMUIOVector *qiov, NBDReply *reply, void **payload,
|
|
|
|
Error **errp)
|
2017-10-27 12:40:37 +02:00
|
|
|
{
|
|
|
|
int ret = nbd_co_do_receive_one_chunk(s, handle, only_structured,
|
2019-02-01 14:01:33 +01:00
|
|
|
request_ret, qiov, payload, errp);
|
2017-10-27 12:40:37 +02:00
|
|
|
|
|
|
|
if (ret < 0) {
|
|
|
|
s->quit = true;
|
|
|
|
} else {
|
2019-02-01 14:01:38 +01:00
|
|
|
/* For assert at loop start in nbd_connection_entry */
|
2017-10-27 12:40:37 +02:00
|
|
|
if (reply) {
|
|
|
|
*reply = s->reply;
|
|
|
|
}
|
|
|
|
s->reply.handle = 0;
|
|
|
|
}
|
2017-02-13 14:52:24 +01:00
|
|
|
|
2019-02-01 14:01:38 +01:00
|
|
|
if (s->connection_co) {
|
|
|
|
aio_co_wake(s->connection_co);
|
2013-12-01 22:23:41 +01:00
|
|
|
}
|
2017-06-01 12:44:56 +02:00
|
|
|
|
2017-10-27 12:40:37 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
typedef struct NBDReplyChunkIter {
|
|
|
|
int ret;
|
2019-02-01 14:01:33 +01:00
|
|
|
int request_ret;
|
2017-10-27 12:40:37 +02:00
|
|
|
Error *err;
|
|
|
|
bool done, only_structured;
|
|
|
|
} NBDReplyChunkIter;
|
|
|
|
|
2019-02-01 14:01:33 +01:00
|
|
|
static void nbd_iter_channel_error(NBDReplyChunkIter *iter,
|
|
|
|
int ret, Error **local_err)
|
2017-10-27 12:40:37 +02:00
|
|
|
{
|
|
|
|
assert(ret < 0);
|
|
|
|
|
2019-02-01 14:01:33 +01:00
|
|
|
if (!iter->ret) {
|
2017-10-27 12:40:37 +02:00
|
|
|
iter->ret = ret;
|
|
|
|
error_propagate(&iter->err, *local_err);
|
|
|
|
} else {
|
|
|
|
error_free(*local_err);
|
|
|
|
}
|
|
|
|
|
|
|
|
*local_err = NULL;
|
|
|
|
}
|
|
|
|
|
2019-02-01 14:01:33 +01:00
|
|
|
static void nbd_iter_request_error(NBDReplyChunkIter *iter, int ret)
|
|
|
|
{
|
|
|
|
assert(ret < 0);
|
|
|
|
|
|
|
|
if (!iter->request_ret) {
|
|
|
|
iter->request_ret = ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-10-27 12:40:37 +02:00
|
|
|
/* NBD_FOREACH_REPLY_CHUNK
|
|
|
|
*/
|
|
|
|
#define NBD_FOREACH_REPLY_CHUNK(s, iter, handle, structured, \
|
|
|
|
qiov, reply, payload) \
|
|
|
|
for (iter = (NBDReplyChunkIter) { .only_structured = structured }; \
|
|
|
|
nbd_reply_chunk_iter_receive(s, &iter, handle, qiov, reply, payload);)
|
|
|
|
|
|
|
|
/* nbd_reply_chunk_iter_receive
|
|
|
|
*/
|
|
|
|
static bool nbd_reply_chunk_iter_receive(NBDClientSession *s,
|
|
|
|
NBDReplyChunkIter *iter,
|
|
|
|
uint64_t handle,
|
|
|
|
QEMUIOVector *qiov, NBDReply *reply,
|
|
|
|
void **payload)
|
|
|
|
{
|
2019-02-01 14:01:33 +01:00
|
|
|
int ret, request_ret;
|
2017-10-27 12:40:37 +02:00
|
|
|
NBDReply local_reply;
|
|
|
|
NBDStructuredReplyChunk *chunk;
|
|
|
|
Error *local_err = NULL;
|
|
|
|
if (s->quit) {
|
|
|
|
error_setg(&local_err, "Connection closed");
|
2019-02-01 14:01:33 +01:00
|
|
|
nbd_iter_channel_error(iter, -EIO, &local_err);
|
2017-10-27 12:40:37 +02:00
|
|
|
goto break_loop;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (iter->done) {
|
|
|
|
/* Previous iteration was last. */
|
|
|
|
goto break_loop;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (reply == NULL) {
|
|
|
|
reply = &local_reply;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = nbd_co_receive_one_chunk(s, handle, iter->only_structured,
|
2019-02-01 14:01:33 +01:00
|
|
|
&request_ret, qiov, reply, payload,
|
|
|
|
&local_err);
|
2017-10-27 12:40:37 +02:00
|
|
|
if (ret < 0) {
|
2019-02-01 14:01:33 +01:00
|
|
|
nbd_iter_channel_error(iter, ret, &local_err);
|
|
|
|
} else if (request_ret < 0) {
|
|
|
|
nbd_iter_request_error(iter, request_ret);
|
2017-10-27 12:40:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Do not execute the body of NBD_FOREACH_REPLY_CHUNK for simple reply. */
|
2019-02-01 14:01:36 +01:00
|
|
|
if (nbd_reply_is_simple(reply) || s->quit) {
|
2017-10-27 12:40:37 +02:00
|
|
|
goto break_loop;
|
|
|
|
}
|
|
|
|
|
|
|
|
chunk = &reply->structured;
|
|
|
|
iter->only_structured = true;
|
|
|
|
|
|
|
|
if (chunk->type == NBD_REPLY_TYPE_NONE) {
|
|
|
|
/* NBD_REPLY_FLAG_DONE is already checked in nbd_co_receive_one_chunk */
|
|
|
|
assert(chunk->flags & NBD_REPLY_FLAG_DONE);
|
|
|
|
goto break_loop;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (chunk->flags & NBD_REPLY_FLAG_DONE) {
|
|
|
|
/* This iteration is last. */
|
|
|
|
iter->done = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Execute the loop body */
|
|
|
|
return true;
|
|
|
|
|
|
|
|
break_loop:
|
|
|
|
s->requests[HANDLE_TO_INDEX(s, handle)].coroutine = NULL;
|
|
|
|
|
2017-06-01 12:44:56 +02:00
|
|
|
qemu_co_mutex_lock(&s->send_mutex);
|
|
|
|
s->in_flight--;
|
|
|
|
qemu_co_queue_next(&s->free_sema);
|
|
|
|
qemu_co_mutex_unlock(&s->send_mutex);
|
2017-09-20 14:45:05 +02:00
|
|
|
|
2017-10-27 12:40:37 +02:00
|
|
|
return false;
|
2013-12-01 22:23:41 +01:00
|
|
|
}
|
|
|
|
|
2017-10-27 12:40:37 +02:00
|
|
|
static int nbd_co_receive_return_code(NBDClientSession *s, uint64_t handle,
|
2019-02-01 14:01:33 +01:00
|
|
|
int *request_ret, Error **errp)
|
2017-10-27 12:40:37 +02:00
|
|
|
{
|
|
|
|
NBDReplyChunkIter iter;
|
|
|
|
|
|
|
|
NBD_FOREACH_REPLY_CHUNK(s, iter, handle, false, NULL, NULL, NULL) {
|
|
|
|
/* nbd_reply_chunk_iter_receive does all the work */
|
|
|
|
}
|
|
|
|
|
|
|
|
error_propagate(errp, iter.err);
|
2019-02-01 14:01:33 +01:00
|
|
|
*request_ret = iter.request_ret;
|
2017-10-27 12:40:37 +02:00
|
|
|
return iter.ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nbd_co_receive_cmdread_reply(NBDClientSession *s, uint64_t handle,
|
|
|
|
uint64_t offset, QEMUIOVector *qiov,
|
2019-02-01 14:01:33 +01:00
|
|
|
int *request_ret, Error **errp)
|
2017-10-27 12:40:37 +02:00
|
|
|
{
|
|
|
|
NBDReplyChunkIter iter;
|
|
|
|
NBDReply reply;
|
|
|
|
void *payload = NULL;
|
|
|
|
Error *local_err = NULL;
|
|
|
|
|
|
|
|
NBD_FOREACH_REPLY_CHUNK(s, iter, handle, s->info.structured_reply,
|
|
|
|
qiov, &reply, &payload)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
NBDStructuredReplyChunk *chunk = &reply.structured;
|
|
|
|
|
|
|
|
assert(nbd_reply_is_structured(&reply));
|
|
|
|
|
|
|
|
switch (chunk->type) {
|
|
|
|
case NBD_REPLY_TYPE_OFFSET_DATA:
|
|
|
|
/* special cased in nbd_co_receive_one_chunk, data is already
|
|
|
|
* in qiov */
|
|
|
|
break;
|
|
|
|
case NBD_REPLY_TYPE_OFFSET_HOLE:
|
nbd/client: Trace server noncompliance on structured reads
Just as we recently added a trace for a server sending block status
that doesn't match the server's advertised minimum block alignment,
let's do the same for read chunks. But since qemu 3.1 is such a
server (because it advertised 512-byte alignment, but when serving a
file that ends in data but is not sector-aligned, NBD_CMD_READ would
detect a mid-sector change between data and hole at EOF and the
resulting read chunks are unaligned), we don't want to change our
behavior of otherwise tolerating unaligned reads.
Note that even though we fixed the server for 4.0 to advertise an
actual block alignment (which gets rid of the unaligned reads at EOF
for posix files), we can still trigger it via other means:
$ qemu-nbd --image-opts driver=blkdebug,align=512,image.driver=file,image.filename=/path/to/non-aligned-file
Arguably, that is a bug in the blkdebug block status function, for
leaking a block status that is not aligned. It may also be possible to
observe issues with a backing layer with smaller alignment than the
active layer, although so far I have been unable to write a reliable
iotest for that scenario.
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20190330165349.32256-1-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2019-03-30 17:53:49 +01:00
|
|
|
ret = nbd_parse_offset_hole_payload(s, &reply.structured, payload,
|
2017-10-27 12:40:37 +02:00
|
|
|
offset, qiov, &local_err);
|
|
|
|
if (ret < 0) {
|
|
|
|
s->quit = true;
|
2019-02-01 14:01:33 +01:00
|
|
|
nbd_iter_channel_error(&iter, ret, &local_err);
|
2017-10-27 12:40:37 +02:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
if (!nbd_reply_type_is_error(chunk->type)) {
|
|
|
|
/* not allowed reply type */
|
|
|
|
s->quit = true;
|
|
|
|
error_setg(&local_err,
|
|
|
|
"Unexpected reply type: %d (%s) for CMD_READ",
|
|
|
|
chunk->type, nbd_reply_type_lookup(chunk->type));
|
2019-02-01 14:01:33 +01:00
|
|
|
nbd_iter_channel_error(&iter, -EINVAL, &local_err);
|
2017-10-27 12:40:37 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
g_free(payload);
|
|
|
|
payload = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
error_propagate(errp, iter.err);
|
2019-02-01 14:01:33 +01:00
|
|
|
*request_ret = iter.request_ret;
|
2017-10-27 12:40:37 +02:00
|
|
|
return iter.ret;
|
|
|
|
}
|
|
|
|
|
2018-03-12 16:21:23 +01:00
|
|
|
static int nbd_co_receive_blockstatus_reply(NBDClientSession *s,
|
|
|
|
uint64_t handle, uint64_t length,
|
2019-02-01 14:01:33 +01:00
|
|
|
NBDExtent *extent,
|
|
|
|
int *request_ret, Error **errp)
|
2018-03-12 16:21:23 +01:00
|
|
|
{
|
|
|
|
NBDReplyChunkIter iter;
|
|
|
|
NBDReply reply;
|
|
|
|
void *payload = NULL;
|
|
|
|
Error *local_err = NULL;
|
|
|
|
bool received = false;
|
|
|
|
|
|
|
|
assert(!extent->length);
|
nbd: Permit simple error to NBD_CMD_BLOCK_STATUS
The NBD spec is clear that when structured replies are active, a
simple error reply is acceptable to any command except for
NBD_CMD_READ. However, we were mistakenly requiring structured errors
for NBD_CMD_BLOCK_STATUS, and hanging up on a server that gave a
simple error (since qemu does not behave as such a server, we didn't
notice the problem until now). Broken since its introduction in
commit 78a33ab5 (v2.12).
Noticed while debugging a separate failure reported by nbdkit while
working out its initial implementation of BLOCK_STATUS, although it
turns out that nbdkit also chose to send structured error replies for
BLOCK_STATUS, so I had to manually provoke the situation by hacking
qemu's server to send a simple error reply:
| diff --git i/nbd/server.c w/nbd/server.c
| index fd013a2817a..833288d7c45 100644
| 00--- i/nbd/server.c
| +++ w/nbd/server.c
| @@ -2269,6 +2269,8 @@ static coroutine_fn int nbd_handle_request(NBDClient *client,
| "discard failed", errp);
|
| case NBD_CMD_BLOCK_STATUS:
| + return nbd_co_send_simple_reply(client, request->handle, ENOMEM,
| + NULL, 0, errp);
| if (!request->len) {
| return nbd_send_generic_reply(client, request->handle, -EINVAL,
| "need non-zero length", errp);
|
Signed-off-by: Eric Blake <eblake@redhat.com>
Acked-by: Richard W.M. Jones <rjones@redhat.com>
Message-Id: <20190325190104.30213-3-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2019-03-25 20:01:04 +01:00
|
|
|
NBD_FOREACH_REPLY_CHUNK(s, iter, handle, false, NULL, &reply, &payload) {
|
2018-03-12 16:21:23 +01:00
|
|
|
int ret;
|
|
|
|
NBDStructuredReplyChunk *chunk = &reply.structured;
|
|
|
|
|
|
|
|
assert(nbd_reply_is_structured(&reply));
|
|
|
|
|
|
|
|
switch (chunk->type) {
|
|
|
|
case NBD_REPLY_TYPE_BLOCK_STATUS:
|
|
|
|
if (received) {
|
|
|
|
s->quit = true;
|
|
|
|
error_setg(&local_err, "Several BLOCK_STATUS chunks in reply");
|
2019-02-01 14:01:33 +01:00
|
|
|
nbd_iter_channel_error(&iter, -EINVAL, &local_err);
|
2018-03-12 16:21:23 +01:00
|
|
|
}
|
|
|
|
received = true;
|
|
|
|
|
|
|
|
ret = nbd_parse_blockstatus_payload(s, &reply.structured,
|
|
|
|
payload, length, extent,
|
|
|
|
&local_err);
|
|
|
|
if (ret < 0) {
|
|
|
|
s->quit = true;
|
2019-02-01 14:01:33 +01:00
|
|
|
nbd_iter_channel_error(&iter, ret, &local_err);
|
2018-03-12 16:21:23 +01:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
if (!nbd_reply_type_is_error(chunk->type)) {
|
|
|
|
s->quit = true;
|
|
|
|
error_setg(&local_err,
|
|
|
|
"Unexpected reply type: %d (%s) "
|
|
|
|
"for CMD_BLOCK_STATUS",
|
|
|
|
chunk->type, nbd_reply_type_lookup(chunk->type));
|
2019-02-01 14:01:33 +01:00
|
|
|
nbd_iter_channel_error(&iter, -EINVAL, &local_err);
|
2018-03-12 16:21:23 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
g_free(payload);
|
|
|
|
payload = NULL;
|
|
|
|
}
|
|
|
|
|
nbd: Don't lose server's error to NBD_CMD_BLOCK_STATUS
When the server replies with a (structured [*]) error to
NBD_CMD_BLOCK_STATUS, without any extent information sent first, the
client code was blindly throwing away the server's error code and
instead telling the caller that EIO occurred. This has been broken
since its introduction in 78a33ab5 (v2.12, where we should have called:
error_setg(&local_err, "Server did not reply with any status extents");
nbd_iter_error(&iter, false, -EIO, &local_err);
to declare the situation as a non-fatal error if no earlier error had
already been flagged, rather than just blindly slamming iter.err and
iter.ret), although it is more noticeable since commit 7f86068d, which
actually tries hard to preserve the server's code thanks to a separate
iter.request_ret.
[*] The spec is clear that the server is also permitted to reply with
a simple error, but that's a separate fix.
I was able to provoke this scenario with a hack to the server, then
seeing whether ENOMEM makes it back to the caller:
| diff --git a/nbd/server.c b/nbd/server.c
| index fd013a2817a..29c7995de02 100644
| --- a/nbd/server.c
| +++ b/nbd/server.c
| @@ -2269,6 +2269,8 @@ static coroutine_fn int nbd_handle_request(NBDClient *client,
| "discard failed", errp);
|
| case NBD_CMD_BLOCK_STATUS:
| + return nbd_send_generic_reply(client, request->handle, -ENOMEM,
| + "no status for you today", errp);
| if (!request->len) {
| return nbd_send_generic_reply(client, request->handle, -EINVAL,
| "need non-zero length", errp);
| --
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20190325190104.30213-2-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2019-03-25 20:01:03 +01:00
|
|
|
if (!extent->length && !iter.request_ret) {
|
|
|
|
error_setg(&local_err, "Server did not reply with any status extents");
|
|
|
|
nbd_iter_channel_error(&iter, -EIO, &local_err);
|
2018-03-12 16:21:23 +01:00
|
|
|
}
|
2019-02-01 14:01:33 +01:00
|
|
|
|
2018-03-12 16:21:23 +01:00
|
|
|
error_propagate(errp, iter.err);
|
2019-02-01 14:01:33 +01:00
|
|
|
*request_ret = iter.request_ret;
|
2018-03-12 16:21:23 +01:00
|
|
|
return iter.ret;
|
|
|
|
}
|
|
|
|
|
2017-10-27 12:40:37 +02:00
|
|
|
static int nbd_co_request(BlockDriverState *bs, NBDRequest *request,
|
|
|
|
QEMUIOVector *write_qiov)
|
2017-08-29 23:48:31 +02:00
|
|
|
{
|
2019-02-01 14:01:33 +01:00
|
|
|
int ret, request_ret;
|
2017-10-27 12:40:37 +02:00
|
|
|
Error *local_err = NULL;
|
|
|
|
NBDClientSession *client = nbd_get_client_session(bs);
|
2017-08-29 23:48:31 +02:00
|
|
|
|
2017-10-27 12:40:37 +02:00
|
|
|
assert(request->type != NBD_CMD_READ);
|
|
|
|
if (write_qiov) {
|
|
|
|
assert(request->type == NBD_CMD_WRITE);
|
|
|
|
assert(request->len == iov_size(write_qiov->iov, write_qiov->niov));
|
2017-10-12 11:53:07 +02:00
|
|
|
} else {
|
2017-10-27 12:40:37 +02:00
|
|
|
assert(request->type != NBD_CMD_WRITE);
|
2017-10-12 11:53:07 +02:00
|
|
|
}
|
2017-10-27 12:40:37 +02:00
|
|
|
ret = nbd_co_send_request(bs, request, write_qiov);
|
2017-08-29 23:48:31 +02:00
|
|
|
if (ret < 0) {
|
2017-09-20 14:45:05 +02:00
|
|
|
return ret;
|
2017-08-29 23:48:31 +02:00
|
|
|
}
|
2017-09-20 14:45:05 +02:00
|
|
|
|
2019-02-01 14:01:33 +01:00
|
|
|
ret = nbd_co_receive_return_code(client, request->handle,
|
|
|
|
&request_ret, &local_err);
|
2017-10-27 12:40:37 +02:00
|
|
|
if (local_err) {
|
block/nbd-client: use traces instead of noisy error_report_err
Reduce extra noise of nbd-client, change 083 correspondingly.
In various commits (be41c100 in 2.10, f140e300 in 2.11, 78a33ab
in 2.12), we added spots where qemu as an NBD client would report
problems communicating with the server to stderr, because there
was no where else to send the error to. However, this is racy,
particularly since the most common source of these errors is when
either the client or the server abruptly hangs up, leaving one
coroutine to report the error only if it wins (or loses) the
race in attempting the read from the server before another
thread completes its cleanup of a protocol error that caused the
disconnect in the first place. The race is also apparent in the
fact that differences in the flush behavior of the server can
alter the frequency of encountering the race in the client (see
commit 6d39db96).
Rather than polluting stderr, it's better to just trace these
situations, for use by developers debugging a flaky connection,
particularly since the real error that either triggers the abrupt
disconnection in the first place, or that results from the EIO
when a request can't receive a reply, DOES make it back to the
user in the normal Error propagation channels.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20181102151152.288399-4-vsementsov@virtuozzo.com>
[eblake: drop depedence on error hint, enhance commit message]
Signed-off-by: Eric Blake <eblake@redhat.com>
2018-11-02 16:11:52 +01:00
|
|
|
trace_nbd_co_request_fail(request->from, request->len, request->handle,
|
|
|
|
request->flags, request->type,
|
|
|
|
nbd_cmd_lookup(request->type),
|
|
|
|
ret, error_get_pretty(local_err));
|
|
|
|
error_free(local_err);
|
2017-10-27 12:40:37 +02:00
|
|
|
}
|
2019-02-01 14:01:33 +01:00
|
|
|
return ret ? ret : request_ret;
|
2017-08-29 23:48:31 +02:00
|
|
|
}
|
|
|
|
|
2016-07-16 01:23:07 +02:00
|
|
|
int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
|
|
|
|
uint64_t bytes, QEMUIOVector *qiov, int flags)
|
2013-12-01 22:23:41 +01:00
|
|
|
{
|
2019-02-01 14:01:33 +01:00
|
|
|
int ret, request_ret;
|
2017-10-27 12:40:37 +02:00
|
|
|
Error *local_err = NULL;
|
|
|
|
NBDClientSession *client = nbd_get_client_session(bs);
|
2016-10-14 20:33:07 +02:00
|
|
|
NBDRequest request = {
|
2016-07-16 01:23:07 +02:00
|
|
|
.type = NBD_CMD_READ,
|
|
|
|
.from = offset,
|
|
|
|
.len = bytes,
|
|
|
|
};
|
2013-12-01 22:23:41 +01:00
|
|
|
|
2016-07-16 01:23:07 +02:00
|
|
|
assert(bytes <= NBD_MAX_BUFFER_SIZE);
|
|
|
|
assert(!flags);
|
2013-12-01 22:23:41 +01:00
|
|
|
|
2017-11-08 22:57:01 +01:00
|
|
|
if (!bytes) {
|
|
|
|
return 0;
|
|
|
|
}
|
nbd/client: Support qemu-img convert from unaligned size
If an NBD server advertises a size that is not a multiple of a sector,
the block layer rounds up that size, even though we set info.size to
the exact byte value sent by the server. The block layer then proceeds
to let us read or query block status on the hole that it added past
EOF, which the NBD server is unlikely to be happy with. Fortunately,
qemu as a server never advertizes an unaligned size, so we generally
don't run into this problem; but the nbdkit server makes it easy to
test:
$ printf %1000d 1 > f1
$ ~/nbdkit/nbdkit -fv file f1 & pid=$!
$ qemu-img convert -f raw nbd://localhost:10809 f2
$ kill $pid
$ qemu-img compare f1 f2
Pre-patch, the server attempts a 1024-byte read, which nbdkit
rightfully rejects as going beyond its advertised 1000 byte size; the
conversion fails and the output files differ (not even the first
sector is copied, because qemu-img does not follow ddrescue's habit of
trying smaller reads to get as much information as possible in spite
of errors). Post-patch, the client's attempts to read (and query block
status, for new enough nbdkit) are properly truncated to the server's
length, with sane handling of the hole the block layer forced on
us. Although f2 ends up as a larger file (1024 bytes instead of 1000),
qemu-img compare shows the two images to have identical contents for
display to the guest.
I didn't add iotests coverage since I didn't want to add a dependency
on nbdkit in iotests. I also did NOT patch write, trim, or write
zeroes - these commands continue to fail (usually with ENOSPC, but
whatever the server chose), because we really can't write to the end
of the file, and because 'qemu-img convert' is the most common case
where we care about being tolerant (which is read-only). Perhaps we
could truncate the request if the client is writing zeros to the tail,
but that seems like more work, especially if the block layer is fixed
in 4.1 to track byte-accurate sizing (in which case this patch would
be reverted as unnecessary).
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20190329042750.14704-5-eblake@redhat.com>
Tested-by: Richard W.M. Jones <rjones@redhat.com>
2019-03-29 05:27:48 +01:00
|
|
|
/*
|
|
|
|
* Work around the fact that the block layer doesn't do
|
|
|
|
* byte-accurate sizing yet - if the read exceeds the server's
|
|
|
|
* advertised size because the block layer rounded size up, then
|
|
|
|
* truncate the request to the server and tail-pad with zero.
|
|
|
|
*/
|
|
|
|
if (offset >= client->info.size) {
|
|
|
|
assert(bytes < BDRV_SECTOR_SIZE);
|
|
|
|
qemu_iovec_memset(qiov, 0, 0, bytes);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (offset + bytes > client->info.size) {
|
|
|
|
uint64_t slop = offset + bytes - client->info.size;
|
|
|
|
|
|
|
|
assert(slop < BDRV_SECTOR_SIZE);
|
|
|
|
qemu_iovec_memset(qiov, bytes - slop, 0, slop);
|
|
|
|
request.len -= slop;
|
|
|
|
}
|
|
|
|
|
2017-10-27 12:40:37 +02:00
|
|
|
ret = nbd_co_send_request(bs, &request, NULL);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = nbd_co_receive_cmdread_reply(client, request.handle, offset, qiov,
|
2019-02-01 14:01:33 +01:00
|
|
|
&request_ret, &local_err);
|
2017-11-12 02:39:36 +01:00
|
|
|
if (local_err) {
|
block/nbd-client: use traces instead of noisy error_report_err
Reduce extra noise of nbd-client, change 083 correspondingly.
In various commits (be41c100 in 2.10, f140e300 in 2.11, 78a33ab
in 2.12), we added spots where qemu as an NBD client would report
problems communicating with the server to stderr, because there
was no where else to send the error to. However, this is racy,
particularly since the most common source of these errors is when
either the client or the server abruptly hangs up, leaving one
coroutine to report the error only if it wins (or loses) the
race in attempting the read from the server before another
thread completes its cleanup of a protocol error that caused the
disconnect in the first place. The race is also apparent in the
fact that differences in the flush behavior of the server can
alter the frequency of encountering the race in the client (see
commit 6d39db96).
Rather than polluting stderr, it's better to just trace these
situations, for use by developers debugging a flaky connection,
particularly since the real error that either triggers the abrupt
disconnection in the first place, or that results from the EIO
when a request can't receive a reply, DOES make it back to the
user in the normal Error propagation channels.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20181102151152.288399-4-vsementsov@virtuozzo.com>
[eblake: drop depedence on error hint, enhance commit message]
Signed-off-by: Eric Blake <eblake@redhat.com>
2018-11-02 16:11:52 +01:00
|
|
|
trace_nbd_co_request_fail(request.from, request.len, request.handle,
|
|
|
|
request.flags, request.type,
|
|
|
|
nbd_cmd_lookup(request.type),
|
|
|
|
ret, error_get_pretty(local_err));
|
|
|
|
error_free(local_err);
|
2017-10-27 12:40:37 +02:00
|
|
|
}
|
2019-02-01 14:01:33 +01:00
|
|
|
return ret ? ret : request_ret;
|
2013-12-01 22:23:41 +01:00
|
|
|
}
|
|
|
|
|
2016-07-16 01:23:07 +02:00
|
|
|
int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
|
|
|
|
uint64_t bytes, QEMUIOVector *qiov, int flags)
|
2013-12-01 22:23:41 +01:00
|
|
|
{
|
2016-10-14 20:33:06 +02:00
|
|
|
NBDClientSession *client = nbd_get_client_session(bs);
|
2016-10-14 20:33:07 +02:00
|
|
|
NBDRequest request = {
|
2016-07-16 01:23:07 +02:00
|
|
|
.type = NBD_CMD_WRITE,
|
|
|
|
.from = offset,
|
|
|
|
.len = bytes,
|
|
|
|
};
|
2013-12-01 22:23:41 +01:00
|
|
|
|
nbd-client: Refuse read-only client with BDRV_O_RDWR
The NBD spec says that clients should not try to write/trim to
an export advertised as read-only by the server. But we failed
to check that, and would allow the block layer to use NBD with
BDRV_O_RDWR even when the server is read-only, which meant we
were depending on the server sending a proper EPERM failure for
various commands, and also exposes a leaky abstraction: using
qemu-io in read-write mode would succeed on 'w -z 0 0' because
of local short-circuiting logic, but 'w 0 0' would send a
request over the wire (where it then depends on the server, and
fails at least for qemu-nbd but might pass for other NBD
implementations).
With this patch, a client MUST request read-only mode to access
a server that is doing a read-only export, or else it will get
a message like:
can't open device nbd://localhost:10809/foo: request for write access conflicts with read-only export
It is no longer possible to even attempt writes over the wire
(including the corner case of 0-length writes), because the block
layer enforces the explicit read-only request; this matches the
behavior of qcow2 when backed by a read-only POSIX file.
Fix several iotests to comply with the new behavior (since
qemu-nbd of an internal snapshot, as well as nbd-server-add over QMP,
default to a read-only export, we must tell blockdev-add/qemu-io to
set up a read-only client).
CC: qemu-stable@nongnu.org
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20171108215703.9295-3-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2017-11-08 22:56:58 +01:00
|
|
|
assert(!(client->info.flags & NBD_FLAG_READ_ONLY));
|
2016-05-04 00:39:08 +02:00
|
|
|
if (flags & BDRV_REQ_FUA) {
|
2017-07-07 22:30:41 +02:00
|
|
|
assert(client->info.flags & NBD_FLAG_SEND_FUA);
|
2016-10-14 20:33:04 +02:00
|
|
|
request.flags |= NBD_CMD_FLAG_FUA;
|
2013-12-01 22:23:41 +01:00
|
|
|
}
|
|
|
|
|
2016-07-16 01:23:07 +02:00
|
|
|
assert(bytes <= NBD_MAX_BUFFER_SIZE);
|
2013-12-01 22:23:41 +01:00
|
|
|
|
2017-11-08 22:57:01 +01:00
|
|
|
if (!bytes) {
|
|
|
|
return 0;
|
|
|
|
}
|
2017-08-29 23:48:31 +02:00
|
|
|
return nbd_co_request(bs, &request, qiov);
|
2013-12-01 22:23:41 +01:00
|
|
|
}
|
|
|
|
|
2016-10-14 20:33:18 +02:00
|
|
|
int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
|
2017-06-09 12:18:08 +02:00
|
|
|
int bytes, BdrvRequestFlags flags)
|
2016-10-14 20:33:18 +02:00
|
|
|
{
|
|
|
|
NBDClientSession *client = nbd_get_client_session(bs);
|
|
|
|
NBDRequest request = {
|
|
|
|
.type = NBD_CMD_WRITE_ZEROES,
|
|
|
|
.from = offset,
|
2017-06-09 12:18:08 +02:00
|
|
|
.len = bytes,
|
2016-10-14 20:33:18 +02:00
|
|
|
};
|
|
|
|
|
nbd-client: Refuse read-only client with BDRV_O_RDWR
The NBD spec says that clients should not try to write/trim to
an export advertised as read-only by the server. But we failed
to check that, and would allow the block layer to use NBD with
BDRV_O_RDWR even when the server is read-only, which meant we
were depending on the server sending a proper EPERM failure for
various commands, and also exposes a leaky abstraction: using
qemu-io in read-write mode would succeed on 'w -z 0 0' because
of local short-circuiting logic, but 'w 0 0' would send a
request over the wire (where it then depends on the server, and
fails at least for qemu-nbd but might pass for other NBD
implementations).
With this patch, a client MUST request read-only mode to access
a server that is doing a read-only export, or else it will get
a message like:
can't open device nbd://localhost:10809/foo: request for write access conflicts with read-only export
It is no longer possible to even attempt writes over the wire
(including the corner case of 0-length writes), because the block
layer enforces the explicit read-only request; this matches the
behavior of qcow2 when backed by a read-only POSIX file.
Fix several iotests to comply with the new behavior (since
qemu-nbd of an internal snapshot, as well as nbd-server-add over QMP,
default to a read-only export, we must tell blockdev-add/qemu-io to
set up a read-only client).
CC: qemu-stable@nongnu.org
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20171108215703.9295-3-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2017-11-08 22:56:58 +01:00
|
|
|
assert(!(client->info.flags & NBD_FLAG_READ_ONLY));
|
2017-07-07 22:30:41 +02:00
|
|
|
if (!(client->info.flags & NBD_FLAG_SEND_WRITE_ZEROES)) {
|
2016-10-14 20:33:18 +02:00
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (flags & BDRV_REQ_FUA) {
|
2017-07-07 22:30:41 +02:00
|
|
|
assert(client->info.flags & NBD_FLAG_SEND_FUA);
|
2016-10-14 20:33:18 +02:00
|
|
|
request.flags |= NBD_CMD_FLAG_FUA;
|
|
|
|
}
|
|
|
|
if (!(flags & BDRV_REQ_MAY_UNMAP)) {
|
|
|
|
request.flags |= NBD_CMD_FLAG_NO_HOLE;
|
|
|
|
}
|
|
|
|
|
2017-11-08 22:57:01 +01:00
|
|
|
if (!bytes) {
|
|
|
|
return 0;
|
|
|
|
}
|
2017-08-29 23:48:31 +02:00
|
|
|
return nbd_co_request(bs, &request, NULL);
|
2016-10-14 20:33:18 +02:00
|
|
|
}
|
|
|
|
|
2015-02-06 22:06:16 +01:00
|
|
|
int nbd_client_co_flush(BlockDriverState *bs)
|
2013-12-01 22:23:41 +01:00
|
|
|
{
|
2016-10-14 20:33:06 +02:00
|
|
|
NBDClientSession *client = nbd_get_client_session(bs);
|
2016-10-14 20:33:07 +02:00
|
|
|
NBDRequest request = { .type = NBD_CMD_FLUSH };
|
2013-12-01 22:23:41 +01:00
|
|
|
|
2017-07-07 22:30:41 +02:00
|
|
|
if (!(client->info.flags & NBD_FLAG_SEND_FLUSH)) {
|
2013-12-01 22:23:41 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
request.from = 0;
|
|
|
|
request.len = 0;
|
|
|
|
|
2017-08-29 23:48:31 +02:00
|
|
|
return nbd_co_request(bs, &request, NULL);
|
2013-12-01 22:23:41 +01:00
|
|
|
}
|
|
|
|
|
2017-06-09 12:18:08 +02:00
|
|
|
int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
|
2013-12-01 22:23:41 +01:00
|
|
|
{
|
2016-10-14 20:33:06 +02:00
|
|
|
NBDClientSession *client = nbd_get_client_session(bs);
|
2016-10-14 20:33:07 +02:00
|
|
|
NBDRequest request = {
|
2016-07-16 01:23:02 +02:00
|
|
|
.type = NBD_CMD_TRIM,
|
|
|
|
.from = offset,
|
2017-06-09 12:18:08 +02:00
|
|
|
.len = bytes,
|
2016-07-16 01:23:02 +02:00
|
|
|
};
|
2013-12-01 22:23:41 +01:00
|
|
|
|
nbd-client: Refuse read-only client with BDRV_O_RDWR
The NBD spec says that clients should not try to write/trim to
an export advertised as read-only by the server. But we failed
to check that, and would allow the block layer to use NBD with
BDRV_O_RDWR even when the server is read-only, which meant we
were depending on the server sending a proper EPERM failure for
various commands, and also exposes a leaky abstraction: using
qemu-io in read-write mode would succeed on 'w -z 0 0' because
of local short-circuiting logic, but 'w 0 0' would send a
request over the wire (where it then depends on the server, and
fails at least for qemu-nbd but might pass for other NBD
implementations).
With this patch, a client MUST request read-only mode to access
a server that is doing a read-only export, or else it will get
a message like:
can't open device nbd://localhost:10809/foo: request for write access conflicts with read-only export
It is no longer possible to even attempt writes over the wire
(including the corner case of 0-length writes), because the block
layer enforces the explicit read-only request; this matches the
behavior of qcow2 when backed by a read-only POSIX file.
Fix several iotests to comply with the new behavior (since
qemu-nbd of an internal snapshot, as well as nbd-server-add over QMP,
default to a read-only export, we must tell blockdev-add/qemu-io to
set up a read-only client).
CC: qemu-stable@nongnu.org
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20171108215703.9295-3-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2017-11-08 22:56:58 +01:00
|
|
|
assert(!(client->info.flags & NBD_FLAG_READ_ONLY));
|
2017-11-08 22:57:01 +01:00
|
|
|
if (!(client->info.flags & NBD_FLAG_SEND_TRIM) || !bytes) {
|
2013-12-01 22:23:41 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-08-29 23:48:31 +02:00
|
|
|
return nbd_co_request(bs, &request, NULL);
|
2013-12-01 22:23:41 +01:00
|
|
|
}
|
|
|
|
|
2018-03-12 16:21:23 +01:00
|
|
|
int coroutine_fn nbd_client_co_block_status(BlockDriverState *bs,
|
|
|
|
bool want_zero,
|
|
|
|
int64_t offset, int64_t bytes,
|
|
|
|
int64_t *pnum, int64_t *map,
|
|
|
|
BlockDriverState **file)
|
|
|
|
{
|
2019-02-01 14:01:33 +01:00
|
|
|
int ret, request_ret;
|
2018-03-12 16:21:23 +01:00
|
|
|
NBDExtent extent = { 0 };
|
|
|
|
NBDClientSession *client = nbd_get_client_session(bs);
|
|
|
|
Error *local_err = NULL;
|
|
|
|
|
|
|
|
NBDRequest request = {
|
|
|
|
.type = NBD_CMD_BLOCK_STATUS,
|
|
|
|
.from = offset,
|
|
|
|
.len = MIN(MIN_NON_ZERO(QEMU_ALIGN_DOWN(INT_MAX,
|
|
|
|
bs->bl.request_alignment),
|
nbd/client: Support qemu-img convert from unaligned size
If an NBD server advertises a size that is not a multiple of a sector,
the block layer rounds up that size, even though we set info.size to
the exact byte value sent by the server. The block layer then proceeds
to let us read or query block status on the hole that it added past
EOF, which the NBD server is unlikely to be happy with. Fortunately,
qemu as a server never advertizes an unaligned size, so we generally
don't run into this problem; but the nbdkit server makes it easy to
test:
$ printf %1000d 1 > f1
$ ~/nbdkit/nbdkit -fv file f1 & pid=$!
$ qemu-img convert -f raw nbd://localhost:10809 f2
$ kill $pid
$ qemu-img compare f1 f2
Pre-patch, the server attempts a 1024-byte read, which nbdkit
rightfully rejects as going beyond its advertised 1000 byte size; the
conversion fails and the output files differ (not even the first
sector is copied, because qemu-img does not follow ddrescue's habit of
trying smaller reads to get as much information as possible in spite
of errors). Post-patch, the client's attempts to read (and query block
status, for new enough nbdkit) are properly truncated to the server's
length, with sane handling of the hole the block layer forced on
us. Although f2 ends up as a larger file (1024 bytes instead of 1000),
qemu-img compare shows the two images to have identical contents for
display to the guest.
I didn't add iotests coverage since I didn't want to add a dependency
on nbdkit in iotests. I also did NOT patch write, trim, or write
zeroes - these commands continue to fail (usually with ENOSPC, but
whatever the server chose), because we really can't write to the end
of the file, and because 'qemu-img convert' is the most common case
where we care about being tolerant (which is read-only). Perhaps we
could truncate the request if the client is writing zeros to the tail,
but that seems like more work, especially if the block layer is fixed
in 4.1 to track byte-accurate sizing (in which case this patch would
be reverted as unnecessary).
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20190329042750.14704-5-eblake@redhat.com>
Tested-by: Richard W.M. Jones <rjones@redhat.com>
2019-03-29 05:27:48 +01:00
|
|
|
client->info.max_block),
|
|
|
|
MIN(bytes, client->info.size - offset)),
|
2018-03-12 16:21:23 +01:00
|
|
|
.flags = NBD_CMD_FLAG_REQ_ONE,
|
|
|
|
};
|
|
|
|
|
|
|
|
if (!client->info.base_allocation) {
|
|
|
|
*pnum = bytes;
|
2019-03-31 03:28:37 +02:00
|
|
|
*map = offset;
|
|
|
|
*file = bs;
|
|
|
|
return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
|
2018-03-12 16:21:23 +01:00
|
|
|
}
|
|
|
|
|
nbd/client: Support qemu-img convert from unaligned size
If an NBD server advertises a size that is not a multiple of a sector,
the block layer rounds up that size, even though we set info.size to
the exact byte value sent by the server. The block layer then proceeds
to let us read or query block status on the hole that it added past
EOF, which the NBD server is unlikely to be happy with. Fortunately,
qemu as a server never advertizes an unaligned size, so we generally
don't run into this problem; but the nbdkit server makes it easy to
test:
$ printf %1000d 1 > f1
$ ~/nbdkit/nbdkit -fv file f1 & pid=$!
$ qemu-img convert -f raw nbd://localhost:10809 f2
$ kill $pid
$ qemu-img compare f1 f2
Pre-patch, the server attempts a 1024-byte read, which nbdkit
rightfully rejects as going beyond its advertised 1000 byte size; the
conversion fails and the output files differ (not even the first
sector is copied, because qemu-img does not follow ddrescue's habit of
trying smaller reads to get as much information as possible in spite
of errors). Post-patch, the client's attempts to read (and query block
status, for new enough nbdkit) are properly truncated to the server's
length, with sane handling of the hole the block layer forced on
us. Although f2 ends up as a larger file (1024 bytes instead of 1000),
qemu-img compare shows the two images to have identical contents for
display to the guest.
I didn't add iotests coverage since I didn't want to add a dependency
on nbdkit in iotests. I also did NOT patch write, trim, or write
zeroes - these commands continue to fail (usually with ENOSPC, but
whatever the server chose), because we really can't write to the end
of the file, and because 'qemu-img convert' is the most common case
where we care about being tolerant (which is read-only). Perhaps we
could truncate the request if the client is writing zeros to the tail,
but that seems like more work, especially if the block layer is fixed
in 4.1 to track byte-accurate sizing (in which case this patch would
be reverted as unnecessary).
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20190329042750.14704-5-eblake@redhat.com>
Tested-by: Richard W.M. Jones <rjones@redhat.com>
2019-03-29 05:27:48 +01:00
|
|
|
/*
|
|
|
|
* Work around the fact that the block layer doesn't do
|
|
|
|
* byte-accurate sizing yet - if the status request exceeds the
|
|
|
|
* server's advertised size because the block layer rounded size
|
|
|
|
* up, we truncated the request to the server (above), or are
|
|
|
|
* called on just the hole.
|
|
|
|
*/
|
|
|
|
if (offset >= client->info.size) {
|
|
|
|
*pnum = bytes;
|
|
|
|
assert(bytes < BDRV_SECTOR_SIZE);
|
|
|
|
/* Intentionally don't report offset_valid for the hole */
|
|
|
|
return BDRV_BLOCK_ZERO;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (client->info.min_block) {
|
|
|
|
assert(QEMU_IS_ALIGNED(request.len, client->info.min_block));
|
|
|
|
}
|
2018-03-12 16:21:23 +01:00
|
|
|
ret = nbd_co_send_request(bs, &request, NULL);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = nbd_co_receive_blockstatus_reply(client, request.handle, bytes,
|
2019-02-01 14:01:33 +01:00
|
|
|
&extent, &request_ret, &local_err);
|
2018-03-12 16:21:23 +01:00
|
|
|
if (local_err) {
|
block/nbd-client: use traces instead of noisy error_report_err
Reduce extra noise of nbd-client, change 083 correspondingly.
In various commits (be41c100 in 2.10, f140e300 in 2.11, 78a33ab
in 2.12), we added spots where qemu as an NBD client would report
problems communicating with the server to stderr, because there
was no where else to send the error to. However, this is racy,
particularly since the most common source of these errors is when
either the client or the server abruptly hangs up, leaving one
coroutine to report the error only if it wins (or loses) the
race in attempting the read from the server before another
thread completes its cleanup of a protocol error that caused the
disconnect in the first place. The race is also apparent in the
fact that differences in the flush behavior of the server can
alter the frequency of encountering the race in the client (see
commit 6d39db96).
Rather than polluting stderr, it's better to just trace these
situations, for use by developers debugging a flaky connection,
particularly since the real error that either triggers the abrupt
disconnection in the first place, or that results from the EIO
when a request can't receive a reply, DOES make it back to the
user in the normal Error propagation channels.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20181102151152.288399-4-vsementsov@virtuozzo.com>
[eblake: drop depedence on error hint, enhance commit message]
Signed-off-by: Eric Blake <eblake@redhat.com>
2018-11-02 16:11:52 +01:00
|
|
|
trace_nbd_co_request_fail(request.from, request.len, request.handle,
|
|
|
|
request.flags, request.type,
|
|
|
|
nbd_cmd_lookup(request.type),
|
|
|
|
ret, error_get_pretty(local_err));
|
|
|
|
error_free(local_err);
|
2018-03-12 16:21:23 +01:00
|
|
|
}
|
2019-02-01 14:01:33 +01:00
|
|
|
if (ret < 0 || request_ret < 0) {
|
|
|
|
return ret ? ret : request_ret;
|
2018-03-12 16:21:23 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
assert(extent.length);
|
|
|
|
*pnum = extent.length;
|
2019-03-31 03:28:37 +02:00
|
|
|
*map = offset;
|
|
|
|
*file = bs;
|
2018-03-12 16:21:23 +01:00
|
|
|
return (extent.flags & NBD_STATE_HOLE ? 0 : BDRV_BLOCK_DATA) |
|
2019-03-31 03:28:37 +02:00
|
|
|
(extent.flags & NBD_STATE_ZERO ? BDRV_BLOCK_ZERO : 0) |
|
|
|
|
BDRV_BLOCK_OFFSET_VALID;
|
2018-03-12 16:21:23 +01:00
|
|
|
}
|
|
|
|
|
2015-02-06 22:06:16 +01:00
|
|
|
void nbd_client_detach_aio_context(BlockDriverState *bs)
|
2014-05-08 16:34:43 +02:00
|
|
|
{
|
2017-02-13 14:52:24 +01:00
|
|
|
NBDClientSession *client = nbd_get_client_session(bs);
|
2017-06-15 19:09:05 +02:00
|
|
|
qio_channel_detach_aio_context(QIO_CHANNEL(client->ioc));
|
2014-05-08 16:34:43 +02:00
|
|
|
}
|
|
|
|
|
2019-02-18 15:33:08 +01:00
|
|
|
static void nbd_client_attach_aio_context_bh(void *opaque)
|
|
|
|
{
|
|
|
|
BlockDriverState *bs = opaque;
|
|
|
|
NBDClientSession *client = nbd_get_client_session(bs);
|
|
|
|
|
|
|
|
/* The node is still drained, so we know the coroutine has yielded in
|
|
|
|
* nbd_read_eof(), the only place where bs->in_flight can reach 0, or it is
|
|
|
|
* entered for the first time. Both places are safe for entering the
|
|
|
|
* coroutine.*/
|
|
|
|
qemu_aio_coroutine_enter(bs->aio_context, client->connection_co);
|
|
|
|
bdrv_dec_in_flight(bs);
|
|
|
|
}
|
|
|
|
|
2015-02-06 22:06:16 +01:00
|
|
|
void nbd_client_attach_aio_context(BlockDriverState *bs,
|
|
|
|
AioContext *new_context)
|
2014-05-08 16:34:43 +02:00
|
|
|
{
|
2017-02-13 14:52:24 +01:00
|
|
|
NBDClientSession *client = nbd_get_client_session(bs);
|
2017-06-15 19:09:05 +02:00
|
|
|
qio_channel_attach_aio_context(QIO_CHANNEL(client->ioc), new_context);
|
2019-02-15 16:53:51 +01:00
|
|
|
|
2019-02-18 15:33:08 +01:00
|
|
|
bdrv_inc_in_flight(bs);
|
|
|
|
|
|
|
|
/* Need to wait here for the BH to run because the BH must run while the
|
|
|
|
* node is still drained. */
|
|
|
|
aio_wait_bh_oneshot(new_context, nbd_client_attach_aio_context_bh, bs);
|
2014-05-08 16:34:43 +02:00
|
|
|
}
|
|
|
|
|
2015-02-06 22:06:16 +01:00
|
|
|
void nbd_client_close(BlockDriverState *bs)
|
2013-12-01 22:23:41 +01:00
|
|
|
{
|
2016-10-14 20:33:06 +02:00
|
|
|
NBDClientSession *client = nbd_get_client_session(bs);
|
2016-10-14 20:33:07 +02:00
|
|
|
NBDRequest request = { .type = NBD_CMD_DISC };
|
2013-12-01 22:23:41 +01:00
|
|
|
|
2019-02-01 14:01:37 +01:00
|
|
|
assert(client->ioc);
|
2014-02-26 15:30:18 +01:00
|
|
|
|
2016-02-10 19:41:04 +01:00
|
|
|
nbd_send_request(client->ioc, &request);
|
2013-12-01 22:23:44 +01:00
|
|
|
|
2015-02-06 22:06:16 +01:00
|
|
|
nbd_teardown_connection(bs);
|
2013-12-01 22:23:41 +01:00
|
|
|
}
|
|
|
|
|
2019-02-01 14:01:34 +01:00
|
|
|
static QIOChannelSocket *nbd_establish_connection(SocketAddress *saddr,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
QIOChannelSocket *sioc;
|
|
|
|
Error *local_err = NULL;
|
|
|
|
|
|
|
|
sioc = qio_channel_socket_new();
|
|
|
|
qio_channel_set_name(QIO_CHANNEL(sioc), "nbd-client");
|
|
|
|
|
|
|
|
qio_channel_socket_connect_sync(sioc, saddr, &local_err);
|
|
|
|
if (local_err) {
|
|
|
|
object_unref(OBJECT(sioc));
|
|
|
|
error_propagate(errp, local_err);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
qio_channel_set_delay(QIO_CHANNEL(sioc), false);
|
|
|
|
|
|
|
|
return sioc;
|
|
|
|
}
|
|
|
|
|
2019-02-01 14:01:35 +01:00
|
|
|
static int nbd_client_connect(BlockDriverState *bs,
|
|
|
|
SocketAddress *saddr,
|
|
|
|
const char *export,
|
|
|
|
QCryptoTLSCreds *tlscreds,
|
|
|
|
const char *hostname,
|
|
|
|
const char *x_dirty_bitmap,
|
|
|
|
Error **errp)
|
2013-12-01 22:23:41 +01:00
|
|
|
{
|
2016-10-14 20:33:06 +02:00
|
|
|
NBDClientSession *client = nbd_get_client_session(bs);
|
2013-12-01 22:23:41 +01:00
|
|
|
int ret;
|
|
|
|
|
2019-02-01 14:01:34 +01:00
|
|
|
/*
|
|
|
|
* establish TCP connection, return error if it fails
|
|
|
|
* TODO: Configurable retry-until-timeout behaviour.
|
|
|
|
*/
|
|
|
|
QIOChannelSocket *sioc = nbd_establish_connection(saddr, errp);
|
|
|
|
|
|
|
|
if (!sioc) {
|
|
|
|
return -ECONNREFUSED;
|
|
|
|
}
|
|
|
|
|
2013-12-01 22:23:41 +01:00
|
|
|
/* NBD handshake */
|
2013-12-01 22:23:43 +01:00
|
|
|
logout("session init %s\n", export);
|
2016-02-10 19:41:01 +01:00
|
|
|
qio_channel_set_blocking(QIO_CHANNEL(sioc), true, NULL);
|
|
|
|
|
nbd: Implement NBD_INFO_BLOCK_SIZE on client
The upstream NBD Protocol has defined a new extension to allow
the server to advertise block sizes to the client, as well as
a way for the client to inform the server whether it intends to
obey block sizes.
When using the block layer as the client, we will obey block
sizes; but when used as 'qemu-nbd -c' to hand off to the
kernel nbd module as the client, we are still waiting for the
kernel to implement a way for us to learn if it will honor
block sizes (perhaps by an addition to sysfs, rather than an
ioctl), as well as any way to tell the kernel what additional
block sizes to obey (NBD_SET_BLKSIZE appears to be accurate
for the minimum size, but preferred and maximum sizes would
probably be new ioctl()s), so until then, we need to make our
request for block sizes conditional.
When using ioctl(NBD_SET_BLKSIZE) to hand off to the kernel,
use the minimum block size as the sector size if it is larger
than 512, which also has the nice effect of cooperating with
(non-qemu) servers that don't do read-modify-write when
exposing a block device with 4k sectors; it might also allow
us to visit a file larger than 2T on a 32-bit kernel.
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20170707203049.534-10-eblake@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2017-07-07 22:30:49 +02:00
|
|
|
client->info.request_sizes = true;
|
2017-10-27 12:40:37 +02:00
|
|
|
client->info.structured_reply = true;
|
2018-03-12 16:21:23 +01:00
|
|
|
client->info.base_allocation = true;
|
2018-07-02 21:14:57 +02:00
|
|
|
client->info.x_dirty_bitmap = g_strdup(x_dirty_bitmap);
|
2019-01-17 20:36:46 +01:00
|
|
|
client->info.name = g_strdup(export ?: "");
|
|
|
|
ret = nbd_receive_negotiate(QIO_CHANNEL(sioc), tlscreds, hostname,
|
2017-07-07 22:30:41 +02:00
|
|
|
&client->ioc, &client->info, errp);
|
2018-07-02 21:14:57 +02:00
|
|
|
g_free(client->info.x_dirty_bitmap);
|
2019-01-17 20:36:46 +01:00
|
|
|
g_free(client->info.name);
|
2013-12-01 22:23:41 +01:00
|
|
|
if (ret < 0) {
|
|
|
|
logout("Failed to negotiate with the NBD server\n");
|
2019-02-01 14:01:34 +01:00
|
|
|
object_unref(OBJECT(sioc));
|
2013-12-01 22:23:41 +01:00
|
|
|
return ret;
|
|
|
|
}
|
2018-11-30 03:32:31 +01:00
|
|
|
if (x_dirty_bitmap && !client->info.base_allocation) {
|
|
|
|
error_setg(errp, "requested x-dirty-bitmap %s not found",
|
|
|
|
x_dirty_bitmap);
|
2018-11-30 03:32:32 +01:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto fail;
|
2018-11-30 03:32:31 +01:00
|
|
|
}
|
2018-10-08 17:27:18 +02:00
|
|
|
if (client->info.flags & NBD_FLAG_READ_ONLY) {
|
|
|
|
ret = bdrv_apply_auto_read_only(bs, "NBD export is read-only", errp);
|
|
|
|
if (ret < 0) {
|
2018-11-30 03:32:32 +01:00
|
|
|
goto fail;
|
2018-10-08 17:27:18 +02:00
|
|
|
}
|
nbd-client: Refuse read-only client with BDRV_O_RDWR
The NBD spec says that clients should not try to write/trim to
an export advertised as read-only by the server. But we failed
to check that, and would allow the block layer to use NBD with
BDRV_O_RDWR even when the server is read-only, which meant we
were depending on the server sending a proper EPERM failure for
various commands, and also exposes a leaky abstraction: using
qemu-io in read-write mode would succeed on 'w -z 0 0' because
of local short-circuiting logic, but 'w 0 0' would send a
request over the wire (where it then depends on the server, and
fails at least for qemu-nbd but might pass for other NBD
implementations).
With this patch, a client MUST request read-only mode to access
a server that is doing a read-only export, or else it will get
a message like:
can't open device nbd://localhost:10809/foo: request for write access conflicts with read-only export
It is no longer possible to even attempt writes over the wire
(including the corner case of 0-length writes), because the block
layer enforces the explicit read-only request; this matches the
behavior of qcow2 when backed by a read-only POSIX file.
Fix several iotests to comply with the new behavior (since
qemu-nbd of an internal snapshot, as well as nbd-server-add over QMP,
default to a read-only export, we must tell blockdev-add/qemu-io to
set up a read-only client).
CC: qemu-stable@nongnu.org
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20171108215703.9295-3-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2017-11-08 22:56:58 +01:00
|
|
|
}
|
2017-07-07 22:30:41 +02:00
|
|
|
if (client->info.flags & NBD_FLAG_SEND_FUA) {
|
2016-05-04 00:39:06 +02:00
|
|
|
bs->supported_write_flags = BDRV_REQ_FUA;
|
2016-11-17 21:13:54 +01:00
|
|
|
bs->supported_zero_flags |= BDRV_REQ_FUA;
|
|
|
|
}
|
2017-07-07 22:30:41 +02:00
|
|
|
if (client->info.flags & NBD_FLAG_SEND_WRITE_ZEROES) {
|
2016-11-17 21:13:54 +01:00
|
|
|
bs->supported_zero_flags |= BDRV_REQ_MAY_UNMAP;
|
2016-05-04 00:39:06 +02:00
|
|
|
}
|
2013-12-01 22:23:41 +01:00
|
|
|
|
2016-02-10 19:41:01 +01:00
|
|
|
client->sioc = sioc;
|
2016-02-10 19:41:11 +01:00
|
|
|
|
|
|
|
if (!client->ioc) {
|
|
|
|
client->ioc = QIO_CHANNEL(sioc);
|
|
|
|
object_ref(OBJECT(client->ioc));
|
|
|
|
}
|
2013-12-01 22:23:41 +01:00
|
|
|
|
|
|
|
/* Now that we're connected, set the socket to be non-blocking and
|
|
|
|
* kick the reply mechanism. */
|
2016-02-10 19:41:01 +01:00
|
|
|
qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL);
|
2019-02-01 14:01:38 +01:00
|
|
|
client->connection_co = qemu_coroutine_create(nbd_connection_entry, client);
|
2019-02-15 16:53:51 +01:00
|
|
|
bdrv_inc_in_flight(bs);
|
2015-02-06 22:06:16 +01:00
|
|
|
nbd_client_attach_aio_context(bs, bdrv_get_aio_context(bs));
|
2013-12-01 22:23:41 +01:00
|
|
|
|
|
|
|
logout("Established connection with NBD server\n");
|
|
|
|
return 0;
|
2018-11-30 03:32:32 +01:00
|
|
|
|
|
|
|
fail:
|
|
|
|
/*
|
|
|
|
* We have connected, but must fail for other reasons. The
|
|
|
|
* connection is still blocking; send NBD_CMD_DISC as a courtesy
|
|
|
|
* to the server.
|
|
|
|
*/
|
|
|
|
{
|
|
|
|
NBDRequest request = { .type = NBD_CMD_DISC };
|
|
|
|
|
|
|
|
nbd_send_request(client->ioc ?: QIO_CHANNEL(sioc), &request);
|
2019-02-01 14:01:34 +01:00
|
|
|
|
|
|
|
object_unref(OBJECT(sioc));
|
|
|
|
|
2018-11-30 03:32:32 +01:00
|
|
|
return ret;
|
|
|
|
}
|
2013-12-01 22:23:41 +01:00
|
|
|
}
|
2019-02-01 14:01:35 +01:00
|
|
|
|
|
|
|
int nbd_client_init(BlockDriverState *bs,
|
|
|
|
SocketAddress *saddr,
|
|
|
|
const char *export,
|
|
|
|
QCryptoTLSCreds *tlscreds,
|
|
|
|
const char *hostname,
|
|
|
|
const char *x_dirty_bitmap,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
NBDClientSession *client = nbd_get_client_session(bs);
|
|
|
|
|
2019-02-15 16:53:51 +01:00
|
|
|
client->bs = bs;
|
2019-02-01 14:01:35 +01:00
|
|
|
qemu_co_mutex_init(&client->send_mutex);
|
|
|
|
qemu_co_queue_init(&client->free_sema);
|
|
|
|
|
|
|
|
return nbd_client_connect(bs, saddr, export, tlscreds, hostname,
|
|
|
|
x_dirty_bitmap, errp);
|
|
|
|
}
|