From b90fb4b8f5cd01dfcf0e3b45c93977a2e3bdcc71 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 8 Sep 2011 17:24:54 +0200 Subject: [PATCH 01/20] nbd: support feature negotiation nbd supports writing flags in bytes 24...27 of the header, and uses that for the read-only flag. Add support for it in qemu-nbd. Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- block/nbd.c | 4 ++-- nbd.c | 32 +++++++++++++++++++++++++------- nbd.h | 9 ++++++--- qemu-nbd.c | 13 ++++++------- 4 files changed, 39 insertions(+), 19 deletions(-) diff --git a/block/nbd.c b/block/nbd.c index 70edd81bd6..76f04d863c 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -48,6 +48,7 @@ typedef struct BDRVNBDState { int sock; + uint32_t nbdflags; off_t size; size_t blocksize; char *export_name; /* An NBD server may export several devices */ @@ -111,7 +112,6 @@ static int nbd_establish_connection(BlockDriverState *bs) int ret; off_t size; size_t blocksize; - uint32_t nbdflags; if (s->host_spec[0] == '/') { sock = unix_socket_outgoing(s->host_spec); @@ -126,7 +126,7 @@ static int nbd_establish_connection(BlockDriverState *bs) } /* NBD handshake */ - ret = nbd_receive_negotiate(sock, s->export_name, &nbdflags, &size, + ret = nbd_receive_negotiate(sock, s->export_name, &s->nbdflags, &size, &blocksize); if (ret == -1) { logout("Failed to negotiate with the NBD server\n"); diff --git a/nbd.c b/nbd.c index 6d81cfbcd0..47ecb2234b 100644 --- a/nbd.c +++ b/nbd.c @@ -30,6 +30,10 @@ #include #include +#ifdef __linux__ +#include +#endif + #include "qemu_socket.h" //#define DEBUG_NBD @@ -172,7 +176,7 @@ int unix_socket_outgoing(const char *path) Request (type == 2) */ -int nbd_negotiate(int csock, off_t size) +int nbd_negotiate(int csock, off_t size, uint32_t flags) { char buf[8 + 8 + 8 + 128]; @@ -180,14 +184,16 @@ int nbd_negotiate(int csock, off_t size) [ 0 .. 7] passwd ("NBDMAGIC") [ 8 .. 15] magic (0x00420281861253) [16 .. 23] size - [24 .. 151] reserved (0) + [24 .. 27] flags + [28 .. 151] reserved (0) */ TRACE("Beginning negotiation."); memcpy(buf, "NBDMAGIC", 8); cpu_to_be64w((uint64_t*)(buf + 8), 0x00420281861253LL); cpu_to_be64w((uint64_t*)(buf + 16), size); - memset(buf + 24, 0, 128); + cpu_to_be32w((uint32_t*)(buf + 24), flags | NBD_FLAG_HAS_FLAGS); + memset(buf + 28, 0, 124); if (write_sync(csock, buf, sizeof(buf)) != sizeof(buf)) { LOG("write failed"); @@ -337,8 +343,8 @@ int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags, return 0; } -#ifndef _WIN32 -int nbd_init(int fd, int csock, off_t size, size_t blocksize) +#ifdef __linux__ +int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize) { TRACE("Setting block size to %lu", (unsigned long)blocksize); @@ -358,6 +364,18 @@ int nbd_init(int fd, int csock, off_t size, size_t blocksize) return -1; } + if (flags & NBD_FLAG_READ_ONLY) { + int read_only = 1; + TRACE("Setting readonly attribute"); + + if (ioctl(fd, BLKROSET, (unsigned long) &read_only) < 0) { + int serrno = errno; + LOG("Failed setting read-only attribute"); + errno = serrno; + return -1; + } + } + TRACE("Clearing NBD socket"); if (ioctl(fd, NBD_CLEAR_SOCK) == -1) { @@ -548,7 +566,7 @@ static int nbd_send_reply(int csock, struct nbd_reply *reply) } int nbd_trip(BlockDriverState *bs, int csock, off_t size, uint64_t dev_offset, - off_t *offset, bool readonly, uint8_t *data, int data_size) + off_t *offset, uint32_t nbdflags, uint8_t *data, int data_size) { struct nbd_request request; struct nbd_reply reply; @@ -632,7 +650,7 @@ int nbd_trip(BlockDriverState *bs, int csock, off_t size, uint64_t dev_offset, return -1; } - if (readonly) { + if (nbdflags & NBD_FLAG_READ_ONLY) { TRACE("Server is read-only, return error"); reply.error = 1; } else { diff --git a/nbd.h b/nbd.h index df7b7af7c0..b9c3b391d6 100644 --- a/nbd.h +++ b/nbd.h @@ -37,6 +37,9 @@ struct nbd_reply { uint64_t handle; } QEMU_PACKED; +#define NBD_FLAG_HAS_FLAGS (1 << 0) /* Flags are there */ +#define NBD_FLAG_READ_ONLY (1 << 1) /* Device is read-only */ + enum { NBD_CMD_READ = 0, NBD_CMD_WRITE = 1, @@ -53,14 +56,14 @@ int tcp_socket_incoming_spec(const char *address_and_port); int unix_socket_outgoing(const char *path); int unix_socket_incoming(const char *path); -int nbd_negotiate(int csock, off_t size); +int nbd_negotiate(int csock, off_t size, uint32_t flags); int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags, off_t *size, size_t *blocksize); -int nbd_init(int fd, int csock, off_t size, size_t blocksize); +int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize); int nbd_send_request(int csock, struct nbd_request *request); int nbd_receive_reply(int csock, struct nbd_reply *reply); int nbd_trip(BlockDriverState *bs, int csock, off_t size, uint64_t dev_offset, - off_t *offset, bool readonly, uint8_t *data, int data_size); + off_t *offset, uint32_t nbdflags, uint8_t *data, int data_size); int nbd_client(int fd); int nbd_disconnect(int fd); diff --git a/qemu-nbd.c b/qemu-nbd.c index 3a39145174..d8d3e15a84 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -185,7 +185,7 @@ int main(int argc, char **argv) BlockDriverState *bs; off_t dev_offset = 0; off_t offset = 0; - bool readonly = false; + uint32_t nbdflags = 0; bool disconnect = false; const char *bindto = "0.0.0.0"; int port = NBD_DEFAULT_PORT; @@ -230,7 +230,6 @@ int main(int argc, char **argv) int nb_fds = 0; int max_fd; int persistent = 0; - uint32_t nbdflags; while ((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) { switch (ch) { @@ -263,7 +262,7 @@ int main(int argc, char **argv) } break; case 'r': - readonly = true; + nbdflags |= NBD_FLAG_READ_ONLY; flags &= ~BDRV_O_RDWR; break; case 'P': @@ -398,13 +397,13 @@ int main(int argc, char **argv) } ret = nbd_receive_negotiate(sock, NULL, &nbdflags, - &size, &blocksize); + &size, &blocksize); if (ret == -1) { ret = 1; goto out; } - ret = nbd_init(fd, sock, size, blocksize); + ret = nbd_init(fd, sock, nbdflags, size, blocksize); if (ret == -1) { ret = 1; goto out; @@ -463,7 +462,7 @@ int main(int argc, char **argv) for (i = 1; i < nb_fds && ret; i++) { if (FD_ISSET(sharing_fds[i], &fds)) { if (nbd_trip(bs, sharing_fds[i], fd_size, dev_offset, - &offset, readonly, data, NBD_BUFFER_SIZE) != 0) { + &offset, nbdflags, data, NBD_BUFFER_SIZE) != 0) { close(sharing_fds[i]); nb_fds--; sharing_fds[i] = sharing_fds[nb_fds]; @@ -479,7 +478,7 @@ int main(int argc, char **argv) (struct sockaddr *)&addr, &addr_len); if (sharing_fds[nb_fds] != -1 && - nbd_negotiate(sharing_fds[nb_fds], fd_size) != -1) { + nbd_negotiate(sharing_fds[nb_fds], fd_size, nbdflags) != -1) { if (sharing_fds[nb_fds] > max_fd) max_fd = sharing_fds[nb_fds]; nb_fds++; From bbb74edd405bee8cf29957ef781294f40f02d4c6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 8 Sep 2011 17:24:55 +0200 Subject: [PATCH 02/20] nbd: sync API definitions with upstream Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- nbd.c | 2 ++ nbd.h | 11 ++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/nbd.c b/nbd.c index 47ecb2234b..d32a19e5e3 100644 --- a/nbd.c +++ b/nbd.c @@ -67,6 +67,8 @@ #define NBD_PRINT_DEBUG _IO(0xab, 6) #define NBD_SET_SIZE_BLOCKS _IO(0xab, 7) #define NBD_DISCONNECT _IO(0xab, 8) +#define NBD_SET_TIMEOUT _IO(0xab, 9) +#define NBD_SET_FLAGS _IO(0xab, 10) #define NBD_OPT_EXPORT_NAME (1 << 0) diff --git a/nbd.h b/nbd.h index b9c3b391d6..61553f4128 100644 --- a/nbd.h +++ b/nbd.h @@ -39,11 +39,20 @@ struct nbd_reply { #define NBD_FLAG_HAS_FLAGS (1 << 0) /* Flags are there */ #define NBD_FLAG_READ_ONLY (1 << 1) /* Device is read-only */ +#define NBD_FLAG_SEND_FLUSH (1 << 2) /* Send FLUSH */ +#define NBD_FLAG_SEND_FUA (1 << 3) /* Send FUA (Force Unit Access) */ +#define NBD_FLAG_ROTATIONAL (1 << 4) /* Use elevator algorithm - rotational media */ +#define NBD_FLAG_SEND_TRIM (1 << 5) /* Send TRIM (discard) */ + +#define NBD_CMD_MASK_COMMAND 0x0000ffff +#define NBD_CMD_FLAG_FUA (1 << 16) enum { NBD_CMD_READ = 0, NBD_CMD_WRITE = 1, - NBD_CMD_DISC = 2 + NBD_CMD_DISC = 2, + NBD_CMD_FLUSH = 3, + NBD_CMD_TRIM = 4 }; #define NBD_DEFAULT_PORT 10809 From 973b3d0a7b2823f510e948283284f103ede43c49 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 8 Sep 2011 17:24:56 +0200 Subject: [PATCH 03/20] nbd: support NBD_SET_FLAGS ioctl The nbd kernel module cannot enable DISCARD requests unless it is informed about it. The flags field in the header is used for this, and this patch adds support for it. Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- nbd.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/nbd.c b/nbd.c index d32a19e5e3..595f4d8df3 100644 --- a/nbd.c +++ b/nbd.c @@ -378,6 +378,14 @@ int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize) } } + if (ioctl(fd, NBD_SET_FLAGS, flags) < 0 + && errno != ENOTTY) { + int serrno = errno; + LOG("Failed setting flags"); + errno = serrno; + return -1; + } + TRACE("Clearing NBD socket"); if (ioctl(fd, NBD_CLEAR_SOCK) == -1) { From 074abad3264bf2bb7cfd279b7457416b318716ca Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 14 Sep 2011 11:22:26 +0200 Subject: [PATCH 04/20] raw-posix: Fix bdrv_flush error return values bdrv_flush is supposed to use 0/-errno return values Signed-off-by: Kevin Wolf --- block/raw-posix.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index a624f56f86..305998ddb3 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -839,7 +839,14 @@ static int raw_create(const char *filename, QEMUOptionParameter *options) static int raw_flush(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; - return qemu_fdatasync(s->fd); + int ret; + + ret = qemu_fdatasync(s->fd); + if (ret < 0) { + return -errno; + } + + return 0; } #ifdef CONFIG_XFS From 1b8f8a6f9190996c617a177acc3b8fd7dff87e24 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 Sep 2011 15:25:56 +0200 Subject: [PATCH 05/20] scsi-generic: do not disable FUA I found no rationale for this in the logs, and it is quite bad because it will make scsi-generic unsafe WRT power failures. Signed-off-by: Paolo Bonzini Reviewed-by: Christoph Hellwig Signed-off-by: Kevin Wolf --- hw/scsi-generic.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/hw/scsi-generic.c b/hw/scsi-generic.c index 5ce01afdce..8f6b70df2b 100644 --- a/hw/scsi-generic.c +++ b/hw/scsi-generic.c @@ -244,12 +244,6 @@ static uint8_t *scsi_get_buf(SCSIRequest *req) static void scsi_req_fixup(SCSIRequest *req) { switch(req->cmd.buf[0]) { - case WRITE_10: - req->cmd.buf[1] &= ~0x08; /* disable FUA */ - break; - case READ_10: - req->cmd.buf[1] &= ~0x08; /* disable FUA */ - break; case REWIND: case START_STOP: if (req->dev->type == TYPE_TAPE) { From bbca72c6215c3e1df72b80bb3f54fe3b2b05212b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 16 Sep 2011 16:40:00 +0200 Subject: [PATCH 06/20] dma-helpers: rename is_write to to_dev Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- dma-helpers.c | 14 +++++++------- dma.h | 2 +- hw/ide/core.c | 2 +- hw/ide/macio.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/dma-helpers.c b/dma-helpers.c index 4610ea0420..717e384104 100644 --- a/dma-helpers.c +++ b/dma-helpers.c @@ -42,7 +42,7 @@ typedef struct { BlockDriverAIOCB *acb; QEMUSGList *sg; uint64_t sector_num; - int is_write; + bool to_dev; int sg_cur_index; target_phys_addr_t sg_cur_byte; QEMUIOVector iov; @@ -75,7 +75,7 @@ static void dma_bdrv_unmap(DMAAIOCB *dbs) for (i = 0; i < dbs->iov.niov; ++i) { cpu_physical_memory_unmap(dbs->iov.iov[i].iov_base, - dbs->iov.iov[i].iov_len, !dbs->is_write, + dbs->iov.iov[i].iov_len, !dbs->to_dev, dbs->iov.iov[i].iov_len); } } @@ -101,7 +101,7 @@ static void dma_bdrv_cb(void *opaque, int ret) while (dbs->sg_cur_index < dbs->sg->nsg) { cur_addr = dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte; cur_len = dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte; - mem = cpu_physical_memory_map(cur_addr, &cur_len, !dbs->is_write); + mem = cpu_physical_memory_map(cur_addr, &cur_len, !dbs->to_dev); if (!mem) break; qemu_iovec_add(&dbs->iov, mem, cur_len); @@ -143,7 +143,7 @@ static AIOPool dma_aio_pool = { BlockDriverAIOCB *dma_bdrv_io( BlockDriverState *bs, QEMUSGList *sg, uint64_t sector_num, DMAIOFunc *io_func, BlockDriverCompletionFunc *cb, - void *opaque, int is_write) + void *opaque, bool to_dev) { DMAAIOCB *dbs = qemu_aio_get(&dma_aio_pool, bs, cb, opaque); @@ -153,7 +153,7 @@ BlockDriverAIOCB *dma_bdrv_io( dbs->sector_num = sector_num; dbs->sg_cur_index = 0; dbs->sg_cur_byte = 0; - dbs->is_write = is_write; + dbs->to_dev = to_dev; dbs->io_func = io_func; dbs->bh = NULL; qemu_iovec_init(&dbs->iov, sg->nsg); @@ -170,12 +170,12 @@ BlockDriverAIOCB *dma_bdrv_read(BlockDriverState *bs, QEMUSGList *sg, uint64_t sector, void (*cb)(void *opaque, int ret), void *opaque) { - return dma_bdrv_io(bs, sg, sector, bdrv_aio_readv, cb, opaque, 0); + return dma_bdrv_io(bs, sg, sector, bdrv_aio_readv, cb, opaque, false); } BlockDriverAIOCB *dma_bdrv_write(BlockDriverState *bs, QEMUSGList *sg, uint64_t sector, void (*cb)(void *opaque, int ret), void *opaque) { - return dma_bdrv_io(bs, sg, sector, bdrv_aio_writev, cb, opaque, 1); + return dma_bdrv_io(bs, sg, sector, bdrv_aio_writev, cb, opaque, true); } diff --git a/dma.h b/dma.h index a6db5bacbb..b222346aa0 100644 --- a/dma.h +++ b/dma.h @@ -39,7 +39,7 @@ typedef BlockDriverAIOCB *DMAIOFunc(BlockDriverState *bs, int64_t sector_num, BlockDriverAIOCB *dma_bdrv_io(BlockDriverState *bs, QEMUSGList *sg, uint64_t sector_num, DMAIOFunc *io_func, BlockDriverCompletionFunc *cb, - void *opaque, int is_write); + void *opaque, bool to_dev); BlockDriverAIOCB *dma_bdrv_read(BlockDriverState *bs, QEMUSGList *sg, uint64_t sector, BlockDriverCompletionFunc *cb, void *opaque); diff --git a/hw/ide/core.c b/hw/ide/core.c index 9297b9e657..f424347cb0 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -603,7 +603,7 @@ handle_rw_error: break; case IDE_DMA_TRIM: s->bus->dma->aiocb = dma_bdrv_io(s->bs, &s->sg, sector_num, - ide_issue_trim, ide_dma_cb, s, 1); + ide_issue_trim, ide_dma_cb, s, true); break; } diff --git a/hw/ide/macio.c b/hw/ide/macio.c index c1844cb738..37b8239b4d 100644 --- a/hw/ide/macio.c +++ b/hw/ide/macio.c @@ -156,7 +156,7 @@ static void pmac_ide_transfer_cb(void *opaque, int ret) break; case IDE_DMA_TRIM: m->aiocb = dma_bdrv_io(s->bs, &s->sg, sector_num, - ide_issue_trim, pmac_ide_transfer_cb, s, 1); + ide_issue_trim, pmac_ide_transfer_cb, s, true); break; } From 10dc8aef419b4ce77670fb080ffe995bf9d7b0a1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 16 Sep 2011 16:40:01 +0200 Subject: [PATCH 07/20] dma-helpers: allow including from target-independent code Target-independent code cannot construct sglists, but it can take them from the outside as a black box. Allow this. Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- dma.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dma.h b/dma.h index b222346aa0..2bdc236c4c 100644 --- a/dma.h +++ b/dma.h @@ -15,10 +15,13 @@ #include "hw/hw.h" #include "block.h" -typedef struct { +typedef struct ScatterGatherEntry ScatterGatherEntry; + +#if defined(TARGET_PHYS_ADDR_BITS) +struct ScatterGatherEntry { target_phys_addr_t base; target_phys_addr_t len; -} ScatterGatherEntry; +}; struct QEMUSGList { ScatterGatherEntry *sg; @@ -31,6 +34,7 @@ void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint); void qemu_sglist_add(QEMUSGList *qsg, target_phys_addr_t base, target_phys_addr_t len); void qemu_sglist_destroy(QEMUSGList *qsg); +#endif typedef BlockDriverAIOCB *DMAIOFunc(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *iov, int nb_sectors, From c3adb5b9168a57790b5074489b6f0275ac3cc8b5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 16 Sep 2011 16:40:02 +0200 Subject: [PATCH 08/20] dma-helpers: rewrite completion/cancellation This fixes various problems with completion/cancellation: * if the io_func fails to get an AIOCB, the callback wasn't called * If DMA encounters a bounce buffer conflict, and the DMA operation is canceled before the bottom half fires, bad things happen. * memory is not unmapped after cancellation, again causing problems when doing DMA to I/O areas * cancellation could leak the iovec * the callback was missed if the I/O operation failed without returning an AIOCB and probably more that I've missed. The patch fixes them by sharing the cleanup code between completion and cancellation. The dma_bdrv_cb now returns a boolean completed/not completed flag, and the wrapper dma_continue takes care of tasks to do upon completion. Most of these are basically impossible in practice, but it is better to be tidy... Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- dma-helpers.c | 44 +++++++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/dma-helpers.c b/dma-helpers.c index 717e384104..86d2d0a997 100644 --- a/dma-helpers.c +++ b/dma-helpers.c @@ -43,6 +43,7 @@ typedef struct { QEMUSGList *sg; uint64_t sector_num; bool to_dev; + bool in_cancel; int sg_cur_index; target_phys_addr_t sg_cur_byte; QEMUIOVector iov; @@ -58,7 +59,7 @@ static void reschedule_dma(void *opaque) qemu_bh_delete(dbs->bh); dbs->bh = NULL; - dma_bdrv_cb(opaque, 0); + dma_bdrv_cb(dbs, 0); } static void continue_after_map_failure(void *opaque) @@ -78,6 +79,26 @@ static void dma_bdrv_unmap(DMAAIOCB *dbs) dbs->iov.iov[i].iov_len, !dbs->to_dev, dbs->iov.iov[i].iov_len); } + qemu_iovec_reset(&dbs->iov); +} + +static void dma_complete(DMAAIOCB *dbs, int ret) +{ + dma_bdrv_unmap(dbs); + if (dbs->common.cb) { + dbs->common.cb(dbs->common.opaque, ret); + } + qemu_iovec_destroy(&dbs->iov); + if (dbs->bh) { + qemu_bh_delete(dbs->bh); + dbs->bh = NULL; + } + if (!dbs->in_cancel) { + /* Requests may complete while dma_aio_cancel is in progress. In + * this case, the AIOCB should not be released because it is still + * referenced by dma_aio_cancel. */ + qemu_aio_release(dbs); + } } static void dma_bdrv_cb(void *opaque, int ret) @@ -89,12 +110,9 @@ static void dma_bdrv_cb(void *opaque, int ret) dbs->acb = NULL; dbs->sector_num += dbs->iov.size / 512; dma_bdrv_unmap(dbs); - qemu_iovec_reset(&dbs->iov); if (dbs->sg_cur_index == dbs->sg->nsg || ret < 0) { - dbs->common.cb(dbs->common.opaque, ret); - qemu_iovec_destroy(&dbs->iov); - qemu_aio_release(dbs); + dma_complete(dbs, ret); return; } @@ -120,9 +138,7 @@ static void dma_bdrv_cb(void *opaque, int ret) dbs->acb = dbs->io_func(dbs->bs, dbs->sector_num, &dbs->iov, dbs->iov.size / 512, dma_bdrv_cb, dbs); if (!dbs->acb) { - dma_bdrv_unmap(dbs); - qemu_iovec_destroy(&dbs->iov); - return; + dma_complete(dbs, -EIO); } } @@ -131,8 +147,14 @@ static void dma_aio_cancel(BlockDriverAIOCB *acb) DMAAIOCB *dbs = container_of(acb, DMAAIOCB, common); if (dbs->acb) { - bdrv_aio_cancel(dbs->acb); + BlockDriverAIOCB *acb = dbs->acb; + dbs->acb = NULL; + dbs->in_cancel = true; + bdrv_aio_cancel(acb); + dbs->in_cancel = false; } + dbs->common.cb = NULL; + dma_complete(dbs, 0); } static AIOPool dma_aio_pool = { @@ -158,10 +180,6 @@ BlockDriverAIOCB *dma_bdrv_io( dbs->bh = NULL; qemu_iovec_init(&dbs->iov, sg->nsg); dma_bdrv_cb(dbs, 0); - if (!dbs->acb) { - qemu_aio_release(dbs); - return NULL; - } return &dbs->common; } From 103b40f51e4012b3b0ad20f615562a1806d7f49a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 16 Sep 2011 16:40:03 +0200 Subject: [PATCH 09/20] scsi-disk: commonize iovec creation between reads and writes Also, consistently use qiov.size instead of iov.iov_len. Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- hw/scsi-disk.c | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c index 4a60820b18..84e8662780 100644 --- a/hw/scsi-disk.c +++ b/hw/scsi-disk.c @@ -108,6 +108,13 @@ static void scsi_cancel_io(SCSIRequest *req) r->req.aiocb = NULL; } +static uint32_t scsi_init_iovec(SCSIDiskReq *r) +{ + r->iov.iov_len = MIN(r->sector_count * 512, SCSI_DMA_BUF_SIZE); + qemu_iovec_init_external(&r->qiov, &r->iov, 1); + return r->qiov.size / 512; +} + static void scsi_read_complete(void * opaque, int ret) { SCSIDiskReq *r = (SCSIDiskReq *)opaque; @@ -125,12 +132,12 @@ static void scsi_read_complete(void * opaque, int ret) } } - DPRINTF("Data ready tag=0x%x len=%zd\n", r->req.tag, r->iov.iov_len); + DPRINTF("Data ready tag=0x%x len=%zd\n", r->req.tag, r->qiov.size); - n = r->iov.iov_len / 512; + n = r->qiov.size / 512; r->sector += n; r->sector_count -= n; - scsi_req_data(&r->req, r->iov.iov_len); + scsi_req_data(&r->req, r->qiov.size); } static void scsi_flush_complete(void * opaque, int ret) @@ -181,16 +188,10 @@ static void scsi_read_data(SCSIRequest *req) return; } - n = r->sector_count; - if (n > SCSI_DMA_BUF_SIZE / 512) - n = SCSI_DMA_BUF_SIZE / 512; - if (s->tray_open) { scsi_read_complete(r, -ENOMEDIUM); } - r->iov.iov_len = n * 512; - qemu_iovec_init_external(&r->qiov, &r->iov, 1); - + n = scsi_init_iovec(r); bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ); r->req.aiocb = bdrv_aio_readv(s->bs, r->sector, &r->qiov, n, scsi_read_complete, r); @@ -239,7 +240,6 @@ static void scsi_write_complete(void * opaque, int ret) { SCSIDiskReq *r = (SCSIDiskReq *)opaque; SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); - uint32_t len; uint32_t n; if (r->req.aiocb != NULL) { @@ -253,19 +253,15 @@ static void scsi_write_complete(void * opaque, int ret) } } - n = r->iov.iov_len / 512; + n = r->qiov.size / 512; r->sector += n; r->sector_count -= n; if (r->sector_count == 0) { scsi_req_complete(&r->req, GOOD); } else { - len = r->sector_count * 512; - if (len > SCSI_DMA_BUF_SIZE) { - len = SCSI_DMA_BUF_SIZE; - } - r->iov.iov_len = len; - DPRINTF("Write complete tag=0x%x more=%d\n", r->req.tag, len); - scsi_req_data(&r->req, len); + scsi_init_iovec(r); + DPRINTF("Write complete tag=0x%x more=%d\n", r->req.tag, r->qiov.size); + scsi_req_data(&r->req, r->qiov.size); } } @@ -284,21 +280,19 @@ static void scsi_write_data(SCSIRequest *req) return; } - n = r->iov.iov_len / 512; + n = r->qiov.size / 512; if (n) { if (s->tray_open) { scsi_write_complete(r, -ENOMEDIUM); } - qemu_iovec_init_external(&r->qiov, &r->iov, 1); - bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_WRITE); r->req.aiocb = bdrv_aio_writev(s->bs, r->sector, &r->qiov, n, - scsi_write_complete, r); + scsi_write_complete, r); if (r->req.aiocb == NULL) { scsi_write_complete(r, -ENOMEM); } } else { - /* Invoke completion routine to fetch data from host. */ + /* Called for the first time. Ask the driver to send us more data. */ scsi_write_complete(r, 0); } } From 7285477ab11831b1cf56e45878a89170dd06d9b9 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 16 Sep 2011 16:40:04 +0200 Subject: [PATCH 10/20] scsi-disk: lazily allocate bounce buffer It will not be needed for reads and writes if the HBA provides a sglist. In addition, this lets scsi-disk refuse commands with an excessive allocation length, as well as limit memory on usual well-behaved guests. Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- hw/scsi-disk.c | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c index 84e8662780..48abe49496 100644 --- a/hw/scsi-disk.c +++ b/hw/scsi-disk.c @@ -55,6 +55,7 @@ typedef struct SCSIDiskReq { /* Both sector and sector_count are in terms of qemu 512 byte blocks. */ uint64_t sector; uint32_t sector_count; + uint32_t buflen; struct iovec iov; QEMUIOVector qiov; uint32_t status; @@ -78,13 +79,15 @@ struct SCSIDiskState }; static int scsi_handle_rw_error(SCSIDiskReq *r, int error, int type); -static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf); +static int scsi_disk_emulate_command(SCSIDiskReq *r); static void scsi_free_request(SCSIRequest *req) { SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req); - qemu_vfree(r->iov.iov_base); + if (r->iov.iov_base) { + qemu_vfree(r->iov.iov_base); + } } /* Helper function for command completion with sense. */ @@ -110,7 +113,13 @@ static void scsi_cancel_io(SCSIRequest *req) static uint32_t scsi_init_iovec(SCSIDiskReq *r) { - r->iov.iov_len = MIN(r->sector_count * 512, SCSI_DMA_BUF_SIZE); + SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); + + if (!r->iov.iov_base) { + r->buflen = SCSI_DMA_BUF_SIZE; + r->iov.iov_base = qemu_blockalign(s->bs, r->buflen); + } + r->iov.iov_len = MIN(r->sector_count * 512, r->buflen); qemu_iovec_init_external(&r->qiov, &r->iov, 1); return r->qiov.size / 512; } @@ -323,7 +332,7 @@ static void scsi_dma_restart_bh(void *opaque) scsi_write_data(&r->req); break; case SCSI_REQ_STATUS_RETRY_FLUSH: - ret = scsi_disk_emulate_command(r, r->iov.iov_base); + ret = scsi_disk_emulate_command(r); if (ret == 0) { scsi_req_complete(&r->req, GOOD); } @@ -838,13 +847,31 @@ static int scsi_disk_emulate_start_stop(SCSIDiskReq *r) return 0; } -static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf) +static int scsi_disk_emulate_command(SCSIDiskReq *r) { SCSIRequest *req = &r->req; SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev); uint64_t nb_sectors; + uint8_t *outbuf; int buflen = 0; + if (!r->iov.iov_base) { + /* + * FIXME: we shouldn't return anything bigger than 4k, but the code + * requires the buffer to be as big as req->cmd.xfer in several + * places. So, do not allow CDBs with a very large ALLOCATION + * LENGTH. The real fix would be to modify scsi_read_data and + * dma_buf_read, so that they return data beyond the buflen + * as all zeros. + */ + if (req->cmd.xfer > 65536) { + goto illegal_request; + } + r->buflen = MAX(4096, req->cmd.xfer); + r->iov.iov_base = qemu_blockalign(s->bs, r->buflen); + } + + outbuf = r->iov.iov_base; switch (req->cmd.buf[0]) { case TEST_UNIT_READY: if (s->tray_open || !bdrv_is_inserted(s->bs)) @@ -995,11 +1022,9 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *buf) SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev); int32_t len; uint8_t command; - uint8_t *outbuf; int rc; command = buf[0]; - outbuf = (uint8_t *)r->iov.iov_base; DPRINTF("Command: lun=%d tag=0x%x data=0x%02x", req->lun, req->tag, buf[0]); #ifdef DEBUG_SCSI @@ -1028,7 +1053,7 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *buf) case GET_CONFIGURATION: case SERVICE_ACTION_IN_16: case VERIFY_10: - rc = scsi_disk_emulate_command(r, outbuf); + rc = scsi_disk_emulate_command(r); if (rc < 0) { return 0; } @@ -1279,11 +1304,8 @@ static SCSIRequest *scsi_new_request(SCSIDevice *d, uint32_t tag, { SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, d); SCSIRequest *req; - SCSIDiskReq *r; req = scsi_req_alloc(&scsi_disk_reqops, &s->qdev, tag, lun, hba_private); - r = DO_UPCAST(SCSIDiskReq, req, req); - r->iov.iov_base = qemu_blockalign(s->bs, SCSI_DMA_BUF_SIZE); return req; } From b3c0bfb6f949d8f1c97f390f951c0bab3e703810 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Mon, 19 Sep 2011 10:26:42 +0800 Subject: [PATCH 11/20] VMDK: fix leak of extent_file Release extent_file on error in vmdk_parse_extents. Added closing files in freeing extents. Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/vmdk.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 6c8edfc190..5d16ec49bc 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -179,11 +179,16 @@ static void vmdk_free_extents(BlockDriverState *bs) { int i; BDRVVmdkState *s = bs->opaque; + VmdkExtent *e; for (i = 0; i < s->num_extents; i++) { - g_free(s->extents[i].l1_table); - g_free(s->extents[i].l2_cache); - g_free(s->extents[i].l1_backup_table); + e = &s->extents[i]; + g_free(e->l1_table); + g_free(e->l2_cache); + g_free(e->l1_backup_table); + if (e->file != bs->file) { + bdrv_delete(e->file); + } } g_free(s->extents); } @@ -619,12 +624,13 @@ static int vmdk_open_desc_file(BlockDriverState *bs, int flags, s->desc_offset = 0; ret = vmdk_parse_extents(buf, bs, bs->file->filename); if (ret) { + vmdk_free_extents(bs); return ret; } /* try to open parent images, if exist */ if (vmdk_parent_open(bs)) { - g_free(s->extents); + vmdk_free_extents(bs); return -EINVAL; } s->parent_cid = vmdk_read_cid(bs, 1); From 21cfa41e91b5f49e8aa35ce768dcbfe436021db6 Mon Sep 17 00:00:00 2001 From: Frediano Ziglio Date: Fri, 16 Sep 2011 13:34:46 +0200 Subject: [PATCH 12/20] posix-aio-compat: Removed unused offset variable Signed-off-by: Frediano Ziglio Signed-off-by: Kevin Wolf --- posix-aio-compat.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/posix-aio-compat.c b/posix-aio-compat.c index 3193dbf83c..63a8faed15 100644 --- a/posix-aio-compat.c +++ b/posix-aio-compat.c @@ -181,7 +181,6 @@ qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) static ssize_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb) { - size_t offset = 0; ssize_t len; do { @@ -189,12 +188,12 @@ static ssize_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb) len = qemu_pwritev(aiocb->aio_fildes, aiocb->aio_iov, aiocb->aio_niov, - aiocb->aio_offset + offset); + aiocb->aio_offset); else len = qemu_preadv(aiocb->aio_fildes, aiocb->aio_iov, aiocb->aio_niov, - aiocb->aio_offset + offset); + aiocb->aio_offset); } while (len == -1 && errno == EINTR); if (len == -1) From a26a13da687f757c07e2a5c26fa411840405e6d7 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 12 Sep 2011 11:19:25 +0300 Subject: [PATCH 13/20] AHCI Port Interrupt Enable register cleaning on soft reset I've found that FreeBSD AHCI driver doesn't work with AHCI hardware emulation of QEMU 0.15.0. I believe the problem is on QEMU's side. As I see, it clears port's Interrupt Enable register each time when reset of any level happens. Is is reasonable for the global controller reset. It is probably not good, but acceptable for FreeBSD driver for the port hard reset. But it is IMO wrong for the device soft reset. None of real hardware I know behaves that way. Signed-off-by: Alexander Motin Signed-off-by: Kevin Wolf --- hw/ide/ahci.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c index a8659cf8b9..464c28b40b 100644 --- a/hw/ide/ahci.c +++ b/hw/ide/ahci.c @@ -499,10 +499,7 @@ static void ahci_reset_port(AHCIState *s, int port) ide_bus_reset(&d->port); ide_state->ncq_queues = AHCI_MAX_CMDS; - pr->irq_stat = 0; - pr->irq_mask = 0; pr->scr_stat = 0; - pr->scr_ctl = 0; pr->scr_err = 0; pr->scr_act = 0; d->busy_slot = -1; @@ -1159,12 +1156,17 @@ void ahci_uninit(AHCIState *s) void ahci_reset(void *opaque) { struct AHCIPCIState *d = opaque; + AHCIPortRegs *pr; int i; d->ahci.control_regs.irqstatus = 0; d->ahci.control_regs.ghc = 0; for (i = 0; i < d->ahci.ports; i++) { + pr = &d->ahci.dev[i].port_regs; + pr->irq_stat = 0; + pr->irq_mask = 0; + pr->scr_ctl = 0; ahci_reset_port(&d->ahci, i); } } From f9fe18ec772d2852caaa7ca3cb72720d6822edca Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 15 Sep 2011 14:11:08 -0700 Subject: [PATCH 14/20] rbd: ignore failures when reading from default conf location If we are reading from the default config location, ignore any failures. It is perfectly legal for the user to specify exactly the options they need and to not rely on any config file. Signed-off-by: Sage Weil Signed-off-by: Kevin Wolf --- block/rbd.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/block/rbd.c b/block/rbd.c index 1b78d51398..f64b2e0542 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -298,11 +298,8 @@ static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options) } if (strstr(conf, "conf=") == NULL) { - if (rados_conf_read_file(cluster, NULL) < 0) { - error_report("error reading config file"); - rados_shutdown(cluster); - return -EIO; - } + /* try default location, but ignore failure */ + rados_conf_read_file(cluster, NULL); } if (conf[0] != '\0' && @@ -441,11 +438,8 @@ static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags) } if (strstr(conf, "conf=") == NULL) { - r = rados_conf_read_file(s->cluster, NULL); - if (r < 0) { - error_report("error reading config file"); - goto failed_shutdown; - } + /* try default location, but ignore failure */ + rados_conf_read_file(s->cluster, NULL); } if (conf[0] != '\0') { From 9e1fbcde578b0ab3cd2732e5334e772ab19371ac Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 15 Sep 2011 14:11:10 -0700 Subject: [PATCH 15/20] rbd: update comment heading Properly document the configuration string syntax and semantics. Remove (out of date) details about the librbd implementation. Signed-off-by: Sage Weil Signed-off-by: Kevin Wolf --- block/rbd.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/block/rbd.c b/block/rbd.c index f64b2e0542..23d8751d03 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -13,35 +13,33 @@ #include "qemu-common.h" #include "qemu-error.h" - #include "block_int.h" #include - - /* * When specifying the image filename use: * * rbd:poolname/devicename[@snapshotname][:option1=value1[:option2=value2...]] * - * poolname must be the name of an existing rados pool + * poolname must be the name of an existing rados pool. * - * devicename is the basename for all objects used to - * emulate the raw device. + * devicename is the name of the rbd image. * - * Each option given is used to configure rados, and may be - * any Ceph option, or "conf". The "conf" option specifies - * a Ceph configuration file to read. + * Each option given is used to configure rados, and may be any valid + * Ceph option, "id", or "conf". * - * Metadata information (image size, ...) is stored in an - * object with the name "devicename.rbd". + * The "id" option indicates what user we should authenticate as to + * the Ceph cluster. If it is excluded we will use the Ceph default + * (normally 'admin'). * - * The raw device is split into 4MB sized objects by default. - * The sequencenumber is encoded in a 12 byte long hex-string, - * and is attached to the devicename, separated by a dot. - * e.g. "devicename.1234567890ab" + * The "conf" option specifies a Ceph configuration file to read. If + * it is not specified, we will read from the default Ceph locations + * (e.g., /etc/ceph/ceph.conf). To avoid reading _any_ configuration + * file, specify conf=/dev/null. * + * Configuration values containing :, @, or = can be escaped with a + * leading "\". */ #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER) From 7a3f5fe9afbef3c55c1527f61fcfd0b9d4783c0d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 15 Sep 2011 14:11:11 -0700 Subject: [PATCH 16/20] rbd: call flush, if available librbd recently added async writeback and flush support. If the new rbd_flush() call is available, call it. Signed-off-by: Sage Weil Signed-off-by: Kevin Wolf --- block/rbd.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/block/rbd.c b/block/rbd.c index 23d8751d03..98efd2cf16 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -680,6 +680,17 @@ static BlockDriverAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs, return rbd_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); } +static int qemu_rbd_flush(BlockDriverState *bs) +{ +#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1) + /* rbd_flush added in 0.1.1 */ + BDRVRBDState *s = bs->opaque; + return rbd_flush(s->image); +#else + return 0; +#endif +} + static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi) { BDRVRBDState *s = bs->opaque; @@ -815,6 +826,7 @@ static BlockDriver bdrv_rbd = { .bdrv_file_open = qemu_rbd_open, .bdrv_close = qemu_rbd_close, .bdrv_create = qemu_rbd_create, + .bdrv_flush = qemu_rbd_flush, .bdrv_get_info = qemu_rbd_getinfo, .create_options = qemu_rbd_create_options, .bdrv_getlength = qemu_rbd_getlength, From bd5da23265ba3bb11a9f0913e016e3c3a16abf8e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 9 Sep 2011 16:47:26 +0200 Subject: [PATCH 17/20] scsi: fix sign extension problems When assigning a 32-bit value to cmd->xfer (which is 64-bits) it can be erroneously sign extended because the intermediate 32-bit computation is signed. Fix this by standardizing on the ld*_be_p functions. Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- hw/scsi-bus.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c index 02482947ca..aca65a16df 100644 --- a/hw/scsi-bus.c +++ b/hw/scsi-bus.c @@ -542,15 +542,15 @@ static int scsi_req_length(SCSICommand *cmd, SCSIDevice *dev, uint8_t *buf) break; case 1: case 2: - cmd->xfer = buf[8] | (buf[7] << 8); + cmd->xfer = lduw_be_p(&buf[7]); cmd->len = 10; break; case 4: - cmd->xfer = buf[13] | (buf[12] << 8) | (buf[11] << 16) | (buf[10] << 24); + cmd->xfer = ldl_be_p(&buf[10]); cmd->len = 16; break; case 5: - cmd->xfer = buf[9] | (buf[8] << 8) | (buf[7] << 16) | (buf[6] << 24); + cmd->xfer = ldl_be_p(&buf[6]); cmd->len = 12; break; default: @@ -710,23 +710,15 @@ static uint64_t scsi_cmd_lba(SCSICommand *cmd) switch (buf[0] >> 5) { case 0: - lba = (uint64_t) buf[3] | ((uint64_t) buf[2] << 8) | - (((uint64_t) buf[1] & 0x1f) << 16); + lba = ldl_be_p(&buf[0]) & 0x1fffff; break; case 1: case 2: - lba = (uint64_t) buf[5] | ((uint64_t) buf[4] << 8) | - ((uint64_t) buf[3] << 16) | ((uint64_t) buf[2] << 24); + case 5: + lba = ldl_be_p(&buf[2]); break; case 4: - lba = (uint64_t) buf[9] | ((uint64_t) buf[8] << 8) | - ((uint64_t) buf[7] << 16) | ((uint64_t) buf[6] << 24) | - ((uint64_t) buf[5] << 32) | ((uint64_t) buf[4] << 40) | - ((uint64_t) buf[3] << 48) | ((uint64_t) buf[2] << 56); - break; - case 5: - lba = (uint64_t) buf[5] | ((uint64_t) buf[4] << 8) | - ((uint64_t) buf[3] << 16) | ((uint64_t) buf[2] << 24); + lba = ldq_be_p(&buf[2]); break; default: lba = -1; From e1d3b254999bd628c7defdda860422baddb73781 Mon Sep 17 00:00:00 2001 From: Frediano Ziglio Date: Mon, 19 Sep 2011 16:37:13 +0200 Subject: [PATCH 18/20] block: avoid SIGUSR2 Now that iothread is always compiled sending a signal seems only an additional step. This patch also avoid writing to two pipe (one from signal and one in qemu_service_io). Work with kvm enabled or disabled. strace output is more readable (less syscalls). [ kwolf: Merged build fix by Paolo Bonzini ] Signed-off-by: Frediano Ziglio Reviewed-by: Paolo Bonzini Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- cpus.c | 5 ----- posix-aio-compat.c | 33 +++++++++------------------------ 2 files changed, 9 insertions(+), 29 deletions(-) diff --git a/cpus.c b/cpus.c index 54c188cf5c..d0cfe91466 100644 --- a/cpus.c +++ b/cpus.c @@ -380,11 +380,6 @@ static int qemu_signal_init(void) int sigfd; sigset_t set; - /* SIGUSR2 used by posix-aio-compat.c */ - sigemptyset(&set); - sigaddset(&set, SIGUSR2); - pthread_sigmask(SIG_UNBLOCK, &set, NULL); - /* * SIG_IPI must be blocked in the main thread and must not be caught * by sigwait() in the signal thread. Otherwise, the cpu thread will diff --git a/posix-aio-compat.c b/posix-aio-compat.c index 63a8faed15..d3c1174ebf 100644 --- a/posix-aio-compat.c +++ b/posix-aio-compat.c @@ -42,7 +42,6 @@ struct qemu_paiocb { int aio_niov; size_t aio_nbytes; #define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */ - int ev_signo; off_t aio_offset; QTAILQ_ENTRY(qemu_paiocb) node; @@ -308,12 +307,10 @@ static ssize_t handle_aiocb_rw(struct qemu_paiocb *aiocb) return nbytes; } +static void posix_aio_notify_event(void); + static void *aio_thread(void *unused) { - pid_t pid; - - pid = getpid(); - mutex_lock(&lock); pending_threads--; mutex_unlock(&lock); @@ -380,7 +377,7 @@ static void *aio_thread(void *unused) aiocb->ret = ret; mutex_unlock(&lock); - if (kill(pid, aiocb->ev_signo)) die("kill failed"); + posix_aio_notify_event(); } cur_threads--; @@ -547,18 +544,14 @@ static int posix_aio_flush(void *opaque) static PosixAioState *posix_aio_state; -static void aio_signal_handler(int signum) +static void posix_aio_notify_event(void) { - if (posix_aio_state) { - char byte = 0; - ssize_t ret; + char byte = 0; + ssize_t ret; - ret = write(posix_aio_state->wfd, &byte, sizeof(byte)); - if (ret < 0 && errno != EAGAIN) - die("write()"); - } - - qemu_service_io(); + ret = write(posix_aio_state->wfd, &byte, sizeof(byte)); + if (ret < 0 && errno != EAGAIN) + die("write()"); } static void paio_remove(struct qemu_paiocb *acb) @@ -622,7 +615,6 @@ BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd, return NULL; acb->aio_type = type; acb->aio_fildes = fd; - acb->ev_signo = SIGUSR2; if (qiov) { acb->aio_iov = qiov->iov; @@ -650,7 +642,6 @@ BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd, return NULL; acb->aio_type = QEMU_AIO_IOCTL; acb->aio_fildes = fd; - acb->ev_signo = SIGUSR2; acb->aio_offset = 0; acb->aio_ioctl_buf = buf; acb->aio_ioctl_cmd = req; @@ -664,7 +655,6 @@ BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd, int paio_init(void) { - struct sigaction act; PosixAioState *s; int fds[2]; int ret; @@ -674,11 +664,6 @@ int paio_init(void) s = g_malloc(sizeof(PosixAioState)); - sigfillset(&act.sa_mask); - act.sa_flags = 0; /* do not restart syscalls to interrupt select() */ - act.sa_handler = aio_signal_handler; - sigaction(SIGUSR2, &act, NULL); - s->first_aio = NULL; if (qemu_pipe(fds) == -1) { fprintf(stderr, "failed to create pipe\n"); From cf26a4e6f82c5fa2efd8e7dc43d2496b14b0e595 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 19 Sep 2011 17:05:12 +0200 Subject: [PATCH 19/20] linux-aio: remove process requests callback Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- linux-aio.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/linux-aio.c b/linux-aio.c index 5265a029b2..bffa6cd0e3 100644 --- a/linux-aio.c +++ b/linux-aio.c @@ -68,15 +68,6 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s, qemu_aio_release(laiocb); } -/* - * All requests are directly processed when they complete, so there's nothing - * left to do during qemu_aio_wait(). - */ -static int qemu_laio_process_requests(void *opaque) -{ - return 0; -} - static void qemu_laio_completion_cb(void *opaque) { struct qemu_laio_state *s = opaque; @@ -215,7 +206,7 @@ void *laio_init(void) goto out_close_efd; qemu_aio_set_fd_handler(s->efd, qemu_laio_completion_cb, NULL, - qemu_laio_flush_cb, qemu_laio_process_requests, s); + qemu_laio_flush_cb, NULL, s); return s; From 16a06b24306b5733a4ef2e585964838e47735a54 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 19 Sep 2011 13:35:26 -0700 Subject: [PATCH 20/20] rbd: allow escaping in config string The config string is variously delimited by =, @, and /, depending on the field. Allow these characters to be escaped by preceeding them with \. Signed-off-by: Sage Weil Signed-off-by: Kevin Wolf --- block/rbd.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/block/rbd.c b/block/rbd.c index 98efd2cf16..3068c829fe 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -102,8 +102,15 @@ static int qemu_rbd_next_tok(char *dst, int dst_len, *p = NULL; if (delim != '\0') { - end = strchr(src, delim); - if (end) { + for (end = src; *end; ++end) { + if (*end == delim) { + break; + } + if (*end == '\\' && end[1] != '\0') { + end++; + } + } + if (*end == delim) { *p = end + 1; *end = '\0'; } @@ -122,6 +129,19 @@ static int qemu_rbd_next_tok(char *dst, int dst_len, return 0; } +static void qemu_rbd_unescape(char *src) +{ + char *p; + + for (p = src; *src; ++src, ++p) { + if (*src == '\\' && src[1] != '\0') { + src++; + } + *p = *src; + } + *p = '\0'; +} + static int qemu_rbd_parsename(const char *filename, char *pool, int pool_len, char *snap, int snap_len, @@ -146,6 +166,7 @@ static int qemu_rbd_parsename(const char *filename, ret = -EINVAL; goto done; } + qemu_rbd_unescape(pool); if (strchr(p, '@')) { ret = qemu_rbd_next_tok(name, name_len, p, '@', "object name", &p); @@ -153,9 +174,11 @@ static int qemu_rbd_parsename(const char *filename, goto done; } ret = qemu_rbd_next_tok(snap, snap_len, p, ':', "snap name", &p); + qemu_rbd_unescape(snap); } else { ret = qemu_rbd_next_tok(name, name_len, p, ':', "object name", &p); } + qemu_rbd_unescape(name); if (ret < 0 || !p) { goto done; } @@ -211,6 +234,7 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf) if (ret < 0) { break; } + qemu_rbd_unescape(name); if (!p) { error_report("conf option %s has no value", name); @@ -223,6 +247,7 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf) if (ret < 0) { break; } + qemu_rbd_unescape(value); if (strcmp(name, "conf") == 0) { ret = rados_conf_read_file(cluster, value);