diff --git a/Makefile b/Makefile index b0a84f36e4..bdac9b331c 100644 --- a/Makefile +++ b/Makefile @@ -56,6 +56,7 @@ recurse-all: $(SUBDIR_RULES) $(ROMSUBDIR_RULES) block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o block-obj-y += nbd.o block.o aio.o aes.o osdep.o block-obj-$(CONFIG_POSIX) += posix-aio-compat.o +block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o block-nested-y += cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o diff --git a/block.c b/block.c index 82ffea8753..033957daf6 100644 --- a/block.c +++ b/block.c @@ -411,7 +411,8 @@ int bdrv_open2(BlockDriverState *bs, const char *filename, int flags, /* Note: for compatibility, we open disk image files as RDWR, and RDONLY as fallback */ if (!(flags & BDRV_O_FILE)) - open_flags = BDRV_O_RDWR | (flags & BDRV_O_CACHE_MASK); + open_flags = BDRV_O_RDWR | + (flags & (BDRV_O_CACHE_MASK|BDRV_O_NATIVE_AIO)); else open_flags = flags & ~(BDRV_O_FILE | BDRV_O_SNAPSHOT); ret = drv->bdrv_open(bs, filename, open_flags); diff --git a/block.h b/block.h index ccd4c1e700..28bf357091 100644 --- a/block.h +++ b/block.h @@ -37,6 +37,7 @@ typedef struct QEMUSnapshotInfo { bdrv_file_open()) */ #define BDRV_O_NOCACHE 0x0020 /* do not use the host page cache */ #define BDRV_O_CACHE_WB 0x0040 /* use write-back caching */ +#define BDRV_O_NATIVE_AIO 0x0080 /* use native AIO instead of the thread pool */ #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_CACHE_WB) diff --git a/block/raw-posix-aio.h b/block/raw-posix-aio.h index 6761cd39f3..244bc8b798 100644 --- a/block/raw-posix-aio.h +++ b/block/raw-posix-aio.h @@ -33,4 +33,10 @@ BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd, unsigned long int req, void *buf, BlockDriverCompletionFunc *cb, void *opaque); +/* linux-aio.c - Linux native implementation */ +void *laio_init(void); +BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque, int type); + #endif /* QEMU_RAW_POSIX_AIO_H */ diff --git a/block/raw-posix.c b/block/raw-posix.c index ca9bc616a7..8a7dc1570c 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -115,6 +115,7 @@ typedef struct BDRVRawState { int fd_got_error; int fd_media_changed; #endif + int use_aio; uint8_t* aligned_buf; } BDRVRawState; @@ -159,6 +160,7 @@ static int raw_open_common(BlockDriverState *bs, const char *filename, } s->fd = fd; s->aligned_buf = NULL; + if ((bdrv_flags & BDRV_O_NOCACHE)) { s->aligned_buf = qemu_blockalign(bs, ALIGNED_BUFFER_SIZE); if (s->aligned_buf == NULL) { @@ -166,9 +168,22 @@ static int raw_open_common(BlockDriverState *bs, const char *filename, } } - s->aio_ctx = paio_init(); - if (!s->aio_ctx) { - goto out_free_buf; +#ifdef CONFIG_LINUX_AIO + if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) == + (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) { + s->aio_ctx = laio_init(); + if (!s->aio_ctx) { + goto out_free_buf; + } + s->use_aio = 1; + } else +#endif + { + s->aio_ctx = paio_init(); + if (!s->aio_ctx) { + goto out_free_buf; + } + s->use_aio = 0; } return 0; @@ -524,8 +539,13 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs, * boundary. Check if this is the case or telll the low-level * driver that it needs to copy the buffer. */ - if (s->aligned_buf && !qiov_is_aligned(qiov)) { - type |= QEMU_AIO_MISALIGNED; + if (s->aligned_buf) { + if (!qiov_is_aligned(qiov)) { + type |= QEMU_AIO_MISALIGNED; + } else if (s->use_aio) { + return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov, + nb_sectors, cb, opaque, type); + } } return paio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov, nb_sectors, diff --git a/configure b/configure index b993b656af..81272fa4ca 100755 --- a/configure +++ b/configure @@ -191,6 +191,7 @@ vde="" vnc_tls="" vnc_sasl="" xen="" +linux_aio="" gprof="no" debug_tcg="no" @@ -523,6 +524,10 @@ for opt do ;; --enable-mixemu) mixemu="yes" ;; + --disable-linux-aio) linux_aio="no" + ;; + --enable-linux-aio) linux_aio="yes" + ;; --enable-io-thread) io_thread="yes" ;; --disable-blobs) blobs="no" @@ -674,6 +679,8 @@ echo " --enable-uname-release=R Return R for uname -r in usermode emulation" echo " --sparc_cpu=V Build qemu for Sparc architecture v7, v8, v8plus, v8plusa, v9" echo " --disable-vde disable support for vde network" echo " --enable-vde enable support for vde network" +echo " --disable-linux-aio disable Linux AIO support" +echo " --enable-linux-aio enable Linux AIO support" echo " --enable-io-thread enable IO thread" echo " --disable-blobs disable installing provided firmware blobs" echo " --kerneldir=PATH look for kernel includes in PATH" @@ -1303,6 +1310,26 @@ if test "$pthread" = no; then exit 1 fi +########################################## +# linux-aio probe +AIOLIBS="" + +if test "$linux_aio" != "no" ; then + cat > $TMPC < +#include +int main(void) { io_setup(0, NULL); io_set_eventfd(NULL, 0); eventfd(0, 0); return 0; } +EOF + if compile_prog "" "-laio" ; then + linux_aio=yes + LIBS="$LIBS -laio" + else + if test "$linux_aio" = "yes" ; then + feature_not_found "linux AIO" + fi + fi +fi + ########################################## # iovec probe cat > $TMPC <> $config_host_mak fi +if test "$linux_aio" = "yes" ; then + echo "CONFIG_LINUX_AIO=y" >> $config_host_mak +fi if test "$blobs" = "yes" ; then echo "INSTALL_BLOBS=yes" >> $config_host_mak fi diff --git a/linux-aio.c b/linux-aio.c new file mode 100644 index 0000000000..f53a08cb0c --- /dev/null +++ b/linux-aio.c @@ -0,0 +1,204 @@ +/* + * Linux native AIO support. + * + * Copyright (C) 2009 IBM, Corp. + * Copyright (C) 2009 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu-common.h" +#include "qemu-aio.h" +#include "block_int.h" +#include "block/raw-posix-aio.h" + +#include +#include + +/* + * Queue size (per-device). + * + * XXX: eventually we need to communicate this to the guest and/or make it + * tunable by the guest. If we get more outstanding requests at a time + * than this we will get EAGAIN from io_submit which is communicated to + * the guest as an I/O error. + */ +#define MAX_EVENTS 128 + +struct qemu_laiocb { + BlockDriverAIOCB common; + struct qemu_laio_state *ctx; + struct iocb iocb; + ssize_t ret; + size_t nbytes; +}; + +struct qemu_laio_state { + io_context_t ctx; + int efd; + int count; +}; + +static inline ssize_t io_event_ret(struct io_event *ev) +{ + return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res); +} + +static void qemu_laio_completion_cb(void *opaque) +{ + struct qemu_laio_state *s = opaque; + + while (1) { + struct io_event events[MAX_EVENTS]; + uint64_t val; + ssize_t ret; + struct timespec ts = { 0 }; + int nevents, i; + + do { + ret = read(s->efd, &val, sizeof(val)); + } while (ret == 1 && errno == EINTR); + + if (ret == -1 && errno == EAGAIN) + break; + + if (ret != 8) + break; + + do { + nevents = io_getevents(s->ctx, val, MAX_EVENTS, events, &ts); + } while (nevents == -EINTR); + + for (i = 0; i < nevents; i++) { + struct iocb *iocb = events[i].obj; + struct qemu_laiocb *laiocb = + container_of(iocb, struct qemu_laiocb, iocb); + + s->count--; + + ret = laiocb->ret = io_event_ret(&events[i]); + if (ret != -ECANCELED) { + if (ret == laiocb->nbytes) + ret = 0; + else if (ret >= 0) + ret = -EINVAL; + + laiocb->common.cb(laiocb->common.opaque, ret); + } + + qemu_aio_release(laiocb); + } + } +} + +static int qemu_laio_flush_cb(void *opaque) +{ + struct qemu_laio_state *s = opaque; + + return (s->count > 0) ? 1 : 0; +} + +static void laio_cancel(BlockDriverAIOCB *blockacb) +{ + struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb; + struct io_event event; + int ret; + + if (laiocb->ret != -EINPROGRESS) + return; + + /* + * Note that as of Linux 2.6.31 neither the block device code nor any + * filesystem implements cancellation of AIO request. + * Thus the polling loop below is the normal code path. + */ + ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event); + if (ret == 0) { + laiocb->ret = -ECANCELED; + return; + } + + /* + * We have to wait for the iocb to finish. + * + * The only way to get the iocb status update is by polling the io context. + * We might be able to do this slightly more optimal by removing the + * O_NONBLOCK flag. + */ + while (laiocb->ret == -EINPROGRESS) + qemu_laio_completion_cb(laiocb->ctx); +} + +static AIOPool laio_pool = { + .aiocb_size = sizeof(struct qemu_laiocb), + .cancel = laio_cancel, +}; + +BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque, int type) +{ + struct qemu_laio_state *s = aio_ctx; + struct qemu_laiocb *laiocb; + struct iocb *iocbs; + off_t offset = sector_num * 512; + + laiocb = qemu_aio_get(&laio_pool, bs, cb, opaque); + if (!laiocb) + return NULL; + laiocb->nbytes = nb_sectors * 512; + laiocb->ctx = s; + laiocb->ret = -EINPROGRESS; + + iocbs = &laiocb->iocb; + + switch (type) { + case QEMU_AIO_WRITE: + io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); + break; + case QEMU_AIO_READ: + io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset); + break; + default: + fprintf(stderr, "%s: invalid AIO request type 0x%x.\n", + __func__, type); + goto out_free_aiocb; + } + io_set_eventfd(&laiocb->iocb, s->efd); + s->count++; + + if (io_submit(s->ctx, 1, &iocbs) < 0) + goto out_dec_count; + return &laiocb->common; + +out_free_aiocb: + qemu_aio_release(laiocb); +out_dec_count: + s->count--; + return NULL; +} + +void *laio_init(void) +{ + struct qemu_laio_state *s; + + s = qemu_mallocz(sizeof(*s)); + s->efd = eventfd(0, 0); + if (s->efd == -1) + goto out_free_state; + fcntl(s->efd, F_SETFL, O_NONBLOCK); + + if (io_setup(MAX_EVENTS, &s->ctx) != 0) + goto out_close_efd; + + qemu_aio_set_fd_handler(s->efd, qemu_laio_completion_cb, + NULL, qemu_laio_flush_cb, s); + + return s; + +out_close_efd: + close(s->efd); +out_free_state: + qemu_free(s); + return NULL; +} diff --git a/qemu-config.c b/qemu-config.c index 3dd473a710..4808db0531 100644 --- a/qemu-config.c +++ b/qemu-config.c @@ -52,6 +52,10 @@ QemuOptsList qemu_drive_opts = { .name = "cache", .type = QEMU_OPT_STRING, .help = "host cache usage (none, writeback, writethrough)", + },{ + .name = "aio", + .type = QEMU_OPT_STRING, + .help = "host AIO implementation (threads, native)", },{ .name = "format", .type = QEMU_OPT_STRING, diff --git a/qemu-io.c b/qemu-io.c index a68f195c11..f96a4de6b4 100644 --- a/qemu-io.c +++ b/qemu-io.c @@ -1401,6 +1401,7 @@ static void usage(const char *name) " -n, --nocache disable host cache\n" " -g, --growable allow file to grow (only applies to protocols)\n" " -m, --misalign misalign allocations for O_DIRECT\n" +" -k, --native-aio use kernel AIO implementation (on Linux only)\n" " -h, --help display this help and exit\n" " -V, --version output version information and exit\n" "\n", @@ -1412,7 +1413,7 @@ int main(int argc, char **argv) { int readonly = 0; int growable = 0; - const char *sopt = "hVc:Crsnmg"; + const char *sopt = "hVc:Crsnmgk"; struct option lopt[] = { { "help", 0, NULL, 'h' }, { "version", 0, NULL, 'V' }, @@ -1424,6 +1425,7 @@ int main(int argc, char **argv) { "nocache", 0, NULL, 'n' }, { "misalign", 0, NULL, 'm' }, { "growable", 0, NULL, 'g' }, + { "native-aio", 0, NULL, 'k' }, { NULL, 0, NULL, 0 } }; int c; @@ -1455,6 +1457,9 @@ int main(int argc, char **argv) case 'g': growable = 1; break; + case 'k': + flags |= BDRV_O_NATIVE_AIO; + break; case 'V': printf("%s version %s\n", progname, VERSION); exit(0); diff --git a/qemu-options.hx b/qemu-options.hx index e3bd314509..0c2b310562 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -95,7 +95,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive, "-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][,index=i]\n" " [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n" " [,cache=writethrough|writeback|none][,format=f][,serial=s]\n" - " [,addr=A][,id=name]\n" + " [,addr=A][,id=name][,aio=threads|native]\n" " use 'file' as a drive image\n") DEF("set", HAS_ARG, QEMU_OPTION_set, "-set group.id.arg=value\n" @@ -128,6 +128,8 @@ These options have the same definition as they have in @option{-hdachs}. @var{snapshot} is "on" or "off" and allows to enable snapshot for given drive (see @option{-snapshot}). @item cache=@var{cache} @var{cache} is "none", "writeback", or "writethrough" and controls how the host cache is used to access block data. +@item aio=@var{aio} +@var{aio} is "threads", or "native" and selects between pthread based disk I/O and native Linux AIO. @item format=@var{format} Specify which disk @var{format} will be used rather than detecting the format. Can be used to specifiy format=raw to avoid interpreting diff --git a/vl.c b/vl.c index 9b2bf00a72..1085794190 100644 --- a/vl.c +++ b/vl.c @@ -1916,6 +1916,7 @@ DriveInfo *drive_init(QemuOpts *opts, void *opaque, int max_devs; int index; int cache; + int aio = 0; int bdrv_flags, onerror; const char *devaddr; DriveInfo *dinfo; @@ -2049,6 +2050,19 @@ DriveInfo *drive_init(QemuOpts *opts, void *opaque, } } +#ifdef CONFIG_LINUX_AIO + if ((buf = qemu_opt_get(opts, "aio")) != NULL) { + if (!strcmp(buf, "threads")) + aio = 0; + else if (!strcmp(buf, "native")) + aio = 1; + else { + fprintf(stderr, "qemu: invalid aio option\n"); + return NULL; + } + } +#endif + if ((buf = qemu_opt_get(opts, "format")) != NULL) { if (strcmp(buf, "?") == 0) { fprintf(stderr, "qemu: Supported formats:"); @@ -2218,11 +2232,19 @@ DriveInfo *drive_init(QemuOpts *opts, void *opaque, bdrv_flags |= BDRV_O_NOCACHE; else if (cache == 2) /* write-back */ bdrv_flags |= BDRV_O_CACHE_WB; + + if (aio == 1) { + bdrv_flags |= BDRV_O_NATIVE_AIO; + } else { + bdrv_flags &= ~BDRV_O_NATIVE_AIO; + } + if (bdrv_open2(dinfo->bdrv, file, bdrv_flags, drv) < 0) { fprintf(stderr, "qemu: could not open disk image %s\n", file); return NULL; } + if (bdrv_key_required(dinfo->bdrv)) autostart = 0; *fatal_error = 0;