From 975a0015ee380f49a3be744279a6a06ab97e960a Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 22 Aug 2013 11:29:02 +0200 Subject: [PATCH 01/11] libcacard: link against qemu-error.o for error_report() Signed-off-by: Stefan Hajnoczi --- libcacard/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libcacard/Makefile b/libcacard/Makefile index 47827a0eb8..4d15da49b8 100644 --- a/libcacard/Makefile +++ b/libcacard/Makefile @@ -4,7 +4,8 @@ TOOLS += vscclient$(EXESUF) # objects linked into a shared library, built with libtool with -fPIC if required libcacard-obj-y = $(stub-obj-y) $(libcacard-y) -libcacard-obj-y += util/osdep.o util/cutils.o util/qemu-timer-common.o util/error.o +libcacard-obj-y += util/osdep.o util/cutils.o util/qemu-timer-common.o +libcacard-obj-y += util/error.o util/qemu-error.o libcacard-obj-$(CONFIG_WIN32) += util/oslib-win32.o util/qemu-thread-win32.o libcacard-obj-$(CONFIG_POSIX) += util/oslib-posix.o util/qemu-thread-posix.o libcacard-obj-y += $(filter trace/%, $(util-obj-y)) From a5813077aac7865f69b7ee46a594f3705429f7cd Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 22 Aug 2013 11:29:03 +0200 Subject: [PATCH 02/11] osdep: warn if open(O_DIRECT) on fails with EINVAL Print a warning when opening a file O_DIRECT fails with EINVAL. This saves users a lot of time trying to figure out the EINVAL error, which is typical when attempting to open a file O_DIRECT on Linux tmpfs. Reported-by: Deepak C Shetty Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake --- util/osdep.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/util/osdep.c b/util/osdep.c index 685c8ae889..62072b4be3 100644 --- a/util/osdep.c +++ b/util/osdep.c @@ -207,6 +207,13 @@ int qemu_open(const char *name, int flags, ...) } #endif +#ifdef O_DIRECT + if (ret == -1 && errno == EINVAL && (flags & O_DIRECT)) { + error_report("file system may not support O_DIRECT"); + errno = EINVAL; /* in case it was clobbered */ + } +#endif /* O_DIRECT */ + return ret; } From da718ceb1730bfe6fea0178df979639b14a0646e Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 12 Sep 2013 11:02:18 +0200 Subject: [PATCH 03/11] qemu-timer: drop outdated signal safety comments host_alarm_handler() is invoked from the signal processing thread (currently the iothread). Previously we did processing in a real signal handler with signalfd and therefore needed signal-safe timer code. Today host_alarm_handler() just marks the alarm timer as expired/pending and notifies the main loop using qemu_notify_event(). Therefore these outdated comments about signal safety can be dropped. Signed-off-by: Stefan Hajnoczi --- qemu-timer.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/qemu-timer.c b/qemu-timer.c index 95ff47fef3..ed3fcb2190 100644 --- a/qemu-timer.c +++ b/qemu-timer.c @@ -301,8 +301,6 @@ void timer_del(QEMUTimer *ts) { QEMUTimer **pt, *t; - /* NOTE: this code must be signal safe because - timer_expired() can be called from a signal. */ pt = &ts->timer_list->active_timers; for(;;) { t = *pt; @@ -325,8 +323,6 @@ void timer_mod_ns(QEMUTimer *ts, int64_t expire_time) timer_del(ts); /* add the timer in the sorted list */ - /* NOTE: this code must be signal safe because - timer_expired() can be called from a signal. */ pt = &ts->timer_list->active_timers; for(;;) { t = *pt; From 978f2205c791de0e02c8802a645bea657408abfd Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 12 Sep 2013 11:02:19 +0200 Subject: [PATCH 04/11] qemu-timer: make qemu_timer_mod_ns() and qemu_timer_del() thread-safe Introduce QEMUTimerList->active_timers_lock to protect the linked list of active timers. This allows qemu_timer_mod_ns() to be called from any thread. Note that vm_clock is not thread-safe and its use of qemu_clock_has_timers() works fine today but is also not thread-safe. The purpose of this patch is to eventually let device models set or cancel timers from a vcpu thread without holding the global mutex. Signed-off-by: Paolo Bonzini Signed-off-by: Stefan Hajnoczi --- include/qemu/timer.h | 17 +++++++++ qemu-timer.c | 87 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 85 insertions(+), 19 deletions(-) diff --git a/include/qemu/timer.h b/include/qemu/timer.h index e4934dd61b..b58903bef5 100644 --- a/include/qemu/timer.h +++ b/include/qemu/timer.h @@ -115,6 +115,10 @@ static inline int64_t qemu_clock_get_us(QEMUClockType type) * Determines whether a clock's default timer list * has timers attached * + * Note that this function should not be used when other threads also access + * the timer list. The return value may be outdated by the time it is acted + * upon. + * * Returns: true if the clock's default timer list * has timers attached */ @@ -271,6 +275,10 @@ void timerlist_free(QEMUTimerList *timer_list); * * Determine whether a timer list has active timers * + * Note that this function should not be used when other threads also access + * the timer list. The return value may be outdated by the time it is acted + * upon. + * * Returns: true if the timer list has timers. */ bool timerlist_has_timers(QEMUTimerList *timer_list); @@ -512,6 +520,9 @@ void timer_free(QEMUTimer *ts); * @ts: the timer * * Delete a timer from the active list. + * + * This function is thread-safe but the timer and its timer list must not be + * freed while this function is running. */ void timer_del(QEMUTimer *ts); @@ -521,6 +532,9 @@ void timer_del(QEMUTimer *ts); * @expire_time: the expiry time in nanoseconds * * Modify a timer to expire at @expire_time + * + * This function is thread-safe but the timer and its timer list must not be + * freed while this function is running. */ void timer_mod_ns(QEMUTimer *ts, int64_t expire_time); @@ -531,6 +545,9 @@ void timer_mod_ns(QEMUTimer *ts, int64_t expire_time); * * Modify a timer to expiry at @expire_time, taking into * account the scale associated with the timer. + * + * This function is thread-safe but the timer and its timer list must not be + * freed while this function is running. */ void timer_mod(QEMUTimer *ts, int64_t expire_timer); diff --git a/qemu-timer.c b/qemu-timer.c index ed3fcb2190..e5047479f2 100644 --- a/qemu-timer.c +++ b/qemu-timer.c @@ -66,6 +66,7 @@ QEMUClock qemu_clocks[QEMU_CLOCK_MAX]; struct QEMUTimerList { QEMUClock *clock; + QemuMutex active_timers_lock; QEMUTimer *active_timers; QLIST_ENTRY(QEMUTimerList) list; QEMUTimerListNotifyCB *notify_cb; @@ -101,6 +102,7 @@ QEMUTimerList *timerlist_new(QEMUClockType type, timer_list->clock = clock; timer_list->notify_cb = cb; timer_list->notify_opaque = opaque; + qemu_mutex_init(&timer_list->active_timers_lock); QLIST_INSERT_HEAD(&clock->timerlists, timer_list, list); return timer_list; } @@ -111,6 +113,7 @@ void timerlist_free(QEMUTimerList *timer_list) if (timer_list->clock) { QLIST_REMOVE(timer_list, list); } + qemu_mutex_destroy(&timer_list->active_timers_lock); g_free(timer_list); } @@ -163,9 +166,17 @@ bool qemu_clock_has_timers(QEMUClockType type) bool timerlist_expired(QEMUTimerList *timer_list) { - return (timer_list->active_timers && - timer_list->active_timers->expire_time < - qemu_clock_get_ns(timer_list->clock->type)); + int64_t expire_time; + + qemu_mutex_lock(&timer_list->active_timers_lock); + if (!timer_list->active_timers) { + qemu_mutex_unlock(&timer_list->active_timers_lock); + return false; + } + expire_time = timer_list->active_timers->expire_time; + qemu_mutex_unlock(&timer_list->active_timers_lock); + + return expire_time < qemu_clock_get_ns(timer_list->clock->type); } bool qemu_clock_expired(QEMUClockType type) @@ -182,13 +193,25 @@ bool qemu_clock_expired(QEMUClockType type) int64_t timerlist_deadline_ns(QEMUTimerList *timer_list) { int64_t delta; + int64_t expire_time; - if (!timer_list->clock->enabled || !timer_list->active_timers) { + if (!timer_list->clock->enabled) { return -1; } - delta = timer_list->active_timers->expire_time - - qemu_clock_get_ns(timer_list->clock->type); + /* The active timers list may be modified before the caller uses our return + * value but ->notify_cb() is called when the deadline changes. Therefore + * the caller should notice the change and there is no race condition. + */ + qemu_mutex_lock(&timer_list->active_timers_lock); + if (!timer_list->active_timers) { + qemu_mutex_unlock(&timer_list->active_timers_lock); + return -1; + } + expire_time = timer_list->active_timers->expire_time; + qemu_mutex_unlock(&timer_list->active_timers_lock); + + delta = expire_time - qemu_clock_get_ns(timer_list->clock->type); if (delta <= 0) { return 0; @@ -296,12 +319,11 @@ void timer_free(QEMUTimer *ts) g_free(ts); } -/* stop a timer, but do not dealloc it */ -void timer_del(QEMUTimer *ts) +static void timer_del_locked(QEMUTimerList *timer_list, QEMUTimer *ts) { QEMUTimer **pt, *t; - pt = &ts->timer_list->active_timers; + pt = &timer_list->active_timers; for(;;) { t = *pt; if (!t) @@ -314,16 +336,28 @@ void timer_del(QEMUTimer *ts) } } +/* stop a timer, but do not dealloc it */ +void timer_del(QEMUTimer *ts) +{ + QEMUTimerList *timer_list = ts->timer_list; + + qemu_mutex_lock(&timer_list->active_timers_lock); + timer_del_locked(timer_list, ts); + qemu_mutex_unlock(&timer_list->active_timers_lock); +} + /* modify the current timer so that it will be fired when current_time >= expire_time. The corresponding callback will be called. */ void timer_mod_ns(QEMUTimer *ts, int64_t expire_time) { + QEMUTimerList *timer_list = ts->timer_list; QEMUTimer **pt, *t; - timer_del(ts); + qemu_mutex_lock(&timer_list->active_timers_lock); + timer_del_locked(timer_list, ts); /* add the timer in the sorted list */ - pt = &ts->timer_list->active_timers; + pt = &timer_list->active_timers; for(;;) { t = *pt; if (!timer_expired_ns(t, expire_time)) { @@ -334,12 +368,13 @@ void timer_mod_ns(QEMUTimer *ts, int64_t expire_time) ts->expire_time = expire_time; ts->next = *pt; *pt = ts; + qemu_mutex_unlock(&timer_list->active_timers_lock); /* Rearm if necessary */ - if (pt == &ts->timer_list->active_timers) { + if (pt == &timer_list->active_timers) { /* Interrupt execution to force deadline recalculation. */ - qemu_clock_warp(ts->timer_list->clock->type); - timerlist_notify(ts->timer_list); + qemu_clock_warp(timer_list->clock->type); + timerlist_notify(timer_list); } } @@ -350,13 +385,19 @@ void timer_mod(QEMUTimer *ts, int64_t expire_time) bool timer_pending(QEMUTimer *ts) { + QEMUTimerList *timer_list = ts->timer_list; QEMUTimer *t; - for (t = ts->timer_list->active_timers; t != NULL; t = t->next) { + bool found = false; + + qemu_mutex_lock(&timer_list->active_timers_lock); + for (t = timer_list->active_timers; t != NULL; t = t->next) { if (t == ts) { - return true; + found = true; + break; } } - return false; + qemu_mutex_unlock(&timer_list->active_timers_lock); + return found; } bool timer_expired(QEMUTimer *timer_head, int64_t current_time) @@ -369,23 +410,31 @@ bool timerlist_run_timers(QEMUTimerList *timer_list) QEMUTimer *ts; int64_t current_time; bool progress = false; - + QEMUTimerCB *cb; + void *opaque; + if (!timer_list->clock->enabled) { return progress; } current_time = qemu_clock_get_ns(timer_list->clock->type); for(;;) { + qemu_mutex_lock(&timer_list->active_timers_lock); ts = timer_list->active_timers; if (!timer_expired_ns(ts, current_time)) { + qemu_mutex_unlock(&timer_list->active_timers_lock); break; } + /* remove timer from the list before calling the callback */ timer_list->active_timers = ts->next; ts->next = NULL; + cb = ts->cb; + opaque = ts->opaque; + qemu_mutex_unlock(&timer_list->active_timers_lock); /* run the callback (the timer list can be modified) */ - ts->cb(ts->opaque); + cb(opaque); progress = true; } return progress; From 3db1ee7c2af2fbbfe259712e3ce74158bc667ad3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 12 Sep 2013 11:02:20 +0200 Subject: [PATCH 05/11] qemu-timer: do not take the lock in timer_pending We can deduce the result from expire_time, by making it always -1 if the timer is not in the active_timers list. We need to check against negative times passed to timer_mod_ns; clamping them to zero is not a problem because the only clock that has a zero value at VM startup is QEMU_CLOCK_VIRTUAL, and it is monotonic so it cannot be non-zero. QEMU_CLOCK_HOST, instead, is not monotonic but it cannot go to negative values unless the host time is seriously screwed up and points to the 1960s. Signed-off-by: Paolo Bonzini Signed-off-by: Stefan Hajnoczi --- qemu-timer.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/qemu-timer.c b/qemu-timer.c index e5047479f2..6b62e88669 100644 --- a/qemu-timer.c +++ b/qemu-timer.c @@ -312,6 +312,7 @@ void timer_init(QEMUTimer *ts, ts->cb = cb; ts->opaque = opaque; ts->scale = scale; + ts->expire_time = -1; } void timer_free(QEMUTimer *ts) @@ -323,6 +324,7 @@ static void timer_del_locked(QEMUTimerList *timer_list, QEMUTimer *ts) { QEMUTimer **pt, *t; + ts->expire_time = -1; pt = &timer_list->active_timers; for(;;) { t = *pt; @@ -365,7 +367,7 @@ void timer_mod_ns(QEMUTimer *ts, int64_t expire_time) } pt = &t->next; } - ts->expire_time = expire_time; + ts->expire_time = MAX(expire_time, 0); ts->next = *pt; *pt = ts; qemu_mutex_unlock(&timer_list->active_timers_lock); @@ -385,19 +387,7 @@ void timer_mod(QEMUTimer *ts, int64_t expire_time) bool timer_pending(QEMUTimer *ts) { - QEMUTimerList *timer_list = ts->timer_list; - QEMUTimer *t; - bool found = false; - - qemu_mutex_lock(&timer_list->active_timers_lock); - for (t = timer_list->active_timers; t != NULL; t = t->next) { - if (t == ts) { - found = true; - break; - } - } - qemu_mutex_unlock(&timer_list->active_timers_lock); - return found; + return ts->expire_time >= 0; } bool timer_expired(QEMUTimer *timer_head, int64_t current_time) @@ -429,6 +419,7 @@ bool timerlist_run_timers(QEMUTimerList *timer_list) /* remove timer from the list before calling the callback */ timer_list->active_timers = ts->next; ts->next = NULL; + ts->expire_time = -1; cb = ts->cb; opaque = ts->opaque; qemu_mutex_unlock(&timer_list->active_timers_lock); From 2fcd15eac3223b3907837e8d7f2b3829a16a4c45 Mon Sep 17 00:00:00 2001 From: Gabriel Kerneis Date: Tue, 17 Sep 2013 17:09:39 +0200 Subject: [PATCH 06/11] coroutine: add qemu_coroutine_yield benchmark Current coroutine performance benchmarks test only coroutine creation, either directly or in a nested way. This patch adds a benchmark to evaluate the performance of qemu_coroutine_yield. Signed-off-by: Gabriel Kerneis Signed-off-by: Stefan Hajnoczi --- tests/test-coroutine.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/test-coroutine.c b/tests/test-coroutine.c index 39be046ec7..2792191f82 100644 --- a/tests/test-coroutine.c +++ b/tests/test-coroutine.c @@ -202,6 +202,38 @@ static void perf_nesting(void) maxcycles, maxnesting, duration); } +/* + * Yield benchmark + */ + +static void coroutine_fn yield_loop(void *opaque) +{ + unsigned int *counter = opaque; + + while ((*counter) > 0) { + (*counter)--; + qemu_coroutine_yield(); + } +} + +static void perf_yield(void) +{ + unsigned int i, maxcycles; + double duration; + + maxcycles = 100000000; + i = maxcycles; + Coroutine *coroutine = qemu_coroutine_create(yield_loop); + + g_test_timer_start(); + while (i > 0) { + qemu_coroutine_enter(coroutine, &i); + } + duration = g_test_timer_elapsed(); + + g_test_message("Yield %u iterations: %f s\n", + maxcycles, duration); +} int main(int argc, char **argv) { @@ -214,6 +246,7 @@ int main(int argc, char **argv) if (g_test_perf()) { g_test_add_func("/perf/lifecycle", perf_lifecycle); g_test_add_func("/perf/nesting", perf_nesting); + g_test_add_func("/perf/yield", perf_yield); } return g_test_run(); } From a9031675b9f757eef0fe8c99284ec0133c032c32 Mon Sep 17 00:00:00 2001 From: Gabriel Kerneis Date: Tue, 17 Sep 2013 18:26:48 +0200 Subject: [PATCH 07/11] coroutine: fix /perf/nesting coroutine benchmark The /perf/nesting benchmark is broken because the counters are not reset after each iteration. Therefore, nesting is done only on the first iteration, and skipped on every other. This patch fixes the issue, and reduces the number of iterations to make it possible to run the benchmark in a reasonable amount of time. Signed-off-by: Gabriel Kerneis Signed-off-by: Stefan Hajnoczi --- tests/test-coroutine.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test-coroutine.c b/tests/test-coroutine.c index 2792191f82..15a885e882 100644 --- a/tests/test-coroutine.c +++ b/tests/test-coroutine.c @@ -182,17 +182,17 @@ static void perf_nesting(void) unsigned int i, maxcycles, maxnesting; double duration; - maxcycles = 100000000; + maxcycles = 10000; maxnesting = 1000; Coroutine *root; - NestData nd = { - .n_enter = 0, - .n_return = 0, - .max = maxnesting, - }; g_test_timer_start(); for (i = 0; i < maxcycles; i++) { + NestData nd = { + .n_enter = 0, + .n_return = 0, + .max = maxnesting, + }; root = qemu_coroutine_create(nest); qemu_coroutine_enter(root, &nd); } From 0f39ac9a07cc10278e37d87076b143008f28aa3b Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Thu, 19 Sep 2013 12:29:15 +0200 Subject: [PATCH 08/11] qcow2: Correct snapshots size for overlap check Using s->snapshots_size instead of snapshots_size for the metadata overlap check in qcow2_write_snapshots leads to the detection of an overlap with the main qcow2 image header when deleting the last snapshot, since s->snapshots_size has not yet been updated and is therefore non-zero. However, the offset returned by qcow2_alloc_clusters will be zero since snapshots_size is zero. Therefore, an overlap is detected albeit no such will occur. This patch fixes this by replacing s->snapshots_size by snapshots_size when calling qcow2_pre_write_overlap_check. Signed-off-by: Max Reitz Signed-off-by: Stefan Hajnoczi --- block/qcow2-snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index 7d144205c3..5e8a7794f4 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -192,7 +192,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs) /* The snapshot list position has not yet been updated, so these clusters * must indeed be completely free */ ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT, offset, - s->snapshots_size); + snapshots_size); if (ret < 0) { return ret; } From bcb9d66e8590151967e1dbe3724eec7ec71392e0 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Wed, 18 Sep 2013 19:14:14 +0800 Subject: [PATCH 09/11] block: don't lose data from last incomplete sector To read the last sector that is not aligned to sector boundary, current code for growable backends, since commit 893a8f6 "block: Produce zeros when protocols reading beyond end of file", drops the data and directly returns zeroes. That is incorrect. Signed-off-by: Fam Zheng Signed-off-by: Stefan Hajnoczi --- block.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block.c b/block.c index e176c6f3bc..ea4956d6c7 100644 --- a/block.c +++ b/block.c @@ -2669,7 +2669,7 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, goto out; } - total_sectors = len >> BDRV_SECTOR_BITS; + total_sectors = (len + BDRV_SECTOR_SIZE - 1) >> BDRV_SECTOR_BITS; max_nb_sectors = MAX(0, total_sectors - sector_num); if (max_nb_sectors > 0) { ret = drv->bdrv_co_readv(bs, sector_num, From 1df6fa4bc6754a170cf511a78e2e6fef84eb5228 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 19 Sep 2013 18:48:53 +0200 Subject: [PATCH 10/11] blockdev: do not default cache.no-flush to true That's why all my VMs were so fast lately. :) This changed in 1.6.0 by mistake in patch 29c4e2b (blockdev: Split up 'cache' option, 2013-07-18). Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini Signed-off-by: Stefan Hajnoczi --- blockdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blockdev.c b/blockdev.c index 80605a2bac..8aa66a949c 100644 --- a/blockdev.c +++ b/blockdev.c @@ -443,7 +443,7 @@ static DriveInfo *blockdev_init(QemuOpts *all_opts, if (qemu_opt_get_bool(opts, "cache.direct", false)) { bdrv_flags |= BDRV_O_NOCACHE; } - if (qemu_opt_get_bool(opts, "cache.no-flush", true)) { + if (qemu_opt_get_bool(opts, "cache.no-flush", false)) { bdrv_flags |= BDRV_O_NO_FLUSH; } From ef5bc96268ceec64769617dc53b0ac3a20ff351c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 20 Sep 2013 17:31:55 +0200 Subject: [PATCH 11/11] virtio-blk: do not relay a previous driver's WCE configuration to the current The following sequence happens: - the SeaBIOS virtio-blk driver does not support the WCE feature, which causes QEMU to disable writeback caching - the Linux virtio-blk driver resets the device, finds WCE is available but writeback caching is disabled; tells block layer to not send cache flush commands - the Linux virtio-blk driver sets the DRIVER_OK bit, which causes writeback caching to be re-enabled, but the Linux virtio-blk driver does not know of this side effect and cache flushes remain disabled The bug is at the third step. If the guest does know about CONFIG_WCE, QEMU should ignore the WCE feature's state. The guest will control the cache mode solely using configuration space. This change makes Linux do flushes correctly, but Linux will keep SeaBIOS's writethrough mode. Hence, whenever the guest is reset, the cache mode of the disk should be reset to whatever was specified in the "-drive" option. With this change, the Linux virtio-blk driver finds that writeback caching is enabled, and tells the block layer to send cache flush commands appropriately. Reported-by: Rusty Russell Signed-off-by: Stefan Hajnoczi --- hw/block/virtio-blk.c | 24 ++++++++++++++++++++++-- include/hw/virtio/virtio-blk.h | 1 + 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index e2f55cc946..49a23c33f7 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -460,9 +460,9 @@ static void virtio_blk_dma_restart_cb(void *opaque, int running, static void virtio_blk_reset(VirtIODevice *vdev) { -#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE VirtIOBlock *s = VIRTIO_BLK(vdev); +#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE if (s->dataplane) { virtio_blk_data_plane_stop(s->dataplane); } @@ -473,6 +473,7 @@ static void virtio_blk_reset(VirtIODevice *vdev) * are per-device request lists. */ bdrv_drain_all(); + bdrv_set_enable_write_cache(s->bs, s->original_wce); } /* coalesce internal state, copy to pci i/o region 0 @@ -564,7 +565,25 @@ static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status) } features = vdev->guest_features; - bdrv_set_enable_write_cache(s->bs, !!(features & (1 << VIRTIO_BLK_F_WCE))); + + /* A guest that supports VIRTIO_BLK_F_CONFIG_WCE must be able to send + * cache flushes. Thus, the "auto writethrough" behavior is never + * necessary for guests that support the VIRTIO_BLK_F_CONFIG_WCE feature. + * Leaving it enabled would break the following sequence: + * + * Guest started with "-drive cache=writethrough" + * Guest sets status to 0 + * Guest sets DRIVER bit in status field + * Guest reads host features (WCE=0, CONFIG_WCE=1) + * Guest writes guest features (WCE=0, CONFIG_WCE=1) + * Guest writes 1 to the WCE configuration field (writeback mode) + * Guest sets DRIVER_OK bit in status field + * + * s->bs would erroneously be placed in writethrough mode. + */ + if (!(features & (1 << VIRTIO_BLK_F_CONFIG_WCE))) { + bdrv_set_enable_write_cache(s->bs, !!(features & (1 << VIRTIO_BLK_F_WCE))); + } } static void virtio_blk_save(QEMUFile *f, void *opaque) @@ -674,6 +693,7 @@ static int virtio_blk_device_init(VirtIODevice *vdev) } blkconf_serial(&blk->conf, &blk->serial); + s->original_wce = bdrv_enable_write_cache(blk->conf.bs); if (blkconf_geometry(&blk->conf, NULL, 65535, 255, 255) < 0) { return -1; } diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h index b87cf490b1..41885da1a0 100644 --- a/include/hw/virtio/virtio-blk.h +++ b/include/hw/virtio/virtio-blk.h @@ -123,6 +123,7 @@ typedef struct VirtIOBlock { BlockConf *conf; VirtIOBlkConf blk; unsigned short sector_mask; + bool original_wce; VMChangeStateEntry *change; #ifdef CONFIG_VIRTIO_BLK_DATA_PLANE Notifier migration_state_notifier;