From 975a0015ee380f49a3be744279a6a06ab97e960a Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 22 Aug 2013 11:29:02 +0200
Subject: [PATCH 01/11] libcacard: link against qemu-error.o for error_report()

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 libcacard/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libcacard/Makefile b/libcacard/Makefile
index 47827a0eb8..4d15da49b8 100644
--- a/libcacard/Makefile
+++ b/libcacard/Makefile
@@ -4,7 +4,8 @@ TOOLS += vscclient$(EXESUF)
 
 # objects linked into a shared library, built with libtool with -fPIC if required
 libcacard-obj-y = $(stub-obj-y) $(libcacard-y)
-libcacard-obj-y += util/osdep.o util/cutils.o util/qemu-timer-common.o util/error.o
+libcacard-obj-y += util/osdep.o util/cutils.o util/qemu-timer-common.o
+libcacard-obj-y += util/error.o util/qemu-error.o
 libcacard-obj-$(CONFIG_WIN32) += util/oslib-win32.o util/qemu-thread-win32.o
 libcacard-obj-$(CONFIG_POSIX) += util/oslib-posix.o util/qemu-thread-posix.o
 libcacard-obj-y += $(filter trace/%, $(util-obj-y))

From a5813077aac7865f69b7ee46a594f3705429f7cd Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 22 Aug 2013 11:29:03 +0200
Subject: [PATCH 02/11] osdep: warn if open(O_DIRECT) on fails with EINVAL

Print a warning when opening a file O_DIRECT fails with EINVAL.  This
saves users a lot of time trying to figure out the EINVAL error, which
is typical when attempting to open a file O_DIRECT on Linux tmpfs.

Reported-by: Deepak C Shetty <deepakcs@linux.vnet.ibm.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 util/osdep.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/util/osdep.c b/util/osdep.c
index 685c8ae889..62072b4be3 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -207,6 +207,13 @@ int qemu_open(const char *name, int flags, ...)
     }
 #endif
 
+#ifdef O_DIRECT
+    if (ret == -1 && errno == EINVAL && (flags & O_DIRECT)) {
+        error_report("file system may not support O_DIRECT");
+        errno = EINVAL; /* in case it was clobbered */
+    }
+#endif /* O_DIRECT */
+
     return ret;
 }
 

From da718ceb1730bfe6fea0178df979639b14a0646e Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 12 Sep 2013 11:02:18 +0200
Subject: [PATCH 03/11] qemu-timer: drop outdated signal safety comments

host_alarm_handler() is invoked from the signal processing thread
(currently the iothread).  Previously we did processing in a real signal
handler with signalfd and therefore needed signal-safe timer code.

Today host_alarm_handler() just marks the alarm timer as expired/pending
and notifies the main loop using qemu_notify_event().

Therefore these outdated comments about signal safety can be dropped.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qemu-timer.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/qemu-timer.c b/qemu-timer.c
index 95ff47fef3..ed3fcb2190 100644
--- a/qemu-timer.c
+++ b/qemu-timer.c
@@ -301,8 +301,6 @@ void timer_del(QEMUTimer *ts)
 {
     QEMUTimer **pt, *t;
 
-    /* NOTE: this code must be signal safe because
-       timer_expired() can be called from a signal. */
     pt = &ts->timer_list->active_timers;
     for(;;) {
         t = *pt;
@@ -325,8 +323,6 @@ void timer_mod_ns(QEMUTimer *ts, int64_t expire_time)
     timer_del(ts);
 
     /* add the timer in the sorted list */
-    /* NOTE: this code must be signal safe because
-       timer_expired() can be called from a signal. */
     pt = &ts->timer_list->active_timers;
     for(;;) {
         t = *pt;

From 978f2205c791de0e02c8802a645bea657408abfd Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 12 Sep 2013 11:02:19 +0200
Subject: [PATCH 04/11] qemu-timer: make qemu_timer_mod_ns() and
 qemu_timer_del() thread-safe

Introduce QEMUTimerList->active_timers_lock to protect the linked list
of active timers.  This allows qemu_timer_mod_ns() to be called from any
thread.

Note that vm_clock is not thread-safe and its use of
qemu_clock_has_timers() works fine today but is also not thread-safe.

The purpose of this patch is to eventually let device models set or
cancel timers from a vcpu thread without holding the global mutex.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/timer.h | 17 +++++++++
 qemu-timer.c         | 87 ++++++++++++++++++++++++++++++++++----------
 2 files changed, 85 insertions(+), 19 deletions(-)

diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index e4934dd61b..b58903bef5 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -115,6 +115,10 @@ static inline int64_t qemu_clock_get_us(QEMUClockType type)
  * Determines whether a clock's default timer list
  * has timers attached
  *
+ * Note that this function should not be used when other threads also access
+ * the timer list.  The return value may be outdated by the time it is acted
+ * upon.
+ *
  * Returns: true if the clock's default timer list
  * has timers attached
  */
@@ -271,6 +275,10 @@ void timerlist_free(QEMUTimerList *timer_list);
  *
  * Determine whether a timer list has active timers
  *
+ * Note that this function should not be used when other threads also access
+ * the timer list.  The return value may be outdated by the time it is acted
+ * upon.
+ *
  * Returns: true if the timer list has timers.
  */
 bool timerlist_has_timers(QEMUTimerList *timer_list);
@@ -512,6 +520,9 @@ void timer_free(QEMUTimer *ts);
  * @ts: the timer
  *
  * Delete a timer from the active list.
+ *
+ * This function is thread-safe but the timer and its timer list must not be
+ * freed while this function is running.
  */
 void timer_del(QEMUTimer *ts);
 
@@ -521,6 +532,9 @@ void timer_del(QEMUTimer *ts);
  * @expire_time: the expiry time in nanoseconds
  *
  * Modify a timer to expire at @expire_time
+ *
+ * This function is thread-safe but the timer and its timer list must not be
+ * freed while this function is running.
  */
 void timer_mod_ns(QEMUTimer *ts, int64_t expire_time);
 
@@ -531,6 +545,9 @@ void timer_mod_ns(QEMUTimer *ts, int64_t expire_time);
  *
  * Modify a timer to expiry at @expire_time, taking into
  * account the scale associated with the timer.
+ *
+ * This function is thread-safe but the timer and its timer list must not be
+ * freed while this function is running.
  */
 void timer_mod(QEMUTimer *ts, int64_t expire_timer);
 
diff --git a/qemu-timer.c b/qemu-timer.c
index ed3fcb2190..e5047479f2 100644
--- a/qemu-timer.c
+++ b/qemu-timer.c
@@ -66,6 +66,7 @@ QEMUClock qemu_clocks[QEMU_CLOCK_MAX];
 
 struct QEMUTimerList {
     QEMUClock *clock;
+    QemuMutex active_timers_lock;
     QEMUTimer *active_timers;
     QLIST_ENTRY(QEMUTimerList) list;
     QEMUTimerListNotifyCB *notify_cb;
@@ -101,6 +102,7 @@ QEMUTimerList *timerlist_new(QEMUClockType type,
     timer_list->clock = clock;
     timer_list->notify_cb = cb;
     timer_list->notify_opaque = opaque;
+    qemu_mutex_init(&timer_list->active_timers_lock);
     QLIST_INSERT_HEAD(&clock->timerlists, timer_list, list);
     return timer_list;
 }
@@ -111,6 +113,7 @@ void timerlist_free(QEMUTimerList *timer_list)
     if (timer_list->clock) {
         QLIST_REMOVE(timer_list, list);
     }
+    qemu_mutex_destroy(&timer_list->active_timers_lock);
     g_free(timer_list);
 }
 
@@ -163,9 +166,17 @@ bool qemu_clock_has_timers(QEMUClockType type)
 
 bool timerlist_expired(QEMUTimerList *timer_list)
 {
-    return (timer_list->active_timers &&
-            timer_list->active_timers->expire_time <
-            qemu_clock_get_ns(timer_list->clock->type));
+    int64_t expire_time;
+
+    qemu_mutex_lock(&timer_list->active_timers_lock);
+    if (!timer_list->active_timers) {
+        qemu_mutex_unlock(&timer_list->active_timers_lock);
+        return false;
+    }
+    expire_time = timer_list->active_timers->expire_time;
+    qemu_mutex_unlock(&timer_list->active_timers_lock);
+
+    return expire_time < qemu_clock_get_ns(timer_list->clock->type);
 }
 
 bool qemu_clock_expired(QEMUClockType type)
@@ -182,13 +193,25 @@ bool qemu_clock_expired(QEMUClockType type)
 int64_t timerlist_deadline_ns(QEMUTimerList *timer_list)
 {
     int64_t delta;
+    int64_t expire_time;
 
-    if (!timer_list->clock->enabled || !timer_list->active_timers) {
+    if (!timer_list->clock->enabled) {
         return -1;
     }
 
-    delta = timer_list->active_timers->expire_time -
-        qemu_clock_get_ns(timer_list->clock->type);
+    /* The active timers list may be modified before the caller uses our return
+     * value but ->notify_cb() is called when the deadline changes.  Therefore
+     * the caller should notice the change and there is no race condition.
+     */
+    qemu_mutex_lock(&timer_list->active_timers_lock);
+    if (!timer_list->active_timers) {
+        qemu_mutex_unlock(&timer_list->active_timers_lock);
+        return -1;
+    }
+    expire_time = timer_list->active_timers->expire_time;
+    qemu_mutex_unlock(&timer_list->active_timers_lock);
+
+    delta = expire_time - qemu_clock_get_ns(timer_list->clock->type);
 
     if (delta <= 0) {
         return 0;
@@ -296,12 +319,11 @@ void timer_free(QEMUTimer *ts)
     g_free(ts);
 }
 
-/* stop a timer, but do not dealloc it */
-void timer_del(QEMUTimer *ts)
+static void timer_del_locked(QEMUTimerList *timer_list, QEMUTimer *ts)
 {
     QEMUTimer **pt, *t;
 
-    pt = &ts->timer_list->active_timers;
+    pt = &timer_list->active_timers;
     for(;;) {
         t = *pt;
         if (!t)
@@ -314,16 +336,28 @@ void timer_del(QEMUTimer *ts)
     }
 }
 
+/* stop a timer, but do not dealloc it */
+void timer_del(QEMUTimer *ts)
+{
+    QEMUTimerList *timer_list = ts->timer_list;
+
+    qemu_mutex_lock(&timer_list->active_timers_lock);
+    timer_del_locked(timer_list, ts);
+    qemu_mutex_unlock(&timer_list->active_timers_lock);
+}
+
 /* modify the current timer so that it will be fired when current_time
    >= expire_time. The corresponding callback will be called. */
 void timer_mod_ns(QEMUTimer *ts, int64_t expire_time)
 {
+    QEMUTimerList *timer_list = ts->timer_list;
     QEMUTimer **pt, *t;
 
-    timer_del(ts);
+    qemu_mutex_lock(&timer_list->active_timers_lock);
+    timer_del_locked(timer_list, ts);
 
     /* add the timer in the sorted list */
-    pt = &ts->timer_list->active_timers;
+    pt = &timer_list->active_timers;
     for(;;) {
         t = *pt;
         if (!timer_expired_ns(t, expire_time)) {
@@ -334,12 +368,13 @@ void timer_mod_ns(QEMUTimer *ts, int64_t expire_time)
     ts->expire_time = expire_time;
     ts->next = *pt;
     *pt = ts;
+    qemu_mutex_unlock(&timer_list->active_timers_lock);
 
     /* Rearm if necessary  */
-    if (pt == &ts->timer_list->active_timers) {
+    if (pt == &timer_list->active_timers) {
         /* Interrupt execution to force deadline recalculation.  */
-        qemu_clock_warp(ts->timer_list->clock->type);
-        timerlist_notify(ts->timer_list);
+        qemu_clock_warp(timer_list->clock->type);
+        timerlist_notify(timer_list);
     }
 }
 
@@ -350,13 +385,19 @@ void timer_mod(QEMUTimer *ts, int64_t expire_time)
 
 bool timer_pending(QEMUTimer *ts)
 {
+    QEMUTimerList *timer_list = ts->timer_list;
     QEMUTimer *t;
-    for (t = ts->timer_list->active_timers; t != NULL; t = t->next) {
+    bool found = false;
+
+    qemu_mutex_lock(&timer_list->active_timers_lock);
+    for (t = timer_list->active_timers; t != NULL; t = t->next) {
         if (t == ts) {
-            return true;
+            found = true;
+            break;
         }
     }
-    return false;
+    qemu_mutex_unlock(&timer_list->active_timers_lock);
+    return found;
 }
 
 bool timer_expired(QEMUTimer *timer_head, int64_t current_time)
@@ -369,23 +410,31 @@ bool timerlist_run_timers(QEMUTimerList *timer_list)
     QEMUTimer *ts;
     int64_t current_time;
     bool progress = false;
-   
+    QEMUTimerCB *cb;
+    void *opaque;
+
     if (!timer_list->clock->enabled) {
         return progress;
     }
 
     current_time = qemu_clock_get_ns(timer_list->clock->type);
     for(;;) {
+        qemu_mutex_lock(&timer_list->active_timers_lock);
         ts = timer_list->active_timers;
         if (!timer_expired_ns(ts, current_time)) {
+            qemu_mutex_unlock(&timer_list->active_timers_lock);
             break;
         }
+
         /* remove timer from the list before calling the callback */
         timer_list->active_timers = ts->next;
         ts->next = NULL;
+        cb = ts->cb;
+        opaque = ts->opaque;
+        qemu_mutex_unlock(&timer_list->active_timers_lock);
 
         /* run the callback (the timer list can be modified) */
-        ts->cb(ts->opaque);
+        cb(opaque);
         progress = true;
     }
     return progress;

From 3db1ee7c2af2fbbfe259712e3ce74158bc667ad3 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 12 Sep 2013 11:02:20 +0200
Subject: [PATCH 05/11] qemu-timer: do not take the lock in timer_pending

We can deduce the result from expire_time, by making it always -1 if
the timer is not in the active_timers list.  We need to check against
negative times passed to timer_mod_ns; clamping them to zero is not
a problem because the only clock that has a zero value at VM startup
is QEMU_CLOCK_VIRTUAL, and it is monotonic so it cannot be non-zero.
QEMU_CLOCK_HOST, instead, is not monotonic but it cannot go to negative
values unless the host time is seriously screwed up and points to
the 1960s.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qemu-timer.c | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/qemu-timer.c b/qemu-timer.c
index e5047479f2..6b62e88669 100644
--- a/qemu-timer.c
+++ b/qemu-timer.c
@@ -312,6 +312,7 @@ void timer_init(QEMUTimer *ts,
     ts->cb = cb;
     ts->opaque = opaque;
     ts->scale = scale;
+    ts->expire_time = -1;
 }
 
 void timer_free(QEMUTimer *ts)
@@ -323,6 +324,7 @@ static void timer_del_locked(QEMUTimerList *timer_list, QEMUTimer *ts)
 {
     QEMUTimer **pt, *t;
 
+    ts->expire_time = -1;
     pt = &timer_list->active_timers;
     for(;;) {
         t = *pt;
@@ -365,7 +367,7 @@ void timer_mod_ns(QEMUTimer *ts, int64_t expire_time)
         }
         pt = &t->next;
     }
-    ts->expire_time = expire_time;
+    ts->expire_time = MAX(expire_time, 0);
     ts->next = *pt;
     *pt = ts;
     qemu_mutex_unlock(&timer_list->active_timers_lock);
@@ -385,19 +387,7 @@ void timer_mod(QEMUTimer *ts, int64_t expire_time)
 
 bool timer_pending(QEMUTimer *ts)
 {
-    QEMUTimerList *timer_list = ts->timer_list;
-    QEMUTimer *t;
-    bool found = false;
-
-    qemu_mutex_lock(&timer_list->active_timers_lock);
-    for (t = timer_list->active_timers; t != NULL; t = t->next) {
-        if (t == ts) {
-            found = true;
-            break;
-        }
-    }
-    qemu_mutex_unlock(&timer_list->active_timers_lock);
-    return found;
+    return ts->expire_time >= 0;
 }
 
 bool timer_expired(QEMUTimer *timer_head, int64_t current_time)
@@ -429,6 +419,7 @@ bool timerlist_run_timers(QEMUTimerList *timer_list)
         /* remove timer from the list before calling the callback */
         timer_list->active_timers = ts->next;
         ts->next = NULL;
+        ts->expire_time = -1;
         cb = ts->cb;
         opaque = ts->opaque;
         qemu_mutex_unlock(&timer_list->active_timers_lock);

From 2fcd15eac3223b3907837e8d7f2b3829a16a4c45 Mon Sep 17 00:00:00 2001
From: Gabriel Kerneis <gabriel@kerneis.info>
Date: Tue, 17 Sep 2013 17:09:39 +0200
Subject: [PATCH 06/11] coroutine: add qemu_coroutine_yield benchmark

Current coroutine performance benchmarks test only coroutine creation,
either directly or in a nested way. This patch adds a benchmark to
evaluate the performance of qemu_coroutine_yield.

Signed-off-by: Gabriel Kerneis <gabriel@kerneis.info>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/test-coroutine.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/tests/test-coroutine.c b/tests/test-coroutine.c
index 39be046ec7..2792191f82 100644
--- a/tests/test-coroutine.c
+++ b/tests/test-coroutine.c
@@ -202,6 +202,38 @@ static void perf_nesting(void)
         maxcycles, maxnesting, duration);
 }
 
+/*
+ * Yield benchmark
+ */
+
+static void coroutine_fn yield_loop(void *opaque)
+{
+    unsigned int *counter = opaque;
+
+    while ((*counter) > 0) {
+        (*counter)--;
+        qemu_coroutine_yield();
+    }
+}
+
+static void perf_yield(void)
+{
+    unsigned int i, maxcycles;
+    double duration;
+
+    maxcycles = 100000000;
+    i = maxcycles;
+    Coroutine *coroutine = qemu_coroutine_create(yield_loop);
+
+    g_test_timer_start();
+    while (i > 0) {
+        qemu_coroutine_enter(coroutine, &i);
+    }
+    duration = g_test_timer_elapsed();
+
+    g_test_message("Yield %u iterations: %f s\n",
+        maxcycles, duration);
+}
 
 int main(int argc, char **argv)
 {
@@ -214,6 +246,7 @@ int main(int argc, char **argv)
     if (g_test_perf()) {
         g_test_add_func("/perf/lifecycle", perf_lifecycle);
         g_test_add_func("/perf/nesting", perf_nesting);
+        g_test_add_func("/perf/yield", perf_yield);
     }
     return g_test_run();
 }

From a9031675b9f757eef0fe8c99284ec0133c032c32 Mon Sep 17 00:00:00 2001
From: Gabriel Kerneis <gabriel@kerneis.info>
Date: Tue, 17 Sep 2013 18:26:48 +0200
Subject: [PATCH 07/11] coroutine: fix /perf/nesting coroutine benchmark

The /perf/nesting benchmark is broken because the counters are
not reset after each iteration. Therefore, nesting is done only
on the first iteration, and skipped on every other.

This patch fixes the issue, and reduces the number of iterations
to make it possible to run the benchmark in a reasonable amount of
time.

Signed-off-by: Gabriel Kerneis <gabriel@kerneis.info>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/test-coroutine.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test-coroutine.c b/tests/test-coroutine.c
index 2792191f82..15a885e882 100644
--- a/tests/test-coroutine.c
+++ b/tests/test-coroutine.c
@@ -182,17 +182,17 @@ static void perf_nesting(void)
     unsigned int i, maxcycles, maxnesting;
     double duration;
 
-    maxcycles = 100000000;
+    maxcycles = 10000;
     maxnesting = 1000;
     Coroutine *root;
-    NestData nd = {
-        .n_enter  = 0,
-        .n_return = 0,
-        .max      = maxnesting,
-    };
 
     g_test_timer_start();
     for (i = 0; i < maxcycles; i++) {
+        NestData nd = {
+            .n_enter  = 0,
+            .n_return = 0,
+            .max      = maxnesting,
+        };
         root = qemu_coroutine_create(nest);
         qemu_coroutine_enter(root, &nd);
     }

From 0f39ac9a07cc10278e37d87076b143008f28aa3b Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 19 Sep 2013 12:29:15 +0200
Subject: [PATCH 08/11] qcow2: Correct snapshots size for overlap check

Using s->snapshots_size instead of snapshots_size for the metadata
overlap check in qcow2_write_snapshots leads to the detection of an
overlap with the main qcow2 image header when deleting the last
snapshot, since s->snapshots_size has not yet been updated and is
therefore non-zero. However, the offset returned by qcow2_alloc_clusters
will be zero since snapshots_size is zero. Therefore, an overlap is
detected albeit no such will occur.

This patch fixes this by replacing s->snapshots_size by snapshots_size
when calling qcow2_pre_write_overlap_check.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-snapshot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 7d144205c3..5e8a7794f4 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -192,7 +192,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
     /* The snapshot list position has not yet been updated, so these clusters
      * must indeed be completely free */
     ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT, offset,
-                                        s->snapshots_size);
+                                        snapshots_size);
     if (ret < 0) {
         return ret;
     }

From bcb9d66e8590151967e1dbe3724eec7ec71392e0 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Wed, 18 Sep 2013 19:14:14 +0800
Subject: [PATCH 09/11] block: don't lose data from last incomplete sector

To read the last sector that is not aligned to sector boundary, current
code for growable backends, since commit 893a8f6 "block: Produce zeros
when protocols reading beyond end of file", drops the data and directly
returns zeroes. That is incorrect.

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block.c b/block.c
index e176c6f3bc..ea4956d6c7 100644
--- a/block.c
+++ b/block.c
@@ -2669,7 +2669,7 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
             goto out;
         }
 
-        total_sectors = len >> BDRV_SECTOR_BITS;
+        total_sectors = (len + BDRV_SECTOR_SIZE - 1) >> BDRV_SECTOR_BITS;
         max_nb_sectors = MAX(0, total_sectors - sector_num);
         if (max_nb_sectors > 0) {
             ret = drv->bdrv_co_readv(bs, sector_num,

From 1df6fa4bc6754a170cf511a78e2e6fef84eb5228 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 19 Sep 2013 18:48:53 +0200
Subject: [PATCH 10/11] blockdev: do not default cache.no-flush to true

That's why all my VMs were so fast lately. :)

This changed in 1.6.0 by mistake in patch 29c4e2b (blockdev: Split up
'cache' option, 2013-07-18).

Cc: qemu-stable@nongnu.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 blockdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/blockdev.c b/blockdev.c
index 80605a2bac..8aa66a949c 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -443,7 +443,7 @@ static DriveInfo *blockdev_init(QemuOpts *all_opts,
     if (qemu_opt_get_bool(opts, "cache.direct", false)) {
         bdrv_flags |= BDRV_O_NOCACHE;
     }
-    if (qemu_opt_get_bool(opts, "cache.no-flush", true)) {
+    if (qemu_opt_get_bool(opts, "cache.no-flush", false)) {
         bdrv_flags |= BDRV_O_NO_FLUSH;
     }
 

From ef5bc96268ceec64769617dc53b0ac3a20ff351c Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 20 Sep 2013 17:31:55 +0200
Subject: [PATCH 11/11] virtio-blk: do not relay a previous driver's WCE
 configuration to the current

The following sequence happens:
- the SeaBIOS virtio-blk driver does not support the WCE feature, which
causes QEMU to disable writeback caching

- the Linux virtio-blk driver resets the device, finds WCE is available
but writeback caching is disabled; tells block layer to not send cache
flush commands

- the Linux virtio-blk driver sets the DRIVER_OK bit, which causes
writeback caching to be re-enabled, but the Linux virtio-blk driver does
not know of this side effect and cache flushes remain disabled

The bug is at the third step.  If the guest does know about CONFIG_WCE,
QEMU should ignore the WCE feature's state.  The guest will control the
cache mode solely using configuration space.  This change makes Linux
do flushes correctly, but Linux will keep SeaBIOS's writethrough mode.

Hence, whenever the guest is reset, the cache mode of the disk should
be reset to whatever was specified in the "-drive" option.  With this
change, the Linux virtio-blk driver finds that writeback caching is
enabled, and tells the block layer to send cache flush commands
appropriately.

Reported-by: Rusty Russell <rusty@au1.ibm.com
Cc: qemu-stable@nongnu.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 hw/block/virtio-blk.c          | 24 ++++++++++++++++++++++--
 include/hw/virtio/virtio-blk.h |  1 +
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index e2f55cc946..49a23c33f7 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -460,9 +460,9 @@ static void virtio_blk_dma_restart_cb(void *opaque, int running,
 
 static void virtio_blk_reset(VirtIODevice *vdev)
 {
-#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
     VirtIOBlock *s = VIRTIO_BLK(vdev);
 
+#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
     if (s->dataplane) {
         virtio_blk_data_plane_stop(s->dataplane);
     }
@@ -473,6 +473,7 @@ static void virtio_blk_reset(VirtIODevice *vdev)
      * are per-device request lists.
      */
     bdrv_drain_all();
+    bdrv_set_enable_write_cache(s->bs, s->original_wce);
 }
 
 /* coalesce internal state, copy to pci i/o region 0
@@ -564,7 +565,25 @@ static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status)
     }
 
     features = vdev->guest_features;
-    bdrv_set_enable_write_cache(s->bs, !!(features & (1 << VIRTIO_BLK_F_WCE)));
+
+    /* A guest that supports VIRTIO_BLK_F_CONFIG_WCE must be able to send
+     * cache flushes.  Thus, the "auto writethrough" behavior is never
+     * necessary for guests that support the VIRTIO_BLK_F_CONFIG_WCE feature.
+     * Leaving it enabled would break the following sequence:
+     *
+     *     Guest started with "-drive cache=writethrough"
+     *     Guest sets status to 0
+     *     Guest sets DRIVER bit in status field
+     *     Guest reads host features (WCE=0, CONFIG_WCE=1)
+     *     Guest writes guest features (WCE=0, CONFIG_WCE=1)
+     *     Guest writes 1 to the WCE configuration field (writeback mode)
+     *     Guest sets DRIVER_OK bit in status field
+     *
+     * s->bs would erroneously be placed in writethrough mode.
+     */
+    if (!(features & (1 << VIRTIO_BLK_F_CONFIG_WCE))) {
+        bdrv_set_enable_write_cache(s->bs, !!(features & (1 << VIRTIO_BLK_F_WCE)));
+    }
 }
 
 static void virtio_blk_save(QEMUFile *f, void *opaque)
@@ -674,6 +693,7 @@ static int virtio_blk_device_init(VirtIODevice *vdev)
     }
 
     blkconf_serial(&blk->conf, &blk->serial);
+    s->original_wce = bdrv_enable_write_cache(blk->conf.bs);
     if (blkconf_geometry(&blk->conf, NULL, 65535, 255, 255) < 0) {
         return -1;
     }
diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index b87cf490b1..41885da1a0 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -123,6 +123,7 @@ typedef struct VirtIOBlock {
     BlockConf *conf;
     VirtIOBlkConf blk;
     unsigned short sector_mask;
+    bool original_wce;
     VMChangeStateEntry *change;
 #ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
     Notifier migration_state_notifier;