From 4f78a16fee462471416dc49b409d57b2071cf3d9 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 9 Jun 2017 13:29:36 +0200
Subject: [PATCH 01/60] commit: Fix completion with extra reference

commit_complete() can't assume that after its block_job_completed() the
job is actually immediately freed; someone else may still be holding
references. In this case, the op blockers on the intermediate nodes make
the graph reconfiguration in the completion code fail.

Call block_job_remove_all_bdrv() manually so that we know for sure that
any blockers on intermediate nodes are given up.

Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 block/commit.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/block/commit.c b/block/commit.c
index af6fa68cf3..8c09c3dbcd 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -119,6 +119,13 @@ static void commit_complete(BlockJob *job, void *opaque)
     }
     g_free(s->backing_file_str);
     blk_unref(s->top);
+
+    /* If there is more than one reference to the job (e.g. if called from
+     * block_job_finish_sync()), block_job_completed() won't free it and
+     * therefore the blockers on the intermediate nodes remain. This would
+     * cause bdrv_set_backing_hd() to fail. */
+    block_job_remove_all_bdrv(job);
+
     block_job_completed(&s->common, ret);
     g_free(data);
 

From ecaf8c8a6fdd2153f1652d1598c4e344cc961e3b Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 9 Jun 2017 13:32:48 +0200
Subject: [PATCH 02/60] qemu-iotests: Allow starting new qemu after cleanup

After _cleanup_qemu(), test cases should be able to start the next qemu
process and call _cleanup_qemu() for that one as well. For this to work
cleanly, we need to improve the cleanup so that the second invocation
doesn't try to kill the qemu instances from the first invocation a
second time (which would result in error messages).

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/common.qemu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu
index 7a78a00999..76ef298d3f 100644
--- a/tests/qemu-iotests/common.qemu
+++ b/tests/qemu-iotests/common.qemu
@@ -222,5 +222,8 @@ function _cleanup_qemu()
         rm -f "${QEMU_FIFO_IN}_${i}" "${QEMU_FIFO_OUT}_${i}"
         eval "exec ${QEMU_IN[$i]}<&-"   # close file descriptors
         eval "exec ${QEMU_OUT[$i]}<&-"
+
+        unset QEMU_IN[$i]
+        unset QEMU_OUT[$i]
     done
 }

From 24575bfa8c05041db097d203c5506814db0fa110 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 9 Jun 2017 13:37:01 +0200
Subject: [PATCH 03/60] qemu-iotests: Test exiting qemu with running job

When qemu is exited, all running jobs should be cancelled successfully.
This adds a test for this for all types of block jobs that currently
exist in qemu.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 tests/qemu-iotests/185     | 206 +++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/185.out |  59 +++++++++++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 266 insertions(+)
 create mode 100755 tests/qemu-iotests/185
 create mode 100644 tests/qemu-iotests/185.out

diff --git a/tests/qemu-iotests/185 b/tests/qemu-iotests/185
new file mode 100755
index 0000000000..0eda371f27
--- /dev/null
+++ b/tests/qemu-iotests/185
@@ -0,0 +1,206 @@
+#!/bin/bash
+#
+# Test exiting qemu while jobs are still running
+#
+# Copyright (C) 2017 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=kwolf@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+status=1 # failure is the default!
+
+MIG_SOCKET="${TEST_DIR}/migrate"
+
+_cleanup()
+{
+    rm -f "${TEST_IMG}.mid"
+    rm -f "${TEST_IMG}.copy"
+    _cleanup_test_img
+    _cleanup_qemu
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+. ./common.qemu
+
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+
+size=64M
+TEST_IMG="${TEST_IMG}.base" _make_test_img $size
+
+echo
+echo === Starting VM ===
+echo
+
+qemu_comm_method="qmp"
+
+_launch_qemu \
+    -drive file="${TEST_IMG}.base",cache=$CACHEMODE,driver=$IMGFMT,id=disk
+h=$QEMU_HANDLE
+_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
+
+echo
+echo === Creating backing chain ===
+echo
+
+_send_qemu_cmd $h \
+    "{ 'execute': 'blockdev-snapshot-sync',
+       'arguments': { 'device': 'disk',
+                      'snapshot-file': '$TEST_IMG.mid',
+                      'format': '$IMGFMT',
+                      'mode': 'absolute-paths' } }" \
+    "return"
+
+_send_qemu_cmd $h \
+    "{ 'execute': 'human-monitor-command',
+       'arguments': { 'command-line':
+                      'qemu-io disk \"write 0 4M\"' } }" \
+    "return"
+
+_send_qemu_cmd $h \
+    "{ 'execute': 'blockdev-snapshot-sync',
+       'arguments': { 'device': 'disk',
+                      'snapshot-file': '$TEST_IMG',
+                      'format': '$IMGFMT',
+                      'mode': 'absolute-paths' } }" \
+    "return"
+
+echo
+echo === Start commit job and exit qemu ===
+echo
+
+# Note that the reference output intentionally includes the 'offset' field in
+# BLOCK_JOB_CANCELLED events for all of the following block jobs. They are
+# predictable and any change in the offsets would hint at a bug in the job
+# throttling code.
+#
+# In order to achieve these predictable offsets, all of the following tests
+# use speed=65536. Each job will perform exactly one iteration before it has
+# to sleep at least for a second, which is plenty of time for the 'quit' QMP
+# command to be received (after receiving the command, the rest runs
+# synchronously, so jobs can arbitrarily continue or complete).
+#
+# The buffer size for commit and streaming is 512k (waiting for 8 seconds after
+# the first request), for active commit and mirror it's large enough to cover
+# the full 4M, and for backup it's the qcow2 cluster size, which we know is
+# 64k. As all of these are at least as large as the speed, we are sure that the
+# offset doesn't advance after the first iteration before qemu exits.
+
+_send_qemu_cmd $h \
+    "{ 'execute': 'block-commit',
+       'arguments': { 'device': 'disk',
+                      'base':'$TEST_IMG.base',
+                      'top': '$TEST_IMG.mid',
+                      'speed': 65536 } }" \
+    "return"
+
+_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
+wait=1 _cleanup_qemu
+
+echo
+echo === Start active commit job and exit qemu ===
+echo
+
+_launch_qemu \
+    -drive file="${TEST_IMG}",cache=$CACHEMODE,driver=$IMGFMT,id=disk
+h=$QEMU_HANDLE
+_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
+
+_send_qemu_cmd $h \
+    "{ 'execute': 'block-commit',
+       'arguments': { 'device': 'disk',
+                      'base':'$TEST_IMG.base',
+                      'speed': 65536 } }" \
+    "return"
+
+_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
+wait=1 _cleanup_qemu
+
+echo
+echo === Start mirror job and exit qemu ===
+echo
+
+_launch_qemu \
+    -drive file="${TEST_IMG}",cache=$CACHEMODE,driver=$IMGFMT,id=disk
+h=$QEMU_HANDLE
+_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
+
+_send_qemu_cmd $h \
+    "{ 'execute': 'drive-mirror',
+       'arguments': { 'device': 'disk',
+                      'target': '$TEST_IMG.copy',
+                      'format': '$IMGFMT',
+                      'sync': 'full',
+                      'speed': 65536 } }" \
+    "return"
+
+_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
+wait=1 _cleanup_qemu
+
+echo
+echo === Start backup job and exit qemu ===
+echo
+
+_launch_qemu \
+    -drive file="${TEST_IMG}",cache=$CACHEMODE,driver=$IMGFMT,id=disk
+h=$QEMU_HANDLE
+_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
+
+_send_qemu_cmd $h \
+    "{ 'execute': 'drive-backup',
+       'arguments': { 'device': 'disk',
+                      'target': '$TEST_IMG.copy',
+                      'format': '$IMGFMT',
+                      'sync': 'full',
+                      'speed': 65536 } }" \
+    "return"
+
+_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
+wait=1 _cleanup_qemu
+
+echo
+echo === Start streaming job and exit qemu ===
+echo
+
+_launch_qemu \
+    -drive file="${TEST_IMG}",cache=$CACHEMODE,driver=$IMGFMT,id=disk
+h=$QEMU_HANDLE
+_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
+
+_send_qemu_cmd $h \
+    "{ 'execute': 'block-stream',
+       'arguments': { 'device': 'disk',
+                      'speed': 65536 } }" \
+    "return"
+
+_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
+wait=1 _cleanup_qemu
+
+_check_test_img
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/185.out b/tests/qemu-iotests/185.out
new file mode 100644
index 0000000000..45bc7cb1a5
--- /dev/null
+++ b/tests/qemu-iotests/185.out
@@ -0,0 +1,59 @@
+QA output created by 185
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864
+
+=== Starting VM ===
+
+{"return": {}}
+
+=== Creating backing chain ===
+
+Formatting 'TEST_DIR/t.qcow2.mid', fmt=qcow2 size=67108864 backing_file=TEST_DIR/t.qcow2.base backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
+{"return": {}}
+wrote 4194304/4194304 bytes at offset 0
+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+{"return": ""}
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 size=67108864 backing_file=TEST_DIR/t.qcow2.mid backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
+{"return": {}}
+
+=== Start commit job and exit qemu ===
+
+{"return": {}}
+{"return": {}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 67108864, "offset": 524288, "speed": 65536, "type": "commit"}}
+
+=== Start active commit job and exit qemu ===
+
+{"return": {}}
+{"return": {}}
+{"return": {}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 4194304, "offset": 4194304, "speed": 65536, "type": "commit"}}
+
+=== Start mirror job and exit qemu ===
+
+{"return": {}}
+Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 size=67108864 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
+{"return": {}}
+{"return": {}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 4194304, "offset": 4194304, "speed": 65536, "type": "mirror"}}
+
+=== Start backup job and exit qemu ===
+
+{"return": {}}
+Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 size=67108864 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
+{"return": {}}
+{"return": {}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 67108864, "offset": 65536, "speed": 65536, "type": "backup"}}
+
+=== Start streaming job and exit qemu ===
+
+{"return": {}}
+{"return": {}}
+{"return": {}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 67108864, "offset": 524288, "speed": 65536, "type": "stream"}}
+No errors were found on the image.
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index a6acafffd7..318ae74b10 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -175,3 +175,4 @@
 181 rw auto migration
 182 rw auto quick
 183 rw auto migration
+185 rw auto

From dc88a467ec7214c3086094033daf2aba554337b1 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 22 May 2017 14:57:01 +0100
Subject: [PATCH 04/60] block: count bdrv_co_rw_vmstate() requests

Call bdrv_inc/dec_in_flight() for vmstate reads/writes.  This seems
unnecessary at first glance because vmstate reads/writes are done
synchronously while the guest is stopped.  But we need the bdrv_wakeup()
in bdrv_dec_in_flight() so the main loop sees request completion.
Besides, it's cleaner to count vmstate reads/writes like ordinary
read/write requests.

The bdrv_wakeup() partially fixes a 'savevm' hang with -object iothread.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/block/io.c b/block/io.c
index 91611ffb2a..684ea465c7 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1980,17 +1980,24 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
                    bool is_read)
 {
     BlockDriver *drv = bs->drv;
+    int ret = -ENOTSUP;
+
+    bdrv_inc_in_flight(bs);
 
     if (!drv) {
-        return -ENOMEDIUM;
+        ret = -ENOMEDIUM;
     } else if (drv->bdrv_load_vmstate) {
-        return is_read ? drv->bdrv_load_vmstate(bs, qiov, pos)
-                       : drv->bdrv_save_vmstate(bs, qiov, pos);
+        if (is_read) {
+            ret = drv->bdrv_load_vmstate(bs, qiov, pos);
+        } else {
+            ret = drv->bdrv_save_vmstate(bs, qiov, pos);
+        }
     } else if (bs->file) {
-        return bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
+        ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
     }
 
-    return -ENOTSUP;
+    bdrv_dec_in_flight(bs);
+    return ret;
 }
 
 static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)

From ea17c9d20d7396351be5e14317354519ff53721d Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 22 May 2017 14:57:02 +0100
Subject: [PATCH 05/60] block: use BDRV_POLL_WHILE() in bdrv_rw_vmstate()

Calling aio_poll() directly may have been fine previously, but this is
the future, man!  The difference between an aio_poll() loop and
BDRV_POLL_WHILE() is that BDRV_POLL_WHILE() releases the AioContext
around aio_poll().

This allows the IOThread to run fd handlers or BHs to complete the
request.  Failure to release the AioContext causes deadlocks.

Using BDRV_POLL_WHILE() partially fixes a 'savevm' hang with -object
iothread.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/io.c b/block/io.c
index 684ea465c7..e158ae0174 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2023,9 +2023,7 @@ bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
         Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
 
         bdrv_coroutine_enter(bs, co);
-        while (data.ret == -EINPROGRESS) {
-            aio_poll(bdrv_get_aio_context(bs), true);
-        }
+        BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
         return data.ret;
     }
 }

From 17e2a4a47d46dc9c33d5946cbdc1ceb15e34b5ac Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 22 May 2017 14:57:03 +0100
Subject: [PATCH 06/60] migration: avoid recursive AioContext locking in
 save_vmstate()

AioContext was designed to allow nested acquire/release calls.  It uses
a recursive mutex so callers don't need to worry about nesting...or so
we thought.

BDRV_POLL_WHILE() is used to wait for block I/O requests.  It releases
the AioContext temporarily around aio_poll().  This gives IOThreads a
chance to acquire the AioContext to process I/O completions.

It turns out that recursive locking and BDRV_POLL_WHILE() don't mix.
BDRV_POLL_WHILE() only releases the AioContext once, so the IOThread
will not be able to acquire the AioContext if it was acquired
multiple times.

Instead of trying to release AioContext n times in BDRV_POLL_WHILE(),
this patch simply avoids nested locking in save_vmstate().  It's the
simplest fix and we should step back to consider the big picture with
all the recent changes to block layer threading.

This patch is the final fix to solve 'savevm' hanging with -object
iothread.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 migration/savevm.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index 6bfd4893e0..5846d9c369 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2144,6 +2144,14 @@ int save_snapshot(const char *name, Error **errp)
         goto the_end;
     }
 
+    /* The bdrv_all_create_snapshot() call that follows acquires the AioContext
+     * for itself.  BDRV_POLL_WHILE() does not support nested locking because
+     * it only releases the lock once.  Therefore synchronous I/O will deadlock
+     * unless we release the AioContext before bdrv_all_create_snapshot().
+     */
+    aio_context_release(aio_context);
+    aio_context = NULL;
+
     ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs);
     if (ret < 0) {
         error_setg(errp, "Error while creating snapshot on '%s'",
@@ -2154,7 +2162,9 @@ int save_snapshot(const char *name, Error **errp)
     ret = 0;
 
  the_end:
-    aio_context_release(aio_context);
+    if (aio_context) {
+        aio_context_release(aio_context);
+    }
     if (saved_vm_running) {
         vm_start();
     }

From 8649f2f9b2d83b627199babc52b454db136e253c Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 22 May 2017 14:57:04 +0100
Subject: [PATCH 07/60] migration: use bdrv_drain_all_begin/end() instead
 bdrv_drain_all()

blk/bdrv_drain_all() only takes effect for a single instant and then
resumes block jobs, guest devices, and other external clients like the
NBD server.  This can be handy when performing a synchronous drain
before terminating the program, for example.

Monitor commands usually need to quiesce I/O across an entire code
region so blk/bdrv_drain_all() is not suitable.  They must use
bdrv_drain_all_begin/end() to mark the region.  This prevents new I/O
requests from slipping in or worse - block jobs completing and modifying
the graph.

I audited other blk/bdrv_drain_all() callers but did not find anything
that needs a similar fix.  This patch fixes the savevm/loadvm commands.
Although I haven't encountered a read world issue this makes the code
safer.

Suggested-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 migration/savevm.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index 5846d9c369..b08df04d59 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2107,6 +2107,8 @@ int save_snapshot(const char *name, Error **errp)
     }
     vm_stop(RUN_STATE_SAVE_VM);
 
+    bdrv_drain_all_begin();
+
     aio_context_acquire(aio_context);
 
     memset(sn, 0, sizeof(*sn));
@@ -2165,6 +2167,9 @@ int save_snapshot(const char *name, Error **errp)
     if (aio_context) {
         aio_context_release(aio_context);
     }
+
+    bdrv_drain_all_end();
+
     if (saved_vm_running) {
         vm_start();
     }
@@ -2273,20 +2278,21 @@ int load_snapshot(const char *name, Error **errp)
     }
 
     /* Flush all IO requests so they don't interfere with the new state.  */
-    bdrv_drain_all();
+    bdrv_drain_all_begin();
 
     ret = bdrv_all_goto_snapshot(name, &bs);
     if (ret < 0) {
         error_setg(errp, "Error %d while activating snapshot '%s' on '%s'",
                      ret, name, bdrv_get_device_name(bs));
-        return ret;
+        goto err_drain;
     }
 
     /* restore the VM state */
     f = qemu_fopen_bdrv(bs_vm_state, 0);
     if (!f) {
         error_setg(errp, "Could not open VM state file");
-        return -EINVAL;
+        ret = -EINVAL;
+        goto err_drain;
     }
 
     qemu_system_reset(SHUTDOWN_CAUSE_NONE);
@@ -2296,6 +2302,8 @@ int load_snapshot(const char *name, Error **errp)
     ret = qemu_loadvm_state(f);
     aio_context_release(aio_context);
 
+    bdrv_drain_all_end();
+
     migration_incoming_state_destroy();
     if (ret < 0) {
         error_setg(errp, "Error %d while loading VM state", ret);
@@ -2303,6 +2311,10 @@ int load_snapshot(const char *name, Error **errp)
     }
 
     return 0;
+
+err_drain:
+    bdrv_drain_all_end();
+    return ret;
 }
 
 void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)

From dfaca4641cfa1601242933ba99fb0156522cc350 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 22 Sep 2016 16:53:24 +0200
Subject: [PATCH 08/60] doc: Document generic -blockdev options

This adds documentation for the -blockdev options that apply to all
nodes independent of the block driver used.

All options that are shared by -blockdev and -drive are now explained in
the section for -blockdev. The documentation of -drive mentions that all
-blockdev options are accepted as well.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 qemu-options.hx | 108 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 79 insertions(+), 29 deletions(-)

diff --git a/qemu-options.hx b/qemu-options.hx
index 30c4f9850f..db20866629 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -610,6 +610,53 @@ DEF("blockdev", HAS_ARG, QEMU_OPTION_blockdev,
     "          [,read-only=on|off][,detect-zeroes=on|off|unmap]\n"
     "          [,driver specific parameters...]\n"
     "                configure a block backend\n", QEMU_ARCH_ALL)
+STEXI
+@item -blockdev @var{option}[,@var{option}[,@var{option}[,...]]]
+@findex -blockdev
+
+Define a new block driver node.
+
+@table @option
+@item Valid options for any block driver node:
+
+@table @code
+@item driver
+Specifies the block driver to use for the given node.
+@item node-name
+This defines the name of the block driver node by which it will be referenced
+later. The name must be unique, i.e. it must not match the name of a different
+block driver node, or (if you use @option{-drive} as well) the ID of a drive.
+
+If no node name is specified, it is automatically generated. The generated node
+name is not intended to be predictable and changes between QEMU invocations.
+For the top level, an explicit node name must be specified.
+@item read-only
+Open the node read-only. Guest write attempts will fail.
+@item cache.direct
+The host page cache can be avoided with @option{cache.direct=on}. This will
+attempt to do disk IO directly to the guest's memory. QEMU may still perform an
+internal copy of the data.
+@item cache.no-flush
+In case you don't care about data integrity over host failures, you can use
+@option{cache.no-flush=on}. This option tells QEMU that it never needs to write
+any data to the disk but can instead keep things in cache. If anything goes
+wrong, like your host losing power, the disk storage getting disconnected
+accidentally, etc. your image will most probably be rendered unusable.
+@item discard=@var{discard}
+@var{discard} is one of "ignore" (or "off") or "unmap" (or "on") and controls
+whether @code{discard} (also known as @code{trim} or @code{unmap}) requests are
+ignored or passed to the filesystem. Some machine types may not support
+discard requests.
+@item detect-zeroes=@var{detect-zeroes}
+@var{detect-zeroes} is "off", "on" or "unmap" and enables the automatic
+conversion of plain zero writes by the OS to driver specific optimized
+zero write commands. You may even choose "unmap" if @var{discard} is set
+to "unmap" to allow a zero write to be converted to an @code{unmap} operation.
+@end table
+
+@end table
+
+ETEXI
 
 DEF("drive", HAS_ARG, QEMU_OPTION_drive,
     "-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][,index=i]\n"
@@ -630,7 +677,12 @@ STEXI
 @item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
 @findex -drive
 
-Define a new drive. Valid options are:
+Define a new drive. This includes creating a block driver node (the backend) as
+well as a guest device, and is mostly a shortcut for defining the corresponding
+@option{-blockdev} and @option{-device} options.
+
+@option{-drive} accepts all options that are accepted by @option{-blockdev}. In
+addition, it knows the following options:
 
 @table @option
 @item file=@var{file}
@@ -657,11 +709,31 @@ These options have the same definition as they have in @option{-hdachs}.
 @var{snapshot} is "on" or "off" and controls snapshot mode for the given drive
 (see @option{-snapshot}).
 @item cache=@var{cache}
-@var{cache} is "none", "writeback", "unsafe", "directsync" or "writethrough" and controls how the host cache is used to access block data.
+@var{cache} is "none", "writeback", "unsafe", "directsync" or "writethrough"
+and controls how the host cache is used to access block data. This is a
+shortcut that sets the @option{cache.direct} and @option{cache.no-flush}
+options (as in @option{-blockdev}), and additionally @option{cache.writeback},
+which provides a default for the @option{write-cache} option of block guest
+devices (as in @option{-device}). The modes correspond to the following
+settings:
+
+@c Our texi2pod.pl script doesn't support @multitable, so fall back to using
+@c plain ASCII art (well, UTF-8 art really). This looks okay both in the manpage
+@c and the HTML output.
+@example
+@             │ cache.writeback   cache.direct   cache.no-flush
+─────────────┼─────────────────────────────────────────────────
+writeback    │ on                off            off
+none         │ on                on             off
+writethrough │ off               off            off
+directsync   │ off               on             off
+unsafe       │ on                off            on
+@end example
+
+The default mode is @option{cache=writeback}.
+
 @item aio=@var{aio}
 @var{aio} is "threads", or "native" and selects between pthread based disk I/O and native Linux AIO.
-@item discard=@var{discard}
-@var{discard} is one of "ignore" (or "off") or "unmap" (or "on") and controls whether @dfn{discard} (also known as @dfn{trim} or @dfn{unmap}) requests are ignored or passed to the filesystem.  Some machine types may not support discard requests.
 @item format=@var{format}
 Specify which disk @var{format} will be used rather than detecting
 the format.  Can be used to specify format=raw to avoid interpreting
@@ -676,16 +748,9 @@ Specify which @var{action} to take on write and read errors. Valid actions are:
 "report" (report the error to the guest), "enospc" (pause QEMU only if the
 host disk is full; report the error to the guest otherwise).
 The default setting is @option{werror=enospc} and @option{rerror=report}.
-@item readonly
-Open drive @option{file} as read-only. Guest write attempts will fail.
 @item copy-on-read=@var{copy-on-read}
 @var{copy-on-read} is "on" or "off" and enables whether to copy read backing
 file sectors into the image file.
-@item detect-zeroes=@var{detect-zeroes}
-@var{detect-zeroes} is "off", "on" or "unmap" and enables the automatic
-conversion of plain zero writes by the OS to driver specific optimized
-zero write commands. You may even choose "unmap" if @var{discard} is set
-to "unmap" to allow a zero write to be converted to an UNMAP operation.
 @item bps=@var{b},bps_rd=@var{r},bps_wr=@var{w}
 Specify bandwidth throttling limits in bytes per second, either for all request
 types or for reads or writes only.  Small values can lead to timeouts or hangs
@@ -712,34 +777,19 @@ prevent guests from circumventing throttling limits by using many small disks
 instead of a single larger disk.
 @end table
 
-By default, the @option{cache=writeback} mode is used. It will report data
+By default, the @option{cache.writeback=on} mode is used. It will report data
 writes as completed as soon as the data is present in the host page cache.
 This is safe as long as your guest OS makes sure to correctly flush disk caches
 where needed. If your guest OS does not handle volatile disk write caches
 correctly and your host crashes or loses power, then the guest may experience
 data corruption.
 
-For such guests, you should consider using @option{cache=writethrough}. This
+For such guests, you should consider using @option{cache.writeback=off}. This
 means that the host page cache will be used to read and write data, but write
 notification will be sent to the guest only after QEMU has made sure to flush
 each write to the disk. Be aware that this has a major impact on performance.
 
-The host page cache can be avoided entirely with @option{cache=none}.  This will
-attempt to do disk IO directly to the guest's memory.  QEMU may still perform
-an internal copy of the data. Note that this is considered a writeback mode and
-the guest OS must handle the disk write cache correctly in order to avoid data
-corruption on host crashes.
-
-The host page cache can be avoided while only sending write notifications to
-the guest when the data has been flushed to the disk using
-@option{cache=directsync}.
-
-In case you don't care about data integrity over host failures, use
-@option{cache=unsafe}. This option tells QEMU that it never needs to write any
-data to the disk but can instead keep things in cache. If anything goes wrong,
-like your host losing power, the disk storage getting disconnected accidentally,
-etc. your image will most probably be rendered unusable.   When using
-the @option{-snapshot} option, unsafe caching is always used.
+When using the @option{-snapshot} option, unsafe caching is always used.
 
 Copy-on-read avoids accessing the same backing file sectors repeatedly and is
 useful when the backing file is over a slow network.  By default copy-on-read

From 370e8328d7c9e0959c31b3ea97f4a49252333307 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 22 Sep 2016 17:24:38 +0200
Subject: [PATCH 09/60] doc: Document driver-specific -blockdev options

This documents the driver-specific options for the raw, qcow2 and file
block drivers for the man page. For everything else, we refer to the
QAPI documentation.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 qemu-options.hx | 115 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 114 insertions(+), 1 deletion(-)

diff --git a/qemu-options.hx b/qemu-options.hx
index db20866629..896ff177c3 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -614,7 +614,18 @@ STEXI
 @item -blockdev @var{option}[,@var{option}[,@var{option}[,...]]]
 @findex -blockdev
 
-Define a new block driver node.
+Define a new block driver node. Some of the options apply to all block drivers,
+other options are only accepted for a specific block driver. See below for a
+list of generic options and options for the most common block drivers.
+
+Options that expect a reference to another node (e.g. @code{file}) can be
+given in two ways. Either you specify the node name of an already existing node
+(file=@var{node-name}), or you define a new node inline, adding options
+for the referenced node after a dot (file.filename=@var{path},file.aio=native).
+
+A block driver node created with @option{-blockdev} can be used for a guest
+device by specifying its node name for the @code{drive} property in a
+@option{-device} argument that defines a block device.
 
 @table @option
 @item Valid options for any block driver node:
@@ -654,6 +665,108 @@ zero write commands. You may even choose "unmap" if @var{discard} is set
 to "unmap" to allow a zero write to be converted to an @code{unmap} operation.
 @end table
 
+@item Driver-specific options for @code{file}
+
+This is the protocol-level block driver for accessing regular files.
+
+@table @code
+@item filename
+The path to the image file in the local filesystem
+@item aio
+Specifies the AIO backend (threads/native, default: threads)
+@end table
+Example:
+@example
+-blockdev driver=file,node-name=disk,filename=disk.img
+@end example
+
+@item Driver-specific options for @code{raw}
+
+This is the image format block driver for raw images. It is usually
+stacked on top of a protocol level block driver such as @code{file}.
+
+@table @code
+@item file
+Reference to or definition of the data source block driver node
+(e.g. a @code{file} driver node)
+@end table
+Example 1:
+@example
+-blockdev driver=file,node-name=disk_file,filename=disk.img
+-blockdev driver=raw,node-name=disk,file=disk_file
+@end example
+Example 2:
+@example
+-blockdev driver=raw,node-name=disk,file.driver=file,file.filename=disk.img
+@end example
+
+@item Driver-specific options for @code{qcow2}
+
+This is the image format block driver for qcow2 images. It is usually
+stacked on top of a protocol level block driver such as @code{file}.
+
+@table @code
+@item file
+Reference to or definition of the data source block driver node
+(e.g. a @code{file} driver node)
+
+@item backing
+Reference to or definition of the backing file block device (default is taken
+from the image file). It is allowed to pass an empty string here in order to
+disable the default backing file.
+
+@item lazy-refcounts
+Whether to enable the lazy refcounts feature (on/off; default is taken from the
+image file)
+
+@item cache-size
+The maximum total size of the L2 table and refcount block caches in bytes
+(default: 1048576 bytes or 8 clusters, whichever is larger)
+
+@item l2-cache-size
+The maximum size of the L2 table cache in bytes
+(default: 4/5 of the total cache size)
+
+@item refcount-cache-size
+The maximum size of the refcount block cache in bytes
+(default: 1/5 of the total cache size)
+
+@item cache-clean-interval
+Clean unused entries in the L2 and refcount caches. The interval is in seconds.
+The default value is 0 and it disables this feature.
+
+@item pass-discard-request
+Whether discard requests to the qcow2 device should be forwarded to the data
+source (on/off; default: on if discard=unmap is specified, off otherwise)
+
+@item pass-discard-snapshot
+Whether discard requests for the data source should be issued when a snapshot
+operation (e.g. deleting a snapshot) frees clusters in the qcow2 file (on/off;
+default: on)
+
+@item pass-discard-other
+Whether discard requests for the data source should be issued on other
+occasions where a cluster gets freed (on/off; default: off)
+
+@item overlap-check
+Which overlap checks to perform for writes to the image
+(none/constant/cached/all; default: cached). For details or finer
+granularity control refer to the QAPI documentation of @code{blockdev-add}.
+@end table
+
+Example 1:
+@example
+-blockdev driver=file,node-name=my_file,filename=/tmp/disk.qcow2
+-blockdev driver=qcow2,node-name=hda,file=my_file,overlap-check=none,cache-size=16777216
+@end example
+Example 2:
+@example
+-blockdev driver=qcow2,node-name=disk,file.driver=http,file.filename=http://example.com/image.qcow2
+@end example
+
+@item Driver-specific options for other drivers
+Please refer to the QAPI documentation of the @code{blockdev-add} QMP command.
+
 @end table
 
 ETEXI

From 0d2fac8ede5623b5bb825bd002633cd65db5159e Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Wed, 14 Jun 2017 00:16:12 +0300
Subject: [PATCH 10/60] throttle: Update throttle-groups.c documentation

There used to be throttle_timers_{detach,attach}_aio_context() calls
in bdrv_set_aio_context(), but since 7ca7f0f6db1fedd28d490795d778cf239
they are now in blk_set_aio_context().

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/throttle-groups.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index a181cb1dee..da2b490c38 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -49,7 +49,7 @@
  * Again, all this is handled internally and is mostly transparent to
  * the outside. The 'throttle_timers' field however has an additional
  * constraint because it may be temporarily invalid (see for example
- * bdrv_set_aio_context()). Therefore in this file a thread will
+ * blk_set_aio_context()). Therefore in this file a thread will
  * access some other BlockBackend's timers only after verifying that
  * that BlockBackend has throttled requests in the queue.
  */

From 1575829d2aaced8ce6a5728d8e9fbbdee8f80885 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 15 Jun 2017 17:38:10 +0100
Subject: [PATCH 11/60] migration: hold AioContext lock for loadvm
 qemu_fclose()

migration_incoming_state_destroy() uses qemu_fclose() on the vmstate
file.  Make sure to call it inside an AioContext acquire/release region.

This fixes an 'qemu: qemu_mutex_unlock: Operation not permitted' abort
in loadvm.

This patch closes the vmstate file before ending the drained region.
Previously we closed the vmstate file after ending the drained region.
The order does not matter.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 migration/savevm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index b08df04d59..c7a49c93c5 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2300,11 +2300,11 @@ int load_snapshot(const char *name, Error **errp)
 
     aio_context_acquire(aio_context);
     ret = qemu_loadvm_state(f);
+    migration_incoming_state_destroy();
     aio_context_release(aio_context);
 
     bdrv_drain_all_end();
 
-    migration_incoming_state_destroy();
     if (ret < 0) {
         error_setg(errp, "Error %d while loading VM state", ret);
         return ret;

From 79645e0569e7a95f5c9bee67eb62b06daaed8099 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 15 Jun 2017 17:38:11 +0100
Subject: [PATCH 12/60] qemu-iotests: 068: extract _qemu() function

Avoid duplicating the QEMU command-line.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/068 | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tests/qemu-iotests/068 b/tests/qemu-iotests/068
index 9c1687d01d..61936d5075 100755
--- a/tests/qemu-iotests/068
+++ b/tests/qemu-iotests/068
@@ -59,14 +59,17 @@ case "$QEMU_DEFAULT_MACHINE" in
       ;;
 esac
 
+_qemu()
+{
+    $QEMU $platform_parm -nographic -monitor stdio -serial none -hda "$TEST_IMG" \
+          "$@" |\
+    _filter_qemu | _filter_hmp
+}
+
 # Give qemu some time to boot before saving the VM state
-bash -c 'sleep 1; echo -e "savevm 0\nquit"' |\
-    $QEMU $platform_parm -nographic -monitor stdio -serial none -hda "$TEST_IMG" |\
-    _filter_qemu | _filter_hmp
+bash -c 'sleep 1; echo -e "savevm 0\nquit"' | _qemu
 # Now try to continue from that VM state (this should just work)
-echo quit |\
-    $QEMU $platform_parm -nographic -monitor stdio -serial none -hda "$TEST_IMG" -loadvm 0 |\
-    _filter_qemu | _filter_hmp
+echo quit | _qemu -loadvm 0
 
 # success, all done
 echo "*** done"

From 5aaf590df4b075c18cf6f27a51900a126deb8634 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 15 Jun 2017 17:38:12 +0100
Subject: [PATCH 13/60] qemu-iotests: 068: use -drive/-device instead of -hda

The legacy -hda option does not support -drive/-device parameters.  They
will be required by the next patch that extends this test case.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/068 | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/068 b/tests/qemu-iotests/068
index 61936d5075..7292643fdc 100755
--- a/tests/qemu-iotests/068
+++ b/tests/qemu-iotests/068
@@ -53,15 +53,20 @@ _make_test_img $IMG_SIZE
 case "$QEMU_DEFAULT_MACHINE" in
   s390-ccw-virtio)
       platform_parm="-no-shutdown"
+      hba=virtio-scsi-ccw
       ;;
   *)
       platform_parm=""
+      hba=virtio-scsi-pci
       ;;
 esac
 
 _qemu()
 {
-    $QEMU $platform_parm -nographic -monitor stdio -serial none -hda "$TEST_IMG" \
+    $QEMU $platform_parm -nographic -monitor stdio -serial none \
+          -drive if=none,id=drive0,file="$TEST_IMG",format="$IMGFMT" \
+          -device $hba,id=hba0 \
+          -device scsi-hd,drive=drive0 \
           "$@" |\
     _filter_qemu | _filter_hmp
 }

From ea4f3cebc4e0224605ab9dd9724aa4e7768fe372 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 15 Jun 2017 17:38:13 +0100
Subject: [PATCH 14/60] qemu-iotests: 068: test iothread mode

Perform the savevm/loadvm test with both iothread on and off.  This
covers the recently found savevm/loadvm hang when iothread is enabled.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/068     | 23 ++++++++++++++---------
 tests/qemu-iotests/068.out | 11 ++++++++++-
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/tests/qemu-iotests/068 b/tests/qemu-iotests/068
index 7292643fdc..3801b65b9d 100755
--- a/tests/qemu-iotests/068
+++ b/tests/qemu-iotests/068
@@ -45,11 +45,6 @@ _supported_os Linux
 IMGOPTS="compat=1.1"
 IMG_SIZE=128K
 
-echo
-echo "=== Saving and reloading a VM state to/from a qcow2 image ==="
-echo
-_make_test_img $IMG_SIZE
-
 case "$QEMU_DEFAULT_MACHINE" in
   s390-ccw-virtio)
       platform_parm="-no-shutdown"
@@ -71,10 +66,20 @@ _qemu()
     _filter_qemu | _filter_hmp
 }
 
-# Give qemu some time to boot before saving the VM state
-bash -c 'sleep 1; echo -e "savevm 0\nquit"' | _qemu
-# Now try to continue from that VM state (this should just work)
-echo quit | _qemu -loadvm 0
+for extra_args in \
+    "" \
+    "-object iothread,id=iothread0 -set device.hba0.iothread=iothread0"; do
+    echo
+    echo "=== Saving and reloading a VM state to/from a qcow2 image ($extra_args) ==="
+    echo
+
+    _make_test_img $IMG_SIZE
+
+    # Give qemu some time to boot before saving the VM state
+    bash -c 'sleep 1; echo -e "savevm 0\nquit"' | _qemu $extra_args
+    # Now try to continue from that VM state (this should just work)
+    echo quit | _qemu $extra_args -loadvm 0
+done
 
 # success, all done
 echo "*** done"
diff --git a/tests/qemu-iotests/068.out b/tests/qemu-iotests/068.out
index 0fa5340c22..aa063cf711 100644
--- a/tests/qemu-iotests/068.out
+++ b/tests/qemu-iotests/068.out
@@ -1,6 +1,15 @@
 QA output created by 068
 
-=== Saving and reloading a VM state to/from a qcow2 image ===
+=== Saving and reloading a VM state to/from a qcow2 image () ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=131072
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) savevm 0
+(qemu) quit
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) quit
+
+=== Saving and reloading a VM state to/from a qcow2 image (-object iothread,id=iothread0 -set device.hba0.iothread=iothread0) ===
 
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=131072
 QEMU X.Y.Z monitor - type 'help' for more information

From b2b2b67a0057407e19cfa3fdd9002db21ced8b01 Mon Sep 17 00:00:00 2001
From: Stephen Bates <sbates@raithlin.com>
Date: Tue, 13 Jun 2017 04:08:35 -0600
Subject: [PATCH 15/60] nvme: Add support for Read Data and Write Data in CMBs.

Add the ability for the NVMe model to support both the RDS and WDS
modes in the Controller Memory Buffer.

Although not currently supported in the upstreamed Linux kernel a fork
with support exists [1] and user-space test programs that build on
this also exist [2].

Useful for testing CMB functionality in preperation for real CMB
enabled NVMe devices (coming soon).

[1] https://github.com/sbates130272/linux-p2pmem
[2] https://github.com/sbates130272/p2pmem-test

Signed-off-by: Stephen Bates <sbates@raithlin.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/block/nvme.c | 83 +++++++++++++++++++++++++++++++++----------------
 hw/block/nvme.h |  1 +
 2 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 381dc7c5fb..6071dc12d8 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -21,7 +21,7 @@
  *              cmb_size_mb=<cmb_size_mb[optional]>
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
- * offset 0 in BAR2 and supports SQS only for now.
+ * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
  */
 
 #include "qemu/osdep.h"
@@ -93,8 +93,8 @@ static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
     }
 }
 
-static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
-    uint32_t len, NvmeCtrl *n)
+static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
+                             uint64_t prp2, uint32_t len, NvmeCtrl *n)
 {
     hwaddr trans_len = n->page_size - (prp1 % n->page_size);
     trans_len = MIN(len, trans_len);
@@ -102,10 +102,15 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
 
     if (!prp1) {
         return NVME_INVALID_FIELD | NVME_DNR;
+    } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
+               prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
+        qsg->nsg = 0;
+        qemu_iovec_init(iov, num_prps);
+        qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - n->ctrl_mem.addr], trans_len);
+    } else {
+        pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
+        qemu_sglist_add(qsg, prp1, trans_len);
     }
-
-    pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
-    qemu_sglist_add(qsg, prp1, trans_len);
     len -= trans_len;
     if (len) {
         if (!prp2) {
@@ -118,7 +123,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
 
             nents = (len + n->page_size - 1) >> n->page_bits;
             prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
-            pci_dma_read(&n->parent_obj, prp2, (void *)prp_list, prp_trans);
+            nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
             while (len != 0) {
                 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
 
@@ -130,7 +135,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
                     i = 0;
                     nents = (len + n->page_size - 1) >> n->page_bits;
                     prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
-                    pci_dma_read(&n->parent_obj, prp_ent, (void *)prp_list,
+                    nvme_addr_read(n, prp_ent, (void *)prp_list,
                         prp_trans);
                     prp_ent = le64_to_cpu(prp_list[i]);
                 }
@@ -140,7 +145,11 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
                 }
 
                 trans_len = MIN(len, n->page_size);
-                qemu_sglist_add(qsg, prp_ent, trans_len);
+                if (qsg->nsg){
+                    qemu_sglist_add(qsg, prp_ent, trans_len);
+                } else {
+                    qemu_iovec_add(iov, (void *)&n->cmbuf[prp_ent - n->ctrl_mem.addr], trans_len);
+                }
                 len -= trans_len;
                 i++;
             }
@@ -148,7 +157,11 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
             if (prp2 & (n->page_size - 1)) {
                 goto unmap;
             }
-            qemu_sglist_add(qsg, prp2, len);
+            if (qsg->nsg) {
+                qemu_sglist_add(qsg, prp2, len);
+            } else {
+                qemu_iovec_add(iov, (void *)&n->cmbuf[prp2 - n->ctrl_mem.addr], trans_len);
+            }
         }
     }
     return NVME_SUCCESS;
@@ -162,16 +175,24 @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
     uint64_t prp1, uint64_t prp2)
 {
     QEMUSGList qsg;
+    QEMUIOVector iov;
+    uint16_t status = NVME_SUCCESS;
 
-    if (nvme_map_prp(&qsg, prp1, prp2, len, n)) {
+    if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
         return NVME_INVALID_FIELD | NVME_DNR;
     }
-    if (dma_buf_read(ptr, len, &qsg)) {
+    if (qsg.nsg > 0) {
+        if (dma_buf_read(ptr, len, &qsg)) {
+            status = NVME_INVALID_FIELD | NVME_DNR;
+        }
         qemu_sglist_destroy(&qsg);
-        return NVME_INVALID_FIELD | NVME_DNR;
+    } else {
+        if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
+            status = NVME_INVALID_FIELD | NVME_DNR;
+        }
+        qemu_iovec_destroy(&iov);
     }
-    qemu_sglist_destroy(&qsg);
-    return NVME_SUCCESS;
+    return status;
 }
 
 static void nvme_post_cqes(void *opaque)
@@ -285,20 +306,27 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
         return NVME_LBA_RANGE | NVME_DNR;
     }
 
-    if (nvme_map_prp(&req->qsg, prp1, prp2, data_size, n)) {
+    if (nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, data_size, n)) {
         block_acct_invalid(blk_get_stats(n->conf.blk), acct);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
-    assert((nlb << data_shift) == req->qsg.size);
-
-    req->has_sg = true;
     dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct);
-    req->aiocb = is_write ?
-        dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
-                      nvme_rw_cb, req) :
-        dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
-                     nvme_rw_cb, req);
+    if (req->qsg.nsg > 0) {
+        req->has_sg = true;
+        req->aiocb = is_write ?
+            dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
+                          nvme_rw_cb, req) :
+            dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
+                         nvme_rw_cb, req);
+    } else {
+        req->has_sg = false;
+        req->aiocb = is_write ?
+            blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
+                            req) :
+            blk_aio_preadv(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
+                           req);
+    }
 
     return NVME_NO_COMPLETE;
 }
@@ -987,11 +1015,14 @@ static int nvme_init(PCIDevice *pci_dev)
         NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
         NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
         NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
-        NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 0);
-        NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 0);
+        NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
+        NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
         NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
         NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->cmb_size_mb);
 
+        n->cmbloc = n->bar.cmbloc;
+        n->cmbsz = n->bar.cmbsz;
+
         n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
         memory_region_init_io(&n->ctrl_mem, OBJECT(n), &nvme_cmb_ops, n,
                               "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index b4961d2547..6aab338ff5 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -712,6 +712,7 @@ typedef struct NvmeRequest {
     NvmeCqe                 cqe;
     BlockAcctCookie         acct;
     QEMUSGList              qsg;
+    QEMUIOVector            iov;
     QTAILQ_ENTRY(NvmeRequest)entry;
 } NvmeRequest;
 

From 026ac1586bdbd184e24082aa2bbab1fa3c48456b Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Mon, 19 Jun 2017 16:40:02 +0300
Subject: [PATCH 16/60] qcow2: Remove unused Error variable in do_perform_cow()

We are using the return value of qcow2_encrypt_sectors() to detect
problems but we are throwing away the returned Error since we have no
way to report it to the user. Therefore we can simply get rid of the
local Error variable and pass NULL instead.

Alternatively we could try to figure out a way to pass the original
error instead of simply returning -EIO, but that would be more
invasive, so let's keep the current approach.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index d779ea19cf..d1c419f52b 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -440,16 +440,14 @@ static int coroutine_fn do_perform_cow(BlockDriverState *bs,
     }
 
     if (bs->encrypted) {
-        Error *err = NULL;
         int64_t sector = (src_cluster_offset + offset_in_cluster)
                          >> BDRV_SECTOR_BITS;
         assert(s->cipher);
         assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
         assert((bytes & ~BDRV_SECTOR_MASK) == 0);
         if (qcow2_encrypt_sectors(s, sector, iov.iov_base, iov.iov_base,
-                                  bytes >> BDRV_SECTOR_BITS, true, &err) < 0) {
+                                  bytes >> BDRV_SECTOR_BITS, true, NULL) < 0) {
             ret = -EIO;
-            error_free(err);
             goto out;
         }
     }

From e034f5bcbc1139903b27c00bd832ee7c4b065810 Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Mon, 19 Jun 2017 16:40:03 +0300
Subject: [PATCH 17/60] qcow2: Use unsigned int for both members of
 Qcow2COWRegion

Qcow2COWRegion has two attributes:

- The offset of the COW region from the start of the first cluster
  touched by the I/O request. Since it's always going to be positive
  and the maximum request size is at most INT_MAX, we can use a
  regular unsigned int to store this offset.

- The size of the COW region in bytes. This is guaranteed to be >= 0,
  so we should use an unsigned type instead.

In x86_64 this reduces the size of Qcow2COWRegion from 16 to 8 bytes.
It will also help keep some assertions simpler now that we know that
there are no negative numbers.

The prototype of do_perform_cow() is also updated to reflect these
changes.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 4 ++--
 block/qcow2.h         | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index d1c419f52b..a86c5a75a9 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -406,8 +406,8 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
 static int coroutine_fn do_perform_cow(BlockDriverState *bs,
                                        uint64_t src_cluster_offset,
                                        uint64_t cluster_offset,
-                                       int offset_in_cluster,
-                                       int bytes)
+                                       unsigned offset_in_cluster,
+                                       unsigned bytes)
 {
     BDRVQcow2State *s = bs->opaque;
     QEMUIOVector qiov;
diff --git a/block/qcow2.h b/block/qcow2.h
index 1801dc30dc..c26ee0a33d 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -301,10 +301,10 @@ typedef struct Qcow2COWRegion {
      * Offset of the COW region in bytes from the start of the first cluster
      * touched by the request.
      */
-    uint64_t    offset;
+    unsigned    offset;
 
     /** Number of bytes to copy */
-    int         nb_bytes;
+    unsigned    nb_bytes;
 } Qcow2COWRegion;
 
 /**

From 99450c6fb9ac85e2c8097156fed78b8ef1d610e0 Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Mon, 19 Jun 2017 16:40:04 +0300
Subject: [PATCH 18/60] qcow2: Make perform_cow() call do_perform_cow() twice

Instead of calling perform_cow() twice with a different COW region
each time, call it just once and make perform_cow() handle both
regions.

This patch simply moves code around. The next one will do the actual
reordering of the COW operations.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index a86c5a75a9..4c03639a72 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -414,6 +414,10 @@ static int coroutine_fn do_perform_cow(BlockDriverState *bs,
     struct iovec iov;
     int ret;
 
+    if (bytes == 0) {
+        return 0;
+    }
+
     iov.iov_len = bytes;
     iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
     if (iov.iov_base == NULL) {
@@ -751,31 +755,40 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
     return cluster_offset;
 }
 
-static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
+static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
 {
     BDRVQcow2State *s = bs->opaque;
+    Qcow2COWRegion *start = &m->cow_start;
+    Qcow2COWRegion *end = &m->cow_end;
     int ret;
 
-    if (r->nb_bytes == 0) {
+    if (start->nb_bytes == 0 && end->nb_bytes == 0) {
         return 0;
     }
 
     qemu_co_mutex_unlock(&s->lock);
-    ret = do_perform_cow(bs, m->offset, m->alloc_offset, r->offset, r->nb_bytes);
-    qemu_co_mutex_lock(&s->lock);
-
+    ret = do_perform_cow(bs, m->offset, m->alloc_offset,
+                         start->offset, start->nb_bytes);
     if (ret < 0) {
-        return ret;
+        goto fail;
     }
 
+    ret = do_perform_cow(bs, m->offset, m->alloc_offset,
+                         end->offset, end->nb_bytes);
+
+fail:
+    qemu_co_mutex_lock(&s->lock);
+
     /*
      * Before we update the L2 table to actually point to the new cluster, we
      * need to be sure that the refcounts have been increased and COW was
      * handled.
      */
-    qcow2_cache_depends_on_flush(s->l2_table_cache);
+    if (ret == 0) {
+        qcow2_cache_depends_on_flush(s->l2_table_cache);
+    }
 
-    return 0;
+    return ret;
 }
 
 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
@@ -795,12 +808,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     }
 
     /* copy content of unmodified sectors */
-    ret = perform_cow(bs, m, &m->cow_start);
-    if (ret < 0) {
-        goto err;
-    }
-
-    ret = perform_cow(bs, m, &m->cow_end);
+    ret = perform_cow(bs, m);
     if (ret < 0) {
         goto err;
     }

From 672f0f2c4b5ee521f5ea06eadaacf8dfa99474f4 Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Mon, 19 Jun 2017 16:40:05 +0300
Subject: [PATCH 19/60] qcow2: Split do_perform_cow() into _read(), _encrypt()
 and _write()

This patch splits do_perform_cow() into three separate functions to
read, encrypt and write the COW regions.

perform_cow() can now read both regions first, then encrypt them and
finally write them to disk. The memory allocation is also done in
this function now, using one single buffer large enough to hold both
regions.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 117 +++++++++++++++++++++++++++++++-----------
 1 file changed, 87 insertions(+), 30 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 4c03639a72..3c9ace8a96 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -403,34 +403,26 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
     return 0;
 }
 
-static int coroutine_fn do_perform_cow(BlockDriverState *bs,
-                                       uint64_t src_cluster_offset,
-                                       uint64_t cluster_offset,
-                                       unsigned offset_in_cluster,
-                                       unsigned bytes)
+static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
+                                            uint64_t src_cluster_offset,
+                                            unsigned offset_in_cluster,
+                                            uint8_t *buffer,
+                                            unsigned bytes)
 {
-    BDRVQcow2State *s = bs->opaque;
     QEMUIOVector qiov;
-    struct iovec iov;
+    struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
     int ret;
 
     if (bytes == 0) {
         return 0;
     }
 
-    iov.iov_len = bytes;
-    iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
-    if (iov.iov_base == NULL) {
-        return -ENOMEM;
-    }
-
     qemu_iovec_init_external(&qiov, &iov, 1);
 
     BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
 
     if (!bs->drv) {
-        ret = -ENOMEDIUM;
-        goto out;
+        return -ENOMEDIUM;
     }
 
     /* Call .bdrv_co_readv() directly instead of using the public block-layer
@@ -440,39 +432,63 @@ static int coroutine_fn do_perform_cow(BlockDriverState *bs,
     ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster,
                                   bytes, &qiov, 0);
     if (ret < 0) {
-        goto out;
+        return ret;
     }
 
-    if (bs->encrypted) {
+    return 0;
+}
+
+static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
+                                                uint64_t src_cluster_offset,
+                                                unsigned offset_in_cluster,
+                                                uint8_t *buffer,
+                                                unsigned bytes)
+{
+    if (bytes && bs->encrypted) {
+        BDRVQcow2State *s = bs->opaque;
         int64_t sector = (src_cluster_offset + offset_in_cluster)
                          >> BDRV_SECTOR_BITS;
         assert(s->cipher);
         assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
         assert((bytes & ~BDRV_SECTOR_MASK) == 0);
-        if (qcow2_encrypt_sectors(s, sector, iov.iov_base, iov.iov_base,
+        if (qcow2_encrypt_sectors(s, sector, buffer, buffer,
                                   bytes >> BDRV_SECTOR_BITS, true, NULL) < 0) {
-            ret = -EIO;
-            goto out;
+            return false;
         }
     }
+    return true;
+}
+
+static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
+                                             uint64_t cluster_offset,
+                                             unsigned offset_in_cluster,
+                                             uint8_t *buffer,
+                                             unsigned bytes)
+{
+    QEMUIOVector qiov;
+    struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
+    int ret;
+
+    if (bytes == 0) {
+        return 0;
+    }
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
 
     ret = qcow2_pre_write_overlap_check(bs, 0,
             cluster_offset + offset_in_cluster, bytes);
     if (ret < 0) {
-        goto out;
+        return ret;
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
     ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster,
                           bytes, &qiov, 0);
     if (ret < 0) {
-        goto out;
+        return ret;
     }
 
-    ret = 0;
-out:
-    qemu_vfree(iov.iov_base);
-    return ret;
+    return 0;
 }
 
 
@@ -760,22 +776,62 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
     BDRVQcow2State *s = bs->opaque;
     Qcow2COWRegion *start = &m->cow_start;
     Qcow2COWRegion *end = &m->cow_end;
+    unsigned buffer_size;
+    uint8_t *start_buffer, *end_buffer;
     int ret;
 
+    assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
+
     if (start->nb_bytes == 0 && end->nb_bytes == 0) {
         return 0;
     }
 
+    /* Reserve a buffer large enough to store the data from both the
+     * start and end COW regions. Add some padding in the middle if
+     * necessary to make sure that the end region is optimally aligned */
+    buffer_size = QEMU_ALIGN_UP(start->nb_bytes, bdrv_opt_mem_align(bs)) +
+        end->nb_bytes;
+    start_buffer = qemu_try_blockalign(bs, buffer_size);
+    if (start_buffer == NULL) {
+        return -ENOMEM;
+    }
+    /* The part of the buffer where the end region is located */
+    end_buffer = start_buffer + buffer_size - end->nb_bytes;
+
     qemu_co_mutex_unlock(&s->lock);
-    ret = do_perform_cow(bs, m->offset, m->alloc_offset,
-                         start->offset, start->nb_bytes);
+    /* First we read the existing data from both COW regions */
+    ret = do_perform_cow_read(bs, m->offset, start->offset,
+                              start_buffer, start->nb_bytes);
     if (ret < 0) {
         goto fail;
     }
 
-    ret = do_perform_cow(bs, m->offset, m->alloc_offset,
-                         end->offset, end->nb_bytes);
+    ret = do_perform_cow_read(bs, m->offset, end->offset,
+                              end_buffer, end->nb_bytes);
+    if (ret < 0) {
+        goto fail;
+    }
 
+    /* Encrypt the data if necessary before writing it */
+    if (bs->encrypted) {
+        if (!do_perform_cow_encrypt(bs, m->offset, start->offset,
+                                    start_buffer, start->nb_bytes) ||
+            !do_perform_cow_encrypt(bs, m->offset, end->offset,
+                                    end_buffer, end->nb_bytes)) {
+            ret = -EIO;
+            goto fail;
+        }
+    }
+
+    /* And now we can write everything */
+    ret = do_perform_cow_write(bs, m->alloc_offset, start->offset,
+                               start_buffer, start->nb_bytes);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = do_perform_cow_write(bs, m->alloc_offset, end->offset,
+                               end_buffer, end->nb_bytes);
 fail:
     qemu_co_mutex_lock(&s->lock);
 
@@ -788,6 +844,7 @@ fail:
         qcow2_cache_depends_on_flush(s->l2_table_cache);
     }
 
+    qemu_vfree(start_buffer);
     return ret;
 }
 

From b3cf1c7cf8714af96def49668a267fa4075242ca Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Mon, 19 Jun 2017 16:40:06 +0300
Subject: [PATCH 20/60] qcow2: Allow reading both COW regions with only one
 request

Reading both COW regions requires two separate requests, but it's
perfectly possible to merge them and perform only one. This generally
improves performance, particularly on rotating disk drives. The
downside is that the data in the middle region is read but discarded.

This patch takes a conservative approach and only merges reads when
the size of the middle region is <= 16KB.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 51 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 3c9ace8a96..20fb531932 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -777,20 +777,38 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
     Qcow2COWRegion *start = &m->cow_start;
     Qcow2COWRegion *end = &m->cow_end;
     unsigned buffer_size;
+    unsigned data_bytes = end->offset - (start->offset + start->nb_bytes);
+    bool merge_reads;
     uint8_t *start_buffer, *end_buffer;
     int ret;
 
     assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
+    assert(start->nb_bytes + end->nb_bytes <= UINT_MAX - data_bytes);
+    assert(start->offset + start->nb_bytes <= end->offset);
 
     if (start->nb_bytes == 0 && end->nb_bytes == 0) {
         return 0;
     }
 
-    /* Reserve a buffer large enough to store the data from both the
-     * start and end COW regions. Add some padding in the middle if
-     * necessary to make sure that the end region is optimally aligned */
-    buffer_size = QEMU_ALIGN_UP(start->nb_bytes, bdrv_opt_mem_align(bs)) +
-        end->nb_bytes;
+    /* If we have to read both the start and end COW regions and the
+     * middle region is not too large then perform just one read
+     * operation */
+    merge_reads = start->nb_bytes && end->nb_bytes && data_bytes <= 16384;
+    if (merge_reads) {
+        buffer_size = start->nb_bytes + data_bytes + end->nb_bytes;
+    } else {
+        /* If we have to do two reads, add some padding in the middle
+         * if necessary to make sure that the end region is optimally
+         * aligned. */
+        size_t align = bdrv_opt_mem_align(bs);
+        assert(align > 0 && align <= UINT_MAX);
+        assert(QEMU_ALIGN_UP(start->nb_bytes, align) <=
+               UINT_MAX - end->nb_bytes);
+        buffer_size = QEMU_ALIGN_UP(start->nb_bytes, align) + end->nb_bytes;
+    }
+
+    /* Reserve a buffer large enough to store all the data that we're
+     * going to read */
     start_buffer = qemu_try_blockalign(bs, buffer_size);
     if (start_buffer == NULL) {
         return -ENOMEM;
@@ -799,15 +817,22 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
     end_buffer = start_buffer + buffer_size - end->nb_bytes;
 
     qemu_co_mutex_unlock(&s->lock);
-    /* First we read the existing data from both COW regions */
-    ret = do_perform_cow_read(bs, m->offset, start->offset,
-                              start_buffer, start->nb_bytes);
-    if (ret < 0) {
-        goto fail;
-    }
+    /* First we read the existing data from both COW regions. We
+     * either read the whole region in one go, or the start and end
+     * regions separately. */
+    if (merge_reads) {
+        ret = do_perform_cow_read(bs, m->offset, start->offset,
+                                  start_buffer, buffer_size);
+    } else {
+        ret = do_perform_cow_read(bs, m->offset, start->offset,
+                                  start_buffer, start->nb_bytes);
+        if (ret < 0) {
+            goto fail;
+        }
 
-    ret = do_perform_cow_read(bs, m->offset, end->offset,
-                              end_buffer, end->nb_bytes);
+        ret = do_perform_cow_read(bs, m->offset, end->offset,
+                                  end_buffer, end->nb_bytes);
+    }
     if (ret < 0) {
         goto fail;
     }

From 86b862c431ae5effa80a095c9c989a5a9976ead1 Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Mon, 19 Jun 2017 16:40:07 +0300
Subject: [PATCH 21/60] qcow2: Pass a QEMUIOVector to
 do_perform_cow_{read,write}()

Instead of passing a single buffer pointer to do_perform_cow_write(),
pass a QEMUIOVector. This will allow us to merge the write requests
for the COW regions and the actual data into a single one.

Although do_perform_cow_read() does not strictly need to change its
API, we're doing it here as well for consistency.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 51 ++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 27 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 20fb531932..3ac26d6bf7 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -406,19 +406,14 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
 static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
                                             uint64_t src_cluster_offset,
                                             unsigned offset_in_cluster,
-                                            uint8_t *buffer,
-                                            unsigned bytes)
+                                            QEMUIOVector *qiov)
 {
-    QEMUIOVector qiov;
-    struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
     int ret;
 
-    if (bytes == 0) {
+    if (qiov->size == 0) {
         return 0;
     }
 
-    qemu_iovec_init_external(&qiov, &iov, 1);
-
     BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
 
     if (!bs->drv) {
@@ -430,7 +425,7 @@ static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
      * which can lead to deadlock when block layer copy-on-read is enabled.
      */
     ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster,
-                                  bytes, &qiov, 0);
+                                  qiov->size, qiov, 0);
     if (ret < 0) {
         return ret;
     }
@@ -462,28 +457,23 @@ static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
 static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
                                              uint64_t cluster_offset,
                                              unsigned offset_in_cluster,
-                                             uint8_t *buffer,
-                                             unsigned bytes)
+                                             QEMUIOVector *qiov)
 {
-    QEMUIOVector qiov;
-    struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
     int ret;
 
-    if (bytes == 0) {
+    if (qiov->size == 0) {
         return 0;
     }
 
-    qemu_iovec_init_external(&qiov, &iov, 1);
-
     ret = qcow2_pre_write_overlap_check(bs, 0,
-            cluster_offset + offset_in_cluster, bytes);
+            cluster_offset + offset_in_cluster, qiov->size);
     if (ret < 0) {
         return ret;
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
     ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster,
-                          bytes, &qiov, 0);
+                          qiov->size, qiov, 0);
     if (ret < 0) {
         return ret;
     }
@@ -780,6 +770,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
     unsigned data_bytes = end->offset - (start->offset + start->nb_bytes);
     bool merge_reads;
     uint8_t *start_buffer, *end_buffer;
+    QEMUIOVector qiov;
     int ret;
 
     assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
@@ -816,22 +807,25 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
     /* The part of the buffer where the end region is located */
     end_buffer = start_buffer + buffer_size - end->nb_bytes;
 
+    qemu_iovec_init(&qiov, 1);
+
     qemu_co_mutex_unlock(&s->lock);
     /* First we read the existing data from both COW regions. We
      * either read the whole region in one go, or the start and end
      * regions separately. */
     if (merge_reads) {
-        ret = do_perform_cow_read(bs, m->offset, start->offset,
-                                  start_buffer, buffer_size);
+        qemu_iovec_add(&qiov, start_buffer, buffer_size);
+        ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
     } else {
-        ret = do_perform_cow_read(bs, m->offset, start->offset,
-                                  start_buffer, start->nb_bytes);
+        qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
+        ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
         if (ret < 0) {
             goto fail;
         }
 
-        ret = do_perform_cow_read(bs, m->offset, end->offset,
-                                  end_buffer, end->nb_bytes);
+        qemu_iovec_reset(&qiov);
+        qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
+        ret = do_perform_cow_read(bs, m->offset, end->offset, &qiov);
     }
     if (ret < 0) {
         goto fail;
@@ -849,14 +843,16 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
     }
 
     /* And now we can write everything */
-    ret = do_perform_cow_write(bs, m->alloc_offset, start->offset,
-                               start_buffer, start->nb_bytes);
+    qemu_iovec_reset(&qiov);
+    qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
+    ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
     if (ret < 0) {
         goto fail;
     }
 
-    ret = do_perform_cow_write(bs, m->alloc_offset, end->offset,
-                               end_buffer, end->nb_bytes);
+    qemu_iovec_reset(&qiov);
+    qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
+    ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
 fail:
     qemu_co_mutex_lock(&s->lock);
 
@@ -870,6 +866,7 @@ fail:
     }
 
     qemu_vfree(start_buffer);
+    qemu_iovec_destroy(&qiov);
     return ret;
 }
 

From ee22a9d86921310672aa8775489217f3e2f5e1c6 Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Mon, 19 Jun 2017 16:40:08 +0300
Subject: [PATCH 22/60] qcow2: Merge the writing of the COW regions with the
 guest data

If the guest tries to write data that results on the allocation of a
new cluster, instead of writing the guest data first and then the data
from the COW regions, write everything together using one single I/O
operation.

This can improve the write performance by 25% or more, depending on
several factors such as the media type, the cluster size and the I/O
request size.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 40 ++++++++++++++++++++-------
 block/qcow2.c         | 64 ++++++++++++++++++++++++++++++++++++-------
 block/qcow2.h         |  7 +++++
 3 files changed, 91 insertions(+), 20 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 3ac26d6bf7..01f210187c 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -776,6 +776,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
     assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
     assert(start->nb_bytes + end->nb_bytes <= UINT_MAX - data_bytes);
     assert(start->offset + start->nb_bytes <= end->offset);
+    assert(!m->data_qiov || m->data_qiov->size == data_bytes);
 
     if (start->nb_bytes == 0 && end->nb_bytes == 0) {
         return 0;
@@ -807,7 +808,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
     /* The part of the buffer where the end region is located */
     end_buffer = start_buffer + buffer_size - end->nb_bytes;
 
-    qemu_iovec_init(&qiov, 1);
+    qemu_iovec_init(&qiov, 2 + (m->data_qiov ? m->data_qiov->niov : 0));
 
     qemu_co_mutex_unlock(&s->lock);
     /* First we read the existing data from both COW regions. We
@@ -842,17 +843,36 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
         }
     }
 
-    /* And now we can write everything */
-    qemu_iovec_reset(&qiov);
-    qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
-    ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
-    if (ret < 0) {
-        goto fail;
+    /* And now we can write everything. If we have the guest data we
+     * can write everything in one single operation */
+    if (m->data_qiov) {
+        qemu_iovec_reset(&qiov);
+        if (start->nb_bytes) {
+            qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
+        }
+        qemu_iovec_concat(&qiov, m->data_qiov, 0, data_bytes);
+        if (end->nb_bytes) {
+            qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
+        }
+        /* NOTE: we have a write_aio blkdebug event here followed by
+         * a cow_write one in do_perform_cow_write(), but there's only
+         * one single I/O operation */
+        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+        ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
+    } else {
+        /* If there's no guest data then write both COW regions separately */
+        qemu_iovec_reset(&qiov);
+        qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
+        ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        qemu_iovec_reset(&qiov);
+        qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
+        ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
     }
 
-    qemu_iovec_reset(&qiov);
-    qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
-    ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
 fail:
     qemu_co_mutex_lock(&s->lock);
 
diff --git a/block/qcow2.c b/block/qcow2.c
index b3ba5daa93..328b1d4fb5 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1575,6 +1575,44 @@ fail:
     return ret;
 }
 
+/* Check if it's possible to merge a write request with the writing of
+ * the data from the COW regions */
+static bool merge_cow(uint64_t offset, unsigned bytes,
+                      QEMUIOVector *hd_qiov, QCowL2Meta *l2meta)
+{
+    QCowL2Meta *m;
+
+    for (m = l2meta; m != NULL; m = m->next) {
+        /* If both COW regions are empty then there's nothing to merge */
+        if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) {
+            continue;
+        }
+
+        /* The data (middle) region must be immediately after the
+         * start region */
+        if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
+            continue;
+        }
+
+        /* The end region must be immediately after the data (middle)
+         * region */
+        if (m->offset + m->cow_end.offset != offset + bytes) {
+            continue;
+        }
+
+        /* Make sure that adding both COW regions to the QEMUIOVector
+         * does not exceed IOV_MAX */
+        if (hd_qiov->niov > IOV_MAX - 2) {
+            continue;
+        }
+
+        m->data_qiov = hd_qiov;
+        return true;
+    }
+
+    return false;
+}
+
 static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
                                          uint64_t bytes, QEMUIOVector *qiov,
                                          int flags)
@@ -1657,16 +1695,22 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
             goto fail;
         }
 
-        qemu_co_mutex_unlock(&s->lock);
-        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
-        trace_qcow2_writev_data(qemu_coroutine_self(),
-                                cluster_offset + offset_in_cluster);
-        ret = bdrv_co_pwritev(bs->file,
-                              cluster_offset + offset_in_cluster,
-                              cur_bytes, &hd_qiov, 0);
-        qemu_co_mutex_lock(&s->lock);
-        if (ret < 0) {
-            goto fail;
+        /* If we need to do COW, check if it's possible to merge the
+         * writing of the guest data together with that of the COW regions.
+         * If it's not possible (or not necessary) then write the
+         * guest data now. */
+        if (!merge_cow(offset, cur_bytes, &hd_qiov, l2meta)) {
+            qemu_co_mutex_unlock(&s->lock);
+            BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+            trace_qcow2_writev_data(qemu_coroutine_self(),
+                                    cluster_offset + offset_in_cluster);
+            ret = bdrv_co_pwritev(bs->file,
+                                  cluster_offset + offset_in_cluster,
+                                  cur_bytes, &hd_qiov, 0);
+            qemu_co_mutex_lock(&s->lock);
+            if (ret < 0) {
+                goto fail;
+            }
         }
 
         while (l2meta != NULL) {
diff --git a/block/qcow2.h b/block/qcow2.h
index c26ee0a33d..87b15eb4aa 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -343,6 +343,13 @@ typedef struct QCowL2Meta
      */
     Qcow2COWRegion cow_end;
 
+    /**
+     * The I/O vector with the data from the actual guest write request.
+     * If non-NULL, this is meant to be merged together with the data
+     * from @cow_start and @cow_end into one single write operation.
+     */
+    QEMUIOVector *data_qiov;
+
     /** Pointer to next L2Meta of the same write request */
     struct QCowL2Meta *next;
 

From 24990c5b959c3a24d76ccf96303c1f70556f1dd2 Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Tue, 20 Jun 2017 16:01:36 +0300
Subject: [PATCH 23/60] qcow2: Use offset_into_cluster() and
 offset_to_l2_index()

We already have functions for doing these calculations, so let's use
them instead of doing everything by hand. This makes the code a bit
more readable.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 4 ++--
 block/qcow2.c         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 01f210187c..3d341fd9cb 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -556,7 +556,7 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
 
     /* find the cluster offset for the given disk offset */
 
-    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+    l2_index = offset_to_l2_index(s, offset);
     *cluster_offset = be64_to_cpu(l2_table[l2_index]);
 
     nb_clusters = size_to_clusters(s, bytes_needed);
@@ -693,7 +693,7 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
 
     /* find the cluster offset for the given disk offset */
 
-    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+    l2_index = offset_to_l2_index(s, offset);
 
     *new_l2_table = l2_table;
     *new_l2_index = l2_index;
diff --git a/block/qcow2.c b/block/qcow2.c
index 328b1d4fb5..088ffe1673 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -356,7 +356,7 @@ static int validate_table_offset(BlockDriverState *bs, uint64_t offset,
     }
 
     /* Tables must be cluster aligned */
-    if (offset & (s->cluster_size - 1)) {
+    if (offset_into_cluster(s, offset) != 0) {
         return -EINVAL;
     }
 

From 3b7cd9fd8fc2d34da6a42f49421e3549918adf58 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 16 Nov 2016 17:31:14 +0100
Subject: [PATCH 24/60] qed: Use bottom half to resume waiting requests

The qed driver serialises allocating write requests. When the active
allocation is finished, the AIO callback is called, but after this, the
next allocating request is immediately processed instead of leaving the
coroutine. Resuming another allocation request in the same request
coroutine means that the request now runs in the wrong coroutine.

The following is one of the possible effects of this: The completed
request will generally reenter its request coroutine in a bottom half,
expecting that it completes the request in bdrv_driver_pwritev().
However, if the second request actually yielded before leaving the
coroutine, the reused request coroutine is in an entirely different
place and is reentered prematurely. Not a good idea.

Let's make sure that we exit the coroutine after completing the first
request by resuming the next allocating request only with a bottom
half.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 8d899fd479..a837a28655 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -967,6 +967,11 @@ static void qed_aio_complete_bh(void *opaque)
     qed_release(s);
 }
 
+static void qed_resume_alloc_bh(void *opaque)
+{
+    qed_aio_start_io(opaque);
+}
+
 static void qed_aio_complete(QEDAIOCB *acb, int ret)
 {
     BDRVQEDState *s = acb_to_s(acb);
@@ -995,10 +1000,12 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
      * requests multiple times but rather finish one at a time completely.
      */
     if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
+        QEDAIOCB *next_acb;
         QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
-        acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
-        if (acb) {
-            qed_aio_start_io(acb);
+        next_acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
+        if (next_acb) {
+            aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs),
+                                    qed_resume_alloc_bh, next_acb);
         } else if (s->header.features & QED_F_NEED_CHECK) {
             qed_start_need_check_timer(s);
         }

From 11273076e96829a48b2773327d6488f1d61901a2 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 14 Nov 2016 14:56:32 +0100
Subject: [PATCH 25/60] qed: Make qed_read_table() synchronous

Note that this code is generally not running in coroutine context, so
this is an actual blocking synchronous operation. We'll fix this in a
moment.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed-table.c | 58 ++++++++++++++++-------------------------------
 1 file changed, 19 insertions(+), 39 deletions(-)

diff --git a/block/qed-table.c b/block/qed-table.c
index b12c298a8a..f3305380f3 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -18,59 +18,39 @@
 #include "qed.h"
 #include "qemu/bswap.h"
 
-typedef struct {
-    GenericCB gencb;
-    BDRVQEDState *s;
-    QEDTable *table;
-
-    struct iovec iov;
-    QEMUIOVector qiov;
-} QEDReadTableCB;
-
-static void qed_read_table_cb(void *opaque, int ret)
+static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
+                           BlockCompletionFunc *cb, void *opaque)
 {
-    QEDReadTableCB *read_table_cb = opaque;
-    QEDTable *table = read_table_cb->table;
-    BDRVQEDState *s = read_table_cb->s;
-    int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
-    int i;
+    QEMUIOVector qiov;
+    int noffsets;
+    int i, ret;
 
-    /* Handle I/O error */
-    if (ret) {
+    struct iovec iov = {
+        .iov_base = table->offsets,
+        .iov_len = s->header.cluster_size * s->header.table_size,
+    };
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    trace_qed_read_table(s, offset, table);
+
+    ret = bdrv_preadv(s->bs->file, offset, &qiov);
+    if (ret < 0) {
         goto out;
     }
 
     /* Byteswap offsets */
     qed_acquire(s);
+    noffsets = qiov.size / sizeof(uint64_t);
     for (i = 0; i < noffsets; i++) {
         table->offsets[i] = le64_to_cpu(table->offsets[i]);
     }
     qed_release(s);
 
+    ret = 0;
 out:
     /* Completion */
-    trace_qed_read_table_cb(s, read_table_cb->table, ret);
-    gencb_complete(&read_table_cb->gencb, ret);
-}
-
-static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
-                           BlockCompletionFunc *cb, void *opaque)
-{
-    QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb),
-                                                cb, opaque);
-    QEMUIOVector *qiov = &read_table_cb->qiov;
-
-    trace_qed_read_table(s, offset, table);
-
-    read_table_cb->s = s;
-    read_table_cb->table = table;
-    read_table_cb->iov.iov_base = table->offsets,
-    read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size,
-
-    qemu_iovec_init_external(qiov, &read_table_cb->iov, 1);
-    bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov,
-                   qiov->size / BDRV_SECTOR_SIZE,
-                   qed_read_table_cb, read_table_cb);
+    trace_qed_read_table_cb(s, table, ret);
+    cb(opaque, ret);
 }
 
 typedef struct {

From f6513529c67509264fc602c4a537ec24bcb7b26f Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 14 Nov 2016 16:08:44 +0100
Subject: [PATCH 26/60] qed: Remove callback from qed_read_table()

Instead of passing the return value to a callback, return it to the
caller so that the callback can be inlined there.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed-table.c | 79 +++++++++++++++--------------------------------
 1 file changed, 25 insertions(+), 54 deletions(-)

diff --git a/block/qed-table.c b/block/qed-table.c
index f3305380f3..427000318d 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -18,8 +18,7 @@
 #include "qed.h"
 #include "qemu/bswap.h"
 
-static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
-                           BlockCompletionFunc *cb, void *opaque)
+static int qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table)
 {
     QEMUIOVector qiov;
     int noffsets;
@@ -50,7 +49,7 @@ static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
 out:
     /* Completion */
     trace_qed_read_table_cb(s, table, ret);
-    cb(opaque, ret);
+    return ret;
 }
 
 typedef struct {
@@ -156,13 +155,7 @@ static void qed_sync_cb(void *opaque, int ret)
 
 int qed_read_l1_table_sync(BDRVQEDState *s)
 {
-    int ret = -EINPROGRESS;
-
-    qed_read_table(s, s->header.l1_table_offset,
-                   s->l1_table, qed_sync_cb, &ret);
-    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
-
-    return ret;
+    return qed_read_table(s, s->header.l1_table_offset, s->l1_table);
 }
 
 void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
@@ -184,46 +177,10 @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
     return ret;
 }
 
-typedef struct {
-    GenericCB gencb;
-    BDRVQEDState *s;
-    uint64_t l2_offset;
-    QEDRequest *request;
-} QEDReadL2TableCB;
-
-static void qed_read_l2_table_cb(void *opaque, int ret)
-{
-    QEDReadL2TableCB *read_l2_table_cb = opaque;
-    QEDRequest *request = read_l2_table_cb->request;
-    BDRVQEDState *s = read_l2_table_cb->s;
-    CachedL2Table *l2_table = request->l2_table;
-    uint64_t l2_offset = read_l2_table_cb->l2_offset;
-
-    qed_acquire(s);
-    if (ret) {
-        /* can't trust loaded L2 table anymore */
-        qed_unref_l2_cache_entry(l2_table);
-        request->l2_table = NULL;
-    } else {
-        l2_table->offset = l2_offset;
-
-        qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
-
-        /* This is guaranteed to succeed because we just committed the entry
-         * to the cache.
-         */
-        request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
-        assert(request->l2_table != NULL);
-    }
-    qed_release(s);
-
-    gencb_complete(&read_l2_table_cb->gencb, ret);
-}
-
 void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
                        BlockCompletionFunc *cb, void *opaque)
 {
-    QEDReadL2TableCB *read_l2_table_cb;
+    int ret;
 
     qed_unref_l2_cache_entry(request->l2_table);
 
@@ -237,14 +194,28 @@ void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
     request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
     request->l2_table->table = qed_alloc_table(s);
 
-    read_l2_table_cb = gencb_alloc(sizeof(*read_l2_table_cb), cb, opaque);
-    read_l2_table_cb->s = s;
-    read_l2_table_cb->l2_offset = offset;
-    read_l2_table_cb->request = request;
-
     BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD);
-    qed_read_table(s, offset, request->l2_table->table,
-                   qed_read_l2_table_cb, read_l2_table_cb);
+    ret = qed_read_table(s, offset, request->l2_table->table);
+
+    qed_acquire(s);
+    if (ret) {
+        /* can't trust loaded L2 table anymore */
+        qed_unref_l2_cache_entry(request->l2_table);
+        request->l2_table = NULL;
+    } else {
+        request->l2_table->offset = offset;
+
+        qed_commit_l2_cache_entry(&s->l2_cache, request->l2_table);
+
+        /* This is guaranteed to succeed because we just committed the entry
+         * to the cache.
+         */
+        request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset);
+        assert(request->l2_table != NULL);
+    }
+    qed_release(s);
+
+    cb(opaque, ret);
 }
 
 int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset)

From a8165d2d6619a29b99451fd0a310d64693d86c3f Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 14 Nov 2016 16:26:14 +0100
Subject: [PATCH 27/60] qed: Remove callback from qed_read_l2_table()

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed-cluster.c | 94 +++++++++++++++------------------------------
 block/qed-table.c   | 15 ++------
 block/qed.h         |  3 +-
 3 files changed, 36 insertions(+), 76 deletions(-)

diff --git a/block/qed-cluster.c b/block/qed-cluster.c
index 8f5da74c4d..d2799446b3 100644
--- a/block/qed-cluster.c
+++ b/block/qed-cluster.c
@@ -61,59 +61,6 @@ static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s,
     return i - index;
 }
 
-typedef struct {
-    BDRVQEDState *s;
-    uint64_t pos;
-    size_t len;
-
-    QEDRequest *request;
-
-    /* User callback */
-    QEDFindClusterFunc *cb;
-    void *opaque;
-} QEDFindClusterCB;
-
-static void qed_find_cluster_cb(void *opaque, int ret)
-{
-    QEDFindClusterCB *find_cluster_cb = opaque;
-    BDRVQEDState *s = find_cluster_cb->s;
-    QEDRequest *request = find_cluster_cb->request;
-    uint64_t offset = 0;
-    size_t len = 0;
-    unsigned int index;
-    unsigned int n;
-
-    qed_acquire(s);
-    if (ret) {
-        goto out;
-    }
-
-    index = qed_l2_index(s, find_cluster_cb->pos);
-    n = qed_bytes_to_clusters(s,
-                              qed_offset_into_cluster(s, find_cluster_cb->pos) +
-                              find_cluster_cb->len);
-    n = qed_count_contiguous_clusters(s, request->l2_table->table,
-                                      index, n, &offset);
-
-    if (qed_offset_is_unalloc_cluster(offset)) {
-        ret = QED_CLUSTER_L2;
-    } else if (qed_offset_is_zero_cluster(offset)) {
-        ret = QED_CLUSTER_ZERO;
-    } else if (qed_check_cluster_offset(s, offset)) {
-        ret = QED_CLUSTER_FOUND;
-    } else {
-        ret = -EINVAL;
-    }
-
-    len = MIN(find_cluster_cb->len, n * s->header.cluster_size -
-              qed_offset_into_cluster(s, find_cluster_cb->pos));
-
-out:
-    find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
-    qed_release(s);
-    g_free(find_cluster_cb);
-}
-
 /**
  * Find the offset of a data cluster
  *
@@ -137,8 +84,11 @@ out:
 void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
                       size_t len, QEDFindClusterFunc *cb, void *opaque)
 {
-    QEDFindClusterCB *find_cluster_cb;
     uint64_t l2_offset;
+    uint64_t offset = 0;
+    unsigned int index;
+    unsigned int n;
+    int ret;
 
     /* Limit length to L2 boundary.  Requests are broken up at the L2 boundary
      * so that a request acts on one L2 table at a time.
@@ -155,14 +105,32 @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
         return;
     }
 
-    find_cluster_cb = g_malloc(sizeof(*find_cluster_cb));
-    find_cluster_cb->s = s;
-    find_cluster_cb->pos = pos;
-    find_cluster_cb->len = len;
-    find_cluster_cb->cb = cb;
-    find_cluster_cb->opaque = opaque;
-    find_cluster_cb->request = request;
+    ret = qed_read_l2_table(s, request, l2_offset);
+    qed_acquire(s);
+    if (ret) {
+        goto out;
+    }
 
-    qed_read_l2_table(s, request, l2_offset,
-                      qed_find_cluster_cb, find_cluster_cb);
+    index = qed_l2_index(s, pos);
+    n = qed_bytes_to_clusters(s,
+                              qed_offset_into_cluster(s, pos) + len);
+    n = qed_count_contiguous_clusters(s, request->l2_table->table,
+                                      index, n, &offset);
+
+    if (qed_offset_is_unalloc_cluster(offset)) {
+        ret = QED_CLUSTER_L2;
+    } else if (qed_offset_is_zero_cluster(offset)) {
+        ret = QED_CLUSTER_ZERO;
+    } else if (qed_check_cluster_offset(s, offset)) {
+        ret = QED_CLUSTER_FOUND;
+    } else {
+        ret = -EINVAL;
+    }
+
+    len = MIN(len,
+              n * s->header.cluster_size - qed_offset_into_cluster(s, pos));
+
+out:
+    cb(opaque, ret, offset, len);
+    qed_release(s);
 }
diff --git a/block/qed-table.c b/block/qed-table.c
index 427000318d..ffecbeadb4 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -177,8 +177,7 @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
     return ret;
 }
 
-void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
-                       BlockCompletionFunc *cb, void *opaque)
+int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
 {
     int ret;
 
@@ -187,8 +186,7 @@ void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
     /* Check for cached L2 entry */
     request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset);
     if (request->l2_table) {
-        cb(opaque, 0);
-        return;
+        return 0;
     }
 
     request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
@@ -215,17 +213,12 @@ void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
     }
     qed_release(s);
 
-    cb(opaque, ret);
+    return ret;
 }
 
 int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
 {
-    int ret = -EINPROGRESS;
-
-    qed_read_l2_table(s, request, offset, qed_sync_cb, &ret);
-    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
-
-    return ret;
+    return qed_read_l2_table(s, request, offset);
 }
 
 void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
diff --git a/block/qed.h b/block/qed.h
index ce8c314089..c71505855d 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -237,8 +237,7 @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
                             unsigned int n);
 int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                            uint64_t offset);
-void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
-                       BlockCompletionFunc *cb, void *opaque);
+int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset);
 void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
                         unsigned int index, unsigned int n, bool flush,
                         BlockCompletionFunc *cb, void *opaque);

From 0f21b7a1b7163dddfe7900bd3da7b4cf9568b536 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 14 Nov 2016 16:56:10 +0100
Subject: [PATCH 28/60] qed: Remove callback from qed_find_cluster()

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed-cluster.c | 39 ++++++++++++++++++++++-----------------
 block/qed.c         | 24 +++++++++++-------------
 block/qed.h         |  4 ++--
 3 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/block/qed-cluster.c b/block/qed-cluster.c
index d2799446b3..88dc979d8c 100644
--- a/block/qed-cluster.c
+++ b/block/qed-cluster.c
@@ -67,22 +67,27 @@ static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s,
  * @s:          QED state
  * @request:    L2 cache entry
  * @pos:        Byte position in device
- * @len:        Number of bytes
- * @cb:         Completion function
- * @opaque:     User data for completion function
+ * @len:        Number of bytes (may be shortened on return)
+ * @img_offset: Contains offset in the image file on success
  *
  * This function translates a position in the block device to an offset in the
- * image file.  It invokes the cb completion callback to report back the
- * translated offset or unallocated range in the image file.
+ * image file. The translated offset or unallocated range in the image file is
+ * reported back in *img_offset and *len.
  *
  * If the L2 table exists, request->l2_table points to the L2 table cache entry
  * and the caller must free the reference when they are finished.  The cache
  * entry is exposed in this way to avoid callers having to read the L2 table
  * again later during request processing.  If request->l2_table is non-NULL it
  * will be unreferenced before taking on the new cache entry.
+ *
+ * On success QED_CLUSTER_FOUND is returned and img_offset/len are a contiguous
+ * range in the image file.
+ *
+ * On failure QED_CLUSTER_L2 or QED_CLUSTER_L1 is returned for missing L2 or L1
+ * table offset, respectively. len is number of contiguous unallocated bytes.
  */
-void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
-                      size_t len, QEDFindClusterFunc *cb, void *opaque)
+int qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
+                     size_t *len, uint64_t *img_offset)
 {
     uint64_t l2_offset;
     uint64_t offset = 0;
@@ -93,16 +98,16 @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
     /* Limit length to L2 boundary.  Requests are broken up at the L2 boundary
      * so that a request acts on one L2 table at a time.
      */
-    len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos);
+    *len = MIN(*len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos);
 
     l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)];
     if (qed_offset_is_unalloc_cluster(l2_offset)) {
-        cb(opaque, QED_CLUSTER_L1, 0, len);
-        return;
+        *img_offset = 0;
+        return QED_CLUSTER_L1;
     }
     if (!qed_check_table_offset(s, l2_offset)) {
-        cb(opaque, -EINVAL, 0, 0);
-        return;
+        *img_offset = *len = 0;
+        return -EINVAL;
     }
 
     ret = qed_read_l2_table(s, request, l2_offset);
@@ -112,8 +117,7 @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
     }
 
     index = qed_l2_index(s, pos);
-    n = qed_bytes_to_clusters(s,
-                              qed_offset_into_cluster(s, pos) + len);
+    n = qed_bytes_to_clusters(s, qed_offset_into_cluster(s, pos) + *len);
     n = qed_count_contiguous_clusters(s, request->l2_table->table,
                                       index, n, &offset);
 
@@ -127,10 +131,11 @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
         ret = -EINVAL;
     }
 
-    len = MIN(len,
-              n * s->header.cluster_size - qed_offset_into_cluster(s, pos));
+    *len = MIN(*len,
+               n * s->header.cluster_size - qed_offset_into_cluster(s, pos));
 
 out:
-    cb(opaque, ret, offset, len);
+    *img_offset = offset;
     qed_release(s);
+    return ret;
 }
diff --git a/block/qed.c b/block/qed.c
index a837a28655..290cbcd6f2 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -776,14 +776,14 @@ static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs,
         .file = file,
     };
     QEDRequest request = { .l2_table = NULL };
+    uint64_t offset;
+    int ret;
 
-    qed_find_cluster(s, &request, cb.pos, len, qed_is_allocated_cb, &cb);
+    ret = qed_find_cluster(s, &request, cb.pos, &len, &offset);
+    qed_is_allocated_cb(&cb, ret, offset, len);
 
-    /* Now sleep if the callback wasn't invoked immediately */
-    while (cb.status == BDRV_BLOCK_OFFSET_MASK) {
-        cb.co = qemu_coroutine_self();
-        qemu_coroutine_yield();
-    }
+    /* The callback was invoked immediately */
+    assert(cb.status != BDRV_BLOCK_OFFSET_MASK);
 
     qed_unref_l2_cache_entry(request.l2_table);
 
@@ -1306,8 +1306,6 @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
  *              or -errno
  * @offset:     Cluster offset in bytes
  * @len:        Length in bytes
- *
- * Callback from qed_find_cluster().
  */
 static void qed_aio_write_data(void *opaque, int ret,
                                uint64_t offset, size_t len)
@@ -1343,8 +1341,6 @@ static void qed_aio_write_data(void *opaque, int ret,
  *              or -errno
  * @offset:     Cluster offset in bytes
  * @len:        Length in bytes
- *
- * Callback from qed_find_cluster().
  */
 static void qed_aio_read_data(void *opaque, int ret,
                               uint64_t offset, size_t len)
@@ -1393,6 +1389,8 @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
     BDRVQEDState *s = acb_to_s(acb);
     QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
                                 qed_aio_write_data : qed_aio_read_data;
+    uint64_t offset;
+    size_t len;
 
     trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
 
@@ -1419,9 +1417,9 @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
     }
 
     /* Find next cluster and start I/O */
-    qed_find_cluster(s, &acb->request,
-                      acb->cur_pos, acb->end_pos - acb->cur_pos,
-                      io_fn, acb);
+    len = acb->end_pos - acb->cur_pos;
+    ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
+    io_fn(acb, ret, offset, len);
 }
 
 static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
diff --git a/block/qed.h b/block/qed.h
index c71505855d..6ab57025e1 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -247,8 +247,8 @@ int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
 /**
  * Cluster functions
  */
-void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
-                      size_t len, QEDFindClusterFunc *cb, void *opaque);
+int qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
+                     size_t *len, uint64_t *img_offset);
 
 /**
  * Consistency check

From e85c5281426364f9c499cd4b71ec6dea20c5b4b2 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 14 Nov 2016 14:56:32 +0100
Subject: [PATCH 29/60] qed: Make qed_read_backing_file() synchronous

Note that this code is generally not running in coroutine context, so
this is an actual blocking synchronous operation. We'll fix this in a
moment.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 290cbcd6f2..1105f19de6 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -808,13 +808,13 @@ static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
  * This function reads qiov->size bytes starting at pos from the backing file.
  * If there is no backing file then zeroes are read.
  */
-static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
-                                  QEMUIOVector *qiov,
-                                  QEMUIOVector **backing_qiov,
-                                  BlockCompletionFunc *cb, void *opaque)
+static int qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
+                                 QEMUIOVector *qiov,
+                                 QEMUIOVector **backing_qiov)
 {
     uint64_t backing_length = 0;
     size_t size;
+    int ret;
 
     /* If there is a backing file, get its length.  Treat the absence of a
      * backing file like a zero length backing file.
@@ -822,8 +822,7 @@ static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
     if (s->bs->backing) {
         int64_t l = bdrv_getlength(s->bs->backing->bs);
         if (l < 0) {
-            cb(opaque, l);
-            return;
+            return l;
         }
         backing_length = l;
     }
@@ -836,8 +835,7 @@ static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
 
     /* Complete now if there are no backing file sectors to read */
     if (pos >= backing_length) {
-        cb(opaque, 0);
-        return;
+        return 0;
     }
 
     /* If the read straddles the end of the backing file, shorten it */
@@ -849,8 +847,11 @@ static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
     qemu_iovec_concat(*backing_qiov, qiov, 0, size);
 
     BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
-    bdrv_aio_readv(s->bs->backing, pos / BDRV_SECTOR_SIZE,
-                   *backing_qiov, size / BDRV_SECTOR_SIZE, cb, opaque);
+    ret = bdrv_preadv(s->bs->backing, pos, *backing_qiov);
+    if (ret < 0) {
+        return ret;
+    }
+    return 0;
 }
 
 typedef struct {
@@ -907,6 +908,7 @@ static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
                                        void *opaque)
 {
     CopyFromBackingFileCB *copy_cb;
+    int ret;
 
     /* Skip copy entirely if there is no work to do */
     if (len == 0) {
@@ -922,8 +924,9 @@ static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
     copy_cb->iov.iov_len = len;
     qemu_iovec_init_external(&copy_cb->qiov, &copy_cb->iov, 1);
 
-    qed_read_backing_file(s, pos, &copy_cb->qiov, &copy_cb->backing_qiov,
-                          qed_copy_from_backing_file_write, copy_cb);
+    ret = qed_read_backing_file(s, pos, &copy_cb->qiov,
+                                &copy_cb->backing_qiov);
+    qed_copy_from_backing_file_write(copy_cb, ret);
 }
 
 /**
@@ -1366,8 +1369,9 @@ static void qed_aio_read_data(void *opaque, int ret,
         qed_aio_start_io(acb);
         return;
     } else if (ret != QED_CLUSTER_FOUND) {
-        qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
-                              &acb->backing_qiov, qed_aio_next_io_cb, acb);
+        ret = qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
+                                    &acb->backing_qiov);
+        qed_aio_next_io(acb, ret);
         return;
     }
 

From 0f7aa24d2ccfa8c6a1894c50e46c1f300d7e6db0 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 14 Nov 2016 14:56:32 +0100
Subject: [PATCH 30/60] qed: Make qed_copy_from_backing_file() synchronous

Note that this code is generally not running in coroutine context, so
this is an actual blocking synchronous operation. We'll fix this in a
moment.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 78 ++++++++++++++++++++---------------------------------
 1 file changed, 29 insertions(+), 49 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 1105f19de6..af53b8ff54 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -854,44 +854,6 @@ static int qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
     return 0;
 }
 
-typedef struct {
-    GenericCB gencb;
-    BDRVQEDState *s;
-    QEMUIOVector qiov;
-    QEMUIOVector *backing_qiov;
-    struct iovec iov;
-    uint64_t offset;
-} CopyFromBackingFileCB;
-
-static void qed_copy_from_backing_file_cb(void *opaque, int ret)
-{
-    CopyFromBackingFileCB *copy_cb = opaque;
-    qemu_vfree(copy_cb->iov.iov_base);
-    gencb_complete(&copy_cb->gencb, ret);
-}
-
-static void qed_copy_from_backing_file_write(void *opaque, int ret)
-{
-    CopyFromBackingFileCB *copy_cb = opaque;
-    BDRVQEDState *s = copy_cb->s;
-
-    if (copy_cb->backing_qiov) {
-        qemu_iovec_destroy(copy_cb->backing_qiov);
-        g_free(copy_cb->backing_qiov);
-        copy_cb->backing_qiov = NULL;
-    }
-
-    if (ret) {
-        qed_copy_from_backing_file_cb(copy_cb, ret);
-        return;
-    }
-
-    BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
-    bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE,
-                    &copy_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE,
-                    qed_copy_from_backing_file_cb, copy_cb);
-}
-
 /**
  * Copy data from backing file into the image
  *
@@ -907,7 +869,9 @@ static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
                                        BlockCompletionFunc *cb,
                                        void *opaque)
 {
-    CopyFromBackingFileCB *copy_cb;
+    QEMUIOVector qiov;
+    QEMUIOVector *backing_qiov = NULL;
+    struct iovec iov;
     int ret;
 
     /* Skip copy entirely if there is no work to do */
@@ -916,17 +880,33 @@ static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
         return;
     }
 
-    copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque);
-    copy_cb->s = s;
-    copy_cb->offset = offset;
-    copy_cb->backing_qiov = NULL;
-    copy_cb->iov.iov_base = qemu_blockalign(s->bs, len);
-    copy_cb->iov.iov_len = len;
-    qemu_iovec_init_external(&copy_cb->qiov, &copy_cb->iov, 1);
+    iov = (struct iovec) {
+        .iov_base = qemu_blockalign(s->bs, len),
+        .iov_len = len,
+    };
+    qemu_iovec_init_external(&qiov, &iov, 1);
 
-    ret = qed_read_backing_file(s, pos, &copy_cb->qiov,
-                                &copy_cb->backing_qiov);
-    qed_copy_from_backing_file_write(copy_cb, ret);
+    ret = qed_read_backing_file(s, pos, &qiov, &backing_qiov);
+
+    if (backing_qiov) {
+        qemu_iovec_destroy(backing_qiov);
+        g_free(backing_qiov);
+        backing_qiov = NULL;
+    }
+
+    if (ret) {
+        goto out;
+    }
+
+    BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
+    ret = bdrv_pwritev(s->bs->file, offset, &qiov);
+    if (ret < 0) {
+        goto out;
+    }
+    ret = 0;
+out:
+    qemu_vfree(iov.iov_base);
+    cb(opaque, ret);
 }
 
 /**

From b4ac32f34f24b832e51fe092988e1d05396510eb Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 15 Nov 2016 11:14:01 +0100
Subject: [PATCH 31/60] qed: Remove callback from qed_copy_from_backing_file()

With this change, qed_aio_write_prefill() and qed_aio_write_postfill()
collapse into a single function. This is reflected by a rename of the
combined function to qed_aio_write_cow().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 57 +++++++++++++++++++++--------------------------------
 1 file changed, 23 insertions(+), 34 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index af53b8ff54..658b31ba85 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -861,13 +861,9 @@ static int qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
  * @pos:        Byte position in device
  * @len:        Number of bytes
  * @offset:     Byte offset in image file
- * @cb:         Completion function
- * @opaque:     User data for completion function
  */
-static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
-                                       uint64_t len, uint64_t offset,
-                                       BlockCompletionFunc *cb,
-                                       void *opaque)
+static int qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
+                                      uint64_t len, uint64_t offset)
 {
     QEMUIOVector qiov;
     QEMUIOVector *backing_qiov = NULL;
@@ -876,8 +872,7 @@ static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
 
     /* Skip copy entirely if there is no work to do */
     if (len == 0) {
-        cb(opaque, 0);
-        return;
+        return 0;
     }
 
     iov = (struct iovec) {
@@ -906,7 +901,7 @@ static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
     ret = 0;
 out:
     qemu_vfree(iov.iov_base);
-    cb(opaque, ret);
+    return ret;
 }
 
 /**
@@ -1133,42 +1128,36 @@ static void qed_aio_write_main(void *opaque, int ret)
 }
 
 /**
- * Populate back untouched region of new data cluster
+ * Populate untouched regions of new data cluster
  */
-static void qed_aio_write_postfill(void *opaque, int ret)
+static void qed_aio_write_cow(void *opaque, int ret)
 {
     QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
-    uint64_t start = acb->cur_pos + acb->cur_qiov.size;
-    uint64_t len =
-        qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
-    uint64_t offset = acb->cur_cluster +
-                      qed_offset_into_cluster(s, acb->cur_pos) +
-                      acb->cur_qiov.size;
+    uint64_t start, len, offset;
 
+    /* Populate front untouched region of new data cluster */
+    start = qed_start_of_cluster(s, acb->cur_pos);
+    len = qed_offset_into_cluster(s, acb->cur_pos);
+
+    trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
+    ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
     if (ret) {
         qed_aio_complete(acb, ret);
         return;
     }
 
+    /* Populate back untouched region of new data cluster */
+    start = acb->cur_pos + acb->cur_qiov.size;
+    len = qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
+    offset = acb->cur_cluster +
+             qed_offset_into_cluster(s, acb->cur_pos) +
+             acb->cur_qiov.size;
+
     trace_qed_aio_write_postfill(s, acb, start, len, offset);
-    qed_copy_from_backing_file(s, start, len, offset,
-                                qed_aio_write_main, acb);
-}
+    ret = qed_copy_from_backing_file(s, start, len, offset);
 
-/**
- * Populate front untouched region of new data cluster
- */
-static void qed_aio_write_prefill(void *opaque, int ret)
-{
-    QEDAIOCB *acb = opaque;
-    BDRVQEDState *s = acb_to_s(acb);
-    uint64_t start = qed_start_of_cluster(s, acb->cur_pos);
-    uint64_t len = qed_offset_into_cluster(s, acb->cur_pos);
-
-    trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
-    qed_copy_from_backing_file(s, start, len, acb->cur_cluster,
-                                qed_aio_write_postfill, acb);
+    qed_aio_write_main(acb, ret);
 }
 
 /**
@@ -1236,7 +1225,7 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 
         cb = qed_aio_write_zero_cluster;
     } else {
-        cb = qed_aio_write_prefill;
+        cb = qed_aio_write_cow;
         acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
     }
 

From 7076309aefaa480fd8b48eaa9aa73c394cf0ae7a Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 14 Nov 2016 14:56:32 +0100
Subject: [PATCH 32/60] qed: Make qed_write_header() synchronous

Note that this code is generally not running in coroutine context, so
this is an actual blocking synchronous operation. We'll fix this in a
moment.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 72 ++++++++++++++++++++---------------------------------
 1 file changed, 27 insertions(+), 45 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 658b31ba85..2665efc71c 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -92,41 +92,6 @@ int qed_write_header_sync(BDRVQEDState *s)
     return 0;
 }
 
-typedef struct {
-    GenericCB gencb;
-    BDRVQEDState *s;
-    struct iovec iov;
-    QEMUIOVector qiov;
-    int nsectors;
-    uint8_t *buf;
-} QEDWriteHeaderCB;
-
-static void qed_write_header_cb(void *opaque, int ret)
-{
-    QEDWriteHeaderCB *write_header_cb = opaque;
-
-    qemu_vfree(write_header_cb->buf);
-    gencb_complete(write_header_cb, ret);
-}
-
-static void qed_write_header_read_cb(void *opaque, int ret)
-{
-    QEDWriteHeaderCB *write_header_cb = opaque;
-    BDRVQEDState *s = write_header_cb->s;
-
-    if (ret) {
-        qed_write_header_cb(write_header_cb, ret);
-        return;
-    }
-
-    /* Update header */
-    qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf);
-
-    bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov,
-                    write_header_cb->nsectors, qed_write_header_cb,
-                    write_header_cb);
-}
-
 /**
  * Update header in-place (does not rewrite backing filename or other strings)
  *
@@ -144,18 +109,35 @@ static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
 
     int nsectors = DIV_ROUND_UP(sizeof(QEDHeader), BDRV_SECTOR_SIZE);
     size_t len = nsectors * BDRV_SECTOR_SIZE;
-    QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb),
-                                                    cb, opaque);
+    uint8_t *buf;
+    struct iovec iov;
+    QEMUIOVector qiov;
+    int ret;
 
-    write_header_cb->s = s;
-    write_header_cb->nsectors = nsectors;
-    write_header_cb->buf = qemu_blockalign(s->bs, len);
-    write_header_cb->iov.iov_base = write_header_cb->buf;
-    write_header_cb->iov.iov_len = len;
-    qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1);
+    buf = qemu_blockalign(s->bs, len);
+    iov = (struct iovec) {
+        .iov_base = buf,
+        .iov_len = len,
+    };
+    qemu_iovec_init_external(&qiov, &iov, 1);
 
-    bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors,
-                   qed_write_header_read_cb, write_header_cb);
+    ret = bdrv_preadv(s->bs->file, 0, &qiov);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* Update header */
+    qed_header_cpu_to_le(&s->header, (QEDHeader *) buf);
+
+    ret = bdrv_pwritev(s->bs->file, 0, &qiov);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = 0;
+out:
+    qemu_vfree(buf);
+    cb(opaque, ret);
 }
 
 static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)

From f13d712bb233d1d2d05dc8c70bd868adc0b5aaec Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 15 Nov 2016 11:14:01 +0100
Subject: [PATCH 33/60] qed: Remove callback from qed_write_header()

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 2665efc71c..95f1050f8b 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -98,8 +98,7 @@ int qed_write_header_sync(BDRVQEDState *s)
  * This function only updates known header fields in-place and does not affect
  * extra data after the QED header.
  */
-static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
-                             void *opaque)
+static int qed_write_header(BDRVQEDState *s)
 {
     /* We must write full sectors for O_DIRECT but cannot necessarily generate
      * the data following the header if an unrecognized compat feature is
@@ -137,7 +136,7 @@ static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
     ret = 0;
 out:
     qemu_vfree(buf);
-    cb(opaque, ret);
+    return ret;
 }
 
 static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
@@ -289,21 +288,6 @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
     }
 }
 
-static void qed_finish_clear_need_check(void *opaque, int ret)
-{
-    /* Do nothing */
-}
-
-static void qed_flush_after_clear_need_check(void *opaque, int ret)
-{
-    BDRVQEDState *s = opaque;
-
-    bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s);
-
-    /* No need to wait until flush completes */
-    qed_unplug_allocating_write_reqs(s);
-}
-
 static void qed_clear_need_check(void *opaque, int ret)
 {
     BDRVQEDState *s = opaque;
@@ -314,7 +298,13 @@ static void qed_clear_need_check(void *opaque, int ret)
     }
 
     s->header.features &= ~QED_F_NEED_CHECK;
-    qed_write_header(s, qed_flush_after_clear_need_check, s);
+    ret = qed_write_header(s);
+    (void) ret;
+
+    qed_unplug_allocating_write_reqs(s);
+
+    ret = bdrv_flush(s->bs);
+    (void) ret;
 }
 
 static void qed_need_check_timer_cb(void *opaque)
@@ -1179,6 +1169,7 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 {
     BDRVQEDState *s = acb_to_s(acb);
     BlockCompletionFunc *cb;
+    int ret;
 
     /* Cancel timer when the first allocating request comes in */
     if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
@@ -1213,7 +1204,8 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 
     if (qed_should_set_need_check(s)) {
         s->header.features |= QED_F_NEED_CHECK;
-        qed_write_header(s, cb, acb);
+        ret = qed_write_header(s);
+        cb(acb, ret);
     } else {
         cb(acb, 0);
     }

From 602b57fba48e3fcbda82112275c86d0f1873bbd3 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 14 Nov 2016 14:56:32 +0100
Subject: [PATCH 34/60] qed: Make qed_write_table() synchronous

Note that this code is generally not running in coroutine context, so
this is an actual blocking synchronous operation. We'll fix this in a
moment.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed-table.c | 84 +++++++++++++++++------------------------------
 1 file changed, 30 insertions(+), 54 deletions(-)

diff --git a/block/qed-table.c b/block/qed-table.c
index ffecbeadb4..0cc93a73d6 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -52,46 +52,6 @@ out:
     return ret;
 }
 
-typedef struct {
-    GenericCB gencb;
-    BDRVQEDState *s;
-    QEDTable *orig_table;
-    QEDTable *table;
-    bool flush;             /* flush after write? */
-
-    struct iovec iov;
-    QEMUIOVector qiov;
-} QEDWriteTableCB;
-
-static void qed_write_table_cb(void *opaque, int ret)
-{
-    QEDWriteTableCB *write_table_cb = opaque;
-    BDRVQEDState *s = write_table_cb->s;
-
-    trace_qed_write_table_cb(s,
-                             write_table_cb->orig_table,
-                             write_table_cb->flush,
-                             ret);
-
-    if (ret) {
-        goto out;
-    }
-
-    if (write_table_cb->flush) {
-        /* We still need to flush first */
-        write_table_cb->flush = false;
-        qed_acquire(s);
-        bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb,
-                       write_table_cb);
-        qed_release(s);
-        return;
-    }
-
-out:
-    qemu_vfree(write_table_cb->table);
-    gencb_complete(&write_table_cb->gencb, ret);
-}
-
 /**
  * Write out an updated part or all of a table
  *
@@ -108,10 +68,13 @@ static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
                             unsigned int index, unsigned int n, bool flush,
                             BlockCompletionFunc *cb, void *opaque)
 {
-    QEDWriteTableCB *write_table_cb;
     unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1;
     unsigned int start, end, i;
+    QEDTable *new_table;
+    struct iovec iov;
+    QEMUIOVector qiov;
     size_t len_bytes;
+    int ret;
 
     trace_qed_write_table(s, offset, table, index, n);
 
@@ -121,28 +84,41 @@ static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
 
     len_bytes = (end - start) * sizeof(uint64_t);
 
-    write_table_cb = gencb_alloc(sizeof(*write_table_cb), cb, opaque);
-    write_table_cb->s = s;
-    write_table_cb->orig_table = table;
-    write_table_cb->flush = flush;
-    write_table_cb->table = qemu_blockalign(s->bs, len_bytes);
-    write_table_cb->iov.iov_base = write_table_cb->table->offsets;
-    write_table_cb->iov.iov_len = len_bytes;
-    qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1);
+    new_table = qemu_blockalign(s->bs, len_bytes);
+    iov = (struct iovec) {
+        .iov_base = new_table->offsets,
+        .iov_len = len_bytes,
+    };
+    qemu_iovec_init_external(&qiov, &iov, 1);
 
     /* Byteswap table */
     for (i = start; i < end; i++) {
         uint64_t le_offset = cpu_to_le64(table->offsets[i]);
-        write_table_cb->table->offsets[i - start] = le_offset;
+        new_table->offsets[i - start] = le_offset;
     }
 
     /* Adjust for offset into table */
     offset += start * sizeof(uint64_t);
 
-    bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
-                    &write_table_cb->qiov,
-                    write_table_cb->qiov.size / BDRV_SECTOR_SIZE,
-                    qed_write_table_cb, write_table_cb);
+    ret = bdrv_pwritev(s->bs->file, offset, &qiov);
+    trace_qed_write_table_cb(s, table, flush, ret);
+    if (ret < 0) {
+        goto out;
+    }
+
+    if (flush) {
+        qed_acquire(s);
+        ret = bdrv_flush(s->bs);
+        qed_release(s);
+        if (ret < 0) {
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    qemu_vfree(new_table);
+    cb(opaque, ret);
 }
 
 /**

From 29470d11bf310de58e05ceadd61f25e6ed9ea8de Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 18 Nov 2016 17:16:24 +0100
Subject: [PATCH 35/60] qed: Remove GenericCB

The GenericCB infrastructure isn't used any more. Remove it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/Makefile.objs |  2 +-
 block/qed-gencb.c   | 33 ---------------------------------
 block/qed.h         | 11 -----------
 3 files changed, 1 insertion(+), 45 deletions(-)
 delete mode 100644 block/qed-gencb.c

diff --git a/block/Makefile.objs b/block/Makefile.objs
index ea955302c8..f9368b52b8 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,6 +1,6 @@
 block-obj-y += raw-format.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o dmg.o
 block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
-block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
+block-obj-y += qed.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
 block-obj-y += vhdx.o vhdx-endian.o vhdx-log.o
 block-obj-y += quorum.o
diff --git a/block/qed-gencb.c b/block/qed-gencb.c
deleted file mode 100644
index faf8ecc840..0000000000
--- a/block/qed-gencb.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * QEMU Enhanced Disk Format
- *
- * Copyright IBM, Corp. 2010
- *
- * Authors:
- *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
- *
- * This work is licensed under the terms of the GNU LGPL, version 2 or later.
- * See the COPYING.LIB file in the top-level directory.
- *
- */
-
-#include "qemu/osdep.h"
-#include "qed.h"
-
-void *gencb_alloc(size_t len, BlockCompletionFunc *cb, void *opaque)
-{
-    GenericCB *gencb = g_malloc(len);
-    gencb->cb = cb;
-    gencb->opaque = opaque;
-    return gencb;
-}
-
-void gencb_complete(void *opaque, int ret)
-{
-    GenericCB *gencb = opaque;
-    BlockCompletionFunc *cb = gencb->cb;
-    void *user_opaque = gencb->opaque;
-
-    g_free(gencb);
-    cb(user_opaque, ret);
-}
diff --git a/block/qed.h b/block/qed.h
index 6ab57025e1..46843c463b 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -201,17 +201,6 @@ typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t l
 void qed_acquire(BDRVQEDState *s);
 void qed_release(BDRVQEDState *s);
 
-/**
- * Generic callback for chaining async callbacks
- */
-typedef struct {
-    BlockCompletionFunc *cb;
-    void *opaque;
-} GenericCB;
-
-void *gencb_alloc(size_t len, BlockCompletionFunc *cb, void *opaque);
-void gencb_complete(void *opaque, int ret);
-
 /**
  * Header functions
  */

From 453e53e2a1128b85a03af7fd597292c9b6f8a9a0 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 15 Nov 2016 11:14:01 +0100
Subject: [PATCH 36/60] qed: Remove callback from qed_write_table()

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed-table.c | 47 ++++++++++++-----------------------------------
 block/qed.c       | 12 +++++++-----
 block/qed.h       |  8 +++-----
 3 files changed, 22 insertions(+), 45 deletions(-)

diff --git a/block/qed-table.c b/block/qed-table.c
index 0cc93a73d6..ebee2c50f0 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -61,12 +61,9 @@ out:
  * @index:      Index of first element
  * @n:          Number of elements
  * @flush:      Whether or not to sync to disk
- * @cb:         Completion function
- * @opaque:     Argument for completion function
  */
-static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
-                            unsigned int index, unsigned int n, bool flush,
-                            BlockCompletionFunc *cb, void *opaque)
+static int qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
+                           unsigned int index, unsigned int n, bool flush)
 {
     unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1;
     unsigned int start, end, i;
@@ -118,15 +115,7 @@ static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
     ret = 0;
 out:
     qemu_vfree(new_table);
-    cb(opaque, ret);
-}
-
-/**
- * Propagate return value from async callback
- */
-static void qed_sync_cb(void *opaque, int ret)
-{
-    *(int *)opaque = ret;
+    return ret;
 }
 
 int qed_read_l1_table_sync(BDRVQEDState *s)
@@ -134,23 +123,17 @@ int qed_read_l1_table_sync(BDRVQEDState *s)
     return qed_read_table(s, s->header.l1_table_offset, s->l1_table);
 }
 
-void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
-                        BlockCompletionFunc *cb, void *opaque)
+int qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n)
 {
     BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE);
-    qed_write_table(s, s->header.l1_table_offset,
-                    s->l1_table, index, n, false, cb, opaque);
+    return qed_write_table(s, s->header.l1_table_offset,
+                           s->l1_table, index, n, false);
 }
 
 int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
                             unsigned int n)
 {
-    int ret = -EINPROGRESS;
-
-    qed_write_l1_table(s, index, n, qed_sync_cb, &ret);
-    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
-
-    return ret;
+    return qed_write_l1_table(s, index, n);
 }
 
 int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
@@ -197,22 +180,16 @@ int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset
     return qed_read_l2_table(s, request, offset);
 }
 
-void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
-                        unsigned int index, unsigned int n, bool flush,
-                        BlockCompletionFunc *cb, void *opaque)
+int qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
+                       unsigned int index, unsigned int n, bool flush)
 {
     BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE);
-    qed_write_table(s, request->l2_table->offset,
-                    request->l2_table->table, index, n, flush, cb, opaque);
+    return qed_write_table(s, request->l2_table->offset,
+                           request->l2_table->table, index, n, flush);
 }
 
 int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                             unsigned int index, unsigned int n, bool flush)
 {
-    int ret = -EINPROGRESS;
-
-    qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret);
-    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
-
-    return ret;
+    return qed_write_l2_table(s, request, index, n, flush);
 }
diff --git a/block/qed.c b/block/qed.c
index 95f1050f8b..8c493bb393 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1000,7 +1000,8 @@ static void qed_aio_write_l1_update(void *opaque, int ret)
     index = qed_l1_index(s, acb->cur_pos);
     s->l1_table->offsets[index] = acb->request.l2_table->offset;
 
-    qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb);
+    ret = qed_write_l1_table(s, index, 1);
+    qed_commit_l2_update(acb, ret);
 }
 
 /**
@@ -1027,12 +1028,13 @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
 
     if (need_alloc) {
         /* Write out the whole new L2 table */
-        qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
-                           qed_aio_write_l1_update, acb);
+        ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
+        qed_aio_write_l1_update(acb, ret);
     } else {
         /* Write out only the updated part of the L2 table */
-        qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
-                           qed_aio_next_io_cb, acb);
+        ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
+                                 false);
+        qed_aio_next_io(acb, ret);
     }
     return;
 
diff --git a/block/qed.h b/block/qed.h
index 46843c463b..51443fa2e0 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -220,16 +220,14 @@ void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table);
  * Table I/O functions
  */
 int qed_read_l1_table_sync(BDRVQEDState *s);
-void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
-                        BlockCompletionFunc *cb, void *opaque);
+int qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n);
 int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
                             unsigned int n);
 int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                            uint64_t offset);
 int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset);
-void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
-                        unsigned int index, unsigned int n, bool flush,
-                        BlockCompletionFunc *cb, void *opaque);
+int qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
+                       unsigned int index, unsigned int n, bool flush);
 int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                             unsigned int index, unsigned int n, bool flush);
 

From 3e248cdcd907df82da63f89905e2e1bd20d44ab6 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 14 Nov 2016 14:56:32 +0100
Subject: [PATCH 37/60] qed: Make qed_aio_read_data() synchronous

Note that this code is generally not running in coroutine context, so
this is an actual blocking synchronous operation. We'll fix this in a
moment.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 8c493bb393..cfebbaed23 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1321,9 +1321,11 @@ static void qed_aio_read_data(void *opaque, int ret,
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
-    bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
-                   &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
-                   qed_aio_next_io_cb, acb);
+    ret = bdrv_preadv(bs->file, offset, &acb->cur_qiov);
+    if (ret < 0) {
+        goto err;
+    }
+    qed_aio_next_io(acb, 0);
     return;
 
 err:

From a4d8f1aee13955026c7ff4414cabad625749a613 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 14 Nov 2016 14:56:32 +0100
Subject: [PATCH 38/60] qed: Make qed_aio_write_main() synchronous

Note that this code is generally not running in coroutine context, so
this is an actual blocking synchronous operation. We'll fix this in a
moment.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 67 ++++++++++++++++++-----------------------------------
 1 file changed, 22 insertions(+), 45 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index cfebbaed23..d164b0e6f2 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -260,13 +260,6 @@ static void qed_aio_start_io(QEDAIOCB *acb)
     qed_aio_next_io(acb, 0);
 }
 
-static void qed_aio_next_io_cb(void *opaque, int ret)
-{
-    QEDAIOCB *acb = opaque;
-
-    qed_aio_next_io(acb, ret);
-}
-
 static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
 {
     assert(!s->allocating_write_reqs_plugged);
@@ -1042,31 +1035,6 @@ err:
     qed_aio_complete(acb, ret);
 }
 
-static void qed_aio_write_l2_update_cb(void *opaque, int ret)
-{
-    QEDAIOCB *acb = opaque;
-    qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
-}
-
-/**
- * Flush new data clusters before updating the L2 table
- *
- * This flush is necessary when a backing file is in use.  A crash during an
- * allocating write could result in empty clusters in the image.  If the write
- * only touched a subregion of the cluster, then backing image sectors have
- * been lost in the untouched region.  The solution is to flush after writing a
- * new data cluster and before updating the L2 table.
- */
-static void qed_aio_write_flush_before_l2_update(void *opaque, int ret)
-{
-    QEDAIOCB *acb = opaque;
-    BDRVQEDState *s = acb_to_s(acb);
-
-    if (!bdrv_aio_flush(s->bs->file->bs, qed_aio_write_l2_update_cb, opaque)) {
-        qed_aio_complete(acb, -EIO);
-    }
-}
-
 /**
  * Write data to the image file
  */
@@ -1076,7 +1044,6 @@ static void qed_aio_write_main(void *opaque, int ret)
     BDRVQEDState *s = acb_to_s(acb);
     uint64_t offset = acb->cur_cluster +
                       qed_offset_into_cluster(s, acb->cur_pos);
-    BlockCompletionFunc *next_fn;
 
     trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size);
 
@@ -1085,20 +1052,30 @@ static void qed_aio_write_main(void *opaque, int ret)
         return;
     }
 
-    if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
-        next_fn = qed_aio_next_io_cb;
-    } else {
-        if (s->bs->backing) {
-            next_fn = qed_aio_write_flush_before_l2_update;
-        } else {
-            next_fn = qed_aio_write_l2_update_cb;
-        }
+    BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
+    ret = bdrv_pwritev(s->bs->file, offset, &acb->cur_qiov);
+    if (ret >= 0) {
+        ret = 0;
     }
 
-    BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
-    bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
-                    &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
-                    next_fn, acb);
+    if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
+        qed_aio_next_io(acb, ret);
+    } else {
+        if (s->bs->backing) {
+            /*
+             * Flush new data clusters before updating the L2 table
+             *
+             * This flush is necessary when a backing file is in use.  A crash
+             * during an allocating write could result in empty clusters in the
+             * image.  If the write only touched a subregion of the cluster,
+             * then backing image sectors have been lost in the untouched
+             * region.  The solution is to flush after writing a new data
+             * cluster and before updating the L2 table.
+             */
+            ret = bdrv_flush(s->bs->file->bs);
+        }
+        qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
+    }
 }
 
 /**

From fae25ac7bd6d64724c415027262a532531decc48 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 17 Nov 2016 12:51:21 +0100
Subject: [PATCH 39/60] qed: Inline qed_commit_l2_update()

qed_commit_l2_update() is unconditionally called at the end of
qed_aio_write_l1_update(). Inline it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index d164b0e6f2..5462faa84a 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -956,15 +956,27 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
 }
 
 /**
- * Commit the current L2 table to the cache
+ * Update L1 table with new L2 table offset and write it out
  */
-static void qed_commit_l2_update(void *opaque, int ret)
+static void qed_aio_write_l1_update(void *opaque, int ret)
 {
     QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
     CachedL2Table *l2_table = acb->request.l2_table;
     uint64_t l2_offset = l2_table->offset;
+    int index;
 
+    if (ret) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+
+    index = qed_l1_index(s, acb->cur_pos);
+    s->l1_table->offsets[index] = l2_table->offset;
+
+    ret = qed_write_l1_table(s, index, 1);
+
+    /* Commit the current L2 table to the cache */
     qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
 
     /* This is guaranteed to succeed because we just committed the entry to the
@@ -976,26 +988,6 @@ static void qed_commit_l2_update(void *opaque, int ret)
     qed_aio_next_io(acb, ret);
 }
 
-/**
- * Update L1 table with new L2 table offset and write it out
- */
-static void qed_aio_write_l1_update(void *opaque, int ret)
-{
-    QEDAIOCB *acb = opaque;
-    BDRVQEDState *s = acb_to_s(acb);
-    int index;
-
-    if (ret) {
-        qed_aio_complete(acb, ret);
-        return;
-    }
-
-    index = qed_l1_index(s, acb->cur_pos);
-    s->l1_table->offsets[index] = acb->request.l2_table->offset;
-
-    ret = qed_write_l1_table(s, index, 1);
-    qed_commit_l2_update(acb, ret);
-}
 
 /**
  * Update L2 table with new cluster offsets and write them out

From fb18de21e01b4c37cd4aa074bb65a0c441e01fb3 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 17 Nov 2016 15:40:41 +0100
Subject: [PATCH 40/60] qed: Add return value to qed_aio_write_l1_update()

Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
just return an error code and let the caller handle it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 5462faa84a..e43827f0ec 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -958,18 +958,12 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
 /**
  * Update L1 table with new L2 table offset and write it out
  */
-static void qed_aio_write_l1_update(void *opaque, int ret)
+static int qed_aio_write_l1_update(QEDAIOCB *acb)
 {
-    QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
     CachedL2Table *l2_table = acb->request.l2_table;
     uint64_t l2_offset = l2_table->offset;
-    int index;
-
-    if (ret) {
-        qed_aio_complete(acb, ret);
-        return;
-    }
+    int index, ret;
 
     index = qed_l1_index(s, acb->cur_pos);
     s->l1_table->offsets[index] = l2_table->offset;
@@ -985,7 +979,7 @@ static void qed_aio_write_l1_update(void *opaque, int ret)
     acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
     assert(acb->request.l2_table != NULL);
 
-    qed_aio_next_io(acb, ret);
+    return ret;
 }
 
 
@@ -1014,7 +1008,12 @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
     if (need_alloc) {
         /* Write out the whole new L2 table */
         ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
-        qed_aio_write_l1_update(acb, ret);
+        if (ret) {
+            goto err;
+        }
+        ret = qed_aio_write_l1_update(acb);
+        qed_aio_next_io(acb, ret);
+
     } else {
         /* Write out only the updated part of the L2 table */
         ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,

From 88d2dd72bc071d2a20a759338ce349039b013baa Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 17 Nov 2016 15:40:41 +0100
Subject: [PATCH 41/60] qed: Add return value to qed_aio_write_l2_update()

Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
just return an error code and let the caller handle it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 43 ++++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index e43827f0ec..3cda01f242 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -986,15 +986,11 @@ static int qed_aio_write_l1_update(QEDAIOCB *acb)
 /**
  * Update L2 table with new cluster offsets and write them out
  */
-static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
+static int qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
 {
     BDRVQEDState *s = acb_to_s(acb);
     bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
-    int index;
-
-    if (ret) {
-        goto err;
-    }
+    int index, ret;
 
     if (need_alloc) {
         qed_unref_l2_cache_entry(acb->request.l2_table);
@@ -1009,21 +1005,18 @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
         /* Write out the whole new L2 table */
         ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
         if (ret) {
-            goto err;
+            return ret;
         }
-        ret = qed_aio_write_l1_update(acb);
-        qed_aio_next_io(acb, ret);
-
+        return qed_aio_write_l1_update(acb);
     } else {
         /* Write out only the updated part of the L2 table */
         ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
                                  false);
-        qed_aio_next_io(acb, ret);
+        if (ret) {
+            return ret;
+        }
     }
-    return;
-
-err:
-    qed_aio_complete(acb, ret);
+    return 0;
 }
 
 /**
@@ -1065,8 +1058,19 @@ static void qed_aio_write_main(void *opaque, int ret)
              */
             ret = bdrv_flush(s->bs->file->bs);
         }
-        qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
+        if (ret) {
+            goto err;
+        }
+        ret = qed_aio_write_l2_update(acb, acb->cur_cluster);
+        if (ret) {
+            goto err;
+        }
+        qed_aio_next_io(acb, 0);
     }
+    return;
+
+err:
+    qed_aio_complete(acb, ret);
 }
 
 /**
@@ -1124,7 +1128,12 @@ static void qed_aio_write_zero_cluster(void *opaque, int ret)
         return;
     }
 
-    qed_aio_write_l2_update(acb, 0, 1);
+    ret = qed_aio_write_l2_update(acb, 1);
+    if (ret < 0) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+    qed_aio_next_io(acb, 0);
 }
 
 /**

From eaf0bc56f528611c2b13450b0ab0972eadf52a8e Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 17 Nov 2016 15:40:41 +0100
Subject: [PATCH 42/60] qed: Add return value to qed_aio_write_main()

Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
just return an error code and let the caller handle it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 55 +++++++++++++++++++++++++++++------------------------
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 3cda01f242..a4b13f8c1e 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1022,29 +1022,22 @@ static int qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
 /**
  * Write data to the image file
  */
-static void qed_aio_write_main(void *opaque, int ret)
+static int qed_aio_write_main(QEDAIOCB *acb)
 {
-    QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
     uint64_t offset = acb->cur_cluster +
                       qed_offset_into_cluster(s, acb->cur_pos);
+    int ret;
 
-    trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size);
-
-    if (ret) {
-        qed_aio_complete(acb, ret);
-        return;
-    }
+    trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size);
 
     BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
     ret = bdrv_pwritev(s->bs->file, offset, &acb->cur_qiov);
-    if (ret >= 0) {
-        ret = 0;
+    if (ret < 0) {
+        return ret;
     }
 
-    if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
-        qed_aio_next_io(acb, ret);
-    } else {
+    if (acb->find_cluster_ret != QED_CLUSTER_FOUND) {
         if (s->bs->backing) {
             /*
              * Flush new data clusters before updating the L2 table
@@ -1057,20 +1050,16 @@ static void qed_aio_write_main(void *opaque, int ret)
              * cluster and before updating the L2 table.
              */
             ret = bdrv_flush(s->bs->file->bs);
-        }
-        if (ret) {
-            goto err;
+            if (ret < 0) {
+                return ret;
+            }
         }
         ret = qed_aio_write_l2_update(acb, acb->cur_cluster);
-        if (ret) {
-            goto err;
+        if (ret < 0) {
+            return ret;
         }
-        qed_aio_next_io(acb, 0);
     }
-    return;
-
-err:
-    qed_aio_complete(acb, ret);
+    return 0;
 }
 
 /**
@@ -1102,8 +1091,17 @@ static void qed_aio_write_cow(void *opaque, int ret)
 
     trace_qed_aio_write_postfill(s, acb, start, len, offset);
     ret = qed_copy_from_backing_file(s, start, len, offset);
+    if (ret) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
 
-    qed_aio_write_main(acb, ret);
+    ret = qed_aio_write_main(acb);
+    if (ret < 0) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+    qed_aio_next_io(acb, 0);
 }
 
 /**
@@ -1201,6 +1199,8 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
  */
 static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
 {
+    int ret;
+
     /* Allocate buffer for zero writes */
     if (acb->flags & QED_AIOCB_ZERO) {
         struct iovec *iov = acb->qiov->iov;
@@ -1220,7 +1220,12 @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
     qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
 
     /* Do the actual write */
-    qed_aio_write_main(acb, 0);
+    ret = qed_aio_write_main(acb);
+    if (ret < 0) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+    qed_aio_next_io(acb, 0);
 }
 
 /**

From a101341aa07237fa85907b1dcafd97add47a3875 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 17 Nov 2016 15:40:41 +0100
Subject: [PATCH 43/60] qed: Add return value to qed_aio_write_cow()

Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
just return an error code and let the caller handle it.

While refactoring qed_aio_write_alloc() to accomodate the change,
qed_aio_write_zero_cluster() ended up with a single line, so I chose to
inline that line and remove the function completely.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 60 ++++++++++++++++++++---------------------------------
 1 file changed, 22 insertions(+), 38 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index a4b13f8c1e..84864e01c3 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1065,11 +1065,11 @@ static int qed_aio_write_main(QEDAIOCB *acb)
 /**
  * Populate untouched regions of new data cluster
  */
-static void qed_aio_write_cow(void *opaque, int ret)
+static int qed_aio_write_cow(QEDAIOCB *acb)
 {
-    QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
     uint64_t start, len, offset;
+    int ret;
 
     /* Populate front untouched region of new data cluster */
     start = qed_start_of_cluster(s, acb->cur_pos);
@@ -1077,9 +1077,8 @@ static void qed_aio_write_cow(void *opaque, int ret)
 
     trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
     ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
-    if (ret) {
-        qed_aio_complete(acb, ret);
-        return;
+    if (ret < 0) {
+        return ret;
     }
 
     /* Populate back untouched region of new data cluster */
@@ -1091,17 +1090,11 @@ static void qed_aio_write_cow(void *opaque, int ret)
 
     trace_qed_aio_write_postfill(s, acb, start, len, offset);
     ret = qed_copy_from_backing_file(s, start, len, offset);
-    if (ret) {
-        qed_aio_complete(acb, ret);
-        return;
+    if (ret < 0) {
+        return ret;
     }
 
-    ret = qed_aio_write_main(acb);
-    if (ret < 0) {
-        qed_aio_complete(acb, ret);
-        return;
-    }
-    qed_aio_next_io(acb, 0);
+    return qed_aio_write_main(acb);
 }
 
 /**
@@ -1117,23 +1110,6 @@ static bool qed_should_set_need_check(BDRVQEDState *s)
     return !(s->header.features & QED_F_NEED_CHECK);
 }
 
-static void qed_aio_write_zero_cluster(void *opaque, int ret)
-{
-    QEDAIOCB *acb = opaque;
-
-    if (ret) {
-        qed_aio_complete(acb, ret);
-        return;
-    }
-
-    ret = qed_aio_write_l2_update(acb, 1);
-    if (ret < 0) {
-        qed_aio_complete(acb, ret);
-        return;
-    }
-    qed_aio_next_io(acb, 0);
-}
-
 /**
  * Write new data cluster
  *
@@ -1145,7 +1121,6 @@ static void qed_aio_write_zero_cluster(void *opaque, int ret)
 static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 {
     BDRVQEDState *s = acb_to_s(acb);
-    BlockCompletionFunc *cb;
     int ret;
 
     /* Cancel timer when the first allocating request comes in */
@@ -1172,20 +1147,29 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
             qed_aio_start_io(acb);
             return;
         }
-
-        cb = qed_aio_write_zero_cluster;
     } else {
-        cb = qed_aio_write_cow;
         acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
     }
 
     if (qed_should_set_need_check(s)) {
         s->header.features |= QED_F_NEED_CHECK;
         ret = qed_write_header(s);
-        cb(acb, ret);
-    } else {
-        cb(acb, 0);
+        if (ret < 0) {
+            qed_aio_complete(acb, ret);
+            return;
+        }
     }
+
+    if (acb->flags & QED_AIOCB_ZERO) {
+        ret = qed_aio_write_l2_update(acb, 1);
+    } else {
+        ret = qed_aio_write_cow(acb);
+    }
+    if (ret < 0) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+    qed_aio_next_io(acb, 0);
 }
 
 /**

From d6daddcdeb2c0c7d443cb039e798a1671dafdd0d Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 17 Nov 2016 15:40:41 +0100
Subject: [PATCH 44/60] qed: Add return value to qed_aio_write_inplace/alloc()

Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
just return an error code and let the caller handle it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 43 ++++++++++++++++++++-----------------------
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 84864e01c3..4c8ba4a92f 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1118,7 +1118,7 @@ static bool qed_should_set_need_check(BDRVQEDState *s)
  *
  * This path is taken when writing to previously unallocated clusters.
  */
-static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+static int qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 {
     BDRVQEDState *s = acb_to_s(acb);
     int ret;
@@ -1134,7 +1134,7 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
     }
     if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
         s->allocating_write_reqs_plugged) {
-        return; /* wait for existing request to finish */
+        return -EINPROGRESS; /* wait for existing request to finish */
     }
 
     acb->cur_nclusters = qed_bytes_to_clusters(s,
@@ -1144,8 +1144,7 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
     if (acb->flags & QED_AIOCB_ZERO) {
         /* Skip ahead if the clusters are already zero */
         if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
-            qed_aio_start_io(acb);
-            return;
+            return 0;
         }
     } else {
         acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
@@ -1155,8 +1154,7 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
         s->header.features |= QED_F_NEED_CHECK;
         ret = qed_write_header(s);
         if (ret < 0) {
-            qed_aio_complete(acb, ret);
-            return;
+            return ret;
         }
     }
 
@@ -1166,10 +1164,9 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
         ret = qed_aio_write_cow(acb);
     }
     if (ret < 0) {
-        qed_aio_complete(acb, ret);
-        return;
+        return ret;
     }
-    qed_aio_next_io(acb, 0);
+    return 0;
 }
 
 /**
@@ -1181,10 +1178,8 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
  *
  * This path is taken when writing to already allocated clusters.
  */
-static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
+static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
 {
-    int ret;
-
     /* Allocate buffer for zero writes */
     if (acb->flags & QED_AIOCB_ZERO) {
         struct iovec *iov = acb->qiov->iov;
@@ -1192,8 +1187,7 @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
         if (!iov->iov_base) {
             iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len);
             if (iov->iov_base == NULL) {
-                qed_aio_complete(acb, -ENOMEM);
-                return;
+                return -ENOMEM;
             }
             memset(iov->iov_base, 0, iov->iov_len);
         }
@@ -1204,12 +1198,7 @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
     qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
 
     /* Do the actual write */
-    ret = qed_aio_write_main(acb);
-    if (ret < 0) {
-        qed_aio_complete(acb, ret);
-        return;
-    }
-    qed_aio_next_io(acb, 0);
+    return qed_aio_write_main(acb);
 }
 
 /**
@@ -1232,19 +1221,27 @@ static void qed_aio_write_data(void *opaque, int ret,
 
     switch (ret) {
     case QED_CLUSTER_FOUND:
-        qed_aio_write_inplace(acb, offset, len);
+        ret = qed_aio_write_inplace(acb, offset, len);
         break;
 
     case QED_CLUSTER_L2:
     case QED_CLUSTER_L1:
     case QED_CLUSTER_ZERO:
-        qed_aio_write_alloc(acb, len);
+        ret = qed_aio_write_alloc(acb, len);
         break;
 
     default:
-        qed_aio_complete(acb, ret);
+        assert(ret < 0);
         break;
     }
+
+    if (ret < 0) {
+        if (ret != -EINPROGRESS) {
+            qed_aio_complete(acb, ret);
+        }
+        return;
+    }
+    qed_aio_next_io(acb, 0);
 }
 
 /**

From 0596be7e6a39da44e2dcba74a97bb8b89cb71bdd Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 17 Nov 2016 15:40:41 +0100
Subject: [PATCH 45/60] qed: Add return value to qed_aio_read/write_data()

Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
just return an error code and let the caller handle it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 72 +++++++++++++++++++++++------------------------------
 block/qed.h | 21 ----------------
 2 files changed, 31 insertions(+), 62 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 4c8ba4a92f..6f83831abe 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1205,13 +1205,12 @@ static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
  * Write data cluster
  *
  * @opaque:     Write request
- * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1,
- *              or -errno
+ * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
  * @offset:     Cluster offset in bytes
  * @len:        Length in bytes
  */
-static void qed_aio_write_data(void *opaque, int ret,
-                               uint64_t offset, size_t len)
+static int qed_aio_write_data(void *opaque, int ret,
+                              uint64_t offset, size_t len)
 {
     QEDAIOCB *acb = opaque;
 
@@ -1221,40 +1220,27 @@ static void qed_aio_write_data(void *opaque, int ret,
 
     switch (ret) {
     case QED_CLUSTER_FOUND:
-        ret = qed_aio_write_inplace(acb, offset, len);
-        break;
+        return qed_aio_write_inplace(acb, offset, len);
 
     case QED_CLUSTER_L2:
     case QED_CLUSTER_L1:
     case QED_CLUSTER_ZERO:
-        ret = qed_aio_write_alloc(acb, len);
-        break;
+        return qed_aio_write_alloc(acb, len);
 
     default:
-        assert(ret < 0);
-        break;
+        g_assert_not_reached();
     }
-
-    if (ret < 0) {
-        if (ret != -EINPROGRESS) {
-            qed_aio_complete(acb, ret);
-        }
-        return;
-    }
-    qed_aio_next_io(acb, 0);
 }
 
 /**
  * Read data cluster
  *
  * @opaque:     Read request
- * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1,
- *              or -errno
+ * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
  * @offset:     Cluster offset in bytes
  * @len:        Length in bytes
  */
-static void qed_aio_read_data(void *opaque, int ret,
-                              uint64_t offset, size_t len)
+static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
 {
     QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
@@ -1265,34 +1251,23 @@ static void qed_aio_read_data(void *opaque, int ret,
 
     trace_qed_aio_read_data(s, acb, ret, offset, len);
 
-    if (ret < 0) {
-        goto err;
-    }
-
     qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
 
     /* Handle zero cluster and backing file reads */
     if (ret == QED_CLUSTER_ZERO) {
         qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
-        qed_aio_start_io(acb);
-        return;
+        return 0;
     } else if (ret != QED_CLUSTER_FOUND) {
-        ret = qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
-                                    &acb->backing_qiov);
-        qed_aio_next_io(acb, ret);
-        return;
+        return qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
+                                     &acb->backing_qiov);
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
     ret = bdrv_preadv(bs->file, offset, &acb->cur_qiov);
     if (ret < 0) {
-        goto err;
+        return ret;
     }
-    qed_aio_next_io(acb, 0);
-    return;
-
-err:
-    qed_aio_complete(acb, ret);
+    return 0;
 }
 
 /**
@@ -1301,8 +1276,6 @@ err:
 static void qed_aio_next_io(QEDAIOCB *acb, int ret)
 {
     BDRVQEDState *s = acb_to_s(acb);
-    QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
-                                qed_aio_write_data : qed_aio_read_data;
     uint64_t offset;
     size_t len;
 
@@ -1333,7 +1306,24 @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
     /* Find next cluster and start I/O */
     len = acb->end_pos - acb->cur_pos;
     ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
-    io_fn(acb, ret, offset, len);
+    if (ret < 0) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+
+    if (acb->flags & QED_AIOCB_WRITE) {
+        ret = qed_aio_write_data(acb, ret, offset, len);
+    } else {
+        ret = qed_aio_read_data(acb, ret, offset, len);
+    }
+
+    if (ret < 0) {
+        if (ret != -EINPROGRESS) {
+            qed_aio_complete(acb, ret);
+        }
+        return;
+    }
+    qed_aio_next_io(acb, 0);
 }
 
 static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
diff --git a/block/qed.h b/block/qed.h
index 51443fa2e0..8644fed3a7 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -177,27 +177,6 @@ enum {
     QED_CLUSTER_L1,            /* cluster missing in L1 */
 };
 
-/**
- * qed_find_cluster() completion callback
- *
- * @opaque:     User data for completion callback
- * @ret:        QED_CLUSTER_FOUND   Success
- *              QED_CLUSTER_L2      Data cluster unallocated in L2
- *              QED_CLUSTER_L1      L2 unallocated in L1
- *              -errno              POSIX error occurred
- * @offset:     Data cluster offset
- * @len:        Contiguous bytes starting from cluster offset
- *
- * This function is invoked when qed_find_cluster() completes.
- *
- * On success ret is QED_CLUSTER_FOUND and offset/len are a contiguous range
- * in the image file.
- *
- * On failure ret is QED_CLUSTER_L2 or QED_CLUSTER_L1 for missing L2 or L1
- * table offset, respectively.  len is number of contiguous unallocated bytes.
- */
-typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len);
-
 void qed_acquire(BDRVQEDState *s);
 void qed_release(BDRVQEDState *s);
 

From dddf8db10b47d34d9d469ffe45e000170666ecdd Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 18 Nov 2016 13:40:13 +0100
Subject: [PATCH 46/60] qed: Remove ret argument from qed_aio_next_io()

All callers pass ret = 0, so we can just remove it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 6f83831abe..db80987dc3 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -253,11 +253,11 @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
     return l2_table;
 }
 
-static void qed_aio_next_io(QEDAIOCB *acb, int ret);
+static void qed_aio_next_io(QEDAIOCB *acb);
 
 static void qed_aio_start_io(QEDAIOCB *acb)
 {
-    qed_aio_next_io(acb, 0);
+    qed_aio_next_io(acb);
 }
 
 static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
@@ -1273,13 +1273,14 @@ static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
 /**
  * Begin next I/O or complete the request
  */
-static void qed_aio_next_io(QEDAIOCB *acb, int ret)
+static void qed_aio_next_io(QEDAIOCB *acb)
 {
     BDRVQEDState *s = acb_to_s(acb);
     uint64_t offset;
     size_t len;
+    int ret;
 
-    trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
+    trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size);
 
     if (acb->backing_qiov) {
         qemu_iovec_destroy(acb->backing_qiov);
@@ -1287,12 +1288,6 @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
         acb->backing_qiov = NULL;
     }
 
-    /* Handle I/O error */
-    if (ret) {
-        qed_aio_complete(acb, ret);
-        return;
-    }
-
     acb->qiov_offset += acb->cur_qiov.size;
     acb->cur_pos += acb->cur_qiov.size;
     qemu_iovec_reset(&acb->cur_qiov);
@@ -1323,7 +1318,7 @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
         }
         return;
     }
-    qed_aio_next_io(acb, 0);
+    qed_aio_next_io(acb);
 }
 
 static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,

From 018598747c775394471ce4a341a1ce225a1738dc Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 18 Nov 2016 14:16:42 +0100
Subject: [PATCH 47/60] qed: Remove recursion in qed_aio_next_io()

Instead of calling itself recursively as the last thing, just convert
qed_aio_next_io() into a loop.

This patch is best reviewed with 'git show -w' because most of it is
just whitespace changes.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 73 +++++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 36 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index db80987dc3..e7621693e2 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1280,45 +1280,46 @@ static void qed_aio_next_io(QEDAIOCB *acb)
     size_t len;
     int ret;
 
-    trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size);
+    while (1) {
+        trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size);
 
-    if (acb->backing_qiov) {
-        qemu_iovec_destroy(acb->backing_qiov);
-        g_free(acb->backing_qiov);
-        acb->backing_qiov = NULL;
-    }
-
-    acb->qiov_offset += acb->cur_qiov.size;
-    acb->cur_pos += acb->cur_qiov.size;
-    qemu_iovec_reset(&acb->cur_qiov);
-
-    /* Complete request */
-    if (acb->cur_pos >= acb->end_pos) {
-        qed_aio_complete(acb, 0);
-        return;
-    }
-
-    /* Find next cluster and start I/O */
-    len = acb->end_pos - acb->cur_pos;
-    ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
-    if (ret < 0) {
-        qed_aio_complete(acb, ret);
-        return;
-    }
-
-    if (acb->flags & QED_AIOCB_WRITE) {
-        ret = qed_aio_write_data(acb, ret, offset, len);
-    } else {
-        ret = qed_aio_read_data(acb, ret, offset, len);
-    }
-
-    if (ret < 0) {
-        if (ret != -EINPROGRESS) {
-            qed_aio_complete(acb, ret);
+        if (acb->backing_qiov) {
+            qemu_iovec_destroy(acb->backing_qiov);
+            g_free(acb->backing_qiov);
+            acb->backing_qiov = NULL;
+        }
+
+        acb->qiov_offset += acb->cur_qiov.size;
+        acb->cur_pos += acb->cur_qiov.size;
+        qemu_iovec_reset(&acb->cur_qiov);
+
+        /* Complete request */
+        if (acb->cur_pos >= acb->end_pos) {
+            qed_aio_complete(acb, 0);
+            return;
+        }
+
+        /* Find next cluster and start I/O */
+        len = acb->end_pos - acb->cur_pos;
+        ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
+        if (ret < 0) {
+            qed_aio_complete(acb, ret);
+            return;
+        }
+
+        if (acb->flags & QED_AIOCB_WRITE) {
+            ret = qed_aio_write_data(acb, ret, offset, len);
+        } else {
+            ret = qed_aio_read_data(acb, ret, offset, len);
+        }
+
+        if (ret < 0) {
+            if (ret != -EINPROGRESS) {
+                qed_aio_complete(acb, ret);
+            }
+            return;
         }
-        return;
     }
-    qed_aio_next_io(acb);
 }
 
 static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,

From 89f89709c7c66edd95c2288eae7ec4006256348a Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 14 Nov 2016 14:20:00 +0100
Subject: [PATCH 48/60] qed: Implement .bdrv_co_readv/writev

Most of the qed code is now synchronous and matches the coroutine model.
One notable exception is the serialisation between requests which can
still schedule a callback. Before we can replace this with coroutine
locks, let's convert the driver's external interfaces to the coroutine
versions.

We need to be careful to handle both requests that call the completion
callback directly from the calling coroutine (i.e. fully synchronous
code) and requests that involve some callback, so that we need to yield
and wait for the completion callback coming from outside the coroutine.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 109 +++++++++++++++++++++++-----------------------------
 1 file changed, 48 insertions(+), 61 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index e7621693e2..a5111fd711 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1322,16 +1322,32 @@ static void qed_aio_next_io(QEDAIOCB *acb)
     }
 }
 
-static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
-                                 int64_t sector_num,
-                                 QEMUIOVector *qiov, int nb_sectors,
-                                 BlockCompletionFunc *cb,
-                                 void *opaque, int flags)
-{
-    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque);
+typedef struct QEDRequestCo {
+    Coroutine *co;
+    bool done;
+    int ret;
+} QEDRequestCo;
 
-    trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors,
-                        opaque, flags);
+static void qed_co_request_cb(void *opaque, int ret)
+{
+    QEDRequestCo *co = opaque;
+
+    co->done = true;
+    co->ret = ret;
+    qemu_coroutine_enter_if_inactive(co->co);
+}
+
+static int coroutine_fn qed_co_request(BlockDriverState *bs, int64_t sector_num,
+                                       QEMUIOVector *qiov, int nb_sectors,
+                                       int flags)
+{
+    QEDRequestCo co = {
+        .co     = qemu_coroutine_self(),
+        .done   = false,
+    };
+    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, qed_co_request_cb, &co);
+
+    trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, &co, flags);
 
     acb->flags = flags;
     acb->qiov = qiov;
@@ -1344,43 +1360,26 @@ static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
 
     /* Start request */
     qed_aio_start_io(acb);
-    return &acb->common;
-}
 
-static BlockAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
-                                      int64_t sector_num,
-                                      QEMUIOVector *qiov, int nb_sectors,
-                                      BlockCompletionFunc *cb,
-                                      void *opaque)
-{
-    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
-}
-
-static BlockAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
-                                       int64_t sector_num,
-                                       QEMUIOVector *qiov, int nb_sectors,
-                                       BlockCompletionFunc *cb,
-                                       void *opaque)
-{
-    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
-                         opaque, QED_AIOCB_WRITE);
-}
-
-typedef struct {
-    Coroutine *co;
-    int ret;
-    bool done;
-} QEDWriteZeroesCB;
-
-static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
-{
-    QEDWriteZeroesCB *cb = opaque;
-
-    cb->done = true;
-    cb->ret = ret;
-    if (cb->co) {
-        aio_co_wake(cb->co);
+    if (!co.done) {
+        qemu_coroutine_yield();
     }
+
+    return co.ret;
+}
+
+static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
+                                          int64_t sector_num, int nb_sectors,
+                                          QEMUIOVector *qiov)
+{
+    return qed_co_request(bs, sector_num, qiov, nb_sectors, 0);
+}
+
+static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs,
+                                           int64_t sector_num, int nb_sectors,
+                                           QEMUIOVector *qiov)
+{
+    return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
 }
 
 static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
@@ -1388,9 +1387,7 @@ static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
                                                   int count,
                                                   BdrvRequestFlags flags)
 {
-    BlockAIOCB *blockacb;
     BDRVQEDState *s = bs->opaque;
-    QEDWriteZeroesCB cb = { .done = false };
     QEMUIOVector qiov;
     struct iovec iov;
 
@@ -1407,19 +1404,9 @@ static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
     iov.iov_len = count;
 
     qemu_iovec_init_external(&qiov, &iov, 1);
-    blockacb = qed_aio_setup(bs, offset >> BDRV_SECTOR_BITS, &qiov,
-                             count >> BDRV_SECTOR_BITS,
-                             qed_co_pwrite_zeroes_cb, &cb,
-                             QED_AIOCB_WRITE | QED_AIOCB_ZERO);
-    if (!blockacb) {
-        return -EIO;
-    }
-    if (!cb.done) {
-        cb.co = qemu_coroutine_self();
-        qemu_coroutine_yield();
-    }
-    assert(cb.done);
-    return cb.ret;
+    return qed_co_request(bs, offset >> BDRV_SECTOR_BITS, &qiov,
+                          count >> BDRV_SECTOR_BITS,
+                          QED_AIOCB_WRITE | QED_AIOCB_ZERO);
 }
 
 static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
@@ -1615,8 +1602,8 @@ static BlockDriver bdrv_qed = {
     .bdrv_create              = bdrv_qed_create,
     .bdrv_has_zero_init       = bdrv_has_zero_init_1,
     .bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
-    .bdrv_aio_readv           = bdrv_qed_aio_readv,
-    .bdrv_aio_writev          = bdrv_qed_aio_writev,
+    .bdrv_co_readv            = bdrv_qed_co_readv,
+    .bdrv_co_writev           = bdrv_qed_co_writev,
     .bdrv_co_pwrite_zeroes    = bdrv_qed_co_pwrite_zeroes,
     .bdrv_truncate            = bdrv_qed_truncate,
     .bdrv_getlength           = bdrv_qed_getlength,

From 0806c3b5dd1aced4c50eda65e9ecc9cfab4ee58e Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 18 Nov 2016 15:32:17 +0100
Subject: [PATCH 49/60] qed: Use CoQueue for serialising allocations

Now that we're running in coroutine context, the ad-hoc serialisation
code (which drops a request that has to wait out of coroutine context)
can be replaced by a CoQueue.

This means that when we resume a serialised request, it is running in
coroutine context again and its I/O isn't blocking any more.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 49 +++++++++++++++++--------------------------------
 block/qed.h |  3 ++-
 2 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index a5111fd711..cd3ef55699 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -269,16 +269,10 @@ static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
 
 static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
 {
-    QEDAIOCB *acb;
-
     assert(s->allocating_write_reqs_plugged);
 
     s->allocating_write_reqs_plugged = false;
-
-    acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
-    if (acb) {
-        qed_aio_start_io(acb);
-    }
+    qemu_co_enter_next(&s->allocating_write_reqs);
 }
 
 static void qed_clear_need_check(void *opaque, int ret)
@@ -305,7 +299,7 @@ static void qed_need_check_timer_cb(void *opaque)
     BDRVQEDState *s = opaque;
 
     /* The timer should only fire when allocating writes have drained */
-    assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs));
+    assert(!s->allocating_acb);
 
     trace_qed_need_check_timer_cb(s);
 
@@ -388,7 +382,7 @@ static int bdrv_qed_do_open(BlockDriverState *bs, QDict *options, int flags,
     int ret;
 
     s->bs = bs;
-    QSIMPLEQ_INIT(&s->allocating_write_reqs);
+    qemu_co_queue_init(&s->allocating_write_reqs);
 
     ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
     if (ret < 0) {
@@ -910,11 +904,6 @@ static void qed_aio_complete_bh(void *opaque)
     qed_release(s);
 }
 
-static void qed_resume_alloc_bh(void *opaque)
-{
-    qed_aio_start_io(opaque);
-}
-
 static void qed_aio_complete(QEDAIOCB *acb, int ret)
 {
     BDRVQEDState *s = acb_to_s(acb);
@@ -942,13 +931,10 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
      * next request in the queue.  This ensures that we don't cycle through
      * requests multiple times but rather finish one at a time completely.
      */
-    if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
-        QEDAIOCB *next_acb;
-        QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
-        next_acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
-        if (next_acb) {
-            aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs),
-                                    qed_resume_alloc_bh, next_acb);
+    if (acb == s->allocating_acb) {
+        s->allocating_acb = NULL;
+        if (!qemu_co_queue_empty(&s->allocating_write_reqs)) {
+            qemu_co_enter_next(&s->allocating_write_reqs);
         } else if (s->header.features & QED_F_NEED_CHECK) {
             qed_start_need_check_timer(s);
         }
@@ -1124,17 +1110,18 @@ static int qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
     int ret;
 
     /* Cancel timer when the first allocating request comes in */
-    if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
+    if (s->allocating_acb == NULL) {
         qed_cancel_need_check_timer(s);
     }
 
     /* Freeze this request if another allocating write is in progress */
-    if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
-        QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next);
-    }
-    if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
-        s->allocating_write_reqs_plugged) {
-        return -EINPROGRESS; /* wait for existing request to finish */
+    if (s->allocating_acb != acb || s->allocating_write_reqs_plugged) {
+        if (s->allocating_acb != NULL) {
+            qemu_co_queue_wait(&s->allocating_write_reqs, NULL);
+            assert(s->allocating_acb == NULL);
+        }
+        s->allocating_acb = acb;
+        return -EAGAIN; /* start over with looking up table entries */
     }
 
     acb->cur_nclusters = qed_bytes_to_clusters(s,
@@ -1313,10 +1300,8 @@ static void qed_aio_next_io(QEDAIOCB *acb)
             ret = qed_aio_read_data(acb, ret, offset, len);
         }
 
-        if (ret < 0) {
-            if (ret != -EINPROGRESS) {
-                qed_aio_complete(acb, ret);
-            }
+        if (ret < 0 && ret != -EAGAIN) {
+            qed_aio_complete(acb, ret);
             return;
         }
     }
diff --git a/block/qed.h b/block/qed.h
index 8644fed3a7..37558e425d 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -163,7 +163,8 @@ typedef struct {
     uint32_t l2_mask;
 
     /* Allocating write request queue */
-    QSIMPLEQ_HEAD(, QEDAIOCB) allocating_write_reqs;
+    QEDAIOCB *allocating_acb;
+    CoQueue allocating_write_reqs;
     bool allocating_write_reqs_plugged;
 
     /* Periodic flush and clear need check flag */

From 48cc565e767d1cb4965150d258ebd15a1b3de488 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 18 Nov 2016 14:47:36 +0100
Subject: [PATCH 50/60] qed: Simplify request handling

Now that we process a request in the same coroutine from beginning to
end and don't drop out of it any more, we can look like a proper
coroutine-based driver and simply call qed_aio_next_io() and get a
return value from it instead of spawning an additional coroutine that
reenters the parent when it's done.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 101 +++++++++++-----------------------------------------
 block/qed.h |   3 +-
 2 files changed, 22 insertions(+), 82 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index cd3ef55699..e53f6b5abf 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -21,10 +21,6 @@
 #include "qapi/qmp/qerror.h"
 #include "sysemu/block-backend.h"
 
-static const AIOCBInfo qed_aiocb_info = {
-    .aiocb_size         = sizeof(QEDAIOCB),
-};
-
 static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
                           const char *filename)
 {
@@ -253,13 +249,6 @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
     return l2_table;
 }
 
-static void qed_aio_next_io(QEDAIOCB *acb);
-
-static void qed_aio_start_io(QEDAIOCB *acb)
-{
-    qed_aio_next_io(acb);
-}
-
 static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
 {
     assert(!s->allocating_write_reqs_plugged);
@@ -751,7 +740,7 @@ static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs,
 
 static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
 {
-    return acb->common.bs->opaque;
+    return acb->bs->opaque;
 }
 
 /**
@@ -888,28 +877,10 @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
     }
 }
 
-static void qed_aio_complete_bh(void *opaque)
-{
-    QEDAIOCB *acb = opaque;
-    BDRVQEDState *s = acb_to_s(acb);
-    BlockCompletionFunc *cb = acb->common.cb;
-    void *user_opaque = acb->common.opaque;
-    int ret = acb->bh_ret;
-
-    qemu_aio_unref(acb);
-
-    /* Invoke callback */
-    qed_acquire(s);
-    cb(user_opaque, ret);
-    qed_release(s);
-}
-
-static void qed_aio_complete(QEDAIOCB *acb, int ret)
+static void qed_aio_complete(QEDAIOCB *acb)
 {
     BDRVQEDState *s = acb_to_s(acb);
 
-    trace_qed_aio_complete(s, acb, ret);
-
     /* Free resources */
     qemu_iovec_destroy(&acb->cur_qiov);
     qed_unref_l2_cache_entry(acb->request.l2_table);
@@ -920,11 +891,6 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
         acb->qiov->iov[0].iov_base = NULL;
     }
 
-    /* Arrange for a bh to invoke the completion function */
-    acb->bh_ret = ret;
-    aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs),
-                            qed_aio_complete_bh, acb);
-
     /* Start next allocating write request waiting behind this one.  Note that
      * requests enqueue themselves when they first hit an unallocated cluster
      * but they wait until the entire request is finished before waking up the
@@ -1172,7 +1138,7 @@ static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
         struct iovec *iov = acb->qiov->iov;
 
         if (!iov->iov_base) {
-            iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len);
+            iov->iov_base = qemu_try_blockalign(acb->bs, iov->iov_len);
             if (iov->iov_base == NULL) {
                 return -ENOMEM;
             }
@@ -1231,7 +1197,7 @@ static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
 {
     QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
-    BlockDriverState *bs = acb->common.bs;
+    BlockDriverState *bs = acb->bs;
 
     /* Adjust offset into cluster */
     offset += qed_offset_into_cluster(s, acb->cur_pos);
@@ -1260,7 +1226,7 @@ static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
 /**
  * Begin next I/O or complete the request
  */
-static void qed_aio_next_io(QEDAIOCB *acb)
+static int qed_aio_next_io(QEDAIOCB *acb)
 {
     BDRVQEDState *s = acb_to_s(acb);
     uint64_t offset;
@@ -1282,16 +1248,15 @@ static void qed_aio_next_io(QEDAIOCB *acb)
 
         /* Complete request */
         if (acb->cur_pos >= acb->end_pos) {
-            qed_aio_complete(acb, 0);
-            return;
+            ret = 0;
+            break;
         }
 
         /* Find next cluster and start I/O */
         len = acb->end_pos - acb->cur_pos;
         ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
         if (ret < 0) {
-            qed_aio_complete(acb, ret);
-            return;
+            break;
         }
 
         if (acb->flags & QED_AIOCB_WRITE) {
@@ -1301,56 +1266,32 @@ static void qed_aio_next_io(QEDAIOCB *acb)
         }
 
         if (ret < 0 && ret != -EAGAIN) {
-            qed_aio_complete(acb, ret);
-            return;
+            break;
         }
     }
-}
 
-typedef struct QEDRequestCo {
-    Coroutine *co;
-    bool done;
-    int ret;
-} QEDRequestCo;
-
-static void qed_co_request_cb(void *opaque, int ret)
-{
-    QEDRequestCo *co = opaque;
-
-    co->done = true;
-    co->ret = ret;
-    qemu_coroutine_enter_if_inactive(co->co);
+    trace_qed_aio_complete(s, acb, ret);
+    qed_aio_complete(acb);
+    return ret;
 }
 
 static int coroutine_fn qed_co_request(BlockDriverState *bs, int64_t sector_num,
                                        QEMUIOVector *qiov, int nb_sectors,
                                        int flags)
 {
-    QEDRequestCo co = {
-        .co     = qemu_coroutine_self(),
-        .done   = false,
+    QEDAIOCB acb = {
+        .bs         = bs,
+        .cur_pos    = (uint64_t) sector_num * BDRV_SECTOR_SIZE,
+        .end_pos    = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE,
+        .qiov       = qiov,
+        .flags      = flags,
     };
-    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, qed_co_request_cb, &co);
+    qemu_iovec_init(&acb.cur_qiov, qiov->niov);
 
-    trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, &co, flags);
-
-    acb->flags = flags;
-    acb->qiov = qiov;
-    acb->qiov_offset = 0;
-    acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
-    acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE;
-    acb->backing_qiov = NULL;
-    acb->request.l2_table = NULL;
-    qemu_iovec_init(&acb->cur_qiov, qiov->niov);
+    trace_qed_aio_setup(bs->opaque, &acb, sector_num, nb_sectors, NULL, flags);
 
     /* Start request */
-    qed_aio_start_io(acb);
-
-    if (!co.done) {
-        qemu_coroutine_yield();
-    }
-
-    return co.ret;
+    return qed_aio_next_io(&acb);
 }
 
 static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
diff --git a/block/qed.h b/block/qed.h
index 37558e425d..fb80943c2d 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -129,8 +129,7 @@ enum {
 };
 
 typedef struct QEDAIOCB {
-    BlockAIOCB common;
-    int bh_ret;                     /* final return status for completion bh */
+    BlockDriverState *bs;
     QSIMPLEQ_ENTRY(QEDAIOCB) next;  /* next request */
     int flags;                      /* QED_AIOCB_* bits ORed together */
     uint64_t end_pos;               /* request end on block device, in bytes */

From c0e8f98927929b65ffbfb320b7a1c79e0e620006 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 18 Nov 2016 16:04:59 +0100
Subject: [PATCH 51/60] qed: Use a coroutine for need_check_timer

This fixes the last place where we degraded from AIO to actual blocking
synchronous I/O requests. Putting it into a coroutine means that instead
of blocking, the coroutine simply yields while doing I/O.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index e53f6b5abf..eac8c2f19a 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -264,28 +264,10 @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
     qemu_co_enter_next(&s->allocating_write_reqs);
 }
 
-static void qed_clear_need_check(void *opaque, int ret)
-{
-    BDRVQEDState *s = opaque;
-
-    if (ret) {
-        qed_unplug_allocating_write_reqs(s);
-        return;
-    }
-
-    s->header.features &= ~QED_F_NEED_CHECK;
-    ret = qed_write_header(s);
-    (void) ret;
-
-    qed_unplug_allocating_write_reqs(s);
-
-    ret = bdrv_flush(s->bs);
-    (void) ret;
-}
-
-static void qed_need_check_timer_cb(void *opaque)
+static void qed_need_check_timer_entry(void *opaque)
 {
     BDRVQEDState *s = opaque;
+    int ret;
 
     /* The timer should only fire when allocating writes have drained */
     assert(!s->allocating_acb);
@@ -296,8 +278,27 @@ static void qed_need_check_timer_cb(void *opaque)
     qed_plug_allocating_write_reqs(s);
 
     /* Ensure writes are on disk before clearing flag */
-    bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s);
+    ret = bdrv_co_flush(s->bs->file->bs);
     qed_release(s);
+    if (ret < 0) {
+        qed_unplug_allocating_write_reqs(s);
+        return;
+    }
+
+    s->header.features &= ~QED_F_NEED_CHECK;
+    ret = qed_write_header(s);
+    (void) ret;
+
+    qed_unplug_allocating_write_reqs(s);
+
+    ret = bdrv_co_flush(s->bs);
+    (void) ret;
+}
+
+static void qed_need_check_timer_cb(void *opaque)
+{
+    Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque);
+    qemu_coroutine_enter(co);
 }
 
 void qed_acquire(BDRVQEDState *s)

From 87f0d88261d120e428cd23305775026556e6cce6 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 12 Jun 2017 11:12:41 +0200
Subject: [PATCH 52/60] qed: Add coroutine_fn to I/O path functions

Now that we stay in coroutine context for the whole request when doing
reads or writes, we can add coroutine_fn annotations to many functions
that can do I/O or yield directly.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed-cluster.c |  5 +++--
 block/qed.c         | 44 ++++++++++++++++++++++++--------------------
 block/qed.h         |  5 +++--
 3 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/block/qed-cluster.c b/block/qed-cluster.c
index 88dc979d8c..d8d6e66a0f 100644
--- a/block/qed-cluster.c
+++ b/block/qed-cluster.c
@@ -86,8 +86,9 @@ static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s,
  * On failure QED_CLUSTER_L2 or QED_CLUSTER_L1 is returned for missing L2 or L1
  * table offset, respectively. len is number of contiguous unallocated bytes.
  */
-int qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
-                     size_t *len, uint64_t *img_offset)
+int coroutine_fn qed_find_cluster(BDRVQEDState *s, QEDRequest *request,
+                                  uint64_t pos, size_t *len,
+                                  uint64_t *img_offset)
 {
     uint64_t l2_offset;
     uint64_t offset = 0;
diff --git a/block/qed.c b/block/qed.c
index eac8c2f19a..48f2b0eb52 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -94,7 +94,7 @@ int qed_write_header_sync(BDRVQEDState *s)
  * This function only updates known header fields in-place and does not affect
  * extra data after the QED header.
  */
-static int qed_write_header(BDRVQEDState *s)
+static int coroutine_fn qed_write_header(BDRVQEDState *s)
 {
     /* We must write full sectors for O_DIRECT but cannot necessarily generate
      * the data following the header if an unrecognized compat feature is
@@ -264,7 +264,7 @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
     qemu_co_enter_next(&s->allocating_write_reqs);
 }
 
-static void qed_need_check_timer_entry(void *opaque)
+static void coroutine_fn qed_need_check_timer_entry(void *opaque)
 {
     BDRVQEDState *s = opaque;
     int ret;
@@ -757,9 +757,9 @@ static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
  * This function reads qiov->size bytes starting at pos from the backing file.
  * If there is no backing file then zeroes are read.
  */
-static int qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
-                                 QEMUIOVector *qiov,
-                                 QEMUIOVector **backing_qiov)
+static int coroutine_fn qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
+                                              QEMUIOVector *qiov,
+                                              QEMUIOVector **backing_qiov)
 {
     uint64_t backing_length = 0;
     size_t size;
@@ -811,8 +811,9 @@ static int qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
  * @len:        Number of bytes
  * @offset:     Byte offset in image file
  */
-static int qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
-                                      uint64_t len, uint64_t offset)
+static int coroutine_fn qed_copy_from_backing_file(BDRVQEDState *s,
+                                                   uint64_t pos, uint64_t len,
+                                                   uint64_t offset)
 {
     QEMUIOVector qiov;
     QEMUIOVector *backing_qiov = NULL;
@@ -865,8 +866,9 @@ out:
  * The cluster offset may be an allocated byte offset in the image file, the
  * zero cluster marker, or the unallocated cluster marker.
  */
-static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
-                                unsigned int n, uint64_t cluster)
+static void coroutine_fn qed_update_l2_table(BDRVQEDState *s, QEDTable *table,
+                                             int index, unsigned int n,
+                                             uint64_t cluster)
 {
     int i;
     for (i = index; i < index + n; i++) {
@@ -878,7 +880,7 @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
     }
 }
 
-static void qed_aio_complete(QEDAIOCB *acb)
+static void coroutine_fn qed_aio_complete(QEDAIOCB *acb)
 {
     BDRVQEDState *s = acb_to_s(acb);
 
@@ -911,7 +913,7 @@ static void qed_aio_complete(QEDAIOCB *acb)
 /**
  * Update L1 table with new L2 table offset and write it out
  */
-static int qed_aio_write_l1_update(QEDAIOCB *acb)
+static int coroutine_fn qed_aio_write_l1_update(QEDAIOCB *acb)
 {
     BDRVQEDState *s = acb_to_s(acb);
     CachedL2Table *l2_table = acb->request.l2_table;
@@ -939,7 +941,7 @@ static int qed_aio_write_l1_update(QEDAIOCB *acb)
 /**
  * Update L2 table with new cluster offsets and write them out
  */
-static int qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
+static int coroutine_fn qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
 {
     BDRVQEDState *s = acb_to_s(acb);
     bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
@@ -975,7 +977,7 @@ static int qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
 /**
  * Write data to the image file
  */
-static int qed_aio_write_main(QEDAIOCB *acb)
+static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
 {
     BDRVQEDState *s = acb_to_s(acb);
     uint64_t offset = acb->cur_cluster +
@@ -1018,7 +1020,7 @@ static int qed_aio_write_main(QEDAIOCB *acb)
 /**
  * Populate untouched regions of new data cluster
  */
-static int qed_aio_write_cow(QEDAIOCB *acb)
+static int coroutine_fn qed_aio_write_cow(QEDAIOCB *acb)
 {
     BDRVQEDState *s = acb_to_s(acb);
     uint64_t start, len, offset;
@@ -1071,7 +1073,7 @@ static bool qed_should_set_need_check(BDRVQEDState *s)
  *
  * This path is taken when writing to previously unallocated clusters.
  */
-static int qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+static int coroutine_fn qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 {
     BDRVQEDState *s = acb_to_s(acb);
     int ret;
@@ -1132,7 +1134,8 @@ static int qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
  *
  * This path is taken when writing to already allocated clusters.
  */
-static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
+static int coroutine_fn qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset,
+                                              size_t len)
 {
     /* Allocate buffer for zero writes */
     if (acb->flags & QED_AIOCB_ZERO) {
@@ -1163,8 +1166,8 @@ static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
  * @offset:     Cluster offset in bytes
  * @len:        Length in bytes
  */
-static int qed_aio_write_data(void *opaque, int ret,
-                              uint64_t offset, size_t len)
+static int coroutine_fn qed_aio_write_data(void *opaque, int ret,
+                                           uint64_t offset, size_t len)
 {
     QEDAIOCB *acb = opaque;
 
@@ -1194,7 +1197,8 @@ static int qed_aio_write_data(void *opaque, int ret,
  * @offset:     Cluster offset in bytes
  * @len:        Length in bytes
  */
-static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
+static int coroutine_fn qed_aio_read_data(void *opaque, int ret,
+                                          uint64_t offset, size_t len)
 {
     QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
@@ -1227,7 +1231,7 @@ static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
 /**
  * Begin next I/O or complete the request
  */
-static int qed_aio_next_io(QEDAIOCB *acb)
+static int coroutine_fn qed_aio_next_io(QEDAIOCB *acb)
 {
     BDRVQEDState *s = acb_to_s(acb);
     uint64_t offset;
diff --git a/block/qed.h b/block/qed.h
index fb80943c2d..dd3a2d5519 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -213,8 +213,9 @@ int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
 /**
  * Cluster functions
  */
-int qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
-                     size_t *len, uint64_t *img_offset);
+int coroutine_fn qed_find_cluster(BDRVQEDState *s, QEDRequest *request,
+                                  uint64_t pos, size_t *len,
+                                  uint64_t *img_offset);
 
 /**
  * Consistency check

From 0f714ec70673e734b4b0d5c99f7851c8d9e6fafe Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 16 Jun 2017 14:43:19 +0200
Subject: [PATCH 53/60] qed: Use bdrv_co_* for coroutine_fns

All functions that are marked coroutine_fn can directly call the
bdrv_co_* version of functions instead of going through the wrapper.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 48f2b0eb52..c073baa340 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -116,7 +116,7 @@ static int coroutine_fn qed_write_header(BDRVQEDState *s)
     };
     qemu_iovec_init_external(&qiov, &iov, 1);
 
-    ret = bdrv_preadv(s->bs->file, 0, &qiov);
+    ret = bdrv_co_preadv(s->bs->file, 0, qiov.size, &qiov, 0);
     if (ret < 0) {
         goto out;
     }
@@ -124,7 +124,7 @@ static int coroutine_fn qed_write_header(BDRVQEDState *s)
     /* Update header */
     qed_header_cpu_to_le(&s->header, (QEDHeader *) buf);
 
-    ret = bdrv_pwritev(s->bs->file, 0, &qiov);
+    ret = bdrv_co_pwritev(s->bs->file, 0, qiov.size,  &qiov, 0);
     if (ret < 0) {
         goto out;
     }
@@ -796,7 +796,7 @@ static int coroutine_fn qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
     qemu_iovec_concat(*backing_qiov, qiov, 0, size);
 
     BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
-    ret = bdrv_preadv(s->bs->backing, pos, *backing_qiov);
+    ret = bdrv_co_preadv(s->bs->backing, pos, size, *backing_qiov, 0);
     if (ret < 0) {
         return ret;
     }
@@ -844,7 +844,7 @@ static int coroutine_fn qed_copy_from_backing_file(BDRVQEDState *s,
     }
 
     BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
-    ret = bdrv_pwritev(s->bs->file, offset, &qiov);
+    ret = bdrv_co_pwritev(s->bs->file, offset, qiov.size, &qiov, 0);
     if (ret < 0) {
         goto out;
     }
@@ -987,7 +987,8 @@ static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
     trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size);
 
     BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
-    ret = bdrv_pwritev(s->bs->file, offset, &acb->cur_qiov);
+    ret = bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size,
+                          &acb->cur_qiov, 0);
     if (ret < 0) {
         return ret;
     }
@@ -1004,7 +1005,7 @@ static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
              * region.  The solution is to flush after writing a new data
              * cluster and before updating the L2 table.
              */
-            ret = bdrv_flush(s->bs->file->bs);
+            ret = bdrv_co_flush(s->bs->file->bs);
             if (ret < 0) {
                 return ret;
             }
@@ -1221,7 +1222,8 @@ static int coroutine_fn qed_aio_read_data(void *opaque, int ret,
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
-    ret = bdrv_preadv(bs->file, offset, &acb->cur_qiov);
+    ret = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size,
+                         &acb->cur_qiov, 0);
     if (ret < 0) {
         return ret;
     }

From c5f1ad429cdf26023cf331075a7d327708e3db6d Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 18 Nov 2016 16:47:54 +0100
Subject: [PATCH 54/60] block: Remove bdrv_aio_readv/writev/flush()

These functions are unused now.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c            | 171 ------------------------------------------
 block/trace-events    |   3 -
 include/block/block.h |   8 --
 3 files changed, 182 deletions(-)

diff --git a/block/io.c b/block/io.c
index e158ae0174..132bcbbd7a 100644
--- a/block/io.c
+++ b/block/io.c
@@ -34,14 +34,6 @@
 
 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
 
-static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
-                                          int64_t offset,
-                                          QEMUIOVector *qiov,
-                                          BdrvRequestFlags flags,
-                                          BlockCompletionFunc *cb,
-                                          void *opaque,
-                                          bool is_write);
-static void coroutine_fn bdrv_co_do_rw(void *opaque);
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
     int64_t offset, int count, BdrvRequestFlags flags);
 
@@ -2080,28 +2072,6 @@ int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
 /**************************************************************/
 /* async I/Os */
 
-BlockAIOCB *bdrv_aio_readv(BdrvChild *child, int64_t sector_num,
-                           QEMUIOVector *qiov, int nb_sectors,
-                           BlockCompletionFunc *cb, void *opaque)
-{
-    trace_bdrv_aio_readv(child->bs, sector_num, nb_sectors, opaque);
-
-    assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
-    return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
-                                  0, cb, opaque, false);
-}
-
-BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num,
-                            QEMUIOVector *qiov, int nb_sectors,
-                            BlockCompletionFunc *cb, void *opaque)
-{
-    trace_bdrv_aio_writev(child->bs, sector_num, nb_sectors, opaque);
-
-    assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
-    return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
-                                  0, cb, opaque, true);
-}
-
 void bdrv_aio_cancel(BlockAIOCB *acb)
 {
     qemu_aio_ref(acb);
@@ -2133,147 +2103,6 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb)
     }
 }
 
-/**************************************************************/
-/* async block device emulation */
-
-typedef struct BlockRequest {
-    union {
-        /* Used during read, write, trim */
-        struct {
-            int64_t offset;
-            int bytes;
-            int flags;
-            QEMUIOVector *qiov;
-        };
-        /* Used during ioctl */
-        struct {
-            int req;
-            void *buf;
-        };
-    };
-    BlockCompletionFunc *cb;
-    void *opaque;
-
-    int error;
-} BlockRequest;
-
-typedef struct BlockAIOCBCoroutine {
-    BlockAIOCB common;
-    BdrvChild *child;
-    BlockRequest req;
-    bool is_write;
-    bool need_bh;
-    bool *done;
-} BlockAIOCBCoroutine;
-
-static const AIOCBInfo bdrv_em_co_aiocb_info = {
-    .aiocb_size         = sizeof(BlockAIOCBCoroutine),
-};
-
-static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
-{
-    if (!acb->need_bh) {
-        bdrv_dec_in_flight(acb->common.bs);
-        acb->common.cb(acb->common.opaque, acb->req.error);
-        qemu_aio_unref(acb);
-    }
-}
-
-static void bdrv_co_em_bh(void *opaque)
-{
-    BlockAIOCBCoroutine *acb = opaque;
-
-    assert(!acb->need_bh);
-    bdrv_co_complete(acb);
-}
-
-static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
-{
-    acb->need_bh = false;
-    if (acb->req.error != -EINPROGRESS) {
-        BlockDriverState *bs = acb->common.bs;
-
-        aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
-    }
-}
-
-/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
-static void coroutine_fn bdrv_co_do_rw(void *opaque)
-{
-    BlockAIOCBCoroutine *acb = opaque;
-
-    if (!acb->is_write) {
-        acb->req.error = bdrv_co_preadv(acb->child, acb->req.offset,
-            acb->req.qiov->size, acb->req.qiov, acb->req.flags);
-    } else {
-        acb->req.error = bdrv_co_pwritev(acb->child, acb->req.offset,
-            acb->req.qiov->size, acb->req.qiov, acb->req.flags);
-    }
-
-    bdrv_co_complete(acb);
-}
-
-static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
-                                          int64_t offset,
-                                          QEMUIOVector *qiov,
-                                          BdrvRequestFlags flags,
-                                          BlockCompletionFunc *cb,
-                                          void *opaque,
-                                          bool is_write)
-{
-    Coroutine *co;
-    BlockAIOCBCoroutine *acb;
-
-    /* Matched by bdrv_co_complete's bdrv_dec_in_flight.  */
-    bdrv_inc_in_flight(child->bs);
-
-    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, child->bs, cb, opaque);
-    acb->child = child;
-    acb->need_bh = true;
-    acb->req.error = -EINPROGRESS;
-    acb->req.offset = offset;
-    acb->req.qiov = qiov;
-    acb->req.flags = flags;
-    acb->is_write = is_write;
-
-    co = qemu_coroutine_create(bdrv_co_do_rw, acb);
-    bdrv_coroutine_enter(child->bs, co);
-
-    bdrv_co_maybe_schedule_bh(acb);
-    return &acb->common;
-}
-
-static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
-{
-    BlockAIOCBCoroutine *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
-
-    acb->req.error = bdrv_co_flush(bs);
-    bdrv_co_complete(acb);
-}
-
-BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
-        BlockCompletionFunc *cb, void *opaque)
-{
-    trace_bdrv_aio_flush(bs, opaque);
-
-    Coroutine *co;
-    BlockAIOCBCoroutine *acb;
-
-    /* Matched by bdrv_co_complete's bdrv_dec_in_flight.  */
-    bdrv_inc_in_flight(bs);
-
-    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
-    acb->need_bh = true;
-    acb->req.error = -EINPROGRESS;
-
-    co = qemu_coroutine_create(bdrv_aio_flush_co_entry, acb);
-    bdrv_coroutine_enter(bs, co);
-
-    bdrv_co_maybe_schedule_bh(acb);
-    return &acb->common;
-}
-
 /**************************************************************/
 /* Coroutine block device emulation */
 
diff --git a/block/trace-events b/block/trace-events
index 9a71c7fb04..752de6a054 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -9,9 +9,6 @@ blk_co_preadv(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags
 blk_co_pwritev(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x"
 
 # block/io.c
-bdrv_aio_flush(void *bs, void *opaque) "bs %p opaque %p"
-bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
-bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
 bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags %#x"
diff --git a/include/block/block.h b/include/block/block.h
index a4f09df95a..623e7fca97 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -353,14 +353,6 @@ BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
                                         const char *node_name, Error **errp);
 
 /* async block I/O */
-BlockAIOCB *bdrv_aio_readv(BdrvChild *child, int64_t sector_num,
-                           QEMUIOVector *iov, int nb_sectors,
-                           BlockCompletionFunc *cb, void *opaque);
-BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num,
-                            QEMUIOVector *iov, int nb_sectors,
-                            BlockCompletionFunc *cb, void *opaque);
-BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
-                           BlockCompletionFunc *cb, void *opaque);
 void bdrv_aio_cancel(BlockAIOCB *acb);
 void bdrv_aio_cancel_async(BlockAIOCB *acb);
 

From f5a5ca796932d04cb2a1cb9382a55f72795b3e06 Mon Sep 17 00:00:00 2001
From: Manos Pitsidianakis <el13635@mail.ntua.gr>
Date: Fri, 9 Jun 2017 13:18:08 +0300
Subject: [PATCH 55/60] block: change variable names in BlockDriverState

Change the 'int count' parameter in *pwrite_zeros, *pdiscard related
functions (and some others) to 'int bytes', as they both refer to bytes.
This helps with code legibility.

Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
Message-id: 20170609101808.13506-1-el13635@mail.ntua.gr
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/blkdebug.c               | 36 ++++++++++++-------------
 block/blkreplay.c              |  8 +++---
 block/block-backend.c          | 22 ++++++++--------
 block/file-posix.c             | 34 ++++++++++++------------
 block/io.c                     | 48 +++++++++++++++++-----------------
 block/iscsi.c                  | 20 +++++++-------
 block/mirror.c                 |  8 +++---
 block/nbd-client.c             |  8 +++---
 block/nbd-client.h             |  4 +--
 block/qcow2.c                  | 28 ++++++++++----------
 block/qed.c                    |  8 +++---
 block/raw-format.c             |  8 +++---
 block/rbd.c                    |  4 +--
 block/sheepdog.c               |  6 ++---
 include/block/block.h          |  8 +++---
 include/block/block_int.h      |  6 ++---
 include/sysemu/block-backend.h | 20 +++++++-------
 qemu-io-cmds.c                 | 46 ++++++++++++++++----------------
 18 files changed, 161 insertions(+), 161 deletions(-)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index 0618fc71c6..6431962d7e 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -575,7 +575,7 @@ static int blkdebug_co_flush(BlockDriverState *bs)
 }
 
 static int coroutine_fn blkdebug_co_pwrite_zeroes(BlockDriverState *bs,
-                                                  int64_t offset, int count,
+                                                  int64_t offset, int bytes,
                                                   BdrvRequestFlags flags)
 {
     uint32_t align = MAX(bs->bl.request_alignment,
@@ -586,29 +586,29 @@ static int coroutine_fn blkdebug_co_pwrite_zeroes(BlockDriverState *bs,
      * preferred alignment (so that we test the fallback to writes on
      * unaligned portions), and check that the block layer never hands
      * us anything unaligned that crosses an alignment boundary.  */
-    if (count < align) {
+    if (bytes < align) {
         assert(QEMU_IS_ALIGNED(offset, align) ||
-               QEMU_IS_ALIGNED(offset + count, align) ||
+               QEMU_IS_ALIGNED(offset + bytes, align) ||
                DIV_ROUND_UP(offset, align) ==
-               DIV_ROUND_UP(offset + count, align));
+               DIV_ROUND_UP(offset + bytes, align));
         return -ENOTSUP;
     }
     assert(QEMU_IS_ALIGNED(offset, align));
-    assert(QEMU_IS_ALIGNED(count, align));
+    assert(QEMU_IS_ALIGNED(bytes, align));
     if (bs->bl.max_pwrite_zeroes) {
-        assert(count <= bs->bl.max_pwrite_zeroes);
+        assert(bytes <= bs->bl.max_pwrite_zeroes);
     }
 
-    err = rule_check(bs, offset, count);
+    err = rule_check(bs, offset, bytes);
     if (err) {
         return err;
     }
 
-    return bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
+    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
 }
 
 static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs,
-                                             int64_t offset, int count)
+                                             int64_t offset, int bytes)
 {
     uint32_t align = bs->bl.pdiscard_alignment;
     int err;
@@ -616,29 +616,29 @@ static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs,
     /* Only pass through requests that are larger than requested
      * minimum alignment, and ensure that unaligned requests do not
      * cross optimum discard boundaries. */
-    if (count < bs->bl.request_alignment) {
+    if (bytes < bs->bl.request_alignment) {
         assert(QEMU_IS_ALIGNED(offset, align) ||
-               QEMU_IS_ALIGNED(offset + count, align) ||
+               QEMU_IS_ALIGNED(offset + bytes, align) ||
                DIV_ROUND_UP(offset, align) ==
-               DIV_ROUND_UP(offset + count, align));
+               DIV_ROUND_UP(offset + bytes, align));
         return -ENOTSUP;
     }
     assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment));
-    assert(QEMU_IS_ALIGNED(count, bs->bl.request_alignment));
-    if (align && count >= align) {
+    assert(QEMU_IS_ALIGNED(bytes, bs->bl.request_alignment));
+    if (align && bytes >= align) {
         assert(QEMU_IS_ALIGNED(offset, align));
-        assert(QEMU_IS_ALIGNED(count, align));
+        assert(QEMU_IS_ALIGNED(bytes, align));
     }
     if (bs->bl.max_pdiscard) {
-        assert(count <= bs->bl.max_pdiscard);
+        assert(bytes <= bs->bl.max_pdiscard);
     }
 
-    err = rule_check(bs, offset, count);
+    err = rule_check(bs, offset, bytes);
     if (err) {
         return err;
     }
 
-    return bdrv_co_pdiscard(bs->file->bs, offset, count);
+    return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
 }
 
 static void blkdebug_close(BlockDriverState *bs)
diff --git a/block/blkreplay.c b/block/blkreplay.c
index 6aa5fd4156..61e44a1949 100755
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -96,10 +96,10 @@ static int coroutine_fn blkreplay_co_pwritev(BlockDriverState *bs,
 }
 
 static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs,
-    int64_t offset, int count, BdrvRequestFlags flags)
+    int64_t offset, int bytes, BdrvRequestFlags flags)
 {
     uint64_t reqid = blkreplay_next_id();
-    int ret = bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
+    int ret = bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
     block_request_create(reqid, bs, qemu_coroutine_self());
     qemu_coroutine_yield();
 
@@ -107,10 +107,10 @@ static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs,
 }
 
 static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs,
-                                              int64_t offset, int count)
+                                              int64_t offset, int bytes)
 {
     uint64_t reqid = blkreplay_next_id();
-    int ret = bdrv_co_pdiscard(bs->file->bs, offset, count);
+    int ret = bdrv_co_pdiscard(bs->file->bs, offset, bytes);
     block_request_create(reqid, bs, qemu_coroutine_self());
     qemu_coroutine_yield();
 
diff --git a/block/block-backend.c b/block/block-backend.c
index a2bbae90b1..0df3457a09 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1099,9 +1099,9 @@ int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
 }
 
 int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
-                      int count, BdrvRequestFlags flags)
+                      int bytes, BdrvRequestFlags flags)
 {
-    return blk_prw(blk, offset, NULL, count, blk_write_entry,
+    return blk_prw(blk, offset, NULL, bytes, blk_write_entry,
                    flags | BDRV_REQ_ZERO_WRITE);
 }
 
@@ -1311,10 +1311,10 @@ static void blk_aio_pdiscard_entry(void *opaque)
 }
 
 BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
-                             int64_t offset, int count,
+                             int64_t offset, int bytes,
                              BlockCompletionFunc *cb, void *opaque)
 {
-    return blk_aio_prwv(blk, offset, count, NULL, blk_aio_pdiscard_entry, 0,
+    return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0,
                         cb, opaque);
 }
 
@@ -1374,14 +1374,14 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
     return blk_aio_prwv(blk, req, 0, &qiov, blk_aio_ioctl_entry, 0, cb, opaque);
 }
 
-int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int count)
+int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
 {
-    int ret = blk_check_byte_request(blk, offset, count);
+    int ret = blk_check_byte_request(blk, offset, bytes);
     if (ret < 0) {
         return ret;
     }
 
-    return bdrv_co_pdiscard(blk_bs(blk), offset, count);
+    return bdrv_co_pdiscard(blk_bs(blk), offset, bytes);
 }
 
 int blk_co_flush(BlockBackend *blk)
@@ -1760,9 +1760,9 @@ void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
 }
 
 int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
-                                      int count, BdrvRequestFlags flags)
+                                      int bytes, BdrvRequestFlags flags)
 {
-    return blk_co_pwritev(blk, offset, count, NULL,
+    return blk_co_pwritev(blk, offset, bytes, NULL,
                           flags | BDRV_REQ_ZERO_WRITE);
 }
 
@@ -1789,9 +1789,9 @@ static void blk_pdiscard_entry(void *opaque)
     rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, rwco->qiov->size);
 }
 
-int blk_pdiscard(BlockBackend *blk, int64_t offset, int count)
+int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
 {
-    return blk_prw(blk, offset, NULL, count, blk_pdiscard_entry, 0);
+    return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0);
 }
 
 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
diff --git a/block/file-posix.c b/block/file-posix.c
index de2d3a2e3c..3927fabf06 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1485,7 +1485,7 @@ static int aio_worker(void *arg)
 
 static int paio_submit_co(BlockDriverState *bs, int fd,
                           int64_t offset, QEMUIOVector *qiov,
-                          int count, int type)
+                          int bytes, int type)
 {
     RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
     ThreadPool *pool;
@@ -1494,22 +1494,22 @@ static int paio_submit_co(BlockDriverState *bs, int fd,
     acb->aio_type = type;
     acb->aio_fildes = fd;
 
-    acb->aio_nbytes = count;
+    acb->aio_nbytes = bytes;
     acb->aio_offset = offset;
 
     if (qiov) {
         acb->aio_iov = qiov->iov;
         acb->aio_niov = qiov->niov;
-        assert(qiov->size == count);
+        assert(qiov->size == bytes);
     }
 
-    trace_paio_submit_co(offset, count, type);
+    trace_paio_submit_co(offset, bytes, type);
     pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
     return thread_pool_submit_co(pool, aio_worker, acb);
 }
 
 static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
-        int64_t offset, QEMUIOVector *qiov, int count,
+        int64_t offset, QEMUIOVector *qiov, int bytes,
         BlockCompletionFunc *cb, void *opaque, int type)
 {
     RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
@@ -1519,7 +1519,7 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
     acb->aio_type = type;
     acb->aio_fildes = fd;
 
-    acb->aio_nbytes = count;
+    acb->aio_nbytes = bytes;
     acb->aio_offset = offset;
 
     if (qiov) {
@@ -1528,7 +1528,7 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
         assert(qiov->size == acb->aio_nbytes);
     }
 
-    trace_paio_submit(acb, opaque, offset, count, type);
+    trace_paio_submit(acb, opaque, offset, bytes, type);
     pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
     return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }
@@ -2109,26 +2109,26 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
 }
 
 static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,
-    int64_t offset, int count,
+    int64_t offset, int bytes,
     BlockCompletionFunc *cb, void *opaque)
 {
     BDRVRawState *s = bs->opaque;
 
-    return paio_submit(bs, s->fd, offset, NULL, count,
+    return paio_submit(bs, s->fd, offset, NULL, bytes,
                        cb, opaque, QEMU_AIO_DISCARD);
 }
 
 static int coroutine_fn raw_co_pwrite_zeroes(
     BlockDriverState *bs, int64_t offset,
-    int count, BdrvRequestFlags flags)
+    int bytes, BdrvRequestFlags flags)
 {
     BDRVRawState *s = bs->opaque;
 
     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
-        return paio_submit_co(bs, s->fd, offset, NULL, count,
+        return paio_submit_co(bs, s->fd, offset, NULL, bytes,
                               QEMU_AIO_WRITE_ZEROES);
     } else if (s->discard_zeroes) {
-        return paio_submit_co(bs, s->fd, offset, NULL, count,
+        return paio_submit_co(bs, s->fd, offset, NULL, bytes,
                               QEMU_AIO_DISCARD);
     }
     return -ENOTSUP;
@@ -2560,7 +2560,7 @@ static int fd_open(BlockDriverState *bs)
 }
 
 static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs,
-    int64_t offset, int count,
+    int64_t offset, int bytes,
     BlockCompletionFunc *cb, void *opaque)
 {
     BDRVRawState *s = bs->opaque;
@@ -2568,12 +2568,12 @@ static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs,
     if (fd_open(bs) < 0) {
         return NULL;
     }
-    return paio_submit(bs, s->fd, offset, NULL, count,
+    return paio_submit(bs, s->fd, offset, NULL, bytes,
                        cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
 }
 
 static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
-    int64_t offset, int count, BdrvRequestFlags flags)
+    int64_t offset, int bytes, BdrvRequestFlags flags)
 {
     BDRVRawState *s = bs->opaque;
     int rc;
@@ -2583,10 +2583,10 @@ static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
         return rc;
     }
     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
-        return paio_submit_co(bs, s->fd, offset, NULL, count,
+        return paio_submit_co(bs, s->fd, offset, NULL, bytes,
                               QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV);
     } else if (s->discard_zeroes) {
-        return paio_submit_co(bs, s->fd, offset, NULL, count,
+        return paio_submit_co(bs, s->fd, offset, NULL, bytes,
                               QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
     }
     return -ENOTSUP;
diff --git a/block/io.c b/block/io.c
index 132bcbbd7a..9bba730a7e 100644
--- a/block/io.c
+++ b/block/io.c
@@ -35,7 +35,7 @@
 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
 
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
-    int64_t offset, int count, BdrvRequestFlags flags);
+    int64_t offset, int bytes, BdrvRequestFlags flags);
 
 void bdrv_parent_drained_begin(BlockDriverState *bs)
 {
@@ -666,12 +666,12 @@ int bdrv_write(BdrvChild *child, int64_t sector_num,
 }
 
 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
-                       int count, BdrvRequestFlags flags)
+                       int bytes, BdrvRequestFlags flags)
 {
     QEMUIOVector qiov;
     struct iovec iov = {
         .iov_base = NULL,
-        .iov_len = count,
+        .iov_len = bytes,
     };
 
     qemu_iovec_init_external(&qiov, &iov, 1);
@@ -1212,7 +1212,7 @@ int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num,
 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
 
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
-    int64_t offset, int count, BdrvRequestFlags flags)
+    int64_t offset, int bytes, BdrvRequestFlags flags)
 {
     BlockDriver *drv = bs->drv;
     QEMUIOVector qiov;
@@ -1230,12 +1230,12 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
 
     assert(alignment % bs->bl.request_alignment == 0);
     head = offset % alignment;
-    tail = (offset + count) % alignment;
+    tail = (offset + bytes) % alignment;
     max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
     assert(max_write_zeroes >= bs->bl.request_alignment);
 
-    while (count > 0 && !ret) {
-        int num = count;
+    while (bytes > 0 && !ret) {
+        int num = bytes;
 
         /* Align request.  Block drivers can expect the "bulk" of the request
          * to be aligned, and that unaligned requests do not cross cluster
@@ -1245,7 +1245,7 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
             /* Make a small request up to the first aligned sector. For
              * convenience, limit this request to max_transfer even if
              * we don't need to fall back to writes.  */
-            num = MIN(MIN(count, max_transfer), alignment - head);
+            num = MIN(MIN(bytes, max_transfer), alignment - head);
             head = (head + num) % alignment;
             assert(num < max_write_zeroes);
         } else if (tail && num > alignment) {
@@ -1306,7 +1306,7 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
         }
 
         offset += num;
-        count -= num;
+        bytes -= num;
     }
 
 fail:
@@ -1658,15 +1658,15 @@ int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num,
 }
 
 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
-                                       int count, BdrvRequestFlags flags)
+                                       int bytes, BdrvRequestFlags flags)
 {
-    trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags);
+    trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
 
     if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
         flags &= ~BDRV_REQ_MAY_UNMAP;
     }
 
-    return bdrv_co_pwritev(child, offset, count, NULL,
+    return bdrv_co_pwritev(child, offset, bytes, NULL,
                            BDRV_REQ_ZERO_WRITE | flags);
 }
 
@@ -2248,18 +2248,18 @@ int bdrv_flush(BlockDriverState *bs)
 typedef struct DiscardCo {
     BlockDriverState *bs;
     int64_t offset;
-    int count;
+    int bytes;
     int ret;
 } DiscardCo;
 static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
 {
     DiscardCo *rwco = opaque;
 
-    rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->count);
+    rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->bytes);
 }
 
 int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
-                                  int count)
+                                  int bytes)
 {
     BdrvTrackedRequest req;
     int max_pdiscard, ret;
@@ -2269,7 +2269,7 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
         return -ENOMEDIUM;
     }
 
-    ret = bdrv_check_byte_request(bs, offset, count);
+    ret = bdrv_check_byte_request(bs, offset, bytes);
     if (ret < 0) {
         return ret;
     } else if (bs->read_only) {
@@ -2294,10 +2294,10 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
     align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
     assert(align % bs->bl.request_alignment == 0);
     head = offset % align;
-    tail = (offset + count) % align;
+    tail = (offset + bytes) % align;
 
     bdrv_inc_in_flight(bs);
-    tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD);
+    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
 
     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
     if (ret < 0) {
@@ -2308,13 +2308,13 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
                                    align);
     assert(max_pdiscard >= bs->bl.request_alignment);
 
-    while (count > 0) {
+    while (bytes > 0) {
         int ret;
-        int num = count;
+        int num = bytes;
 
         if (head) {
             /* Make small requests to get to alignment boundaries. */
-            num = MIN(count, align - head);
+            num = MIN(bytes, align - head);
             if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
                 num %= bs->bl.request_alignment;
             }
@@ -2358,7 +2358,7 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
         }
 
         offset += num;
-        count -= num;
+        bytes -= num;
     }
     ret = 0;
 out:
@@ -2370,13 +2370,13 @@ out:
     return ret;
 }
 
-int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count)
+int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
 {
     Coroutine *co;
     DiscardCo rwco = {
         .bs = bs,
         .offset = offset,
-        .count = count,
+        .bytes = bytes,
         .ret = NOT_DONE,
     };
 
diff --git a/block/iscsi.c b/block/iscsi.c
index b5f7a228b9..54067e2620 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1116,14 +1116,14 @@ iscsi_getlength(BlockDriverState *bs)
 }
 
 static int
-coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
+coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
 {
     IscsiLun *iscsilun = bs->opaque;
     struct IscsiTask iTask;
     struct unmap_list list;
     int r = 0;
 
-    if (!is_byte_request_lun_aligned(offset, count, iscsilun)) {
+    if (!is_byte_request_lun_aligned(offset, bytes, iscsilun)) {
         return -ENOTSUP;
     }
 
@@ -1133,7 +1133,7 @@ coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
     }
 
     list.lba = offset / iscsilun->block_size;
-    list.num = count / iscsilun->block_size;
+    list.num = bytes / iscsilun->block_size;
 
     iscsi_co_init_iscsitask(iscsilun, &iTask);
     qemu_mutex_lock(&iscsilun->mutex);
@@ -1174,7 +1174,7 @@ retry:
     }
 
     iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
-                               count >> BDRV_SECTOR_BITS);
+                               bytes >> BDRV_SECTOR_BITS);
 
 out_unlock:
     qemu_mutex_unlock(&iscsilun->mutex);
@@ -1183,7 +1183,7 @@ out_unlock:
 
 static int
 coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
-                                    int count, BdrvRequestFlags flags)
+                                    int bytes, BdrvRequestFlags flags)
 {
     IscsiLun *iscsilun = bs->opaque;
     struct IscsiTask iTask;
@@ -1192,7 +1192,7 @@ coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
     bool use_16_for_ws = iscsilun->use_16_for_rw;
     int r = 0;
 
-    if (!is_byte_request_lun_aligned(offset, count, iscsilun)) {
+    if (!is_byte_request_lun_aligned(offset, bytes, iscsilun)) {
         return -ENOTSUP;
     }
 
@@ -1215,7 +1215,7 @@ coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
     }
 
     lba = offset / iscsilun->block_size;
-    nb_blocks = count / iscsilun->block_size;
+    nb_blocks = bytes / iscsilun->block_size;
 
     if (iscsilun->zeroblock == NULL) {
         iscsilun->zeroblock = g_try_malloc0(iscsilun->block_size);
@@ -1273,17 +1273,17 @@ retry:
 
     if (iTask.status != SCSI_STATUS_GOOD) {
         iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
-                                   count >> BDRV_SECTOR_BITS);
+                                   bytes >> BDRV_SECTOR_BITS);
         r = iTask.err_code;
         goto out_unlock;
     }
 
     if (flags & BDRV_REQ_MAY_UNMAP) {
         iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
-                                   count >> BDRV_SECTOR_BITS);
+                                   bytes >> BDRV_SECTOR_BITS);
     } else {
         iscsi_allocmap_set_allocated(iscsilun, offset >> BDRV_SECTOR_BITS,
-                                     count >> BDRV_SECTOR_BITS);
+                                     bytes >> BDRV_SECTOR_BITS);
     }
 
 out_unlock:
diff --git a/block/mirror.c b/block/mirror.c
index 19afcc6f1a..68744a17e8 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1063,15 +1063,15 @@ static int64_t coroutine_fn bdrv_mirror_top_get_block_status(
 }
 
 static int coroutine_fn bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs,
-    int64_t offset, int count, BdrvRequestFlags flags)
+    int64_t offset, int bytes, BdrvRequestFlags flags)
 {
-    return bdrv_co_pwrite_zeroes(bs->backing, offset, count, flags);
+    return bdrv_co_pwrite_zeroes(bs->backing, offset, bytes, flags);
 }
 
 static int coroutine_fn bdrv_mirror_top_pdiscard(BlockDriverState *bs,
-    int64_t offset, int count)
+    int64_t offset, int bytes)
 {
-    return bdrv_co_pdiscard(bs->backing->bs, offset, count);
+    return bdrv_co_pdiscard(bs->backing->bs, offset, bytes);
 }
 
 static void bdrv_mirror_top_refresh_filename(BlockDriverState *bs, QDict *opts)
diff --git a/block/nbd-client.c b/block/nbd-client.c
index d64e775385..02e928142e 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -259,14 +259,14 @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
 }
 
 int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
-                                int count, BdrvRequestFlags flags)
+                                int bytes, BdrvRequestFlags flags)
 {
     ssize_t ret;
     NBDClientSession *client = nbd_get_client_session(bs);
     NBDRequest request = {
         .type = NBD_CMD_WRITE_ZEROES,
         .from = offset,
-        .len = count,
+        .len = bytes,
     };
     NBDReply reply;
 
@@ -316,13 +316,13 @@ int nbd_client_co_flush(BlockDriverState *bs)
     return -reply.error;
 }
 
-int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
+int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
 {
     NBDClientSession *client = nbd_get_client_session(bs);
     NBDRequest request = {
         .type = NBD_CMD_TRIM,
         .from = offset,
-        .len = count,
+        .len = bytes,
     };
     NBDReply reply;
     ssize_t ret;
diff --git a/block/nbd-client.h b/block/nbd-client.h
index 891ba44a20..49636bc621 100644
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -42,12 +42,12 @@ int nbd_client_init(BlockDriverState *bs,
                     Error **errp);
 void nbd_client_close(BlockDriverState *bs);
 
-int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count);
+int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes);
 int nbd_client_co_flush(BlockDriverState *bs);
 int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
                           uint64_t bytes, QEMUIOVector *qiov, int flags);
 int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
-                                int count, BdrvRequestFlags flags);
+                                int bytes, BdrvRequestFlags flags);
 int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
                          uint64_t bytes, QEMUIOVector *qiov, int flags);
 
diff --git a/block/qcow2.c b/block/qcow2.c
index 088ffe1673..2f94f0326e 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2508,16 +2508,16 @@ static bool is_zero_sectors(BlockDriverState *bs, int64_t start,
 }
 
 static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
-    int64_t offset, int count, BdrvRequestFlags flags)
+    int64_t offset, int bytes, BdrvRequestFlags flags)
 {
     int ret;
     BDRVQcow2State *s = bs->opaque;
 
     uint32_t head = offset % s->cluster_size;
-    uint32_t tail = (offset + count) % s->cluster_size;
+    uint32_t tail = (offset + bytes) % s->cluster_size;
 
-    trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, count);
-    if (offset + count == bs->total_sectors * BDRV_SECTOR_SIZE) {
+    trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, bytes);
+    if (offset + bytes == bs->total_sectors * BDRV_SECTOR_SIZE) {
         tail = 0;
     }
 
@@ -2526,12 +2526,12 @@ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
         uint64_t off;
         unsigned int nr;
 
-        assert(head + count <= s->cluster_size);
+        assert(head + bytes <= s->cluster_size);
 
         /* check whether remainder of cluster already reads as zero */
         if (!(is_zero_sectors(bs, cl_start,
                               DIV_ROUND_UP(head, BDRV_SECTOR_SIZE)) &&
-              is_zero_sectors(bs, (offset + count) >> BDRV_SECTOR_BITS,
+              is_zero_sectors(bs, (offset + bytes) >> BDRV_SECTOR_BITS,
                               DIV_ROUND_UP(-tail & (s->cluster_size - 1),
                                            BDRV_SECTOR_SIZE)))) {
             return -ENOTSUP;
@@ -2540,7 +2540,7 @@ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
         qemu_co_mutex_lock(&s->lock);
         /* We can have new write after previous check */
         offset = cl_start << BDRV_SECTOR_BITS;
-        count = s->cluster_size;
+        bytes = s->cluster_size;
         nr = s->cluster_size;
         ret = qcow2_get_cluster_offset(bs, offset, &nr, &off);
         if (ret != QCOW2_CLUSTER_UNALLOCATED &&
@@ -2553,33 +2553,33 @@ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
         qemu_co_mutex_lock(&s->lock);
     }
 
-    trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, count);
+    trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, bytes);
 
     /* Whatever is left can use real zero clusters */
-    ret = qcow2_cluster_zeroize(bs, offset, count, flags);
+    ret = qcow2_cluster_zeroize(bs, offset, bytes, flags);
     qemu_co_mutex_unlock(&s->lock);
 
     return ret;
 }
 
 static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs,
-                                          int64_t offset, int count)
+                                          int64_t offset, int bytes)
 {
     int ret;
     BDRVQcow2State *s = bs->opaque;
 
-    if (!QEMU_IS_ALIGNED(offset | count, s->cluster_size)) {
-        assert(count < s->cluster_size);
+    if (!QEMU_IS_ALIGNED(offset | bytes, s->cluster_size)) {
+        assert(bytes < s->cluster_size);
         /* Ignore partial clusters, except for the special case of the
          * complete partial cluster at the end of an unaligned file */
         if (!QEMU_IS_ALIGNED(offset, s->cluster_size) ||
-            offset + count != bs->total_sectors * BDRV_SECTOR_SIZE) {
+            offset + bytes != bs->total_sectors * BDRV_SECTOR_SIZE) {
             return -ENOTSUP;
         }
     }
 
     qemu_co_mutex_lock(&s->lock);
-    ret = qcow2_cluster_discard(bs, offset, count, QCOW2_DISCARD_REQUEST,
+    ret = qcow2_cluster_discard(bs, offset, bytes, QCOW2_DISCARD_REQUEST,
                                 false);
     qemu_co_mutex_unlock(&s->lock);
     return ret;
diff --git a/block/qed.c b/block/qed.c
index c073baa340..385381a78a 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1317,7 +1317,7 @@ static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs,
 
 static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
                                                   int64_t offset,
-                                                  int count,
+                                                  int bytes,
                                                   BdrvRequestFlags flags)
 {
     BDRVQEDState *s = bs->opaque;
@@ -1326,7 +1326,7 @@ static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
 
     /* Fall back if the request is not aligned */
     if (qed_offset_into_cluster(s, offset) ||
-        qed_offset_into_cluster(s, count)) {
+        qed_offset_into_cluster(s, bytes)) {
         return -ENOTSUP;
     }
 
@@ -1334,11 +1334,11 @@ static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
      * then it will be allocated during request processing.
      */
     iov.iov_base = NULL;
-    iov.iov_len = count;
+    iov.iov_len = bytes;
 
     qemu_iovec_init_external(&qiov, &iov, 1);
     return qed_co_request(bs, offset >> BDRV_SECTOR_BITS, &qiov,
-                          count >> BDRV_SECTOR_BITS,
+                          bytes >> BDRV_SECTOR_BITS,
                           QED_AIOCB_WRITE | QED_AIOCB_ZERO);
 }
 
diff --git a/block/raw-format.c b/block/raw-format.c
index 36e65036f0..0d185fe41b 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -264,7 +264,7 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
 }
 
 static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs,
-                                             int64_t offset, int count,
+                                             int64_t offset, int bytes,
                                              BdrvRequestFlags flags)
 {
     BDRVRawState *s = bs->opaque;
@@ -272,18 +272,18 @@ static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs,
         return -EINVAL;
     }
     offset += s->offset;
-    return bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
+    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
 }
 
 static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs,
-                                        int64_t offset, int count)
+                                        int64_t offset, int bytes)
 {
     BDRVRawState *s = bs->opaque;
     if (offset > UINT64_MAX - s->offset) {
         return -EINVAL;
     }
     offset += s->offset;
-    return bdrv_co_pdiscard(bs->file->bs, offset, count);
+    return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
 }
 
 static int64_t raw_getlength(BlockDriverState *bs)
diff --git a/block/rbd.c b/block/rbd.c
index ff44e5f437..9da02cdceb 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -1065,11 +1065,11 @@ static int qemu_rbd_snap_list(BlockDriverState *bs,
 #ifdef LIBRBD_SUPPORTS_DISCARD
 static BlockAIOCB *qemu_rbd_aio_pdiscard(BlockDriverState *bs,
                                          int64_t offset,
-                                         int count,
+                                         int bytes,
                                          BlockCompletionFunc *cb,
                                          void *opaque)
 {
-    return rbd_start_aio(bs, offset, NULL, count, cb, opaque,
+    return rbd_start_aio(bs, offset, NULL, bytes, cb, opaque,
                          RBD_AIO_DISCARD);
 }
 #endif
diff --git a/block/sheepdog.c b/block/sheepdog.c
index c9236679c6..a87ee5facd 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -2935,7 +2935,7 @@ static int sd_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
 
 
 static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
-                                      int count)
+                                      int bytes)
 {
     SheepdogAIOCB acb;
     BDRVSheepdogState *s = bs->opaque;
@@ -2953,11 +2953,11 @@ static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
     iov.iov_len = sizeof(zero);
     discard_iov.iov = &iov;
     discard_iov.niov = 1;
-    if (!QEMU_IS_ALIGNED(offset | count, BDRV_SECTOR_SIZE)) {
+    if (!QEMU_IS_ALIGNED(offset | bytes, BDRV_SECTOR_SIZE)) {
         return -ENOTSUP;
     }
     sd_aio_setup(&acb, s, &discard_iov, offset >> BDRV_SECTOR_BITS,
-                 count >> BDRV_SECTOR_BITS, AIOCB_DISCARD_OBJ);
+                 bytes >> BDRV_SECTOR_BITS, AIOCB_DISCARD_OBJ);
     sd_co_rw_vector(&acb);
     sd_aio_complete(&acb);
 
diff --git a/include/block/block.h b/include/block/block.h
index 623e7fca97..85e4be7462 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -276,7 +276,7 @@ int bdrv_read(BdrvChild *child, int64_t sector_num,
 int bdrv_write(BdrvChild *child, int64_t sector_num,
                const uint8_t *buf, int nb_sectors);
 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
-                       int count, BdrvRequestFlags flags);
+                       int bytes, BdrvRequestFlags flags);
 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags);
 int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes);
 int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov);
@@ -295,7 +295,7 @@ int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num,
  * because it may allocate memory for the entire region.
  */
 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
-                                       int count, BdrvRequestFlags flags);
+                                       int bytes, BdrvRequestFlags flags);
 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
     const char *backing_file);
 int bdrv_get_backing_file_depth(BlockDriverState *bs);
@@ -411,8 +411,8 @@ void bdrv_drain_all(void);
     }                                                      \
     waited_; })
 
-int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count);
-int bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, int count);
+int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes);
+int bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes);
 int bdrv_has_zero_init_1(BlockDriverState *bs);
 int bdrv_has_zero_init(BlockDriverState *bs);
 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 748970055e..15fa602150 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -142,7 +142,7 @@ struct BlockDriver {
     BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
         BlockCompletionFunc *cb, void *opaque);
     BlockAIOCB *(*bdrv_aio_pdiscard)(BlockDriverState *bs,
-        int64_t offset, int count,
+        int64_t offset, int bytes,
         BlockCompletionFunc *cb, void *opaque);
 
     int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs,
@@ -163,9 +163,9 @@ struct BlockDriver {
      * will be called instead.
      */
     int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs,
-        int64_t offset, int count, BdrvRequestFlags flags);
+        int64_t offset, int bytes, BdrvRequestFlags flags);
     int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs,
-        int64_t offset, int count);
+        int64_t offset, int bytes);
 
     /*
      * Building block for bdrv_block_status[_above]. The driver should
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 999eb2333a..1e05281fff 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -130,7 +130,7 @@ BlockBackend *blk_by_dev(void *dev);
 BlockBackend *blk_by_qdev_id(const char *id, Error **errp);
 void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, void *opaque);
 int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
-                          int count);
+                          int bytes);
 int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
                                unsigned int bytes, QEMUIOVector *qiov,
                                BdrvRequestFlags flags);
@@ -138,13 +138,13 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
                                unsigned int bytes, QEMUIOVector *qiov,
                                BdrvRequestFlags flags);
 int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
-                      int count, BdrvRequestFlags flags);
+                      int bytes, BdrvRequestFlags flags);
 BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
-                                  int count, BdrvRequestFlags flags,
+                                  int bytes, BdrvRequestFlags flags,
                                   BlockCompletionFunc *cb, void *opaque);
 int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags);
-int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count);
-int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count,
+int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int bytes);
+int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int bytes,
                BdrvRequestFlags flags);
 int64_t blk_getlength(BlockBackend *blk);
 void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr);
@@ -157,7 +157,7 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
                             BlockCompletionFunc *cb, void *opaque);
 BlockAIOCB *blk_aio_flush(BlockBackend *blk,
                           BlockCompletionFunc *cb, void *opaque);
-BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int count,
+BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int bytes,
                              BlockCompletionFunc *cb, void *opaque);
 void blk_aio_cancel(BlockAIOCB *acb);
 void blk_aio_cancel_async(BlockAIOCB *acb);
@@ -165,7 +165,7 @@ int blk_co_ioctl(BlockBackend *blk, unsigned long int req, void *buf);
 int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf);
 BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
                           BlockCompletionFunc *cb, void *opaque);
-int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int count);
+int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes);
 int blk_co_flush(BlockBackend *blk);
 int blk_flush(BlockBackend *blk);
 int blk_commit_all(void);
@@ -220,11 +220,11 @@ int blk_get_open_flags_from_root_state(BlockBackend *blk);
 void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
                   BlockCompletionFunc *cb, void *opaque);
 int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
-                                      int count, BdrvRequestFlags flags);
+                                      int bytes, BdrvRequestFlags flags);
 int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
-                          int count);
+                          int bytes);
 int blk_truncate(BlockBackend *blk, int64_t offset, Error **errp);
-int blk_pdiscard(BlockBackend *blk, int64_t offset, int count);
+int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes);
 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
                      int64_t pos, int size);
 int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size);
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 4b2278f040..b0ea327024 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -451,13 +451,13 @@ fail:
 }
 
 static int do_pread(BlockBackend *blk, char *buf, int64_t offset,
-                    int64_t count, int64_t *total)
+                    int64_t bytes, int64_t *total)
 {
-    if (count > INT_MAX) {
+    if (bytes > INT_MAX) {
         return -ERANGE;
     }
 
-    *total = blk_pread(blk, offset, (uint8_t *)buf, count);
+    *total = blk_pread(blk, offset, (uint8_t *)buf, bytes);
     if (*total < 0) {
         return *total;
     }
@@ -465,13 +465,13 @@ static int do_pread(BlockBackend *blk, char *buf, int64_t offset,
 }
 
 static int do_pwrite(BlockBackend *blk, char *buf, int64_t offset,
-                     int64_t count, int flags, int64_t *total)
+                     int64_t bytes, int flags, int64_t *total)
 {
-    if (count > INT_MAX) {
+    if (bytes > INT_MAX) {
         return -ERANGE;
     }
 
-    *total = blk_pwrite(blk, offset, (uint8_t *)buf, count, flags);
+    *total = blk_pwrite(blk, offset, (uint8_t *)buf, bytes, flags);
     if (*total < 0) {
         return *total;
     }
@@ -481,7 +481,7 @@ static int do_pwrite(BlockBackend *blk, char *buf, int64_t offset,
 typedef struct {
     BlockBackend *blk;
     int64_t offset;
-    int64_t count;
+    int64_t bytes;
     int64_t *total;
     int flags;
     int ret;
@@ -492,7 +492,7 @@ static void coroutine_fn co_pwrite_zeroes_entry(void *opaque)
 {
     CoWriteZeroes *data = opaque;
 
-    data->ret = blk_co_pwrite_zeroes(data->blk, data->offset, data->count,
+    data->ret = blk_co_pwrite_zeroes(data->blk, data->offset, data->bytes,
                                      data->flags);
     data->done = true;
     if (data->ret < 0) {
@@ -500,23 +500,23 @@ static void coroutine_fn co_pwrite_zeroes_entry(void *opaque)
         return;
     }
 
-    *data->total = data->count;
+    *data->total = data->bytes;
 }
 
 static int do_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
-                               int64_t count, int flags, int64_t *total)
+                               int64_t bytes, int flags, int64_t *total)
 {
     Coroutine *co;
     CoWriteZeroes data = {
         .blk    = blk,
         .offset = offset,
-        .count  = count,
+        .bytes  = bytes,
         .total  = total,
         .flags  = flags,
         .done   = false,
     };
 
-    if (count > INT_MAX) {
+    if (bytes > INT_MAX) {
         return -ERANGE;
     }
 
@@ -533,19 +533,19 @@ static int do_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
 }
 
 static int do_write_compressed(BlockBackend *blk, char *buf, int64_t offset,
-                               int64_t count, int64_t *total)
+                               int64_t bytes, int64_t *total)
 {
     int ret;
 
-    if (count >> 9 > BDRV_REQUEST_MAX_SECTORS) {
+    if (bytes >> 9 > BDRV_REQUEST_MAX_SECTORS) {
         return -ERANGE;
     }
 
-    ret = blk_pwrite_compressed(blk, offset, buf, count);
+    ret = blk_pwrite_compressed(blk, offset, buf, bytes);
     if (ret < 0) {
         return ret;
     }
-    *total = count;
+    *total = bytes;
     return 1;
 }
 
@@ -1701,7 +1701,7 @@ static int discard_f(BlockBackend *blk, int argc, char **argv)
     struct timeval t1, t2;
     bool Cflag = false, qflag = false;
     int c, ret;
-    int64_t offset, count;
+    int64_t offset, bytes;
 
     while ((c = getopt(argc, argv, "Cq")) != -1) {
         switch (c) {
@@ -1727,11 +1727,11 @@ static int discard_f(BlockBackend *blk, int argc, char **argv)
     }
 
     optind++;
-    count = cvtnum(argv[optind]);
-    if (count < 0) {
-        print_cvtnum_err(count, argv[optind]);
+    bytes = cvtnum(argv[optind]);
+    if (bytes < 0) {
+        print_cvtnum_err(bytes, argv[optind]);
         return 0;
-    } else if (count >> BDRV_SECTOR_BITS > BDRV_REQUEST_MAX_SECTORS) {
+    } else if (bytes >> BDRV_SECTOR_BITS > BDRV_REQUEST_MAX_SECTORS) {
         printf("length cannot exceed %"PRIu64", given %s\n",
                (uint64_t)BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS,
                argv[optind]);
@@ -1739,7 +1739,7 @@ static int discard_f(BlockBackend *blk, int argc, char **argv)
     }
 
     gettimeofday(&t1, NULL);
-    ret = blk_pdiscard(blk, offset, count);
+    ret = blk_pdiscard(blk, offset, bytes);
     gettimeofday(&t2, NULL);
 
     if (ret < 0) {
@@ -1750,7 +1750,7 @@ static int discard_f(BlockBackend *blk, int argc, char **argv)
     /* Finally, report back -- -C gives a parsable format */
     if (!qflag) {
         t2 = tsub(t2, t1);
-        print_report("discard", &t2, offset, count, count, 1, Cflag);
+        print_report("discard", &t2, offset, bytes, bytes, 1, Cflag);
     }
 
 out:

From 4172a00373b2c81374293becc02b16b7f8c76659 Mon Sep 17 00:00:00 2001
From: "sochin.jiang" <sochin.jiang@huawei.com>
Date: Thu, 15 Jun 2017 14:47:33 +0800
Subject: [PATCH 56/60] fix: avoid an infinite loop or a dangling pointer
 problem in img_commit

img_commit could fall into an infinite loop calling run_block_job() if
its blockjob fails on any I/O error, fix this already known problem.

Signed-off-by: sochin.jiang <sochin.jiang@huawei.com>
Message-id: 1497509253-28941-1-git-send-email-sochin.jiang@huawei.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 blockjob.c               |  4 ++--
 include/block/blockjob.h | 18 ++++++++++++++++++
 qemu-img.c               | 20 +++++++++++++-------
 3 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index a0d7e29b83..70a78188b7 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -139,7 +139,7 @@ static void block_job_resume(BlockJob *job)
     block_job_enter(job);
 }
 
-static void block_job_ref(BlockJob *job)
+void block_job_ref(BlockJob *job)
 {
     ++job->refcnt;
 }
@@ -148,7 +148,7 @@ static void block_job_attached_aio_context(AioContext *new_context,
                                            void *opaque);
 static void block_job_detach_aio_context(void *opaque);
 
-static void block_job_unref(BlockJob *job)
+void block_job_unref(BlockJob *job)
 {
     if (--job->refcnt == 0) {
         BlockDriverState *bs = blk_bs(job->blk);
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index 09c7c694b5..67c0968fa5 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -320,6 +320,24 @@ void block_job_iostatus_reset(BlockJob *job);
  */
 BlockJobTxn *block_job_txn_new(void);
 
+/**
+ * block_job_ref:
+ *
+ * Add a reference to BlockJob refcnt, it will be decreased with
+ * block_job_unref, and then be freed if it comes to be the last
+ * reference.
+ */
+void block_job_ref(BlockJob *job);
+
+/**
+ * block_job_unref:
+ *
+ * Release a reference that was previously acquired with block_job_ref
+ * or block_job_create. If it's the last reference to the object, it will be
+ * freed.
+ */
+void block_job_unref(BlockJob *job);
+
 /**
  * block_job_txn_unref:
  *
diff --git a/qemu-img.c b/qemu-img.c
index 0ad698d7f1..e70d5155e4 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -887,22 +887,28 @@ static void common_block_job_cb(void *opaque, int ret)
 static void run_block_job(BlockJob *job, Error **errp)
 {
     AioContext *aio_context = blk_get_aio_context(job->blk);
+    int ret = 0;
 
-    /* FIXME In error cases, the job simply goes away and we access a dangling
-     * pointer below. */
     aio_context_acquire(aio_context);
+    block_job_ref(job);
     do {
         aio_poll(aio_context, true);
         qemu_progress_print(job->len ?
                             ((float)job->offset / job->len * 100.f) : 0.0f, 0);
-    } while (!job->ready);
+    } while (!job->ready && !job->completed);
 
-    block_job_complete_sync(job, errp);
+    if (!job->completed) {
+        ret = block_job_complete_sync(job, errp);
+    } else {
+        ret = job->ret;
+    }
+    block_job_unref(job);
     aio_context_release(aio_context);
 
-    /* A block job may finish instantaneously without publishing any progress,
-     * so just signal completion here */
-    qemu_progress_print(100.f, 0);
+    /* publish completion progress only when success */
+    if (!ret) {
+        qemu_progress_print(100.f, 0);
+    }
 }
 
 static int img_commit(int argc, char **argv)

From de81d72d3d13a19edf4d461be3b0f5a877be0234 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 13 Jun 2017 19:20:05 +0200
Subject: [PATCH 57/60] blkdebug: Catch bs->exact_filename overflow

The bs->exact_filename field may not be sufficient to store the full
blkdebug node filename. In this case, we should not generate a filename
at all instead of an unusable one.

Cc: qemu-stable@nongnu.org
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170613172006.19685-2-mreitz@redhat.com
Reviewed-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/blkdebug.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index 6431962d7e..a1b24b9b0d 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -839,9 +839,13 @@ static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options)
     }
 
     if (!force_json && bs->file->bs->exact_filename[0]) {
-        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "blkdebug:%s:%s", s->config_file ?: "",
-                 bs->file->bs->exact_filename);
+        int ret = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                           "blkdebug:%s:%s", s->config_file ?: "",
+                           bs->file->bs->exact_filename);
+        if (ret >= sizeof(bs->exact_filename)) {
+            /* An overflow makes the filename unusable, so do not report any */
+            bs->exact_filename[0] = 0;
+        }
     }
 
     opts = qdict_new();

From 05cc758a3dfc79488d0a8eb7f5830a41871e78d0 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 13 Jun 2017 19:20:06 +0200
Subject: [PATCH 58/60] blkverify: Catch bs->exact_filename overflow

The bs->exact_filename field may not be sufficient to store the full
blkverify node filename. In this case, we should not generate a filename
at all instead of an unusable one.

Cc: qemu-stable@nongnu.org
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170613172006.19685-3-mreitz@redhat.com
Reviewed-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/blkverify.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/block/blkverify.c b/block/blkverify.c
index 6b0a603cf0..06369f9eac 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -301,10 +301,14 @@ static void blkverify_refresh_filename(BlockDriverState *bs, QDict *options)
     if (bs->file->bs->exact_filename[0]
         && s->test_file->bs->exact_filename[0])
     {
-        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "blkverify:%s:%s",
-                 bs->file->bs->exact_filename,
-                 s->test_file->bs->exact_filename);
+        int ret = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                           "blkverify:%s:%s",
+                           bs->file->bs->exact_filename,
+                           s->test_file->bs->exact_filename);
+        if (ret >= sizeof(bs->exact_filename)) {
+            /* An overflow makes the filename unusable, so do not report any */
+            bs->exact_filename[0] = 0;
+        }
     }
 }
 

From f69165a8feca055cf4a37d13ab0fc5beec3cb372 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 13 Jun 2017 22:57:26 +0200
Subject: [PATCH 59/60] block: Do not strcmp() with NULL uri->scheme

uri_parse(...)->scheme may be NULL. In fact, probably every field may be
NULL, and the callers do test this for all of the other fields but not
for scheme (except for block/gluster.c; block/vxhs.c does not access
that field at all).

We can easily fix this by using g_strcmp0() instead of strcmp().

Cc: qemu-stable@nongnu.org
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170613205726.13544-1-mreitz@redhat.com
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/nbd.c      | 6 +++---
 block/nfs.c      | 2 +-
 block/sheepdog.c | 6 +++---
 block/ssh.c      | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index e946ea944d..d529305330 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -64,11 +64,11 @@ static int nbd_parse_uri(const char *filename, QDict *options)
     }
 
     /* transport */
-    if (!strcmp(uri->scheme, "nbd")) {
+    if (!g_strcmp0(uri->scheme, "nbd")) {
         is_unix = false;
-    } else if (!strcmp(uri->scheme, "nbd+tcp")) {
+    } else if (!g_strcmp0(uri->scheme, "nbd+tcp")) {
         is_unix = false;
-    } else if (!strcmp(uri->scheme, "nbd+unix")) {
+    } else if (!g_strcmp0(uri->scheme, "nbd+unix")) {
         is_unix = true;
     } else {
         ret = -EINVAL;
diff --git a/block/nfs.c b/block/nfs.c
index 6b8b5b653d..c3c5de0113 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -82,7 +82,7 @@ static int nfs_parse_uri(const char *filename, QDict *options, Error **errp)
         error_setg(errp, "Invalid URI specified");
         goto out;
     }
-    if (strcmp(uri->scheme, "nfs") != 0) {
+    if (g_strcmp0(uri->scheme, "nfs") != 0) {
         error_setg(errp, "URI scheme must be 'nfs'");
         goto out;
     }
diff --git a/block/sheepdog.c b/block/sheepdog.c
index a87ee5facd..08d7b11e9d 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1046,11 +1046,11 @@ static void sd_parse_uri(SheepdogConfig *cfg, const char *filename,
     }
 
     /* transport */
-    if (!strcmp(uri->scheme, "sheepdog")) {
+    if (!g_strcmp0(uri->scheme, "sheepdog")) {
         is_unix = false;
-    } else if (!strcmp(uri->scheme, "sheepdog+tcp")) {
+    } else if (!g_strcmp0(uri->scheme, "sheepdog+tcp")) {
         is_unix = false;
-    } else if (!strcmp(uri->scheme, "sheepdog+unix")) {
+    } else if (!g_strcmp0(uri->scheme, "sheepdog+unix")) {
         is_unix = true;
     } else {
         error_setg(&err, "URI scheme must be 'sheepdog', 'sheepdog+tcp',"
diff --git a/block/ssh.c b/block/ssh.c
index bac3453c3e..52964416da 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -204,7 +204,7 @@ static int parse_uri(const char *filename, QDict *options, Error **errp)
         return -EINVAL;
     }
 
-    if (strcmp(uri->scheme, "ssh") != 0) {
+    if (g_strcmp0(uri->scheme, "ssh") != 0) {
         error_setg(errp, "URI scheme must be 'ssh'");
         goto err;
     }

From 2a245709099e98bca638694c182f1e5627567df7 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 19 Jun 2017 16:00:02 +0100
Subject: [PATCH 60/60] qemu-img: don't shadow opts variable in img_dd()

It's confusing when two different variables have the same name in one
function.

Cc: Reda Sallahi <fullmanet@gmail.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20170619150002.3033-1-stefanha@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qemu-img.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index e70d5155e4..91ad6bebbf 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -4255,15 +4255,12 @@ static int img_dd(int argc, char **argv)
         case 'U':
             force_share = true;
             break;
-        case OPTION_OBJECT: {
-            QemuOpts *opts;
-            opts = qemu_opts_parse_noisily(&qemu_object_opts,
-                                           optarg, true);
-            if (!opts) {
+        case OPTION_OBJECT:
+            if (!qemu_opts_parse_noisily(&qemu_object_opts, optarg, true)) {
                 ret = -1;
                 goto out;
             }
-        }   break;
+            break;
         case OPTION_IMAGE_OPTS:
             image_opts = true;
             break;