From 653df36bbe58e20258610bc74c5940c456b31084 Mon Sep 17 00:00:00 2001
From: Aurelien Jarno <aurelien@aurel32.net>
Date: Sat, 1 Jan 2011 21:50:34 +0100
Subject: [PATCH 01/23] qcow2: fix unaligned access

cpu_to_be64w() is called with an obviously non-aligned pointer. Use
cpu_to_be64wu() instead. It fixes unaligned accesses errors on IA64
hosts.

Cc: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 6928c6341d..c3ef550f7e 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -81,7 +81,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size)
     /* set new table */
     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
     cpu_to_be32w((uint32_t*)data, new_l1_size);
-    cpu_to_be64w((uint64_t*)(data + 4), new_l1_table_offset);
+    cpu_to_be64wu((uint64_t*)(data + 4), new_l1_table_offset);
     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data));
     if (ret < 0) {
         goto fail;

From 710da702beb0dc4aeeab1b9e712cd9473b083a89 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 10 Jan 2011 12:33:02 +0100
Subject: [PATCH 02/23] qemu-img snapshot: Use writeback caching

None of the other qemu-img subcommands uses writethrough, and there's no reason
why snapshot should be special.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 qemu-img.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qemu-img.c b/qemu-img.c
index afd9ed2e0e..1e65ea82d6 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1068,7 +1068,7 @@ static int img_snapshot(int argc, char **argv)
     int action = 0;
     qemu_timeval tv;
 
-    bdrv_oflags = BDRV_O_RDWR;
+    bdrv_oflags = BDRV_O_FLAGS | BDRV_O_RDWR;
     /* Parse commandline parameters */
     for(;;) {
         c = getopt(argc, argv, "la:c:d:h");

From c90f1b3297943f2f142d8114bef1092f9ac9acef Mon Sep 17 00:00:00 2001
From: Jes Sorensen <Jes.Sorensen@redhat.com>
Date: Thu, 6 Jan 2011 17:02:23 +0100
Subject: [PATCH 03/23] do_snapshot_blkdev() error on missing snapshot_file
 argument

Current code does not support snapshot internally to the running
image. Error in case no snapshot_file is specified.

Signed-off-by: Jes Sorensen <Jes.Sorensen@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/blockdev.c b/blockdev.c
index d7add36929..662f7a9d86 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -526,6 +526,12 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data)
     int ret = 0;
     int flags;
 
+    if (!filename) {
+        qerror_report(QERR_MISSING_PARAMETER, "snapshot_file");
+        ret = -1;
+        goto out;
+    }
+
     bs = bdrv_find(device);
     if (!bs) {
         qerror_report(QERR_DEVICE_NOT_FOUND, device);

From 70b4f4bb05ff5e6812c6593eeefbd19bd61b517d Mon Sep 17 00:00:00 2001
From: Jes Sorensen <Jes.Sorensen@redhat.com>
Date: Wed, 5 Jan 2011 11:41:02 +0100
Subject: [PATCH 04/23] Make strtosz() return int64_t instead of ssize_t

strtosz() needs to return a 64 bit type even on 32 bit
architectures. Otherwise qemu-img will fail to create disk
images >= 2GB

Signed-off-by: Jes Sorensen <Jes.Sorensen@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 cutils.c      | 8 ++++----
 monitor.c     | 2 +-
 qemu-common.h | 4 ++--
 qemu-img.c    | 2 +-
 vl.c          | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/cutils.c b/cutils.c
index 7984bc1ca4..4d2e27c13a 100644
--- a/cutils.c
+++ b/cutils.c
@@ -291,9 +291,9 @@ int fcntl_setfl(int fd, int flag)
  * value must be terminated by whitespace, ',' or '\0'. Return -1 on
  * error.
  */
-ssize_t strtosz_suffix(const char *nptr, char **end, const char default_suffix)
+int64_t strtosz_suffix(const char *nptr, char **end, const char default_suffix)
 {
-    ssize_t retval = -1;
+    int64_t retval = -1;
     char *endptr, c, d;
     int mul_required = 0;
     double val, mul, integral, fraction;
@@ -365,7 +365,7 @@ ssize_t strtosz_suffix(const char *nptr, char **end, const char default_suffix)
             goto fail;
         }
     }
-    if ((val * mul >= ~(size_t)0) || val < 0) {
+    if ((val * mul >= INT64_MAX) || val < 0) {
         goto fail;
     }
     retval = val * mul;
@@ -378,7 +378,7 @@ fail:
     return retval;
 }
 
-ssize_t strtosz(const char *nptr, char **end)
+int64_t strtosz(const char *nptr, char **end)
 {
     return strtosz_suffix(nptr, end, STRTOSZ_DEFSUFFIX_MB);
 }
diff --git a/monitor.c b/monitor.c
index d291158c2f..0cda3dad2c 100644
--- a/monitor.c
+++ b/monitor.c
@@ -4162,7 +4162,7 @@ static const mon_cmd_t *monitor_parse_command(Monitor *mon,
             break;
         case 'o':
             {
-                ssize_t val;
+                int64_t val;
                 char *end;
 
                 while (qemu_isspace(*p)) {
diff --git a/qemu-common.h b/qemu-common.h
index c766b990eb..c351131623 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -158,8 +158,8 @@ int fcntl_setfl(int fd, int flag);
 #define STRTOSZ_DEFSUFFIX_MB	'M'
 #define STRTOSZ_DEFSUFFIX_KB	'K'
 #define STRTOSZ_DEFSUFFIX_B	'B'
-ssize_t strtosz(const char *nptr, char **end);
-ssize_t strtosz_suffix(const char *nptr, char **end, const char default_suffix);
+int64_t strtosz(const char *nptr, char **end);
+int64_t strtosz_suffix(const char *nptr, char **end, const char default_suffix);
 
 /* path.c */
 void init_paths(const char *prefix);
diff --git a/qemu-img.c b/qemu-img.c
index 1e65ea82d6..4a3735811c 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -320,7 +320,7 @@ static int img_create(int argc, char **argv)
 
     /* Get image size, if specified */
     if (optind < argc) {
-        ssize_t sval;
+        int64_t sval;
         sval = strtosz_suffix(argv[optind++], NULL, STRTOSZ_DEFSUFFIX_B);
         if (sval < 0) {
             error_report("Invalid image size specified! You may use k, M, G or "
diff --git a/vl.c b/vl.c
index 0292184273..14255c4948 100644
--- a/vl.c
+++ b/vl.c
@@ -804,7 +804,7 @@ static void numa_add(const char *optarg)
         if (get_param_value(option, 128, "mem", optarg) == 0) {
             node_mem[nodenr] = 0;
         } else {
-            ssize_t sval;
+            int64_t sval;
             sval = strtosz(option, NULL);
             if (sval < 0) {
                 fprintf(stderr, "qemu: invalid numa mem size: %s\n", optarg);
@@ -2245,7 +2245,7 @@ int main(int argc, char **argv, char **envp)
                 exit(0);
                 break;
             case QEMU_OPTION_m: {
-                ssize_t value;
+                int64_t value;
 
                 value = strtosz(optarg, NULL);
                 if (value < 0) {

From 8b6b2afcf85dd5ff33075e93a2e30fbea34c5a55 Mon Sep 17 00:00:00 2001
From: Pierre Riteau <Pierre.Riteau@irisa.fr>
Date: Wed, 12 Jan 2011 14:41:00 +0100
Subject: [PATCH 05/23] Avoid divide by zero when there is no block device to
 migrate

When block migration is requested and no read-write block device is
present, a divide by zero exception is triggered because
total_sector_sum equals zero.

Signed-off-by: Pierre Riteau <Pierre.Riteau@irisa.fr>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block-migration.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/block-migration.c b/block-migration.c
index 14753254d6..60b9fc0ef0 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -350,7 +350,12 @@ static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f)
         }
     }
 
-    progress = completed_sector_sum * 100 / block_mig_state.total_sector_sum;
+    if (block_mig_state.total_sector_sum != 0) {
+        progress = completed_sector_sum * 100 /
+                   block_mig_state.total_sector_sum;
+    } else {
+        progress = 100;
+    }
     if (progress != block_mig_state.prev_progress) {
         block_mig_state.prev_progress = progress;
         qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)

From cd369c4634e58a99fb82f076e6117bfdf0012b8e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Dec 2010 13:45:48 +0100
Subject: [PATCH 06/23] ide: factor dma handling helpers

Factor the DMA I/O path that is duplicated between read and write
commands, into common helpers using the s->is_read flag added for
the macio ATA controller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/ide/core.c     | 103 +++++++++++++++-------------------------------
 hw/ide/internal.h |   4 +-
 hw/ide/pci.c      |   9 +---
 3 files changed, 36 insertions(+), 80 deletions(-)

diff --git a/hw/ide/core.c b/hw/ide/core.c
index 9496e990b9..e93dd4616c 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -487,16 +487,18 @@ static int ide_handle_rw_error(IDEState *s, int error, int op)
     return 1;
 }
 
-void ide_read_dma_cb(void *opaque, int ret)
+void ide_dma_cb(void *opaque, int ret)
 {
     IDEState *s = opaque;
     int n;
     int64_t sector_num;
 
     if (ret < 0) {
-        if (ide_handle_rw_error(s, -ret,
-            BM_STATUS_DMA_RETRY | BM_STATUS_RETRY_READ))
-        {
+        int op = BM_STATUS_DMA_RETRY;
+
+        if (s->is_read)
+            op |= BM_STATUS_RETRY_READ;
+        if (ide_handle_rw_error(s, -ret, op)) {
             return;
         }
     }
@@ -504,7 +506,7 @@ void ide_read_dma_cb(void *opaque, int ret)
     n = s->io_buffer_size >> 9;
     sector_num = ide_get_sector(s);
     if (n > 0) {
-        dma_buf_commit(s, 1);
+        dma_buf_commit(s, s->is_read);
         sector_num += n;
         ide_set_sector(s, sector_num);
         s->nsector -= n;
@@ -514,32 +516,44 @@ void ide_read_dma_cb(void *opaque, int ret)
     if (s->nsector == 0) {
         s->status = READY_STAT | SEEK_STAT;
         ide_set_irq(s->bus);
-    eot:
-        s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT);
-        ide_set_inactive(s);
-        return;
+        goto eot;
     }
 
     /* launch next transfer */
     n = s->nsector;
-    s->io_buffer_index = 0;
+    if (s->is_read)
+        s->io_buffer_index = 0;
     s->io_buffer_size = n * 512;
-    if (s->bus->dma->ops->prepare_buf(s->bus->dma, 1) == 0)
+    if (s->bus->dma->ops->prepare_buf(s->bus->dma, s->is_read) == 0)
         goto eot;
+
 #ifdef DEBUG_AIO
-    printf("aio_read: sector_num=%" PRId64 " n=%d\n", sector_num, n);
+    printf("ide_dma_cb: sector_num=%" PRId64 " n=%d, is_read=%d\n",
+           sector_num, n, s->is_read);
 #endif
-    s->bus->dma->aiocb = dma_bdrv_read(s->bs, &s->sg, sector_num, ide_read_dma_cb, s);
-    ide_dma_submit_check(s, ide_read_dma_cb);
+
+    if (s->is_read) {
+        s->bus->dma->aiocb = dma_bdrv_read(s->bs, &s->sg, sector_num,
+                                           ide_dma_cb, s);
+    } else {
+        s->bus->dma->aiocb = dma_bdrv_write(s->bs, &s->sg, sector_num,
+                                            ide_dma_cb, s);
+    }
+    ide_dma_submit_check(s, ide_dma_cb);
+    return;
+
+eot:
+   s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT);
+   ide_set_inactive(s);
 }
 
-static void ide_sector_read_dma(IDEState *s)
+static void ide_sector_start_dma(IDEState *s, int is_read)
 {
     s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
     s->io_buffer_index = 0;
     s->io_buffer_size = 0;
-    s->is_read = 1;
-    s->bus->dma->ops->start_dma(s->bus->dma, s, ide_read_dma_cb);
+    s->is_read = is_read;
+    s->bus->dma->ops->start_dma(s->bus->dma, s, ide_dma_cb);
 }
 
 static void ide_sector_write_timer_cb(void *opaque)
@@ -594,57 +608,6 @@ void ide_sector_write(IDEState *s)
     }
 }
 
-void ide_write_dma_cb(void *opaque, int ret)
-{
-    IDEState *s = opaque;
-    int n;
-    int64_t sector_num;
-
-    if (ret < 0) {
-        if (ide_handle_rw_error(s, -ret,  BM_STATUS_DMA_RETRY))
-            return;
-    }
-
-    n = s->io_buffer_size >> 9;
-    sector_num = ide_get_sector(s);
-    if (n > 0) {
-        dma_buf_commit(s, 0);
-        sector_num += n;
-        ide_set_sector(s, sector_num);
-        s->nsector -= n;
-    }
-
-    /* end of transfer ? */
-    if (s->nsector == 0) {
-        s->status = READY_STAT | SEEK_STAT;
-        ide_set_irq(s->bus);
-    eot:
-        s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT);
-        ide_set_inactive(s);
-        return;
-    }
-
-    n = s->nsector;
-    s->io_buffer_size = n * 512;
-    /* launch next transfer */
-    if (s->bus->dma->ops->prepare_buf(s->bus->dma, 0) == 0)
-        goto eot;
-#ifdef DEBUG_AIO
-    printf("aio_write: sector_num=%" PRId64 " n=%d\n", sector_num, n);
-#endif
-    s->bus->dma->aiocb = dma_bdrv_write(s->bs, &s->sg, sector_num, ide_write_dma_cb, s);
-    ide_dma_submit_check(s, ide_write_dma_cb);
-}
-
-static void ide_sector_write_dma(IDEState *s)
-{
-    s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
-    s->io_buffer_index = 0;
-    s->io_buffer_size = 0;
-    s->is_read = 0;
-    s->bus->dma->ops->start_dma(s->bus->dma, s, ide_write_dma_cb);
-}
-
 void ide_atapi_cmd_ok(IDEState *s)
 {
     s->error = 0;
@@ -1858,7 +1821,7 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
         if (!s->bs)
             goto abort_cmd;
 	ide_cmd_lba48_transform(s, lba48);
-        ide_sector_read_dma(s);
+        ide_sector_start_dma(s, 1);
         break;
 	case WIN_WRITEDMA_EXT:
 	lba48 = 1;
@@ -1867,7 +1830,7 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
         if (!s->bs)
             goto abort_cmd;
 	ide_cmd_lba48_transform(s, lba48);
-        ide_sector_write_dma(s);
+        ide_sector_start_dma(s, 0);
         s->media_changed = 1;
         break;
     case WIN_READ_NATIVE_MAX_EXT:
diff --git a/hw/ide/internal.h b/hw/ide/internal.h
index 697c3b4dc1..d533fb63b3 100644
--- a/hw/ide/internal.h
+++ b/hw/ide/internal.h
@@ -439,7 +439,6 @@ struct IDEState {
     uint32_t mdata_size;
     uint8_t *mdata_storage;
     int media_changed;
-    /* for pmac */
     int is_read;
     /* SMART */
     uint8_t smart_enabled;
@@ -560,8 +559,7 @@ void ide_init2_with_non_qdev_drives(IDEBus *bus, DriveInfo *hd0,
 void ide_init_ioport(IDEBus *bus, int iobase, int iobase2);
 
 void ide_exec_cmd(IDEBus *bus, uint32_t val);
-void ide_read_dma_cb(void *opaque, int ret);
-void ide_write_dma_cb(void *opaque, int ret);
+void ide_dma_cb(void *opaque, int ret);
 void ide_sector_write(IDEState *s);
 void ide_sector_read(IDEState *s);
 void ide_flush_cache(IDEState *s);
diff --git a/hw/ide/pci.c b/hw/ide/pci.c
index 510b2de597..987caffa3a 100644
--- a/hw/ide/pci.c
+++ b/hw/ide/pci.c
@@ -178,14 +178,9 @@ static void bmdma_restart_dma(BMDMAState *bm, int is_read)
     s->io_buffer_index = 0;
     s->io_buffer_size = 0;
     s->nsector = bm->nsector;
+    s->is_read = is_read;
     bm->cur_addr = bm->addr;
-
-    if (is_read) {
-        bm->dma_cb = ide_read_dma_cb;
-    } else {
-        bm->dma_cb = ide_write_dma_cb;
-    }
-
+    bm->dma_cb = ide_dma_cb;
     bmdma_start_dma(&bm->dma, s, bm->dma_cb);
 }
 

From 596bb44dead047249c11df24b0e1ffaa514f4909 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Dec 2010 13:45:58 +0100
Subject: [PATCH 07/23] ide: also reset io_buffer_index for writes

Currenly the code only resets the io_buffer_index field for reads,
but the code seems to expect this for all types of I/O.  I guess
we simply don't hit large enough transfers that would require this
often enough.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/ide/core.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hw/ide/core.c b/hw/ide/core.c
index e93dd4616c..12b9c53f73 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -521,8 +521,7 @@ void ide_dma_cb(void *opaque, int ret)
 
     /* launch next transfer */
     n = s->nsector;
-    if (s->is_read)
-        s->io_buffer_index = 0;
+    s->io_buffer_index = 0;
     s->io_buffer_size = n * 512;
     if (s->bus->dma->ops->prepare_buf(s->bus->dma, s->is_read) == 0)
         goto eot;

From c641483fbe0aa08cd7c0580d019dc2d5a7e71138 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Dec 2010 13:46:09 +0100
Subject: [PATCH 08/23] ide: kill ide_dma_submit_check

Merge ide_dma_submit_check into it's only caller.  Also use tail recursion
using a goto instead of a real recursion - this avoid overflowing the
stack in the pathological situation of an recurring error that is ignored.
We'll still be busy looping in ide_dma_cb, but at least won't eat up
all stack space after this.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/ide/core.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/hw/ide/core.c b/hw/ide/core.c
index 12b9c53f73..e698c13ac5 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -321,14 +321,6 @@ static inline void ide_abort_command(IDEState *s)
     s->error = ABRT_ERR;
 }
 
-static inline void ide_dma_submit_check(IDEState *s,
-          BlockDriverCompletionFunc *dma_cb)
-{
-    if (s->bus->dma->aiocb)
-	return;
-    dma_cb(s, -1);
-}
-
 /* prepare data transfer and tell what to do after */
 static void ide_transfer_start(IDEState *s, uint8_t *buf, int size,
                                EndTransferFunc *end_transfer_func)
@@ -493,6 +485,7 @@ void ide_dma_cb(void *opaque, int ret)
     int n;
     int64_t sector_num;
 
+handle_rw_error:
     if (ret < 0) {
         int op = BM_STATUS_DMA_RETRY;
 
@@ -538,7 +531,11 @@ void ide_dma_cb(void *opaque, int ret)
         s->bus->dma->aiocb = dma_bdrv_write(s->bs, &s->sg, sector_num,
                                             ide_dma_cb, s);
     }
-    ide_dma_submit_check(s, ide_dma_cb);
+
+    if (!s->bus->dma->aiocb) {
+        ret = -1;
+        goto handle_rw_error;
+    }
     return;
 
 eot:

From 493810940bfaad0fd5dd9bfb79cdc89519f89588 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 10 Jan 2011 17:15:10 +0100
Subject: [PATCH 09/23] qcow2: Add QcowCache

This adds some new cache functions to qcow2 which can be used for caching
refcount blocks and L2 tables. When used with cache=writethrough they work
like the old caching code which is spread all over qcow2, so for this case we
have merely a cleanup.

The interesting case is with writeback caching (this includes cache=none) where
data isn't written to disk immediately but only kept in cache initially. This
leads to some form of metadata write batching which avoids the current "write
to refcount block, flush, write to L2 table" pattern for each single request
when a lot of cluster allocations happen. Instead, cache entries are only
written out if its required to maintain the right order. In the pure cluster
allocation case this means that all metadata updates for requests are done in
memory initially and on sync, first the refcount blocks are written to disk,
then fsync, then L2 tables.

This improves performance of scenarios with lots of cluster allocations
noticably (e.g. installation or after taking a snapshot).

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 Makefile.objs       |   2 +-
 block/qcow2-cache.c | 290 ++++++++++++++++++++++++++++++++++++++++++++
 block/qcow2.h       |  19 +++
 3 files changed, 310 insertions(+), 1 deletion(-)
 create mode 100644 block/qcow2-cache.c

diff --git a/Makefile.objs b/Makefile.objs
index fda366d11c..93406ff392 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -19,7 +19,7 @@ block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
 block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
-block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
+block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
 block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
new file mode 100644
index 0000000000..f7c4e2af0d
--- /dev/null
+++ b/block/qcow2-cache.c
@@ -0,0 +1,290 @@
+/*
+ * L2/refcount table cache for the QCOW2 format
+ *
+ * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "block_int.h"
+#include "qemu-common.h"
+#include "qcow2.h"
+
+typedef struct Qcow2CachedTable {
+    void*   table;
+    int64_t offset;
+    bool    dirty;
+    int     cache_hits;
+    int     ref;
+} Qcow2CachedTable;
+
+struct Qcow2Cache {
+    int                     size;
+    Qcow2CachedTable*       entries;
+    struct Qcow2Cache*      depends;
+    bool                    writethrough;
+};
+
+Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables,
+    bool writethrough)
+{
+    BDRVQcowState *s = bs->opaque;
+    Qcow2Cache *c;
+    int i;
+
+    c = qemu_mallocz(sizeof(*c));
+    c->size = num_tables;
+    c->entries = qemu_mallocz(sizeof(*c->entries) * num_tables);
+    c->writethrough = writethrough;
+
+    for (i = 0; i < c->size; i++) {
+        c->entries[i].table = qemu_blockalign(bs, s->cluster_size);
+    }
+
+    return c;
+}
+
+int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c)
+{
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        assert(c->entries[i].ref == 0);
+        qemu_vfree(c->entries[i].table);
+    }
+
+    qemu_free(c->entries);
+    qemu_free(c);
+
+    return 0;
+}
+
+static int qcow2_cache_flush_dependency(BlockDriverState *bs, Qcow2Cache *c)
+{
+    int ret;
+
+    ret = qcow2_cache_flush(bs, c->depends);
+    if (ret < 0) {
+        return ret;
+    }
+
+    c->depends = NULL;
+    return 0;
+}
+
+static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    if (!c->entries[i].dirty || !c->entries[i].offset) {
+        return 0;
+    }
+
+    if (c->depends) {
+        ret = qcow2_cache_flush_dependency(bs, c);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    ret = bdrv_pwrite(bs->file, c->entries[i].offset, c->entries[i].table,
+        s->cluster_size);
+    if (ret < 0) {
+        return ret;
+    }
+
+    c->entries[i].dirty = false;
+
+    return 0;
+}
+
+int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c)
+{
+    int result = 0;
+    int ret;
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        ret = qcow2_cache_entry_flush(bs, c, i);
+        if (ret < 0 && result != -ENOSPC) {
+            result = ret;
+        }
+    }
+
+    if (result == 0) {
+        ret = bdrv_flush(bs->file);
+        if (ret < 0) {
+            result = ret;
+        }
+    }
+
+    return result;
+}
+
+int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
+    Qcow2Cache *dependency)
+{
+    int ret;
+
+    if (dependency->depends) {
+        ret = qcow2_cache_flush_dependency(bs, dependency);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    if (c->depends && (c->depends != dependency)) {
+        ret = qcow2_cache_flush_dependency(bs, c);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    c->depends = dependency;
+    return 0;
+}
+
+static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c)
+{
+    int i;
+    int min_count = INT_MAX;
+    int min_index = -1;
+
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].ref) {
+            continue;
+        }
+
+        if (c->entries[i].cache_hits < min_count) {
+            min_index = i;
+            min_count = c->entries[i].cache_hits;
+        }
+
+        /* Give newer hits priority */
+        /* TODO Check how to optimize the replacement strategy */
+        c->entries[i].cache_hits /= 2;
+    }
+
+    if (min_index == -1) {
+        /* This can't happen in current synchronous code, but leave the check
+         * here as a reminder for whoever starts using AIO with the cache */
+        abort();
+    }
+    return min_index;
+}
+
+static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
+    uint64_t offset, void **table, bool read_from_disk)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i;
+    int ret;
+
+    /* Check if the table is already cached */
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].offset == offset) {
+            goto found;
+        }
+    }
+
+    /* If not, write a table back and replace it */
+    i = qcow2_cache_find_entry_to_replace(c);
+    if (i < 0) {
+        return i;
+    }
+
+    ret = qcow2_cache_entry_flush(bs, c, i);
+    if (ret < 0) {
+        return ret;
+    }
+
+    c->entries[i].offset = 0;
+    if (read_from_disk) {
+        ret = bdrv_pread(bs->file, offset, c->entries[i].table, s->cluster_size);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    /* Give the table some hits for the start so that it won't be replaced
+     * immediately. The number 32 is completely arbitrary. */
+    c->entries[i].cache_hits = 32;
+    c->entries[i].offset = offset;
+
+    /* And return the right table */
+found:
+    c->entries[i].cache_hits++;
+    c->entries[i].ref++;
+    *table = c->entries[i].table;
+    return 0;
+}
+
+int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+    void **table)
+{
+    return qcow2_cache_do_get(bs, c, offset, table, true);
+}
+
+int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+    void **table)
+{
+    return qcow2_cache_do_get(bs, c, offset, table, false);
+}
+
+int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
+{
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].table == *table) {
+            goto found;
+        }
+    }
+    return -ENOENT;
+
+found:
+    c->entries[i].ref--;
+    *table = NULL;
+
+    assert(c->entries[i].ref >= 0);
+
+    if (c->writethrough) {
+        return qcow2_cache_entry_flush(bs, c, i);
+    } else {
+        return 0;
+    }
+}
+
+void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table)
+{
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].table == table) {
+            goto found;
+        }
+    }
+    abort();
+
+found:
+    c->entries[i].dirty = true;
+}
+
diff --git a/block/qcow2.h b/block/qcow2.h
index 5217bea8a2..e5473e1efe 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -78,6 +78,9 @@ typedef struct QCowSnapshot {
     uint64_t vm_clock_nsec;
 } QCowSnapshot;
 
+struct Qcow2Cache;
+typedef struct Qcow2Cache Qcow2Cache;
+
 typedef struct BDRVQcowState {
     int cluster_bits;
     int cluster_size;
@@ -215,4 +218,20 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name);
 void qcow2_free_snapshots(BlockDriverState *bs);
 int qcow2_read_snapshots(BlockDriverState *bs);
 
+/* qcow2-cache.c functions */
+Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables,
+    bool writethrough);
+int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c);
+
+void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table);
+int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c);
+int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
+    Qcow2Cache *dependency);
+
+int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+    void **table);
+int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+    void **table);
+int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
+
 #endif

From 29c1a7301af752de6721e031d31faa48887204bd Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 10 Jan 2011 17:17:28 +0100
Subject: [PATCH 10/23] qcow2: Use QcowCache

Use the new functions of qcow2-cache.c for everything that works on refcount
block and L2 tables.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cache.c    |  10 ++
 block/qcow2-cluster.c  | 208 +++++++++++---------------------
 block/qcow2-refcount.c | 266 +++++++++++++++++------------------------
 block/qcow2.c          |  48 +++++++-
 block/qcow2.h          |  12 +-
 5 files changed, 243 insertions(+), 301 deletions(-)

diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index f7c4e2af0d..5f1740b9aa 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -104,6 +104,12 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
         }
     }
 
+    if (c == s->refcount_block_cache) {
+        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART);
+    } else if (c == s->l2_table_cache) {
+        BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
+    }
+
     ret = bdrv_pwrite(bs->file, c->entries[i].offset, c->entries[i].table,
         s->cluster_size);
     if (ret < 0) {
@@ -218,6 +224,10 @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
 
     c->entries[i].offset = 0;
     if (read_from_disk) {
+        if (c == s->l2_table_cache) {
+            BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD);
+        }
+
         ret = bdrv_pread(bs->file, offset, c->entries[i].table, s->cluster_size);
         if (ret < 0) {
             return ret;
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index c3ef550f7e..76e7e07bdb 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -67,7 +67,11 @@ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size)
         qemu_free(new_l1_table);
         return new_l1_table_offset;
     }
-    bdrv_flush(bs->file);
+
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        return ret;
+    }
 
     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
     for(i = 0; i < s->l1_size; i++)
@@ -98,63 +102,6 @@ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size)
     return ret;
 }
 
-void qcow2_l2_cache_reset(BlockDriverState *bs)
-{
-    BDRVQcowState *s = bs->opaque;
-
-    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
-    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
-    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
-}
-
-static inline int l2_cache_new_entry(BlockDriverState *bs)
-{
-    BDRVQcowState *s = bs->opaque;
-    uint32_t min_count;
-    int min_index, i;
-
-    /* find a new entry in the least used one */
-    min_index = 0;
-    min_count = 0xffffffff;
-    for(i = 0; i < L2_CACHE_SIZE; i++) {
-        if (s->l2_cache_counts[i] < min_count) {
-            min_count = s->l2_cache_counts[i];
-            min_index = i;
-        }
-    }
-    return min_index;
-}
-
-/*
- * seek_l2_table
- *
- * seek l2_offset in the l2_cache table
- * if not found, return NULL,
- * if found,
- *   increments the l2 cache hit count of the entry,
- *   if counter overflow, divide by two all counters
- *   return the pointer to the l2 cache entry
- *
- */
-
-static uint64_t *seek_l2_table(BDRVQcowState *s, uint64_t l2_offset)
-{
-    int i, j;
-
-    for(i = 0; i < L2_CACHE_SIZE; i++) {
-        if (l2_offset == s->l2_cache_offsets[i]) {
-            /* increment the hit count */
-            if (++s->l2_cache_counts[i] == 0xffffffff) {
-                for(j = 0; j < L2_CACHE_SIZE; j++) {
-                    s->l2_cache_counts[j] >>= 1;
-                }
-            }
-            return s->l2_cache + (i << s->l2_bits);
-        }
-    }
-    return NULL;
-}
-
 /*
  * l2_load
  *
@@ -169,33 +116,11 @@ static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
     uint64_t **l2_table)
 {
     BDRVQcowState *s = bs->opaque;
-    int min_index;
     int ret;
 
-    /* seek if the table for the given offset is in the cache */
+    ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table);
 
-    *l2_table = seek_l2_table(s, l2_offset);
-    if (*l2_table != NULL) {
-        return 0;
-    }
-
-    /* not found: load a new entry in the least used one */
-
-    min_index = l2_cache_new_entry(bs);
-    *l2_table = s->l2_cache + (min_index << s->l2_bits);
-
-    BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD);
-    ret = bdrv_pread(bs->file, l2_offset, *l2_table,
-        s->l2_size * sizeof(uint64_t));
-    if (ret < 0) {
-        qcow2_l2_cache_reset(bs);
-        return ret;
-    }
-
-    s->l2_cache_offsets[min_index] = l2_offset;
-    s->l2_cache_counts[min_index] = 1;
-
-    return 0;
+    return ret;
 }
 
 /*
@@ -238,7 +163,6 @@ static int write_l1_entry(BlockDriverState *bs, int l1_index)
 static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
 {
     BDRVQcowState *s = bs->opaque;
-    int min_index;
     uint64_t old_l2_offset;
     uint64_t *l2_table;
     int64_t l2_offset;
@@ -252,29 +176,48 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
     if (l2_offset < 0) {
         return l2_offset;
     }
-    bdrv_flush(bs->file);
+
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        goto fail;
+    }
 
     /* allocate a new entry in the l2 cache */
 
-    min_index = l2_cache_new_entry(bs);
-    l2_table = s->l2_cache + (min_index << s->l2_bits);
+    ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table);
+    if (ret < 0) {
+        return ret;
+    }
+
+    l2_table = *table;
 
     if (old_l2_offset == 0) {
         /* if there was no old l2 table, clear the new table */
         memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
     } else {
+        uint64_t* old_table;
+
         /* if there was an old l2 table, read it from the disk */
         BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
-        ret = bdrv_pread(bs->file, old_l2_offset, l2_table,
-            s->l2_size * sizeof(uint64_t));
+        ret = qcow2_cache_get(bs, s->l2_table_cache, old_l2_offset,
+            (void**) &old_table);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        memcpy(l2_table, old_table, s->cluster_size);
+
+        ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &old_table);
         if (ret < 0) {
             goto fail;
         }
     }
+
     /* write the l2 table to the file */
     BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
-    ret = bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
-        s->l2_size * sizeof(uint64_t));
+
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+    ret = qcow2_cache_flush(bs, s->l2_table_cache);
     if (ret < 0) {
         goto fail;
     }
@@ -286,17 +229,12 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
         goto fail;
     }
 
-    /* update the l2 cache entry */
-
-    s->l2_cache_offsets[min_index] = l2_offset;
-    s->l2_cache_counts[min_index] = 1;
-
     *table = l2_table;
     return 0;
 
 fail:
+    qcow2_cache_put(bs, s->l2_table_cache, (void**) table);
     s->l1_table[l1_index] = old_l2_offset;
-    qcow2_l2_cache_reset(bs);
     return ret;
 }
 
@@ -521,6 +459,8 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
                 &l2_table[l2_index], 0, QCOW_OFLAG_COPIED);
     }
 
+    qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+
    nb_available = (c * s->cluster_sectors);
 out:
     if (nb_available > nb_needed)
@@ -575,6 +515,7 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
             return ret;
         }
     } else {
+        /* FIXME Order */
         if (l2_offset)
             qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
         ret = l2_allocate(bs, l1_index, &l2_table);
@@ -632,6 +573,7 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
 
     cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
     if (cluster_offset < 0) {
+        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
         return 0;
     }
 
@@ -646,38 +588,14 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
     /* compressed clusters never have the copied flag */
 
     BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
     l2_table[l2_index] = cpu_to_be64(cluster_offset);
-    if (bdrv_pwrite_sync(bs->file,
-                    l2_offset + l2_index * sizeof(uint64_t),
-                    l2_table + l2_index,
-                    sizeof(uint64_t)) < 0)
-        return 0;
-
-    return cluster_offset;
-}
-
-/*
- * Write L2 table updates to disk, writing whole sectors to avoid a
- * read-modify-write in bdrv_pwrite
- */
-#define L2_ENTRIES_PER_SECTOR (512 / 8)
-static int write_l2_entries(BlockDriverState *bs, uint64_t *l2_table,
-    uint64_t l2_offset, int l2_index, int num)
-{
-    int l2_start_index = l2_index & ~(L1_ENTRIES_PER_SECTOR - 1);
-    int start_offset = (8 * l2_index) & ~511;
-    int end_offset = (8 * (l2_index + num) + 511) & ~511;
-    size_t len = end_offset - start_offset;
-    int ret;
-
-    BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
-    ret = bdrv_pwrite(bs->file, l2_offset + start_offset,
-        &l2_table[l2_start_index], len);
+    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
     if (ret < 0) {
-        return ret;
+        return 0;
     }
 
-    return 0;
+    return cluster_offset;
 }
 
 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
@@ -686,6 +604,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     int i, j = 0, l2_index, ret;
     uint64_t *old_cluster, start_sect, l2_offset, *l2_table;
     uint64_t cluster_offset = m->cluster_offset;
+    bool cow = false;
 
     if (m->nb_clusters == 0)
         return 0;
@@ -695,6 +614,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     /* copy content of unmodified sectors */
     start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
     if (m->n_start) {
+        cow = true;
         ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
         if (ret < 0)
             goto err;
@@ -702,17 +622,30 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
 
     if (m->nb_available & (s->cluster_sectors - 1)) {
         uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
+        cow = true;
         ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
                 m->nb_available - end, s->cluster_sectors);
         if (ret < 0)
             goto err;
     }
 
-    /* update L2 table */
+    /*
+     * Update L2 table.
+     *
+     * Before we update the L2 table to actually point to the new cluster, we
+     * need to be sure that the refcounts have been increased and COW was
+     * handled.
+     */
+    if (cow) {
+        bdrv_flush(bs->file);
+    }
+
+    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
     ret = get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index);
     if (ret < 0) {
         goto err;
     }
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
 
     for (i = 0; i < m->nb_clusters; i++) {
         /* if two concurrent writes happen to the same unallocated cluster
@@ -728,16 +661,9 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
                     (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
      }
 
-    /*
-     * Before we update the L2 table to actually point to the new cluster, we
-     * need to be sure that the refcounts have been increased and COW was
-     * handled.
-     */
-    bdrv_flush(bs->file);
 
-    ret = write_l2_entries(bs, l2_table, l2_offset, l2_index, m->nb_clusters);
+    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
     if (ret < 0) {
-        qcow2_l2_cache_reset(bs);
         goto err;
     }
 
@@ -746,7 +672,6 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
      * Also flush bs->file to get the right order for L2 and refcount update.
      */
     if (j != 0) {
-        bdrv_flush(bs->file);
         for (i = 0; i < j; i++) {
             qcow2_free_any_clusters(bs,
                 be64_to_cpu(old_cluster[i]) & ~QCOW_OFLAG_COPIED, 1);
@@ -868,7 +793,8 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
                 m->depends_on = old_alloc;
                 m->nb_clusters = 0;
                 *num = 0;
-                return 0;
+                ret = 0;
+                goto fail;
             }
         }
     }
@@ -884,7 +810,8 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
     cluster_offset = qcow2_alloc_clusters(bs, nb_clusters * s->cluster_size);
     if (cluster_offset < 0) {
         QLIST_REMOVE(m, next_in_flight);
-        return cluster_offset;
+        ret = cluster_offset;
+        goto fail;
     }
 
     /* save info needed for meta data update */
@@ -893,12 +820,21 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
     m->nb_clusters = nb_clusters;
 
 out:
+    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (ret < 0) {
+        return ret;
+    }
+
     m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
     m->cluster_offset = cluster_offset;
 
     *num = m->nb_available - n_start;
 
     return 0;
+
+fail:
+    qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    return ret;
 }
 
 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index a10453c875..e37e2268af 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -32,27 +32,6 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
                             int addend);
 
 
-static int cache_refcount_updates = 0;
-
-static int write_refcount_block(BlockDriverState *bs)
-{
-    BDRVQcowState *s = bs->opaque;
-    size_t size = s->cluster_size;
-
-    if (s->refcount_block_cache_offset == 0) {
-        return 0;
-    }
-
-    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE);
-    if (bdrv_pwrite_sync(bs->file, s->refcount_block_cache_offset,
-            s->refcount_block_cache, size) < 0)
-    {
-        return -EIO;
-    }
-
-    return 0;
-}
-
 /*********************************************************/
 /* refcount handling */
 
@@ -61,7 +40,6 @@ int qcow2_refcount_init(BlockDriverState *bs)
     BDRVQcowState *s = bs->opaque;
     int ret, refcount_table_size2, i;
 
-    s->refcount_block_cache = qemu_malloc(s->cluster_size);
     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
     s->refcount_table = qemu_malloc(refcount_table_size2);
     if (s->refcount_table_size > 0) {
@@ -81,34 +59,22 @@ int qcow2_refcount_init(BlockDriverState *bs)
 void qcow2_refcount_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
-    qemu_free(s->refcount_block_cache);
     qemu_free(s->refcount_table);
 }
 
 
 static int load_refcount_block(BlockDriverState *bs,
-                               int64_t refcount_block_offset)
+                               int64_t refcount_block_offset,
+                               void **refcount_block)
 {
     BDRVQcowState *s = bs->opaque;
     int ret;
 
-    if (cache_refcount_updates) {
-        ret = write_refcount_block(bs);
-        if (ret < 0) {
-            return ret;
-        }
-    }
-
     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
-    ret = bdrv_pread(bs->file, refcount_block_offset, s->refcount_block_cache,
-                     s->cluster_size);
-    if (ret < 0) {
-        s->refcount_block_cache_offset = 0;
-        return ret;
-    }
+    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
+        refcount_block);
 
-    s->refcount_block_cache_offset = refcount_block_offset;
-    return 0;
+    return ret;
 }
 
 /*
@@ -122,6 +88,8 @@ static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
     int refcount_table_index, block_index;
     int64_t refcount_block_offset;
     int ret;
+    uint16_t *refcount_block;
+    uint16_t refcount;
 
     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
     if (refcount_table_index >= s->refcount_table_size)
@@ -129,16 +97,24 @@ static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
     refcount_block_offset = s->refcount_table[refcount_table_index];
     if (!refcount_block_offset)
         return 0;
-    if (refcount_block_offset != s->refcount_block_cache_offset) {
-        /* better than nothing: return allocated if read error */
-        ret = load_refcount_block(bs, refcount_block_offset);
-        if (ret < 0) {
-            return ret;
-        }
+
+    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
+        (void**) &refcount_block);
+    if (ret < 0) {
+        return ret;
     }
+
     block_index = cluster_index &
         ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
-    return be16_to_cpu(s->refcount_block_cache[block_index]);
+    refcount = be16_to_cpu(refcount_block[block_index]);
+
+    ret = qcow2_cache_put(bs, s->refcount_block_cache,
+        (void**) &refcount_block);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return refcount;
 }
 
 /*
@@ -174,9 +150,10 @@ static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
  * Loads a refcount block. If it doesn't exist yet, it is allocated first
  * (including growing the refcount table if needed).
  *
- * Returns the offset of the refcount block on success or -errno in error case
+ * Returns 0 on success or -errno in error case
  */
-static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
+static int alloc_refcount_block(BlockDriverState *bs,
+    int64_t cluster_index, uint16_t **refcount_block)
 {
     BDRVQcowState *s = bs->opaque;
     unsigned int refcount_table_index;
@@ -194,13 +171,8 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
 
         /* If it's already there, we're done */
         if (refcount_block_offset) {
-            if (refcount_block_offset != s->refcount_block_cache_offset) {
-                ret = load_refcount_block(bs, refcount_block_offset);
-                if (ret < 0) {
-                    return ret;
-                }
-            }
-            return refcount_block_offset;
+             return load_refcount_block(bs, refcount_block_offset,
+                 (void**) refcount_block);
         }
     }
 
@@ -226,12 +198,10 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
      *   refcount block into the cache
      */
 
-    if (cache_refcount_updates) {
-        ret = write_refcount_block(bs);
-        if (ret < 0) {
-            return ret;
-        }
-    }
+    *refcount_block = NULL;
+
+    /* We write to the refcount table, so we might depend on L2 tables */
+    qcow2_cache_flush(bs, s->l2_table_cache);
 
     /* Allocate the refcount block itself and mark it as used */
     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
@@ -247,13 +217,18 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
 
     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
         /* Zero the new refcount block before updating it */
-        memset(s->refcount_block_cache, 0, s->cluster_size);
-        s->refcount_block_cache_offset = new_block;
+        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
+            (void**) refcount_block);
+        if (ret < 0) {
+            goto fail_block;
+        }
+
+        memset(*refcount_block, 0, s->cluster_size);
 
         /* The block describes itself, need to update the cache */
         int block_index = (new_block >> s->cluster_bits) &
             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
-        s->refcount_block_cache[block_index] = cpu_to_be16(1);
+        (*refcount_block)[block_index] = cpu_to_be16(1);
     } else {
         /* Described somewhere else. This can recurse at most twice before we
          * arrive at a block that describes itself. */
@@ -266,14 +241,19 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
 
         /* Initialize the new refcount block only after updating its refcount,
          * update_refcount uses the refcount cache itself */
-        memset(s->refcount_block_cache, 0, s->cluster_size);
-        s->refcount_block_cache_offset = new_block;
+        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
+            (void**) refcount_block);
+        if (ret < 0) {
+            goto fail_block;
+        }
+
+        memset(*refcount_block, 0, s->cluster_size);
     }
 
     /* Now the new refcount block needs to be written to disk */
     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
-    ret = bdrv_pwrite_sync(bs->file, new_block, s->refcount_block_cache,
-        s->cluster_size);
+    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
     if (ret < 0) {
         goto fail_block;
     }
@@ -290,7 +270,12 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
         }
 
         s->refcount_table[refcount_table_index] = new_block;
-        return new_block;
+        return 0;
+    }
+
+    ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
+    if (ret < 0) {
+        goto fail_block;
     }
 
     /*
@@ -410,9 +395,9 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
     s->free_cluster_index = old_free_cluster_index;
 
-    ret = load_refcount_block(bs, new_block);
+    ret = load_refcount_block(bs, new_block, (void**) refcount_block);
     if (ret < 0) {
-        goto fail_block;
+        return ret;
     }
 
     return new_block;
@@ -420,52 +405,20 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
 fail_table:
     qemu_free(new_table);
 fail_block:
-    s->refcount_block_cache_offset = 0;
+    if (*refcount_block != NULL) {
+        qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
+    }
     return ret;
 }
 
-#define REFCOUNTS_PER_SECTOR (512 >> REFCOUNT_SHIFT)
-static int write_refcount_block_entries(BlockDriverState *bs,
-    int64_t refcount_block_offset, int first_index, int last_index)
-{
-    BDRVQcowState *s = bs->opaque;
-    size_t size;
-    int ret;
-
-    if (cache_refcount_updates) {
-        return 0;
-    }
-
-    if (first_index < 0) {
-        return 0;
-    }
-
-    first_index &= ~(REFCOUNTS_PER_SECTOR - 1);
-    last_index = (last_index + REFCOUNTS_PER_SECTOR)
-        & ~(REFCOUNTS_PER_SECTOR - 1);
-
-    size = (last_index - first_index) << REFCOUNT_SHIFT;
-
-    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART);
-    ret = bdrv_pwrite(bs->file,
-        refcount_block_offset + (first_index << REFCOUNT_SHIFT),
-        &s->refcount_block_cache[first_index], size);
-    if (ret < 0) {
-        return ret;
-    }
-
-    return 0;
-}
-
 /* XXX: cache several refcount block clusters ? */
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
     int64_t offset, int64_t length, int addend)
 {
     BDRVQcowState *s = bs->opaque;
     int64_t start, last, cluster_offset;
-    int64_t refcount_block_offset = 0;
-    int64_t table_index = -1, old_table_index;
-    int first_index = -1, last_index = -1;
+    uint16_t *refcount_block = NULL;
+    int64_t old_table_index = -1;
     int ret;
 
 #ifdef DEBUG_ALLOC2
@@ -478,6 +431,11 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
         return 0;
     }
 
+    if (addend < 0) {
+        qcow2_cache_set_dependency(bs, s->refcount_block_cache,
+            s->l2_table_cache);
+    }
+
     start = offset & ~(s->cluster_size - 1);
     last = (offset + length - 1) & ~(s->cluster_size - 1);
     for(cluster_offset = start; cluster_offset <= last;
@@ -485,42 +443,33 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
     {
         int block_index, refcount;
         int64_t cluster_index = cluster_offset >> s->cluster_bits;
-        int64_t new_block;
-
-        /* Only write refcount block to disk when we are done with it */
-        old_table_index = table_index;
-        table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
-        if ((old_table_index >= 0) && (table_index != old_table_index)) {
-
-            ret = write_refcount_block_entries(bs, refcount_block_offset,
-                first_index, last_index);
-            if (ret < 0) {
-                return ret;
-            }
-
-            first_index = -1;
-            last_index = -1;
-        }
+        int64_t table_index =
+            cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
 
         /* Load the refcount block and allocate it if needed */
-        new_block = alloc_refcount_block(bs, cluster_index);
-        if (new_block < 0) {
-            ret = new_block;
-            goto fail;
+        if (table_index != old_table_index) {
+            if (refcount_block) {
+                ret = qcow2_cache_put(bs, s->refcount_block_cache,
+                    (void**) &refcount_block);
+                if (ret < 0) {
+                    goto fail;
+                }
+            }
+
+            ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
+            if (ret < 0) {
+                goto fail;
+            }
         }
-        refcount_block_offset = new_block;
+        old_table_index = table_index;
+
+        qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
 
         /* we can update the count and save it */
         block_index = cluster_index &
             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
-        if (first_index == -1 || block_index < first_index) {
-            first_index = block_index;
-        }
-        if (block_index > last_index) {
-            last_index = block_index;
-        }
 
-        refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
+        refcount = be16_to_cpu(refcount_block[block_index]);
         refcount += addend;
         if (refcount < 0 || refcount > 0xffff) {
             ret = -EINVAL;
@@ -529,17 +478,16 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
         if (refcount == 0 && cluster_index < s->free_cluster_index) {
             s->free_cluster_index = cluster_index;
         }
-        s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
+        refcount_block[block_index] = cpu_to_be16(refcount);
     }
 
     ret = 0;
 fail:
-
     /* Write last changed block to disk */
-    if (refcount_block_offset != 0) {
+    if (refcount_block) {
         int wret;
-        wret = write_refcount_block_entries(bs, refcount_block_offset,
-            first_index, last_index);
+        wret = qcow2_cache_put(bs, s->refcount_block_cache,
+            (void**) &refcount_block);
         if (wret < 0) {
             return ret < 0 ? ret : wret;
         }
@@ -758,9 +706,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
     int64_t old_offset, old_l2_offset;
     int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
-
-    qcow2_l2_cache_reset(bs);
-    cache_refcount_updates = 1;
+    int ret;
 
     l2_table = NULL;
     l1_table = NULL;
@@ -784,7 +730,6 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     }
 
     l2_size = s->l2_size * sizeof(uint64_t);
-    l2_table = qemu_malloc(l2_size);
     l1_modified = 0;
     for(i = 0; i < l1_size; i++) {
         l2_offset = l1_table[i];
@@ -792,8 +737,13 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
             old_l2_offset = l2_offset;
             l2_offset &= ~QCOW_OFLAG_COPIED;
             l2_modified = 0;
-            if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
+
+            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
+                (void**) &l2_table);
+            if (ret < 0) {
                 goto fail;
+            }
+
             for(j = 0; j < s->l2_size; j++) {
                 offset = be64_to_cpu(l2_table[j]);
                 if (offset != 0) {
@@ -833,17 +783,23 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                         offset |= QCOW_OFLAG_COPIED;
                     }
                     if (offset != old_offset) {
+                        if (addend > 0) {
+                            qcow2_cache_set_dependency(bs, s->l2_table_cache,
+                                s->refcount_block_cache);
+                        }
                         l2_table[j] = cpu_to_be64(offset);
                         l2_modified = 1;
+                        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
                     }
                 }
             }
-            if (l2_modified) {
-                if (bdrv_pwrite_sync(bs->file,
-                                l2_offset, l2_table, l2_size) < 0)
-                    goto fail;
+
+            ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+            if (ret < 0) {
+                goto fail;
             }
 
+
             if (addend != 0) {
                 refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
             } else {
@@ -871,16 +827,14 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     }
     if (l1_allocated)
         qemu_free(l1_table);
-    qemu_free(l2_table);
-    cache_refcount_updates = 0;
-    write_refcount_block(bs);
     return 0;
  fail:
+    if (l2_table) {
+        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    }
+
     if (l1_allocated)
         qemu_free(l1_table);
-    qemu_free(l2_table);
-    cache_refcount_updates = 0;
-    write_refcount_block(bs);
     return -EIO;
 }
 
diff --git a/block/qcow2.c b/block/qcow2.c
index b6b094c797..49bf7b9d3d 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -143,6 +143,7 @@ static int qcow2_open(BlockDriverState *bs, int flags)
     int len, i, ret = 0;
     QCowHeader header;
     uint64_t ext_end;
+    bool writethrough;
 
     ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
     if (ret < 0) {
@@ -217,8 +218,13 @@ static int qcow2_open(BlockDriverState *bs, int flags)
             be64_to_cpus(&s->l1_table[i]);
         }
     }
-    /* alloc L2 cache */
-    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+
+    /* alloc L2 table/refcount block cache */
+    writethrough = ((flags & BDRV_O_CACHE_MASK) == 0);
+    s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE, writethrough);
+    s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE,
+        writethrough);
+
     s->cluster_cache = qemu_malloc(s->cluster_size);
     /* one more sector for decompressed data alignment */
     s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
@@ -270,7 +276,9 @@ static int qcow2_open(BlockDriverState *bs, int flags)
     qcow2_free_snapshots(bs);
     qcow2_refcount_close(bs);
     qemu_free(s->l1_table);
-    qemu_free(s->l2_cache);
+    if (s->l2_table_cache) {
+        qcow2_cache_destroy(bs, s->l2_table_cache);
+    }
     qemu_free(s->cluster_cache);
     qemu_free(s->cluster_data);
     return ret;
@@ -719,7 +727,13 @@ static void qcow2_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
     qemu_free(s->l1_table);
-    qemu_free(s->l2_cache);
+
+    qcow2_cache_flush(bs, s->l2_table_cache);
+    qcow2_cache_flush(bs, s->refcount_block_cache);
+
+    qcow2_cache_destroy(bs, s->l2_table_cache);
+    qcow2_cache_destroy(bs, s->refcount_block_cache);
+
     qemu_free(s->cluster_cache);
     qemu_free(s->cluster_data);
     qcow2_refcount_close(bs);
@@ -1179,6 +1193,19 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
 
 static int qcow2_flush(BlockDriverState *bs)
 {
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    ret = qcow2_cache_flush(bs, s->l2_table_cache);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        return ret;
+    }
+
     return bdrv_flush(bs->file);
 }
 
@@ -1186,6 +1213,19 @@ static BlockDriverAIOCB *qcow2_aio_flush(BlockDriverState *bs,
                                          BlockDriverCompletionFunc *cb,
                                          void *opaque)
 {
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    ret = qcow2_cache_flush(bs, s->l2_table_cache);
+    if (ret < 0) {
+        return NULL;
+    }
+
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        return NULL;
+    }
+
     return bdrv_aio_flush(bs->file, cb, opaque);
 }
 
diff --git a/block/qcow2.h b/block/qcow2.h
index e5473e1efe..11cbce337d 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -51,6 +51,9 @@
 
 #define L2_CACHE_SIZE 16
 
+/* Must be at least 4 to cover all cases of refcount table growth */
+#define REFCOUNT_CACHE_SIZE 4
+
 typedef struct QCowHeader {
     uint32_t magic;
     uint32_t version;
@@ -94,9 +97,10 @@ typedef struct BDRVQcowState {
     uint64_t cluster_offset_mask;
     uint64_t l1_table_offset;
     uint64_t *l1_table;
-    uint64_t *l2_cache;
-    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
-    uint32_t l2_cache_counts[L2_CACHE_SIZE];
+
+    Qcow2Cache* l2_table_cache;
+    Qcow2Cache* refcount_block_cache;
+
     uint8_t *cluster_cache;
     uint8_t *cluster_data;
     uint64_t cluster_cache_offset;
@@ -105,8 +109,6 @@ typedef struct BDRVQcowState {
     uint64_t *refcount_table;
     uint64_t refcount_table_offset;
     uint32_t refcount_table_size;
-    uint64_t refcount_block_cache_offset;
-    uint16_t *refcount_block_cache;
     int64_t free_cluster_index;
     int64_t free_byte_offset;
 

From 3de0a2944bdb3047dce275560631834bcb4afe22 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 14 Jan 2011 15:55:38 +0100
Subject: [PATCH 11/23] qcow2: Batch flushes for COW

qcow2 calls bdrv_flush() after performing COW in order to ensure that the
L2 table change is never written before the copy is safe on disk. Now that the
L2 table is cached, we can wait with flushing until we write out the next L2
table.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cache.c   | 20 +++++++++++++++++---
 block/qcow2-cluster.c |  2 +-
 block/qcow2.h         |  1 +
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index 5f1740b9aa..8f2955b372 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -38,6 +38,7 @@ struct Qcow2Cache {
     int                     size;
     Qcow2CachedTable*       entries;
     struct Qcow2Cache*      depends;
+    bool                    depends_on_flush;
     bool                    writethrough;
 };
 
@@ -85,13 +86,15 @@ static int qcow2_cache_flush_dependency(BlockDriverState *bs, Qcow2Cache *c)
     }
 
     c->depends = NULL;
+    c->depends_on_flush = false;
+
     return 0;
 }
 
 static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
 {
     BDRVQcowState *s = bs->opaque;
-    int ret;
+    int ret = 0;
 
     if (!c->entries[i].dirty || !c->entries[i].offset) {
         return 0;
@@ -99,11 +102,17 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
 
     if (c->depends) {
         ret = qcow2_cache_flush_dependency(bs, c);
-        if (ret < 0) {
-            return ret;
+    } else if (c->depends_on_flush) {
+        ret = bdrv_flush(bs->file);
+        if (ret >= 0) {
+            c->depends_on_flush = false;
         }
     }
 
+    if (ret < 0) {
+        return ret;
+    }
+
     if (c == s->refcount_block_cache) {
         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART);
     } else if (c == s->l2_table_cache) {
@@ -167,6 +176,11 @@ int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
     return 0;
 }
 
+void qcow2_cache_depends_on_flush(Qcow2Cache *c)
+{
+    c->depends_on_flush = true;
+}
+
 static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c)
 {
     int i;
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 76e7e07bdb..1c2003a8a4 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -637,7 +637,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
      * handled.
      */
     if (cow) {
-        bdrv_flush(bs->file);
+        qcow2_cache_depends_on_flush(s->l2_table_cache);
     }
 
     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
diff --git a/block/qcow2.h b/block/qcow2.h
index 11cbce337d..6d801206e2 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -229,6 +229,7 @@ void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table);
 int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c);
 int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
     Qcow2Cache *dependency);
+void qcow2_cache_depends_on_flush(Qcow2Cache *c);
 
 int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
     void **table);

From 1635eecc413ed680013cf77e6994901cafe15590 Mon Sep 17 00:00:00 2001
From: Stefan Weil <weil@mail.berlios.de>
Date: Sat, 15 Jan 2011 19:01:03 +0100
Subject: [PATCH 12/23] ide: Remove unneeded null pointer check

With bm == NULL, other code in the same function would crash.

This bug was reported by cppcheck:
hw/ide/pci.c:280: error: Possible null pointer dereference: bm

Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Stefan Weil <weil@mail.berlios.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/ide/pci.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/hw/ide/pci.c b/hw/ide/pci.c
index 987caffa3a..35168cb469 100644
--- a/hw/ide/pci.c
+++ b/hw/ide/pci.c
@@ -267,9 +267,7 @@ static void bmdma_irq(void *opaque, int n, int level)
         return;
     }
 
-    if (bm) {
-        bm->status |= BM_STATUS_INT;
-    }
+    bm->status |= BM_STATUS_INT;
 
     /* trigger the real irq */
     qemu_set_irq(bm->irq, level);

From e61846908efd5ea0916d157a82adabf0674a01b0 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 17 Jan 2011 15:35:28 +0100
Subject: [PATCH 13/23] Documentation: Add qemu-img check/rebase

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qemu-img.texi | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/qemu-img.texi b/qemu-img.texi
index 1b90ddbcfc..ced64a40ed 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -59,6 +59,13 @@ lists all snapshots in the given image
 Command description:
 
 @table @option
+@item check [-f @var{fmt}] @var{filename}
+
+Perform a consistency check on the disk image @var{filename}.
+
+Only the formats @code{qcow2}, @code{qed} and @code{vdi} support
+consistency checks.
+
 @item create [-f @var{fmt}] [-o @var{options}] @var{filename} [@var{size}]
 
 Create the new disk image @var{filename} of size @var{size} and format
@@ -107,6 +114,40 @@ they are displayed too.
 
 List, apply, create or delete snapshots in image @var{filename}.
 
+@item rebase [-f @var{fmt}] [-u] -b @var{backing_file} [-F @var{backing_fmt}] @var{filename}
+
+Changes the backing file of an image. Only the formats @code{qcow2} and
+@code{qed} support changing the backing file.
+
+The backing file is changed to @var{backing_file} and (if the image format of
+@var{filename} supports this) the backing file format is changed to
+@var{backing_fmt}.
+
+There are two different modes in which @code{rebase} can operate:
+@table @option
+@item Safe mode
+This is the default mode and performs a real rebase operation. The new backing
+file may differ from the old one and qemu-img rebase will take care of keeping
+the guest-visible content of @var{filename} unchanged.
+
+In order to achieve this, any clusters that differ between @var{backing_file}
+and the old backing file of @var{filename} are merged into @var{filename}
+before actually changing the backing file.
+
+Note that the safe mode is an expensive operation, comparable to converting
+an image. It only works if the old backing file still exists.
+
+@item Unsafe mode
+qemu-img uses the unsafe mode if @code{-u} is specified. In this mode, only the
+backing file name and format of @var{filename} is changed without any checks
+on the file contents. The user must take care of specifying the correct new
+backing file, or the guest-visible content of the image will be corrupted.
+
+This mode is useful for renaming or moving the backing file to somewhere else.
+It can be used without an accessible old backing file, i.e. you can use it to
+fix an image whose backing file has already been moved/renamed.
+@end table
+
 @item resize @var{filename} [+ | -]@var{size}
 
 Change the disk image as if it had been created with @var{size}.

From c743849bee7333c7ef256b7e12e34ed6f907064f Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Fri, 14 Jan 2011 22:44:33 +0000
Subject: [PATCH 14/23] qed: Refuse to create images on block devices

QED relies on the underlying filesystem to extend the file and maintain
its size.  Check that images are not created on a block device.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qed.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/block/qed.c b/block/qed.c
index 085c4f2210..a46f9ef419 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -469,6 +469,12 @@ static int qed_create(const char *filename, uint32_t cluster_size,
         return ret;
     }
 
+    /* File must start empty and grow, check truncate is supported */
+    ret = bdrv_truncate(bs, 0);
+    if (ret < 0) {
+        goto out;
+    }
+
     if (backing_file) {
         header.features |= QED_F_BACKING_FILE;
         header.backing_filename_offset = sizeof(le_header);

From 77358b59f6f3ef571fb2262f5f6216e179d07ecb Mon Sep 17 00:00:00 2001
From: Pierre Riteau <Pierre.Riteau@irisa.fr>
Date: Fri, 21 Jan 2011 12:42:30 +0100
Subject: [PATCH 15/23] Fix block migration when the device size is not a
 multiple of 1 MB

b02bea3a85cc939f09aa674a3f1e4f36d418c007 added a check on the return
value of bdrv_write and aborts migration when it fails. However, if the
size of the block device to migrate is not a multiple of BLOCK_SIZE
(currently 1 MB), the last bdrv_write will fail with -EIO.

Fixed by calling bdrv_write with the correct size of the last block.

Signed-off-by: Pierre Riteau <Pierre.Riteau@irisa.fr>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block-migration.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/block-migration.c b/block-migration.c
index 60b9fc0ef0..c9d3e81dbf 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -638,8 +638,10 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
     int len, flags;
     char device_name[256];
     int64_t addr;
-    BlockDriverState *bs;
+    BlockDriverState *bs, *bs_prev = NULL;
     uint8_t *buf;
+    int64_t total_sectors = 0;
+    int nr_sectors;
 
     do {
         addr = qemu_get_be64(f);
@@ -661,10 +663,26 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
                 return -EINVAL;
             }
 
+            if (bs != bs_prev) {
+                bs_prev = bs;
+                total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
+                if (total_sectors <= 0) {
+                    error_report("Error getting length of block device %s\n",
+                                 device_name);
+                    return -EINVAL;
+                }
+            }
+
+            if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
+                nr_sectors = total_sectors - addr;
+            } else {
+                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
+            }
+
             buf = qemu_malloc(BLOCK_SIZE);
 
             qemu_get_buffer(f, buf, BLOCK_SIZE);
-            ret = bdrv_write(bs, addr, buf, BDRV_SECTORS_PER_DIRTY_CHUNK);
+            ret = bdrv_write(bs, addr, buf, nr_sectors);
 
             qemu_free(buf);
             if (ret < 0) {

From 483848540557aef6af08adbe3ef8201b961220d5 Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@redhat.com>
Date: Mon, 17 Jan 2011 19:31:26 +0100
Subject: [PATCH 16/23] blockdev: Fix error message for invalid -drive CHS

When cyls, heads or secs are out of range, the error message prints
buf, which points to the value of option "if".  Bogus, may even be
null.  Drop that.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 662f7a9d86..28c051ba89 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -224,15 +224,15 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi, int *fatal_error)
 
     if (cyls || heads || secs) {
         if (cyls < 1 || (type == IF_IDE && cyls > 16383)) {
-            fprintf(stderr, "qemu: '%s' invalid physical cyls number\n", buf);
+            fprintf(stderr, "qemu: invalid physical cyls number\n");
 	    return NULL;
 	}
         if (heads < 1 || (type == IF_IDE && heads > 16)) {
-            fprintf(stderr, "qemu: '%s' invalid physical heads number\n", buf);
+            fprintf(stderr, "qemu: invalid physical heads number\n");
 	    return NULL;
 	}
         if (secs < 1 || (type == IF_IDE && secs > 63)) {
-            fprintf(stderr, "qemu: '%s' invalid physical secs number\n", buf);
+            fprintf(stderr, "qemu: invalid physical secs number\n");
 	    return NULL;
 	}
     }

From 807105a775323be9b45ef1e965c5aad85a5ec281 Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@redhat.com>
Date: Mon, 17 Jan 2011 19:31:27 +0100
Subject: [PATCH 17/23] blockdev: Make drive_init() use error_report()

This makes the errors point to the error location, and fixes drive_add
to report errors in the monitor instead of stderr.

While there, tweak a few error messages for consistency.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c | 59 ++++++++++++++++++++++++++----------------------------
 1 file changed, 28 insertions(+), 31 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 28c051ba89..0621390ea6 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -107,7 +107,7 @@ DriveInfo *drive_get_by_blockdev(BlockDriverState *bs)
 
 static void bdrv_format_print(void *opaque, const char *name)
 {
-    fprintf(stderr, " %s", name);
+    error_printf(" %s", name);
 }
 
 void drive_uninit(DriveInfo *dinfo)
@@ -129,8 +129,8 @@ static int parse_block_error_action(const char *buf, int is_read)
     } else if (!strcmp(buf, "report")) {
         return BLOCK_ERR_REPORT;
     } else {
-        fprintf(stderr, "qemu: '%s' invalid %s error action\n",
-            buf, is_read ? "read" : "write");
+        error_report("'%s' invalid %s error action",
+                     buf, is_read ? "read" : "write");
         return -1;
     }
 }
@@ -217,31 +217,30 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi, int *fatal_error)
 	    type = IF_NONE;
             max_devs = 0;
 	} else {
-            fprintf(stderr, "qemu: unsupported bus type '%s'\n", buf);
+            error_report("unsupported bus type '%s'", buf);
             return NULL;
 	}
     }
 
     if (cyls || heads || secs) {
         if (cyls < 1 || (type == IF_IDE && cyls > 16383)) {
-            fprintf(stderr, "qemu: invalid physical cyls number\n");
+            error_report("invalid physical cyls number");
 	    return NULL;
 	}
         if (heads < 1 || (type == IF_IDE && heads > 16)) {
-            fprintf(stderr, "qemu: invalid physical heads number\n");
+            error_report("invalid physical heads number");
 	    return NULL;
 	}
         if (secs < 1 || (type == IF_IDE && secs > 63)) {
-            fprintf(stderr, "qemu: invalid physical secs number\n");
+            error_report("invalid physical secs number");
 	    return NULL;
 	}
     }
 
     if ((buf = qemu_opt_get(opts, "trans")) != NULL) {
         if (!cyls) {
-            fprintf(stderr,
-                    "qemu: '%s' trans must be used with cyls,heads and secs\n",
-                    buf);
+            error_report("'%s' trans must be used with cyls,heads and secs",
+                         buf);
             return NULL;
         }
         if (!strcmp(buf, "none"))
@@ -251,7 +250,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi, int *fatal_error)
         else if (!strcmp(buf, "auto"))
             translation = BIOS_ATA_TRANSLATION_AUTO;
 	else {
-            fprintf(stderr, "qemu: '%s' invalid translation type\n", buf);
+            error_report("'%s' invalid translation type", buf);
 	    return NULL;
 	}
     }
@@ -261,13 +260,12 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi, int *fatal_error)
 	    media = MEDIA_DISK;
 	} else if (!strcmp(buf, "cdrom")) {
             if (cyls || secs || heads) {
-                fprintf(stderr,
-                        "qemu: '%s' invalid physical CHS format\n", buf);
+                error_report("'%s' invalid physical CHS format", buf);
 	        return NULL;
             }
 	    media = MEDIA_CDROM;
 	} else {
-	    fprintf(stderr, "qemu: '%s' invalid media\n", buf);
+	    error_report("'%s' invalid media", buf);
 	    return NULL;
 	}
     }
@@ -283,7 +281,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi, int *fatal_error)
         } else if (!strcmp(buf, "writethrough")) {
             /* this is the default */
         } else {
-           fprintf(stderr, "qemu: invalid cache option\n");
+           error_report("invalid cache option");
            return NULL;
         }
     }
@@ -295,7 +293,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi, int *fatal_error)
         } else if (!strcmp(buf, "threads")) {
             /* this is the default */
         } else {
-           fprintf(stderr, "qemu: invalid aio option\n");
+           error_report("invalid aio option");
            return NULL;
         }
     }
@@ -303,14 +301,14 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi, int *fatal_error)
 
     if ((buf = qemu_opt_get(opts, "format")) != NULL) {
        if (strcmp(buf, "?") == 0) {
-            fprintf(stderr, "qemu: Supported formats:");
-            bdrv_iterate_format(bdrv_format_print, NULL);
-            fprintf(stderr, "\n");
-	    return NULL;
+           error_printf("Supported formats:");
+           bdrv_iterate_format(bdrv_format_print, NULL);
+           error_printf("\n");
+           return NULL;
         }
         drv = bdrv_find_whitelisted_format(buf);
         if (!drv) {
-            fprintf(stderr, "qemu: '%s' invalid format\n", buf);
+            error_report("'%s' invalid format", buf);
             return NULL;
         }
     }
@@ -318,7 +316,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi, int *fatal_error)
     on_write_error = BLOCK_ERR_STOP_ENOSPC;
     if ((buf = qemu_opt_get(opts, "werror")) != NULL) {
         if (type != IF_IDE && type != IF_SCSI && type != IF_VIRTIO && type != IF_NONE) {
-            fprintf(stderr, "werror is not supported by this format\n");
+            error_report("werror is not supported by this bus type");
             return NULL;
         }
 
@@ -331,7 +329,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi, int *fatal_error)
     on_read_error = BLOCK_ERR_REPORT;
     if ((buf = qemu_opt_get(opts, "rerror")) != NULL) {
         if (type != IF_IDE && type != IF_VIRTIO && type != IF_SCSI && type != IF_NONE) {
-            fprintf(stderr, "rerror is not supported by this format\n");
+            error_report("rerror is not supported by this bus type");
             return NULL;
         }
 
@@ -343,7 +341,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi, int *fatal_error)
 
     if ((devaddr = qemu_opt_get(opts, "addr")) != NULL) {
         if (type != IF_VIRTIO) {
-            fprintf(stderr, "addr is not supported\n");
+            error_report("addr is not supported by this bus type");
             return NULL;
         }
     }
@@ -352,8 +350,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi, int *fatal_error)
 
     if (index != -1) {
         if (bus_id != 0 || unit_id != -1) {
-            fprintf(stderr,
-                    "qemu: index cannot be used with bus and unit\n");
+            error_report("index cannot be used with bus and unit");
             return NULL;
         }
         if (max_devs == 0)
@@ -384,8 +381,8 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi, int *fatal_error)
     /* check unit id */
 
     if (max_devs && unit_id >= max_devs) {
-        fprintf(stderr, "qemu: unit %d too big (max is %d)\n",
-                unit_id, max_devs - 1);
+        error_report("unit %d too big (max is %d)",
+                     unit_id, max_devs - 1);
         return NULL;
     }
 
@@ -479,7 +476,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi, int *fatal_error)
         ro = 1;
     } else if (ro == 1) {
         if (type != IF_SCSI && type != IF_VIRTIO && type != IF_FLOPPY && type != IF_NONE) {
-            fprintf(stderr, "qemu: readonly flag not supported for drive with this interface\n");
+            error_report("readonly not supported by this bus type");
             return NULL;
         }
     }
@@ -488,8 +485,8 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi, int *fatal_error)
 
     ret = bdrv_open(dinfo->bdrv, file, bdrv_flags, drv);
     if (ret < 0) {
-        fprintf(stderr, "qemu: could not open disk image %s: %s\n",
-                        file, strerror(-ret));
+        error_report("could not open disk image %s: %s",
+                     file, strerror(-ret));
         return NULL;
     }
 

From 850ec1133bf0f78ff19402cfd5d77eea376599a9 Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@redhat.com>
Date: Mon, 17 Jan 2011 19:31:29 +0100
Subject: [PATCH 18/23] blockdev: Fix drive_del not to crash when drive is not
 in use

Watch this:

    (qemu) drive_add 0 if=none,file=tmp.img
    OK
    (qemu) info block
    none0: type=hd removable=0 file=tmp.img ro=0 drv=raw encrypted=0
    (qemu) drive_del none0
    Segmentation fault (core dumped)

do_drive_del()'s code to clean up the pointer from a qdev using the
drive back to the drive needs to check whether such a device exists.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 0621390ea6..f7f591fe78 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -687,13 +687,15 @@ int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
 
     /* clean up guest state from pointing to host resource by
      * finding and removing DeviceState "drive" property */
-    for (prop = bs->peer->info->props; prop && prop->name; prop++) {
-        if (prop->info->type == PROP_TYPE_DRIVE) {
-            ptr = qdev_get_prop_ptr(bs->peer, prop);
-            if ((*ptr) == bs) {
-                bdrv_detach(bs, bs->peer);
-                *ptr = NULL;
-                break;
+    if (bs->peer) {
+        for (prop = bs->peer->info->props; prop && prop->name; prop++) {
+            if (prop->info->type == PROP_TYPE_DRIVE) {
+                ptr = qdev_get_prop_ptr(bs->peer, prop);
+                if (*ptr == bs) {
+                    bdrv_detach(bs, bs->peer);
+                    *ptr = NULL;
+                    break;
+                }
             }
         }
     }

From 96df67d1c3928704cd76d0b2e76372ef18658e85 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 24 Jan 2011 09:32:20 +0000
Subject: [PATCH 19/23] block: Use backing format driver during image creation

The backing format should be honored during image creation.  For some
reason we currently use the image format to open the backing file.  This
fails when the backing file has a different format than the image being
created.  Keep the image and backing format drivers completely separate.

Also print the backing filename if there is an error opening the backing
file instead of the image filename.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Acked-by: Jes Sorensen <Jes.Sorensen@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/block.c b/block.c
index ff2795b1e9..7ad3ddfea5 100644
--- a/block.c
+++ b/block.c
@@ -2778,6 +2778,7 @@ int bdrv_img_create(const char *filename, const char *fmt,
     QEMUOptionParameter *backing_fmt, *backing_file;
     BlockDriverState *bs = NULL;
     BlockDriver *drv, *proto_drv;
+    BlockDriver *backing_drv = NULL;
     int ret = 0;
 
     /* Find driver and parse its options */
@@ -2846,7 +2847,8 @@ int bdrv_img_create(const char *filename, const char *fmt,
 
     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
     if (backing_fmt && backing_fmt->value.s) {
-        if (!bdrv_find_format(backing_fmt->value.s)) {
+        backing_drv = bdrv_find_format(backing_fmt->value.s);
+        if (!backing_drv) {
             error_report("Unknown backing file format '%s'",
                          backing_fmt->value.s);
             ret = -EINVAL;
@@ -2863,9 +2865,9 @@ int bdrv_img_create(const char *filename, const char *fmt,
 
             bs = bdrv_new("");
 
-            ret = bdrv_open(bs, backing_file->value.s, flags, drv);
+            ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
             if (ret < 0) {
-                error_report("Could not open '%s'", filename);
+                error_report("Could not open '%s'", backing_file->value.s);
                 goto out;
             }
             bdrv_get_geometry(bs, &size);

From 419e691f8ef16635e73d814ef3ab825272208ba8 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 24 Jan 2011 15:34:58 +0000
Subject: [PATCH 20/23] scsi-disk: Allow overriding SCSI INQUIRY removable bit

Provide the "removable" qdev property bit to override the SCSI INQUIRY
removable (RMB) bit for non-CDROM devices.  This will be used by USB
Mass Storage Devices, which sometimes have this guest-visible bit set
and sometimes do not.  They therefore requires a means for user
configuration.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/scsi-disk.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 6cb317c8f9..488eedd2cd 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -72,6 +72,7 @@ struct SCSIDiskState
     /* The qemu block layer uses a fixed 512 byte sector size.
        This is the number of 512 byte blocks in a single scsi sector.  */
     int cluster_size;
+    uint32_t removable;
     uint64_t max_lba;
     QEMUBH *bh;
     char *version;
@@ -552,6 +553,7 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf)
         memcpy(&outbuf[16], "QEMU CD-ROM     ", 16);
     } else {
         outbuf[0] = 0;
+        outbuf[1] = s->removable ? 0x80 : 0;
         memcpy(&outbuf[16], "QEMU HARDDISK   ", 16);
     }
     memcpy(&outbuf[8], "QEMU    ", 8);
@@ -1295,6 +1297,7 @@ static SCSIDeviceInfo scsi_disk_info = {
         DEFINE_BLOCK_PROPERTIES(SCSIDiskState, qdev.conf),
         DEFINE_PROP_STRING("ver",  SCSIDiskState, version),
         DEFINE_PROP_STRING("serial",  SCSIDiskState, serial),
+        DEFINE_PROP_BIT("removable", SCSIDiskState, removable, 0, false),
         DEFINE_PROP_END_OF_LIST(),
     },
 };

From 2d1fd2613769d99e5fad1f57ab8466434e2079fd Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 24 Jan 2011 15:34:59 +0000
Subject: [PATCH 21/23] scsi: Allow scsi_bus_legacy_add_drive() to set
 removable bit

scsi-disk devices may wish to override the removable bit.  Add support
for a qdev property on SCSI devices.  This is will be used by usb-msd.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/pci-hotplug.c | 2 +-
 hw/scsi-bus.c    | 8 ++++++--
 hw/scsi.h        | 3 ++-
 hw/usb-msd.c     | 2 +-
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/hw/pci-hotplug.c b/hw/pci-hotplug.c
index 716133c376..270a9821e5 100644
--- a/hw/pci-hotplug.c
+++ b/hw/pci-hotplug.c
@@ -90,7 +90,7 @@ static int scsi_hot_add(Monitor *mon, DeviceState *adapter,
      * specified).
      */
     dinfo->unit = qemu_opt_get_number(dinfo->opts, "unit", -1);
-    scsidev = scsi_bus_legacy_add_drive(scsibus, dinfo->bdrv, dinfo->unit);
+    scsidev = scsi_bus_legacy_add_drive(scsibus, dinfo->bdrv, dinfo->unit, false);
     if (!scsidev) {
         return -1;
     }
diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c
index 7febb86e77..ceeb4ecb91 100644
--- a/hw/scsi-bus.c
+++ b/hw/scsi-bus.c
@@ -87,7 +87,8 @@ void scsi_qdev_register(SCSIDeviceInfo *info)
 }
 
 /* handle legacy '-drive if=scsi,...' cmd line args */
-SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockDriverState *bdrv, int unit)
+SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockDriverState *bdrv,
+                                      int unit, bool removable)
 {
     const char *driver;
     DeviceState *dev;
@@ -95,6 +96,9 @@ SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockDriverState *bdrv, int
     driver = bdrv_is_sg(bdrv) ? "scsi-generic" : "scsi-disk";
     dev = qdev_create(&bus->qbus, driver);
     qdev_prop_set_uint32(dev, "scsi-id", unit);
+    if (qdev_prop_exists(dev, "removable")) {
+        qdev_prop_set_bit(dev, "removable", removable);
+    }
     if (qdev_prop_set_drive(dev, "drive", bdrv) < 0) {
         qdev_free(dev);
         return NULL;
@@ -117,7 +121,7 @@ int scsi_bus_legacy_handle_cmdline(SCSIBus *bus)
             continue;
         }
         qemu_opts_loc_restore(dinfo->opts);
-        if (!scsi_bus_legacy_add_drive(bus, dinfo->bdrv, unit)) {
+        if (!scsi_bus_legacy_add_drive(bus, dinfo->bdrv, unit, false)) {
             res = -1;
             break;
         }
diff --git a/hw/scsi.h b/hw/scsi.h
index bf02adfbe2..846fbba893 100644
--- a/hw/scsi.h
+++ b/hw/scsi.h
@@ -94,7 +94,8 @@ static inline SCSIBus *scsi_bus_from_device(SCSIDevice *d)
     return DO_UPCAST(SCSIBus, qbus, d->qdev.parent_bus);
 }
 
-SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockDriverState *bdrv, int unit);
+SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockDriverState *bdrv,
+                                      int unit, bool removable);
 int scsi_bus_legacy_handle_cmdline(SCSIBus *bus);
 
 SCSIRequest *scsi_req_alloc(size_t size, SCSIDevice *d, uint32_t tag, uint32_t lun);
diff --git a/hw/usb-msd.c b/hw/usb-msd.c
index 729d96ccc3..ccdf8ffc4b 100644
--- a/hw/usb-msd.c
+++ b/hw/usb-msd.c
@@ -515,7 +515,7 @@ static int usb_msd_initfn(USBDevice *dev)
 
     usb_desc_init(dev);
     scsi_bus_new(&s->bus, &s->dev.qdev, 0, 1, usb_msd_command_complete);
-    s->scsi_dev = scsi_bus_legacy_add_drive(&s->bus, bs, 0);
+    s->scsi_dev = scsi_bus_legacy_add_drive(&s->bus, bs, 0, false);
     if (!s->scsi_dev) {
         return -1;
     }

From 6bb7b86722c6cc772f66cdc7923dec56e8458c29 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 24 Jan 2011 15:35:00 +0000
Subject: [PATCH 22/23] usb-msd: Propagate removable bit to SCSI device

USB Mass Storage Devices sometimes have the RMB (removable) bit set in
the SCSI INQUIRY response.  Thumbdrives tend to have the bit set whereas
hard disks do not.

Operating systems differentiate between removable devices and fixed
devices.  Under Linux, the anaconda installer looks for removable
devices.  Under Windows, only fixed devices may have more than one
partition and AutoRun is also affected by the removable bit.

For these reasons, allow USB Mass Storage Devices to override the
removable bit:

qemu -usb
     -drive if=none,file=test.img,cache=none,id=disk0
     -device usb-storage,drive=disk0,removable=on

The default is off.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/usb-msd.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/usb-msd.c b/hw/usb-msd.c
index ccdf8ffc4b..11722c7486 100644
--- a/hw/usb-msd.c
+++ b/hw/usb-msd.c
@@ -51,6 +51,7 @@ typedef struct {
     SCSIBus bus;
     BlockConf conf;
     SCSIDevice *scsi_dev;
+    uint32_t removable;
     int result;
     /* For async completion.  */
     USBPacket *packet;
@@ -515,7 +516,7 @@ static int usb_msd_initfn(USBDevice *dev)
 
     usb_desc_init(dev);
     scsi_bus_new(&s->bus, &s->dev.qdev, 0, 1, usb_msd_command_complete);
-    s->scsi_dev = scsi_bus_legacy_add_drive(&s->bus, bs, 0, false);
+    s->scsi_dev = scsi_bus_legacy_add_drive(&s->bus, bs, 0, !!s->removable);
     if (!s->scsi_dev) {
         return -1;
     }
@@ -607,6 +608,7 @@ static struct USBDeviceInfo msd_info = {
     .usbdevice_init = usb_msd_init,
     .qdev.props     = (Property[]) {
         DEFINE_BLOCK_PROPERTIES(MSDState, conf),
+        DEFINE_PROP_BIT("removable", MSDState, removable, 0, false),
         DEFINE_PROP_END_OF_LIST(),
     },
 };

From a5c062edd272a222179c2bbf54c539c992aefc93 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 24 Jan 2011 15:35:01 +0000
Subject: [PATCH 23/23] docs: Document scsi-disk and usb-storage removable
 parameter

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 docs/qdev-device-use.txt | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/docs/qdev-device-use.txt b/docs/qdev-device-use.txt
index f2f9b757a5..4bb2be8850 100644
--- a/docs/qdev-device-use.txt
+++ b/docs/qdev-device-use.txt
@@ -80,7 +80,11 @@ The -device argument differs in detail for each kind of drive:
   This SCSI controller a single SCSI bus, named ID.0.  Put a disk on
   it:
 
-  -device scsi-disk,drive=DRIVE-ID,bus=ID.0,scsi-id=SCSI-ID
+  -device scsi-disk,drive=DRIVE-ID,bus=ID.0,scsi-id=SCSI-ID,removable=RMB
+
+  The (optional) removable parameter lets you override the SCSI INQUIRY
+  removable (RMB) bit for non CD-ROM devices.  It is ignored for CD-ROM devices
+  which are always removable.  RMB is "on" or "off".
 
 * if=floppy
 
@@ -116,7 +120,12 @@ For USB devices, the old way is actually different:
 Provides much less control than -drive's HOST-OPTS...  The new way
 fixes that:
 
-    -device usb-storage,drive=DRIVE-ID
+    -device usb-storage,drive=DRIVE-ID,removable=RMB
+
+The removable parameter gives control over the SCSI INQUIRY removable (RMB)
+bit.  USB thumbdrives usually set removable=on, while USB hard disks set
+removable=off.  See the if=scsi description above for details on the removable
+parameter, which applies only to scsi-disk devices and not to scsi-generic.
 
 === Character Devices ===