From c77a6c8dd7692e5249d5734df0256faad4c8aca2 Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@redhat.com>
Date: Fri, 3 Feb 2017 16:43:36 +0530
Subject: [PATCH 01/14] migration: remove myself as maintainer

I'm switching jobs, and I'm not sure I can continue maintaining migration.

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Message-Id: <1486120416-11566-1-git-send-email-amit.shah@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 7afbadaa15..d8ea161c7c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1431,7 +1431,6 @@ F: scripts/checkpatch.pl
 
 Migration
 M: Juan Quintela <quintela@redhat.com>
-M: Amit Shah <amit.shah@redhat.com>
 M: Dr. David Alan Gilbert <dgilbert@redhat.com>
 S: Maintained
 F: include/migration/

From cee887d96921d291fa72e90c79b5f03113dae86b Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@redhat.com>
Date: Fri, 3 Feb 2017 16:43:53 +0530
Subject: [PATCH 02/14] MAINTAINERS: update my email address

I'm leaving my job at Red Hat, this email address will stop working next week.
Update it to one that I will have access to later.

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Message-Id: <1486120433-11628-1-git-send-email-amit.shah@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 MAINTAINERS | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index d8ea161c7c..fb57d8eb45 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1034,7 +1034,7 @@ F: hw/input/virtio-input*.c
 F: include/hw/virtio/virtio-input.h
 
 virtio-serial
-M: Amit Shah <amit.shah@redhat.com>
+M: Amit Shah <amit@kernel.org>
 S: Supported
 F: hw/char/virtio-serial-bus.c
 F: hw/char/virtio-console.c
@@ -1043,7 +1043,7 @@ F: tests/virtio-console-test.c
 F: tests/virtio-serial-test.c
 
 virtio-rng
-M: Amit Shah <amit.shah@redhat.com>
+M: Amit Shah <amit@kernel.org>
 S: Supported
 F: hw/virtio/virtio-rng.c
 F: include/hw/virtio/virtio-rng.h

From 9eb14766102b7af5c3ab3d64cc85985bdaa544e2 Mon Sep 17 00:00:00 2001
From: Pavel Butsykin <pbutsykin@virtuozzo.com>
Date: Fri, 3 Feb 2017 18:23:19 +0300
Subject: [PATCH 03/14] migration: add MigrationState arg for
 ram_save_/compressed_/page()

Cosmetic patch. The use of ms variable instead of migrate_get_current()
looks nicer, especially when there reuse.

Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
Message-Id: <20170203152321.19739-2-pbutsykin@virtuozzo.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 migration/ram.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index ef8fadfe69..91443b3961 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -713,13 +713,14 @@ static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
  *          >=0 - Number of pages written - this might legally be 0
  *                if xbzrle noticed the page was the same.
  *
+ * @ms: The current migration state.
  * @f: QEMUFile where to send the data
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
  * @last_stage: if we are at the completion stage
  * @bytes_transferred: increase it with the number of transferred bytes
  */
-static int ram_save_page(QEMUFile *f, PageSearchStatus *pss,
+static int ram_save_page(MigrationState *ms, QEMUFile *f, PageSearchStatus *pss,
                          bool last_stage, uint64_t *bytes_transferred)
 {
     int pages = -1;
@@ -765,8 +766,7 @@ static int ram_save_page(QEMUFile *f, PageSearchStatus *pss,
              */
             xbzrle_cache_zero_page(current_addr);
         } else if (!ram_bulk_stage &&
-                   !migration_in_postcopy(migrate_get_current()) &&
-                   migrate_use_xbzrle()) {
+                   !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
             pages = save_xbzrle_page(f, &p, current_addr, block,
                                      offset, last_stage, bytes_transferred);
             if (!last_stage) {
@@ -893,14 +893,15 @@ static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
  *
  * Returns: Number of pages written.
  *
+ * @ms: The current migration state.
  * @f: QEMUFile where to send the data
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
  * @last_stage: if we are at the completion stage
  * @bytes_transferred: increase it with the number of transferred bytes
  */
-static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss,
-                                    bool last_stage,
+static int ram_save_compressed_page(MigrationState *ms, QEMUFile *f,
+                                    PageSearchStatus *pss, bool last_stage,
                                     uint64_t *bytes_transferred)
 {
     int pages = -1;
@@ -1231,11 +1232,11 @@ static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
     if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
         unsigned long *unsentmap;
         if (compression_switch && migrate_use_compression()) {
-            res = ram_save_compressed_page(f, pss,
+            res = ram_save_compressed_page(ms, f, pss,
                                            last_stage,
                                            bytes_transferred);
         } else {
-            res = ram_save_page(f, pss, last_stage,
+            res = ram_save_page(ms, f, pss, last_stage,
                                 bytes_transferred);
         }
 

From 53f09a1076f5efbba7d751a8005e2fcf008606db Mon Sep 17 00:00:00 2001
From: Pavel Butsykin <pbutsykin@virtuozzo.com>
Date: Fri, 3 Feb 2017 18:23:20 +0300
Subject: [PATCH 04/14] add 'release-ram' migrate capability

This feature frees the migrated memory on the source during postcopy-ram
migration. In the second step of postcopy-ram migration when the source vm
is put on pause we can free unnecessary memory. It will allow, in particular,
to start relaxing the memory stress on the source host in a load-balancing
scenario.

Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
Message-Id: <20170203152321.19739-3-pbutsykin@virtuozzo.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
   Manually merged in Pavel's 'migration: madvise error_report fixup!'
---
 include/migration/migration.h |  1 +
 include/migration/qemu-file.h |  3 +-
 migration/migration.c         |  9 ++++++
 migration/qemu-file.c         | 59 ++++++++++++++++++++++++++++++-----
 migration/ram.c               | 22 ++++++++++++-
 qapi-schema.json              |  5 ++-
 6 files changed, 89 insertions(+), 10 deletions(-)

diff --git a/include/migration/migration.h b/include/migration/migration.h
index 7528cc2fbc..b9b706a7e3 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -304,6 +304,7 @@ int migrate_add_blocker(Error *reason, Error **errp);
  */
 void migrate_del_blocker(Error *reason);
 
+bool migrate_release_ram(void);
 bool migrate_postcopy_ram(void);
 bool migrate_zero_blocks(void);
 
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index abedd466c9..0cd648a733 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -132,7 +132,8 @@ void qemu_put_byte(QEMUFile *f, int v);
  * put_buffer without copying the buffer.
  * The buffer should be available till it is sent asynchronously.
  */
-void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size);
+void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size,
+                           bool may_free);
 bool qemu_file_mode_is_not_valid(const char *mode);
 bool qemu_file_is_writable(QEMUFile *f);
 
diff --git a/migration/migration.c b/migration/migration.c
index 2b179c69fa..68afc07016 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1297,6 +1297,15 @@ void qmp_migrate_set_downtime(double value, Error **errp)
     qmp_migrate_set_parameters(&p, errp);
 }
 
+bool migrate_release_ram(void)
+{
+    MigrationState *s;
+
+    s = migrate_get_current();
+
+    return s->enabled_capabilities[MIGRATION_CAPABILITY_RELEASE_RAM];
+}
+
 bool migrate_postcopy_ram(void)
 {
     MigrationState *s;
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index e9fae31158..195fa94fcf 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -49,6 +49,7 @@ struct QEMUFile {
     int buf_size; /* 0 when writing */
     uint8_t buf[IO_BUF_SIZE];
 
+    DECLARE_BITMAP(may_free, MAX_IOV_SIZE);
     struct iovec iov[MAX_IOV_SIZE];
     unsigned int iovcnt;
 
@@ -132,6 +133,41 @@ bool qemu_file_is_writable(QEMUFile *f)
     return f->ops->writev_buffer;
 }
 
+static void qemu_iovec_release_ram(QEMUFile *f)
+{
+    struct iovec iov;
+    unsigned long idx;
+
+    /* Find and release all the contiguous memory ranges marked as may_free. */
+    idx = find_next_bit(f->may_free, f->iovcnt, 0);
+    if (idx >= f->iovcnt) {
+        return;
+    }
+    iov = f->iov[idx];
+
+    /* The madvise() in the loop is called for iov within a continuous range and
+     * then reinitialize the iov. And in the end, madvise() is called for the
+     * last iov.
+     */
+    while ((idx = find_next_bit(f->may_free, f->iovcnt, idx + 1)) < f->iovcnt) {
+        /* check for adjacent buffer and coalesce them */
+        if (iov.iov_base + iov.iov_len == f->iov[idx].iov_base) {
+            iov.iov_len += f->iov[idx].iov_len;
+            continue;
+        }
+        if (qemu_madvise(iov.iov_base, iov.iov_len, QEMU_MADV_DONTNEED) < 0) {
+            error_report("migrate: madvise DONTNEED failed %p %zd: %s",
+                         iov.iov_base, iov.iov_len, strerror(errno));
+        }
+        iov = f->iov[idx];
+    }
+    if (qemu_madvise(iov.iov_base, iov.iov_len, QEMU_MADV_DONTNEED) < 0) {
+            error_report("migrate: madvise DONTNEED failed %p %zd: %s",
+                         iov.iov_base, iov.iov_len, strerror(errno));
+    }
+    memset(f->may_free, 0, sizeof(f->may_free));
+}
+
 /**
  * Flushes QEMUFile buffer
  *
@@ -151,6 +187,8 @@ void qemu_fflush(QEMUFile *f)
     if (f->iovcnt > 0) {
         expect = iov_size(f->iov, f->iovcnt);
         ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos);
+
+        qemu_iovec_release_ram(f);
     }
 
     if (ret >= 0) {
@@ -304,13 +342,19 @@ int qemu_fclose(QEMUFile *f)
     return ret;
 }
 
-static void add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size)
+static void add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size,
+                         bool may_free)
 {
     /* check for adjacent buffer and coalesce them */
     if (f->iovcnt > 0 && buf == f->iov[f->iovcnt - 1].iov_base +
-        f->iov[f->iovcnt - 1].iov_len) {
+        f->iov[f->iovcnt - 1].iov_len &&
+        may_free == test_bit(f->iovcnt - 1, f->may_free))
+    {
         f->iov[f->iovcnt - 1].iov_len += size;
     } else {
+        if (may_free) {
+            set_bit(f->iovcnt, f->may_free);
+        }
         f->iov[f->iovcnt].iov_base = (uint8_t *)buf;
         f->iov[f->iovcnt++].iov_len = size;
     }
@@ -320,14 +364,15 @@ static void add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size)
     }
 }
 
-void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size)
+void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size,
+                           bool may_free)
 {
     if (f->last_error) {
         return;
     }
 
     f->bytes_xfer += size;
-    add_to_iovec(f, buf, size);
+    add_to_iovec(f, buf, size, may_free);
 }
 
 void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
@@ -345,7 +390,7 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
         }
         memcpy(f->buf + f->buf_index, buf, l);
         f->bytes_xfer += l;
-        add_to_iovec(f, f->buf + f->buf_index, l);
+        add_to_iovec(f, f->buf + f->buf_index, l, false);
         f->buf_index += l;
         if (f->buf_index == IO_BUF_SIZE) {
             qemu_fflush(f);
@@ -366,7 +411,7 @@ void qemu_put_byte(QEMUFile *f, int v)
 
     f->buf[f->buf_index] = v;
     f->bytes_xfer++;
-    add_to_iovec(f, f->buf + f->buf_index, 1);
+    add_to_iovec(f, f->buf + f->buf_index, 1, false);
     f->buf_index++;
     if (f->buf_index == IO_BUF_SIZE) {
         qemu_fflush(f);
@@ -647,7 +692,7 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size,
     }
     qemu_put_be32(f, blen);
     if (f->ops->writev_buffer) {
-        add_to_iovec(f, f->buf + f->buf_index, blen);
+        add_to_iovec(f, f->buf + f->buf_index, blen, false);
     }
     f->buf_index += blen;
     if (f->buf_index == IO_BUF_SIZE) {
diff --git a/migration/ram.c b/migration/ram.c
index 91443b3961..c22209db30 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -705,6 +705,16 @@ static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
     return pages;
 }
 
+static void ram_release_pages(MigrationState *ms, const char *block_name,
+                              uint64_t offset, int pages)
+{
+    if (!migrate_release_ram() || !migration_in_postcopy(ms)) {
+        return;
+    }
+
+    ram_discard_range(NULL, block_name, offset, pages << TARGET_PAGE_BITS);
+}
+
 /**
  * ram_save_page: Send the given page to the stream
  *
@@ -765,6 +775,7 @@ static int ram_save_page(MigrationState *ms, QEMUFile *f, PageSearchStatus *pss,
              * page would be stale
              */
             xbzrle_cache_zero_page(current_addr);
+            ram_release_pages(ms, block->idstr, pss->offset, pages);
         } else if (!ram_bulk_stage &&
                    !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
             pages = save_xbzrle_page(f, &p, current_addr, block,
@@ -783,7 +794,9 @@ static int ram_save_page(MigrationState *ms, QEMUFile *f, PageSearchStatus *pss,
         *bytes_transferred += save_page_header(f, block,
                                                offset | RAM_SAVE_FLAG_PAGE);
         if (send_async) {
-            qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
+            qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE,
+                                  migrate_release_ram() &
+                                  migration_in_postcopy(ms));
         } else {
             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
         }
@@ -813,6 +826,8 @@ static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
         error_report("compressed data failed!");
     } else {
         bytes_sent += blen;
+        ram_release_pages(migrate_get_current(), block->idstr,
+                          offset & TARGET_PAGE_MASK, 1);
     }
 
     return bytes_sent;
@@ -952,12 +967,17 @@ static int ram_save_compressed_page(MigrationState *ms, QEMUFile *f,
                     error_report("compressed data failed!");
                 }
             }
+            if (pages > 0) {
+                ram_release_pages(ms, block->idstr, pss->offset, pages);
+            }
         } else {
             offset |= RAM_SAVE_FLAG_CONTINUE;
             pages = save_zero_page(f, block, offset, p, bytes_transferred);
             if (pages == -1) {
                 pages = compress_page_with_multi_thread(f, block, offset,
                                                         bytes_transferred);
+            } else {
+                ram_release_pages(ms, block->idstr, pss->offset, pages);
             }
         }
     }
diff --git a/qapi-schema.json b/qapi-schema.json
index 61151f34d0..93305412dd 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -865,11 +865,14 @@
 #        side, this process is called COarse-Grain LOck Stepping (COLO) for
 #        Non-stop Service. (since 2.8)
 #
+# @release-ram: if enabled, qemu will free the migrated ram pages on the source
+#        during postcopy-ram migration. (since 2.9)
+#
 # Since: 1.2
 ##
 { 'enum': 'MigrationCapability',
   'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks',
-           'compress', 'events', 'postcopy-ram', 'x-colo'] }
+           'compress', 'events', 'postcopy-ram', 'x-colo', 'release-ram'] }
 
 ##
 # @MigrationCapabilityStatus:

From ced1c6166ef901a91578d54ff39f46b3e7ae870f Mon Sep 17 00:00:00 2001
From: Pavel Butsykin <pbutsykin@virtuozzo.com>
Date: Fri, 3 Feb 2017 18:23:21 +0300
Subject: [PATCH 05/14] migration: discard non-dirty ram pages after the start
 of postcopy

After the start of postcopy migration there are some non-dirty pages which have
already been migrated. These pages are no longer needed on the source vm so that
we can free them and it doen't hurt to complete the migration.

Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
Message-Id: <20170203152321.19739-4-pbutsykin@virtuozzo.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 include/migration/migration.h |  1 +
 migration/migration.c         |  4 ++++
 migration/ram.c               | 19 +++++++++++++++++++
 3 files changed, 24 insertions(+)

diff --git a/include/migration/migration.h b/include/migration/migration.h
index b9b706a7e3..71ce19062c 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -285,6 +285,7 @@ int ram_postcopy_send_discard_bitmap(MigrationState *ms);
 int ram_discard_range(MigrationIncomingState *mis, const char *block_name,
                       uint64_t start, size_t length);
 int ram_postcopy_incoming_init(MigrationIncomingState *mis);
+void ram_postcopy_migrated_memory_release(MigrationState *ms);
 
 /**
  * @migrate_add_blocker - prevent migration from proceeding
diff --git a/migration/migration.c b/migration/migration.c
index 68afc07016..2a26a20aaf 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1722,6 +1722,10 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running)
      */
     qemu_savevm_send_ping(ms->to_dst_file, 4);
 
+    if (migrate_release_ram()) {
+        ram_postcopy_migrated_memory_release(ms);
+    }
+
     ret = qemu_file_get_error(ms->to_dst_file);
     if (ret) {
         error_report("postcopy_start: Migration stream errored");
diff --git a/migration/ram.c b/migration/ram.c
index c22209db30..67f2efbc59 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1537,6 +1537,25 @@ void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
 
 /* **** functions for postcopy ***** */
 
+void ram_postcopy_migrated_memory_release(MigrationState *ms)
+{
+    struct RAMBlock *block;
+    unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+        unsigned long first = block->offset >> TARGET_PAGE_BITS;
+        unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
+        unsigned long run_start = find_next_zero_bit(bitmap, range, first);
+
+        while (run_start < range) {
+            unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
+            ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS,
+                              (run_end - run_start) << TARGET_PAGE_BITS);
+            run_start = find_next_zero_bit(bitmap, range, run_end + 1);
+        }
+    }
+}
+
 /*
  * Callback from postcopy_each_ram_send_discard for each RAMBlock
  * Note: At this point the 'unsentmap' is the processed bitmap combined

From 0827b9e97d443781a17a21c64695940675aa1f8a Mon Sep 17 00:00:00 2001
From: Ashijeet Acharya <ashijeetacharya@gmail.com>
Date: Wed, 8 Feb 2017 19:58:45 +0530
Subject: [PATCH 06/14] migrate: Introduce zero RAM checks to skip RAM
 migration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Migration of a "none" machine with no RAM crashes abruptly as
bitmap_new() fails and thus aborts. Instead place zero RAM checks at
appropriate places to skip migration of RAM in this case and complete
migration successfully for devices only.

Signed-off-by: Ashijeet Acharya <ashijeetacharya@gmail.com>
Message-Id: <1486564125-31366-1-git-send-email-ashijeetacharya@gmail.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 migration/ram.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 67f2efbc59..f289fcddd5 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1346,6 +1346,11 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
     ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
                                  ram_addr_t space */
 
+    /* No dirty page as there is zero RAM */
+    if (!ram_bytes_total()) {
+        return pages;
+    }
+
     pss.block = last_seen_block;
     pss.offset = last_offset;
     pss.complete_round = false;
@@ -1952,14 +1957,17 @@ static int ram_save_init_globals(void)
     bytes_transferred = 0;
     reset_ram_globals();
 
-    ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
     migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
-    migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
-    bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
+    /* Skip setting bitmap if there is no RAM */
+    if (ram_bytes_total()) {
+        ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
+        migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
+        bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
 
-    if (migrate_postcopy_ram()) {
-        migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
-        bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
+        if (migrate_postcopy_ram()) {
+            migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
+            bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
+        }
     }
 
     /*

From 59046ec29ad4c24391bb9fe1fbdced33557aaa70 Mon Sep 17 00:00:00 2001
From: Halil Pasic <pasic@linux.vnet.ibm.com>
Date: Fri, 3 Feb 2017 18:52:17 +0100
Subject: [PATCH 07/14] migration: consolidate VMStateField.start

The member VMStateField.start is used for two things, partial data
migration for VBUFFER data (basically provide migration for a
sub-buffer) and for locating next in QTAILQ.

The implementation of the VBUFFER feature is broken when VMSTATE_ALLOC
is used. This however goes unnoticed because actually partial migration
for VBUFFER is not used at all.

Let's consolidate the usage of VMStateField.start by removing support
for partial migration for VBUFFER.

Signed-off-by: Halil Pasic <pasic@linux.vnet.ibm.com>

Message-Id: <20170203175217.45562-1-pasic@linux.vnet.ibm.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 hw/char/exynos4210_uart.c   |  2 +-
 hw/display/g364fb.c         |  2 +-
 hw/dma/pl330.c              |  8 ++++----
 hw/intc/exynos4210_gic.c    |  2 +-
 hw/ipmi/isa_ipmi_bt.c       |  6 ++----
 hw/net/vmxnet3.c            |  2 +-
 hw/nvram/mac_nvram.c        |  2 +-
 hw/nvram/spapr_nvram.c      |  2 +-
 hw/sd/sdhci.c               |  2 +-
 hw/timer/m48t59.c           |  2 +-
 include/migration/vmstate.h | 21 ++++++++-------------
 migration/savevm.c          |  2 +-
 migration/vmstate.c         |  4 ++--
 target/s390x/machine.c      |  2 +-
 util/fifo8.c                |  2 +-
 15 files changed, 27 insertions(+), 34 deletions(-)

diff --git a/hw/char/exynos4210_uart.c b/hw/char/exynos4210_uart.c
index 7c16e894e2..b75f28d473 100644
--- a/hw/char/exynos4210_uart.c
+++ b/hw/char/exynos4210_uart.c
@@ -561,7 +561,7 @@ static const VMStateDescription vmstate_exynos4210_uart_fifo = {
     .fields = (VMStateField[]) {
         VMSTATE_UINT32(sp, Exynos4210UartFIFO),
         VMSTATE_UINT32(rp, Exynos4210UartFIFO),
-        VMSTATE_VBUFFER_UINT32(data, Exynos4210UartFIFO, 1, NULL, 0, size),
+        VMSTATE_VBUFFER_UINT32(data, Exynos4210UartFIFO, 1, NULL, size),
         VMSTATE_END_OF_LIST()
     }
 };
diff --git a/hw/display/g364fb.c b/hw/display/g364fb.c
index 70ef2c7453..8cdc205dd9 100644
--- a/hw/display/g364fb.c
+++ b/hw/display/g364fb.c
@@ -464,7 +464,7 @@ static const VMStateDescription vmstate_g364fb = {
     .minimum_version_id = 1,
     .post_load = g364fb_post_load,
     .fields = (VMStateField[]) {
-        VMSTATE_VBUFFER_UINT32(vram, G364State, 1, NULL, 0, vram_size),
+        VMSTATE_VBUFFER_UINT32(vram, G364State, 1, NULL, vram_size),
         VMSTATE_BUFFER_UNSAFE(color_palette, G364State, 0, 256 * 3),
         VMSTATE_BUFFER_UNSAFE(cursor_palette, G364State, 0, 9),
         VMSTATE_UINT16_ARRAY(cursor, G364State, 512),
diff --git a/hw/dma/pl330.c b/hw/dma/pl330.c
index c0bd9fec30..32cf8399b8 100644
--- a/hw/dma/pl330.c
+++ b/hw/dma/pl330.c
@@ -173,8 +173,8 @@ static const VMStateDescription vmstate_pl330_fifo = {
     .version_id = 1,
     .minimum_version_id = 1,
     .fields = (VMStateField[]) {
-        VMSTATE_VBUFFER_UINT32(buf, PL330Fifo, 1, NULL, 0, buf_size),
-        VMSTATE_VBUFFER_UINT32(tag, PL330Fifo, 1, NULL, 0, buf_size),
+        VMSTATE_VBUFFER_UINT32(buf, PL330Fifo, 1, NULL, buf_size),
+        VMSTATE_VBUFFER_UINT32(tag, PL330Fifo, 1, NULL, buf_size),
         VMSTATE_UINT32(head, PL330Fifo),
         VMSTATE_UINT32(num, PL330Fifo),
         VMSTATE_UINT32(buf_size, PL330Fifo),
@@ -282,8 +282,8 @@ static const VMStateDescription vmstate_pl330 = {
         VMSTATE_STRUCT(manager, PL330State, 0, vmstate_pl330_chan, PL330Chan),
         VMSTATE_STRUCT_VARRAY_UINT32(chan, PL330State, num_chnls, 0,
                                      vmstate_pl330_chan, PL330Chan),
-        VMSTATE_VBUFFER_UINT32(lo_seqn, PL330State, 1, NULL, 0, num_chnls),
-        VMSTATE_VBUFFER_UINT32(hi_seqn, PL330State, 1, NULL, 0, num_chnls),
+        VMSTATE_VBUFFER_UINT32(lo_seqn, PL330State, 1, NULL, num_chnls),
+        VMSTATE_VBUFFER_UINT32(hi_seqn, PL330State, 1, NULL, num_chnls),
         VMSTATE_STRUCT(fifo, PL330State, 0, vmstate_pl330_fifo, PL330Fifo),
         VMSTATE_STRUCT(read_queue, PL330State, 0, vmstate_pl330_queue,
                        PL330Queue),
diff --git a/hw/intc/exynos4210_gic.c b/hw/intc/exynos4210_gic.c
index fd7a8f3058..2a55817b76 100644
--- a/hw/intc/exynos4210_gic.c
+++ b/hw/intc/exynos4210_gic.c
@@ -393,7 +393,7 @@ static const VMStateDescription vmstate_exynos4210_irq_gate = {
     .version_id = 2,
     .minimum_version_id = 2,
     .fields = (VMStateField[]) {
-        VMSTATE_VBUFFER_UINT32(level, Exynos4210IRQGateState, 1, NULL, 0, n_in),
+        VMSTATE_VBUFFER_UINT32(level, Exynos4210IRQGateState, 1, NULL, n_in),
         VMSTATE_END_OF_LIST()
     }
 };
diff --git a/hw/ipmi/isa_ipmi_bt.c b/hw/ipmi/isa_ipmi_bt.c
index f03661715c..1c69cb33f8 100644
--- a/hw/ipmi/isa_ipmi_bt.c
+++ b/hw/ipmi/isa_ipmi_bt.c
@@ -471,10 +471,8 @@ static const VMStateDescription vmstate_ISAIPMIBTDevice = {
         VMSTATE_BOOL(bt.use_irq, ISAIPMIBTDevice),
         VMSTATE_BOOL(bt.irqs_enabled, ISAIPMIBTDevice),
         VMSTATE_UINT32(bt.outpos, ISAIPMIBTDevice),
-        VMSTATE_VBUFFER_UINT32(bt.outmsg, ISAIPMIBTDevice, 1, NULL, 0,
-                               bt.outlen),
-        VMSTATE_VBUFFER_UINT32(bt.inmsg, ISAIPMIBTDevice, 1, NULL, 0,
-                               bt.inlen),
+        VMSTATE_VBUFFER_UINT32(bt.outmsg, ISAIPMIBTDevice, 1, NULL, bt.outlen),
+        VMSTATE_VBUFFER_UINT32(bt.inmsg, ISAIPMIBTDevice, 1, NULL, bt.inlen),
         VMSTATE_UINT8(bt.control_reg, ISAIPMIBTDevice),
         VMSTATE_UINT8(bt.mask_reg, ISAIPMIBTDevice),
         VMSTATE_UINT8(bt.waiting_rsp, ISAIPMIBTDevice),
diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c
index 7dd456551c..e13a798b3b 100644
--- a/hw/net/vmxnet3.c
+++ b/hw/net/vmxnet3.c
@@ -2397,7 +2397,7 @@ static const VMStateDescription vmxstate_vmxnet3_mcast_list = {
     .pre_load = vmxnet3_mcast_list_pre_load,
     .needed = vmxnet3_mc_list_needed,
     .fields = (VMStateField[]) {
-        VMSTATE_VBUFFER_UINT32(mcast_list, VMXNET3State, 0, NULL, 0,
+        VMSTATE_VBUFFER_UINT32(mcast_list, VMXNET3State, 0, NULL,
             mcast_list_buff_size),
         VMSTATE_END_OF_LIST()
     }
diff --git a/hw/nvram/mac_nvram.c b/hw/nvram/mac_nvram.c
index 63f9ed1d82..aef80e64df 100644
--- a/hw/nvram/mac_nvram.c
+++ b/hw/nvram/mac_nvram.c
@@ -82,7 +82,7 @@ static const VMStateDescription vmstate_macio_nvram = {
     .version_id = 1,
     .minimum_version_id = 1,
     .fields = (VMStateField[]) {
-        VMSTATE_VBUFFER_UINT32(data, MacIONVRAMState, 0, NULL, 0, size),
+        VMSTATE_VBUFFER_UINT32(data, MacIONVRAMState, 0, NULL, size),
         VMSTATE_END_OF_LIST()
     }
 };
diff --git a/hw/nvram/spapr_nvram.c b/hw/nvram/spapr_nvram.c
index eb42ea323f..65ba188555 100644
--- a/hw/nvram/spapr_nvram.c
+++ b/hw/nvram/spapr_nvram.c
@@ -224,7 +224,7 @@ static const VMStateDescription vmstate_spapr_nvram = {
     .post_load = spapr_nvram_post_load,
     .fields = (VMStateField[]) {
         VMSTATE_UINT32(size, sPAPRNVRAM),
-        VMSTATE_VBUFFER_ALLOC_UINT32(buf, sPAPRNVRAM, 1, NULL, 0, size),
+        VMSTATE_VBUFFER_ALLOC_UINT32(buf, sPAPRNVRAM, 1, NULL, size),
         VMSTATE_END_OF_LIST()
     },
 };
diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c
index 5bd5ab6319..da32b5f709 100644
--- a/hw/sd/sdhci.c
+++ b/hw/sd/sdhci.c
@@ -1253,7 +1253,7 @@ const VMStateDescription sdhci_vmstate = {
         VMSTATE_UINT16(data_count, SDHCIState),
         VMSTATE_UINT64(admasysaddr, SDHCIState),
         VMSTATE_UINT8(stopped_state, SDHCIState),
-        VMSTATE_VBUFFER_UINT32(fifo_buffer, SDHCIState, 1, NULL, 0, buf_maxsz),
+        VMSTATE_VBUFFER_UINT32(fifo_buffer, SDHCIState, 1, NULL, buf_maxsz),
         VMSTATE_TIMER_PTR(insert_timer, SDHCIState),
         VMSTATE_TIMER_PTR(transfer_timer, SDHCIState),
         VMSTATE_END_OF_LIST()
diff --git a/hw/timer/m48t59.c b/hw/timer/m48t59.c
index 015797732f..474981a6ac 100644
--- a/hw/timer/m48t59.c
+++ b/hw/timer/m48t59.c
@@ -563,7 +563,7 @@ static const VMStateDescription vmstate_m48t59 = {
     .fields = (VMStateField[]) {
         VMSTATE_UINT8(lock, M48t59State),
         VMSTATE_UINT16(addr, M48t59State),
-        VMSTATE_VBUFFER_UINT32(buffer, M48t59State, 0, NULL, 0, size),
+        VMSTATE_VBUFFER_UINT32(buffer, M48t59State, 0, NULL, size),
         VMSTATE_END_OF_LIST()
     }
 };
diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 6233fe2e5b..39db47eb3f 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -587,7 +587,8 @@ extern const VMStateInfo vmstate_info_qtailq;
     .offset       = vmstate_offset_buffer(_state, _field) + _start,  \
 }
 
-#define VMSTATE_VBUFFER_MULTIPLY(_field, _state, _version, _test, _start, _field_size, _multiply) { \
+#define VMSTATE_VBUFFER_MULTIPLY(_field, _state, _version, _test,    \
+                                 _field_size, _multiply) {           \
     .name         = (stringify(_field)),                             \
     .version_id   = (_version),                                      \
     .field_exists = (_test),                                         \
@@ -596,10 +597,9 @@ extern const VMStateInfo vmstate_info_qtailq;
     .info         = &vmstate_info_buffer,                            \
     .flags        = VMS_VBUFFER|VMS_POINTER|VMS_MULTIPLY,            \
     .offset       = offsetof(_state, _field),                        \
-    .start        = (_start),                                        \
 }
 
-#define VMSTATE_VBUFFER(_field, _state, _version, _test, _start, _field_size) { \
+#define VMSTATE_VBUFFER(_field, _state, _version, _test, _field_size) { \
     .name         = (stringify(_field)),                             \
     .version_id   = (_version),                                      \
     .field_exists = (_test),                                         \
@@ -607,10 +607,9 @@ extern const VMStateInfo vmstate_info_qtailq;
     .info         = &vmstate_info_buffer,                            \
     .flags        = VMS_VBUFFER|VMS_POINTER,                         \
     .offset       = offsetof(_state, _field),                        \
-    .start        = (_start),                                        \
 }
 
-#define VMSTATE_VBUFFER_UINT32(_field, _state, _version, _test, _start, _field_size) { \
+#define VMSTATE_VBUFFER_UINT32(_field, _state, _version, _test, _field_size) { \
     .name         = (stringify(_field)),                             \
     .version_id   = (_version),                                      \
     .field_exists = (_test),                                         \
@@ -618,10 +617,10 @@ extern const VMStateInfo vmstate_info_qtailq;
     .info         = &vmstate_info_buffer,                            \
     .flags        = VMS_VBUFFER|VMS_POINTER,                         \
     .offset       = offsetof(_state, _field),                        \
-    .start        = (_start),                                        \
 }
 
-#define VMSTATE_VBUFFER_ALLOC_UINT32(_field, _state, _version, _test, _start, _field_size) { \
+#define VMSTATE_VBUFFER_ALLOC_UINT32(_field, _state, _version,       \
+                                     _test, _field_size) {           \
     .name         = (stringify(_field)),                             \
     .version_id   = (_version),                                      \
     .field_exists = (_test),                                         \
@@ -629,7 +628,6 @@ extern const VMStateInfo vmstate_info_qtailq;
     .info         = &vmstate_info_buffer,                            \
     .flags        = VMS_VBUFFER|VMS_POINTER|VMS_ALLOC,               \
     .offset       = offsetof(_state, _field),                        \
-    .start        = (_start),                                        \
 }
 
 #define VMSTATE_BUFFER_UNSAFE_INFO_TEST(_field, _state, _test, _version, _info, _size) { \
@@ -948,13 +946,10 @@ extern const VMStateInfo vmstate_info_qtailq;
     VMSTATE_BUFFER_START_MIDDLE_V(_f, _s, _start, 0)
 
 #define VMSTATE_PARTIAL_VBUFFER(_f, _s, _size)                        \
-    VMSTATE_VBUFFER(_f, _s, 0, NULL, 0, _size)
+    VMSTATE_VBUFFER(_f, _s, 0, NULL, _size)
 
 #define VMSTATE_PARTIAL_VBUFFER_UINT32(_f, _s, _size)                        \
-    VMSTATE_VBUFFER_UINT32(_f, _s, 0, NULL, 0, _size)
-
-#define VMSTATE_SUB_VBUFFER(_f, _s, _start, _size)                    \
-    VMSTATE_VBUFFER(_f, _s, 0, NULL, _start, _size)
+    VMSTATE_VBUFFER_UINT32(_f, _s, 0, NULL, _size)
 
 #define VMSTATE_BUFFER_TEST(_f, _s, _test)                            \
     VMSTATE_STATIC_BUFFER(_f, _s, 0, _test, 0, sizeof(typeof_field(_s, _f)))
diff --git a/migration/savevm.c b/migration/savevm.c
index 01997687c4..5ecd264134 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -356,7 +356,7 @@ static const VMStateDescription vmstate_configuration = {
     .pre_save = configuration_pre_save,
     .fields = (VMStateField[]) {
         VMSTATE_UINT32(len, SaveState),
-        VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, 0, len),
+        VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len),
         VMSTATE_END_OF_LIST()
     },
     .subsections = (const VMStateDescription*[]) {
diff --git a/migration/vmstate.c b/migration/vmstate.c
index 2b2b3a58e6..520341a2de 100644
--- a/migration/vmstate.c
+++ b/migration/vmstate.c
@@ -68,10 +68,10 @@ static void *vmstate_base_addr(void *opaque, VMStateField *field, bool alloc)
                 }
             }
             if (size) {
-                *((void **)base_addr + field->start) = g_malloc(size);
+                *(void **)base_addr = g_malloc(size);
             }
         }
-        base_addr = *(void **)base_addr + field->start;
+        base_addr = *(void **)base_addr;
     }
 
     return base_addr;
diff --git a/target/s390x/machine.c b/target/s390x/machine.c
index edc3a4717b..8503fa1c8d 100644
--- a/target/s390x/machine.c
+++ b/target/s390x/machine.c
@@ -180,7 +180,7 @@ const VMStateDescription vmstate_s390_cpu = {
         VMSTATE_UINT8(env.cpu_state, S390CPU),
         VMSTATE_UINT8(env.sigp_order, S390CPU),
         VMSTATE_UINT32_V(irqstate_saved_size, S390CPU, 4),
-        VMSTATE_VBUFFER_UINT32(irqstate, S390CPU, 4, NULL, 0,
+        VMSTATE_VBUFFER_UINT32(irqstate, S390CPU, 4, NULL,
                                irqstate_saved_size),
         VMSTATE_END_OF_LIST()
     },
diff --git a/util/fifo8.c b/util/fifo8.c
index 5c64101b33..d38b3bdaa5 100644
--- a/util/fifo8.c
+++ b/util/fifo8.c
@@ -118,7 +118,7 @@ const VMStateDescription vmstate_fifo8 = {
     .version_id = 1,
     .minimum_version_id = 1,
     .fields = (VMStateField[]) {
-        VMSTATE_VBUFFER_UINT32(data, Fifo8, 1, NULL, 0, capacity),
+        VMSTATE_VBUFFER_UINT32(data, Fifo8, 1, NULL, capacity),
         VMSTATE_UINT32(head, Fifo8),
         VMSTATE_UINT32(num, Fifo8),
         VMSTATE_END_OF_LIST()

From 479125d53eb8509d69a0548f131028a65fcbd65a Mon Sep 17 00:00:00 2001
From: zhanghailiang <zhang.zhanghailiang@huawei.com>
Date: Tue, 17 Jan 2017 20:57:42 +0800
Subject: [PATCH 08/14] COLO: fix setting checkpoint-delay not working properly

If we set checkpoint-delay through command 'migrate-set-parameters',
It will not take effect until we finish last sleep chekpoint-delay,
That's will be offensive espeically when we want to change its value
from an extreme big one to a proper value.

Fix it by using timer to realize checkpoint-delay.

Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Message-Id: <1484657864-21708-2-git-send-email-zhang.zhanghailiang@huawei.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 include/migration/colo.h      |  2 ++
 include/migration/migration.h |  5 +++++
 migration/colo.c              | 33 +++++++++++++++++++++++----------
 migration/migration.c         |  3 +++
 4 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/include/migration/colo.h b/include/migration/colo.h
index e32eef4763..2bbff9e6c2 100644
--- a/include/migration/colo.h
+++ b/include/migration/colo.h
@@ -35,4 +35,6 @@ COLOMode get_colo_mode(void);
 
 /* failover */
 void colo_do_failover(MigrationState *s);
+
+void colo_checkpoint_notify(void *opaque);
 #endif
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 71ce19062c..cb83f1688e 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -188,6 +188,11 @@ struct MigrationState
     /* The RAMBlock used in the last src_page_request */
     RAMBlock *last_req_rb;
 
+    /* The semaphore is used to notify COLO thread to do checkpoint */
+    QemuSemaphore colo_checkpoint_sem;
+    int64_t colo_checkpoint_time;
+    QEMUTimer *colo_delay_timer;
+
     /* The last error that occurred */
     Error *error;
 };
diff --git a/migration/colo.c b/migration/colo.c
index 93c85c538b..08b2e46dac 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -302,7 +302,7 @@ static void colo_process_checkpoint(MigrationState *s)
 {
     QIOChannelBuffer *bioc;
     QEMUFile *fb = NULL;
-    int64_t current_time, checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+    int64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
     Error *local_err = NULL;
     int ret;
 
@@ -332,26 +332,21 @@ static void colo_process_checkpoint(MigrationState *s)
     qemu_mutex_unlock_iothread();
     trace_colo_vm_state_change("stop", "run");
 
+    timer_mod(s->colo_delay_timer,
+            current_time + s->parameters.x_checkpoint_delay);
+
     while (s->state == MIGRATION_STATUS_COLO) {
         if (failover_get_state() != FAILOVER_STATUS_NONE) {
             error_report("failover request");
             goto out;
         }
 
-        current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
-        if (current_time - checkpoint_time <
-            s->parameters.x_checkpoint_delay) {
-            int64_t delay_ms;
+        qemu_sem_wait(&s->colo_checkpoint_sem);
 
-            delay_ms = s->parameters.x_checkpoint_delay -
-                       (current_time - checkpoint_time);
-            g_usleep(delay_ms * 1000);
-        }
         ret = colo_do_checkpoint_transaction(s, bioc, fb);
         if (ret < 0) {
             goto out;
         }
-        checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
     }
 
 out:
@@ -364,14 +359,32 @@ out:
         qemu_fclose(fb);
     }
 
+    timer_del(s->colo_delay_timer);
+
     if (s->rp_state.from_dst_file) {
         qemu_fclose(s->rp_state.from_dst_file);
     }
 }
 
+void colo_checkpoint_notify(void *opaque)
+{
+    MigrationState *s = opaque;
+    int64_t next_notify_time;
+
+    qemu_sem_post(&s->colo_checkpoint_sem);
+    s->colo_checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+    next_notify_time = s->colo_checkpoint_time +
+                    s->parameters.x_checkpoint_delay;
+    timer_mod(s->colo_delay_timer, next_notify_time);
+}
+
 void migrate_start_colo_process(MigrationState *s)
 {
     qemu_mutex_unlock_iothread();
+    qemu_sem_init(&s->colo_checkpoint_sem, 0);
+    s->colo_delay_timer =  timer_new_ms(QEMU_CLOCK_HOST,
+                                colo_checkpoint_notify, s);
+
     migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
                       MIGRATION_STATUS_COLO);
     colo_process_checkpoint(s);
diff --git a/migration/migration.c b/migration/migration.c
index 2a26a20aaf..c6ae69d371 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -891,6 +891,9 @@ void qmp_migrate_set_parameters(MigrationParameters *params, Error **errp)
 
     if (params->has_x_checkpoint_delay) {
         s->parameters.x_checkpoint_delay = params->x_checkpoint_delay;
+        if (migration_in_colo_state()) {
+            colo_checkpoint_notify(s);
+        }
     }
 }
 

From c937b9a6db2d564b96aae35a6757bb4144ea5184 Mon Sep 17 00:00:00 2001
From: zhanghailiang <zhang.zhanghailiang@huawei.com>
Date: Tue, 17 Jan 2017 20:57:43 +0800
Subject: [PATCH 09/14] COLO: Shutdown related socket fd while do failover

If the net connection between primary host and secondary host breaks
while COLO/COLO incoming threads are doing read() or write().
It will block until connection is timeout, and the failover process
will be blocked because of it.

So it is necessary to shutdown all the socket fds used by COLO
to avoid this situation. Besides, we should close the corresponding
file descriptors after failvoer BH shutdown them,
Or there will be an error.

Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Message-Id: <1484657864-21708-3-git-send-email-zhang.zhanghailiang@huawei.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 include/migration/migration.h |  3 +++
 migration/colo.c              | 43 +++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/include/migration/migration.h b/include/migration/migration.h
index cb83f1688e..1735d66512 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -116,6 +116,7 @@ struct MigrationIncomingState {
     QemuThread colo_incoming_thread;
     /* The coroutine we should enter (back) after failover */
     Coroutine *migration_incoming_co;
+    QemuSemaphore colo_incoming_sem;
 
     /* See savevm.c */
     LoadStateEntry_Head loadvm_handlers;
@@ -187,6 +188,8 @@ struct MigrationState
     QSIMPLEQ_HEAD(src_page_requests, MigrationSrcPageRequest) src_page_requests;
     /* The RAMBlock used in the last src_page_request */
     RAMBlock *last_req_rb;
+    /* The semaphore is used to notify COLO thread that failover is finished */
+    QemuSemaphore colo_exit_sem;
 
     /* The semaphore is used to notify COLO thread to do checkpoint */
     QemuSemaphore colo_checkpoint_sem;
diff --git a/migration/colo.c b/migration/colo.c
index 08b2e46dac..3222812d96 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -59,6 +59,18 @@ static void secondary_vm_do_failover(void)
         /* recover runstate to normal migration finish state */
         autostart = true;
     }
+    /*
+     * Make sure COLO incoming thread not block in recv or send,
+     * If mis->from_src_file and mis->to_src_file use the same fd,
+     * The second shutdown() will return -1, we ignore this value,
+     * It is harmless.
+     */
+    if (mis->from_src_file) {
+        qemu_file_shutdown(mis->from_src_file);
+    }
+    if (mis->to_src_file) {
+        qemu_file_shutdown(mis->to_src_file);
+    }
 
     old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
                                    FAILOVER_STATUS_COMPLETED);
@@ -67,6 +79,8 @@ static void secondary_vm_do_failover(void)
                      "secondary VM", FailoverStatus_lookup[old_state]);
         return;
     }
+    /* Notify COLO incoming thread that failover work is finished */
+    qemu_sem_post(&mis->colo_incoming_sem);
     /* For Secondary VM, jump to incoming co */
     if (mis->migration_incoming_co) {
         qemu_coroutine_enter(mis->migration_incoming_co);
@@ -81,6 +95,18 @@ static void primary_vm_do_failover(void)
     migrate_set_state(&s->state, MIGRATION_STATUS_COLO,
                       MIGRATION_STATUS_COMPLETED);
 
+    /*
+     * Wake up COLO thread which may blocked in recv() or send(),
+     * The s->rp_state.from_dst_file and s->to_dst_file may use the
+     * same fd, but we still shutdown the fd for twice, it is harmless.
+     */
+    if (s->to_dst_file) {
+        qemu_file_shutdown(s->to_dst_file);
+    }
+    if (s->rp_state.from_dst_file) {
+        qemu_file_shutdown(s->rp_state.from_dst_file);
+    }
+
     old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
                                    FAILOVER_STATUS_COMPLETED);
     if (old_state != FAILOVER_STATUS_ACTIVE) {
@@ -88,6 +114,8 @@ static void primary_vm_do_failover(void)
                      FailoverStatus_lookup[old_state]);
         return;
     }
+    /* Notify COLO thread that failover work is finished */
+    qemu_sem_post(&s->colo_exit_sem);
 }
 
 void colo_do_failover(MigrationState *s)
@@ -361,6 +389,14 @@ out:
 
     timer_del(s->colo_delay_timer);
 
+    /* Hope this not to be too long to wait here */
+    qemu_sem_wait(&s->colo_exit_sem);
+    qemu_sem_destroy(&s->colo_exit_sem);
+    /*
+     * Must be called after failover BH is completed,
+     * Or the failover BH may shutdown the wrong fd that
+     * re-used by other threads after we release here.
+     */
     if (s->rp_state.from_dst_file) {
         qemu_fclose(s->rp_state.from_dst_file);
     }
@@ -385,6 +421,7 @@ void migrate_start_colo_process(MigrationState *s)
     s->colo_delay_timer =  timer_new_ms(QEMU_CLOCK_HOST,
                                 colo_checkpoint_notify, s);
 
+    qemu_sem_init(&s->colo_exit_sem, 0);
     migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
                       MIGRATION_STATUS_COLO);
     colo_process_checkpoint(s);
@@ -423,6 +460,8 @@ void *colo_process_incoming_thread(void *opaque)
     uint64_t value;
     Error *local_err = NULL;
 
+    qemu_sem_init(&mis->colo_incoming_sem, 0);
+
     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
                       MIGRATION_STATUS_COLO);
 
@@ -533,6 +572,10 @@ out:
         qemu_fclose(fb);
     }
 
+    /* Hope this not to be too long to loop here */
+    qemu_sem_wait(&mis->colo_incoming_sem);
+    qemu_sem_destroy(&mis->colo_incoming_sem);
+    /* Must be called after failover BH is completed */
     if (mis->to_src_file) {
         qemu_fclose(mis->to_src_file);
     }

From a8664ba5101446f4f2c24b24ed9e10335bbbd46b Mon Sep 17 00:00:00 2001
From: zhanghailiang <zhang.zhanghailiang@huawei.com>
Date: Tue, 17 Jan 2017 20:57:44 +0800
Subject: [PATCH 10/14] COLO: Don't process failover request while loading VM's
 state

We should not do failover work while the main thread is loading
VM's state. Otherwise the consistent of VM's memory and
device state will be broken.

We will restart the loading process after jump over the stage,
The new failover status 'RELAUNCH' will help to record if we
need to restart the process.

Cc: Eric Blake <eblake@redhat.com>
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Message-Id: <1484657864-21708-4-git-send-email-zhang.zhanghailiang@huawei.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
   Added a missing '(Since 2.9)'
---
 migration/colo.c | 26 ++++++++++++++++++++++++++
 qapi-schema.json |  4 +++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/migration/colo.c b/migration/colo.c
index 3222812d96..712308ed5e 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -20,6 +20,8 @@
 #include "qapi/error.h"
 #include "migration/failover.h"
 
+static bool vmstate_loading;
+
 #define COLO_BUFFER_BASE_SIZE (4 * 1024 * 1024)
 
 bool colo_supported(void)
@@ -51,6 +53,19 @@ static void secondary_vm_do_failover(void)
     int old_state;
     MigrationIncomingState *mis = migration_incoming_get_current();
 
+    /* Can not do failover during the process of VM's loading VMstate, Or
+     * it will break the secondary VM.
+     */
+    if (vmstate_loading) {
+        old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
+                        FAILOVER_STATUS_RELAUNCH);
+        if (old_state != FAILOVER_STATUS_ACTIVE) {
+            error_report("Unknown error while do failover for secondary VM,"
+                         "old_state: %s", FailoverStatus_lookup[old_state]);
+        }
+        return;
+    }
+
     migrate_set_state(&mis->state, MIGRATION_STATUS_COLO,
                       MIGRATION_STATUS_COMPLETED);
 
@@ -548,13 +563,23 @@ void *colo_process_incoming_thread(void *opaque)
 
         qemu_mutex_lock_iothread();
         qemu_system_reset(VMRESET_SILENT);
+        vmstate_loading = true;
         if (qemu_loadvm_state(fb) < 0) {
             error_report("COLO: loadvm failed");
             qemu_mutex_unlock_iothread();
             goto out;
         }
+
+        vmstate_loading = false;
         qemu_mutex_unlock_iothread();
 
+        if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) {
+            failover_set_state(FAILOVER_STATUS_RELAUNCH,
+                            FAILOVER_STATUS_NONE);
+            failover_request_active(NULL);
+            goto out;
+        }
+
         colo_send_message(mis->to_src_file, COLO_MESSAGE_VMSTATE_LOADED,
                      &local_err);
         if (local_err) {
@@ -563,6 +588,7 @@ void *colo_process_incoming_thread(void *opaque)
     }
 
 out:
+    vmstate_loading = false;
     /* Throw the unreported error message after exited from loop */
     if (local_err) {
         error_report_err(local_err);
diff --git a/qapi-schema.json b/qapi-schema.json
index 93305412dd..5edb08d621 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1193,10 +1193,12 @@
 #
 # @completed: finish the process of failover
 #
+# @relaunch: restart the failover process, from 'none' -> 'completed' (Since 2.9)
+#
 # Since: 2.8
 ##
 { 'enum': 'FailoverStatus',
-  'data': [ 'none', 'require', 'active', 'completed'] }
+  'data': [ 'none', 'require', 'active', 'completed', 'relaunch' ] }
 
 ##
 # @x-colo-lost-heartbeat:

From b5b5c569570c414bf0aa80a5ae9480debe07ed58 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Fri, 3 Feb 2017 16:06:48 +0000
Subject: [PATCH 11/14] migration: Add VMSTATE_UNUSED_VARRAY_UINT32

VMSTATE_UNUSED_VARRAY_UINT32 is used to skip a chunk of the stream
that's an n-element array;  note the array size and the dynamic value
read never get multiplied so there's no overflow risk.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Message-Id: <20170203160651.19917-2-dgilbert@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 include/migration/vmstate.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 39db47eb3f..7339594337 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -658,6 +658,17 @@ extern const VMStateInfo vmstate_info_qtailq;
     .flags        = VMS_BUFFER,                                      \
 }
 
+/* Discard size * field_num bytes, where field_num is a uint32 member */
+#define VMSTATE_UNUSED_VARRAY_UINT32(_state, _test, _version, _field_num, _size) {\
+    .name         = "unused",                                        \
+    .field_exists = (_test),                                         \
+    .num_offset   = vmstate_offset_value(_state, _field_num, uint32_t),\
+    .version_id   = (_version),                                      \
+    .size         = (_size),                                         \
+    .info         = &vmstate_info_unused_buffer,                     \
+    .flags        = VMS_VARRAY_UINT32 | VMS_BUFFER,                  \
+}
+
 /* _field_size should be a int32_t field in the _state struct giving the
  * size of the bitmap _field in bits.
  */

From bcf45131293664118355df03d2ce5458156deaad Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Fri, 3 Feb 2017 16:06:49 +0000
Subject: [PATCH 12/14] migration: Add VMSTATE_WITH_TMP

VMSTATE_WITH_TMP is for handling structures where some calculation
or rearrangement of the data needs to be performed before the data
hits the wire.
For example,  where the value on the wire is an offset from a
non-migrated base, but the data in the structure is the actual pointer.

To use it, a temporary type is created and a vmsd used on that type.
The first element of the type must be 'parent' a pointer back to the
type of the main structure.  VMSTATE_WITH_TMP takes care of allocating
and freeing the temporary before running the child vmsd.

The post_load/pre_save on the child vmsd can copy things from the parent
to the temporary using the parent pointer and do any other calculations
needed; it can then use normal VMSD entries to do the actual data
storage without having to fiddle around with qemu_get_*/qemu_put_*

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Message-Id: <20170203160651.19917-3-dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 include/migration/vmstate.h | 19 ++++++++++++++++++
 migration/vmstate.c         | 40 +++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)

diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 7339594337..63e7b02e05 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -259,6 +259,7 @@ extern const VMStateInfo vmstate_info_cpudouble;
 extern const VMStateInfo vmstate_info_timer;
 extern const VMStateInfo vmstate_info_buffer;
 extern const VMStateInfo vmstate_info_unused_buffer;
+extern const VMStateInfo vmstate_info_tmp;
 extern const VMStateInfo vmstate_info_bitmap;
 extern const VMStateInfo vmstate_info_qtailq;
 
@@ -649,6 +650,24 @@ extern const VMStateInfo vmstate_info_qtailq;
     .offset     = offsetof(_state, _field),                          \
 }
 
+/* Allocate a temporary of type 'tmp_type', set tmp->parent to _state
+ * and execute the vmsd on the temporary.  Note that we're working with
+ * the whole of _state here, not a field within it.
+ * We compile time check that:
+ *    That _tmp_type contains a 'parent' member that's a pointer to the
+ *        '_state' type
+ *    That the pointer is right at the start of _tmp_type.
+ */
+#define VMSTATE_WITH_TMP(_state, _tmp_type, _vmsd) {                 \
+    .name         = "tmp",                                           \
+    .size         = sizeof(_tmp_type) +                              \
+                    QEMU_BUILD_BUG_ON_ZERO(offsetof(_tmp_type, parent) != 0) + \
+                    type_check_pointer(_state,                       \
+                        typeof_field(_tmp_type, parent)),            \
+    .vmsd         = &(_vmsd),                                        \
+    .info         = &vmstate_info_tmp,                               \
+}
+
 #define VMSTATE_UNUSED_BUFFER(_test, _version, _size) {              \
     .name         = "unused",                                        \
     .field_exists = (_test),                                         \
diff --git a/migration/vmstate.c b/migration/vmstate.c
index 520341a2de..b4d8ae982a 100644
--- a/migration/vmstate.c
+++ b/migration/vmstate.c
@@ -935,6 +935,46 @@ const VMStateInfo vmstate_info_unused_buffer = {
     .put  = put_unused_buffer,
 };
 
+/* vmstate_info_tmp, see VMSTATE_WITH_TMP, the idea is that we allocate
+ * a temporary buffer and the pre_load/pre_save methods in the child vmsd
+ * copy stuff from the parent into the child and do calculations to fill
+ * in fields that don't really exist in the parent but need to be in the
+ * stream.
+ */
+static int get_tmp(QEMUFile *f, void *pv, size_t size, VMStateField *field)
+{
+    int ret;
+    const VMStateDescription *vmsd = field->vmsd;
+    int version_id = field->version_id;
+    void *tmp = g_malloc(size);
+
+    /* Writes the parent field which is at the start of the tmp */
+    *(void **)tmp = pv;
+    ret = vmstate_load_state(f, vmsd, tmp, version_id);
+    g_free(tmp);
+    return ret;
+}
+
+static int put_tmp(QEMUFile *f, void *pv, size_t size, VMStateField *field,
+                    QJSON *vmdesc)
+{
+    const VMStateDescription *vmsd = field->vmsd;
+    void *tmp = g_malloc(size);
+
+    /* Writes the parent field which is at the start of the tmp */
+    *(void **)tmp = pv;
+    vmstate_save_state(f, vmsd, tmp, vmdesc);
+    g_free(tmp);
+
+    return 0;
+}
+
+const VMStateInfo vmstate_info_tmp = {
+    .name = "tmp",
+    .get = get_tmp,
+    .put = put_tmp,
+};
+
 /* bitmaps (as defined by bitmap.h). Note that size here is the size
  * of the bitmap in bits. The on-the-wire format of a bitmap is 64
  * bit words with the bits in big endian order. The in-memory format

From 5c379d9031931fb1be52b96b25589abc7311f200 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Fri, 3 Feb 2017 16:06:50 +0000
Subject: [PATCH 13/14] tests/migration: Add test for VMSTATE_WITH_TMP

Add a test for VMSTATE_WITH_TMP to tests/test-vmstate.c

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Message-Id: <20170203160651.19917-4-dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 tests/test-vmstate.c | 98 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 92 insertions(+), 6 deletions(-)

diff --git a/tests/test-vmstate.c b/tests/test-vmstate.c
index 9d87faf12b..d0dd390006 100644
--- a/tests/test-vmstate.c
+++ b/tests/test-vmstate.c
@@ -90,7 +90,7 @@ static void save_buffer(const uint8_t *buf, size_t buf_size)
     qemu_fclose(fsave);
 }
 
-static void compare_vmstate(uint8_t *wire, size_t size)
+static void compare_vmstate(const uint8_t *wire, size_t size)
 {
     QEMUFile *f = open_test_file(false);
     uint8_t result[size];
@@ -113,7 +113,7 @@ static void compare_vmstate(uint8_t *wire, size_t size)
 }
 
 static int load_vmstate_one(const VMStateDescription *desc, void *obj,
-                            int version, uint8_t *wire, size_t size)
+                            int version, const uint8_t *wire, size_t size)
 {
     QEMUFile *f;
     int ret;
@@ -137,7 +137,7 @@ static int load_vmstate_one(const VMStateDescription *desc, void *obj,
 static int load_vmstate(const VMStateDescription *desc,
                         void *obj, void *obj_clone,
                         void (*obj_copy)(void *, void*),
-                        int version, uint8_t *wire, size_t size)
+                        int version, const uint8_t *wire, size_t size)
 {
     /* We test with zero size */
     obj_copy(obj_clone, obj);
@@ -289,7 +289,6 @@ static void test_simple_primitive(void)
     FIELD_EQUAL(i64_1);
     FIELD_EQUAL(i64_2);
 }
-#undef FIELD_EQUAL
 
 typedef struct TestStruct {
     uint32_t a, b, c, e;
@@ -474,7 +473,6 @@ static void test_load_skip(void)
     qemu_fclose(loading);
 }
 
-
 typedef struct {
     int32_t i;
 } TestStructTriv;
@@ -688,6 +686,94 @@ static void test_load_q(void)
     qemu_fclose(fload);
 }
 
+typedef struct TmpTestStruct {
+    TestStruct *parent;
+    int64_t diff;
+} TmpTestStruct;
+
+static void tmp_child_pre_save(void *opaque)
+{
+    struct TmpTestStruct *tts = opaque;
+
+    tts->diff = tts->parent->b - tts->parent->a;
+}
+
+static int tmp_child_post_load(void *opaque, int version_id)
+{
+    struct TmpTestStruct *tts = opaque;
+
+    tts->parent->b = tts->parent->a + tts->diff;
+
+    return 0;
+}
+
+static const VMStateDescription vmstate_tmp_back_to_parent = {
+    .name = "test/tmp_child_parent",
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(f, TestStruct),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static const VMStateDescription vmstate_tmp_child = {
+    .name = "test/tmp_child",
+    .pre_save = tmp_child_pre_save,
+    .post_load = tmp_child_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_INT64(diff, TmpTestStruct),
+        VMSTATE_STRUCT_POINTER(parent, TmpTestStruct,
+                               vmstate_tmp_back_to_parent, TestStruct),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static const VMStateDescription vmstate_with_tmp = {
+    .name = "test/with_tmp",
+    .version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(a, TestStruct),
+        VMSTATE_UINT64(d, TestStruct),
+        VMSTATE_WITH_TMP(TestStruct, TmpTestStruct, vmstate_tmp_child),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void obj_tmp_copy(void *target, void *source)
+{
+    memcpy(target, source, sizeof(TestStruct));
+}
+
+static void test_tmp_struct(void)
+{
+    TestStruct obj, obj_clone;
+
+    uint8_t const wire_with_tmp[] = {
+        /* u32 a */ 0x00, 0x00, 0x00, 0x02,
+        /* u64 d */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+        /* diff  */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
+        /* u64 f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,
+        QEMU_VM_EOF, /* just to ensure we won't get EOF reported prematurely */
+    };
+
+    memset(&obj, 0, sizeof(obj));
+    obj.a = 2;
+    obj.b = 4;
+    obj.d = 1;
+    obj.f = 8;
+    save_vmstate(&vmstate_with_tmp, &obj);
+
+    compare_vmstate(wire_with_tmp, sizeof(wire_with_tmp));
+
+    memset(&obj, 0, sizeof(obj));
+    SUCCESS(load_vmstate(&vmstate_with_tmp, &obj, &obj_clone,
+                         obj_tmp_copy, 1, wire_with_tmp,
+                         sizeof(wire_with_tmp)));
+    g_assert_cmpint(obj.a, ==, 2); /* From top level vmsd */
+    g_assert_cmpint(obj.b, ==, 4); /* from the post_load */
+    g_assert_cmpint(obj.d, ==, 1); /* From top level vmsd */
+    g_assert_cmpint(obj.f, ==, 8); /* From the child->parent */
+}
+
 int main(int argc, char **argv)
 {
     temp_fd = mkstemp(temp_file);
@@ -708,7 +794,7 @@ int main(int argc, char **argv)
                     test_arr_ptr_str_no0_load);
     g_test_add_func("/vmstate/qtailq/save/saveq", test_save_q);
     g_test_add_func("/vmstate/qtailq/load/loadq", test_load_q);
-
+    g_test_add_func("/vmstate/tmp_struct", test_tmp_struct);
     g_test_run();
 
     close(temp_fd);

From 982b78c5e37864c06fd7b5f156d80bf02628a855 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Fri, 3 Feb 2017 16:06:51 +0000
Subject: [PATCH 14/14] virtio/migration: Migrate virtio-net to VMState

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Message-Id: <20170203160651.19917-5-dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
  Merge fix against Halil's removal of the '_start' field in
    VMSTATE_VBUFFER_MULTIPLY
---
 hw/net/virtio-net.c            | 316 ++++++++++++++++++++++-----------
 include/hw/virtio/virtio-net.h |   4 +-
 2 files changed, 213 insertions(+), 107 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 7b3ad4a9f0..354a19eab8 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -1557,119 +1557,22 @@ static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue)
     virtio_net_set_queues(n);
 }
 
-static void virtio_net_save_device(VirtIODevice *vdev, QEMUFile *f)
+static int virtio_net_post_load_device(void *opaque, int version_id)
 {
-    VirtIONet *n = VIRTIO_NET(vdev);
-    int i;
-
-    qemu_put_buffer(f, n->mac, ETH_ALEN);
-    qemu_put_be32(f, n->vqs[0].tx_waiting);
-    qemu_put_be32(f, n->mergeable_rx_bufs);
-    qemu_put_be16(f, n->status);
-    qemu_put_byte(f, n->promisc);
-    qemu_put_byte(f, n->allmulti);
-    qemu_put_be32(f, n->mac_table.in_use);
-    qemu_put_buffer(f, n->mac_table.macs, n->mac_table.in_use * ETH_ALEN);
-    qemu_put_buffer(f, (uint8_t *)n->vlans, MAX_VLAN >> 3);
-    qemu_put_be32(f, n->has_vnet_hdr);
-    qemu_put_byte(f, n->mac_table.multi_overflow);
-    qemu_put_byte(f, n->mac_table.uni_overflow);
-    qemu_put_byte(f, n->alluni);
-    qemu_put_byte(f, n->nomulti);
-    qemu_put_byte(f, n->nouni);
-    qemu_put_byte(f, n->nobcast);
-    qemu_put_byte(f, n->has_ufo);
-    if (n->max_queues > 1) {
-        qemu_put_be16(f, n->max_queues);
-        qemu_put_be16(f, n->curr_queues);
-        for (i = 1; i < n->curr_queues; i++) {
-            qemu_put_be32(f, n->vqs[i].tx_waiting);
-        }
-    }
-
-    if (virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
-        qemu_put_be64(f, n->curr_guest_offloads);
-    }
-}
-
-static int virtio_net_load_device(VirtIODevice *vdev, QEMUFile *f,
-                                  int version_id)
-{
-    VirtIONet *n = VIRTIO_NET(vdev);
+    VirtIONet *n = opaque;
+    VirtIODevice *vdev = VIRTIO_DEVICE(n);
     int i, link_down;
 
-    qemu_get_buffer(f, n->mac, ETH_ALEN);
-    n->vqs[0].tx_waiting = qemu_get_be32(f);
-
-    virtio_net_set_mrg_rx_bufs(n, qemu_get_be32(f),
+    virtio_net_set_mrg_rx_bufs(n, n->mergeable_rx_bufs,
                                virtio_vdev_has_feature(vdev,
                                                        VIRTIO_F_VERSION_1));
 
-    n->status = qemu_get_be16(f);
-
-    n->promisc = qemu_get_byte(f);
-    n->allmulti = qemu_get_byte(f);
-
-    n->mac_table.in_use = qemu_get_be32(f);
     /* MAC_TABLE_ENTRIES may be different from the saved image */
-    if (n->mac_table.in_use <= MAC_TABLE_ENTRIES) {
-        qemu_get_buffer(f, n->mac_table.macs,
-                        n->mac_table.in_use * ETH_ALEN);
-    } else {
-        int64_t i;
-
-        /* Overflow detected - can happen if source has a larger MAC table.
-         * We simply set overflow flag so there's no need to maintain the
-         * table of addresses, discard them all.
-         * Note: 64 bit math to avoid integer overflow.
-         */
-        for (i = 0; i < (int64_t)n->mac_table.in_use * ETH_ALEN; ++i) {
-            qemu_get_byte(f);
-        }
-        n->mac_table.multi_overflow = n->mac_table.uni_overflow = 1;
+    if (n->mac_table.in_use > MAC_TABLE_ENTRIES) {
         n->mac_table.in_use = 0;
     }
- 
-    qemu_get_buffer(f, (uint8_t *)n->vlans, MAX_VLAN >> 3);
 
-    if (qemu_get_be32(f) && !peer_has_vnet_hdr(n)) {
-        error_report("virtio-net: saved image requires vnet_hdr=on");
-        return -1;
-    }
-
-    n->mac_table.multi_overflow = qemu_get_byte(f);
-    n->mac_table.uni_overflow = qemu_get_byte(f);
-
-    n->alluni = qemu_get_byte(f);
-    n->nomulti = qemu_get_byte(f);
-    n->nouni = qemu_get_byte(f);
-    n->nobcast = qemu_get_byte(f);
-
-    if (qemu_get_byte(f) && !peer_has_ufo(n)) {
-        error_report("virtio-net: saved image requires TUN_F_UFO support");
-        return -1;
-    }
-
-    if (n->max_queues > 1) {
-        if (n->max_queues != qemu_get_be16(f)) {
-            error_report("virtio-net: different max_queues ");
-            return -1;
-        }
-
-        n->curr_queues = qemu_get_be16(f);
-        if (n->curr_queues > n->max_queues) {
-            error_report("virtio-net: curr_queues %x > max_queues %x",
-                         n->curr_queues, n->max_queues);
-            return -1;
-        }
-        for (i = 1; i < n->curr_queues; i++) {
-            n->vqs[i].tx_waiting = qemu_get_be32(f);
-        }
-    }
-
-    if (virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
-        n->curr_guest_offloads = qemu_get_be64(f);
-    } else {
+    if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
         n->curr_guest_offloads = virtio_net_supported_guest_offloads(n);
     }
 
@@ -1703,6 +1606,210 @@ static int virtio_net_load_device(VirtIODevice *vdev, QEMUFile *f,
     return 0;
 }
 
+/* tx_waiting field of a VirtIONetQueue */
+static const VMStateDescription vmstate_virtio_net_queue_tx_waiting = {
+    .name = "virtio-net-queue-tx_waiting",
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(tx_waiting, VirtIONetQueue),
+        VMSTATE_END_OF_LIST()
+   },
+};
+
+static bool max_queues_gt_1(void *opaque, int version_id)
+{
+    return VIRTIO_NET(opaque)->max_queues > 1;
+}
+
+static bool has_ctrl_guest_offloads(void *opaque, int version_id)
+{
+    return virtio_vdev_has_feature(VIRTIO_DEVICE(opaque),
+                                   VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
+}
+
+static bool mac_table_fits(void *opaque, int version_id)
+{
+    return VIRTIO_NET(opaque)->mac_table.in_use <= MAC_TABLE_ENTRIES;
+}
+
+static bool mac_table_doesnt_fit(void *opaque, int version_id)
+{
+    return !mac_table_fits(opaque, version_id);
+}
+
+/* This temporary type is shared by all the WITH_TMP methods
+ * although only some fields are used by each.
+ */
+struct VirtIONetMigTmp {
+    VirtIONet      *parent;
+    VirtIONetQueue *vqs_1;
+    uint16_t        curr_queues_1;
+    uint8_t         has_ufo;
+    uint32_t        has_vnet_hdr;
+};
+
+/* The 2nd and subsequent tx_waiting flags are loaded later than
+ * the 1st entry in the queues and only if there's more than one
+ * entry.  We use the tmp mechanism to calculate a temporary
+ * pointer and count and also validate the count.
+ */
+
+static void virtio_net_tx_waiting_pre_save(void *opaque)
+{
+    struct VirtIONetMigTmp *tmp = opaque;
+
+    tmp->vqs_1 = tmp->parent->vqs + 1;
+    tmp->curr_queues_1 = tmp->parent->curr_queues - 1;
+    if (tmp->parent->curr_queues == 0) {
+        tmp->curr_queues_1 = 0;
+    }
+}
+
+static int virtio_net_tx_waiting_pre_load(void *opaque)
+{
+    struct VirtIONetMigTmp *tmp = opaque;
+
+    /* Reuse the pointer setup from save */
+    virtio_net_tx_waiting_pre_save(opaque);
+
+    if (tmp->parent->curr_queues > tmp->parent->max_queues) {
+        error_report("virtio-net: curr_queues %x > max_queues %x",
+            tmp->parent->curr_queues, tmp->parent->max_queues);
+
+        return -EINVAL;
+    }
+
+    return 0; /* all good */
+}
+
+static const VMStateDescription vmstate_virtio_net_tx_waiting = {
+    .name      = "virtio-net-tx_waiting",
+    .pre_load  = virtio_net_tx_waiting_pre_load,
+    .pre_save  = virtio_net_tx_waiting_pre_save,
+    .fields    = (VMStateField[]) {
+        VMSTATE_STRUCT_VARRAY_POINTER_UINT16(vqs_1, struct VirtIONetMigTmp,
+                                     curr_queues_1,
+                                     vmstate_virtio_net_queue_tx_waiting,
+                                     struct VirtIONetQueue),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+/* the 'has_ufo' flag is just tested; if the incoming stream has the
+ * flag set we need to check that we have it
+ */
+static int virtio_net_ufo_post_load(void *opaque, int version_id)
+{
+    struct VirtIONetMigTmp *tmp = opaque;
+
+    if (tmp->has_ufo && !peer_has_ufo(tmp->parent)) {
+        error_report("virtio-net: saved image requires TUN_F_UFO support");
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static void virtio_net_ufo_pre_save(void *opaque)
+{
+    struct VirtIONetMigTmp *tmp = opaque;
+
+    tmp->has_ufo = tmp->parent->has_ufo;
+}
+
+static const VMStateDescription vmstate_virtio_net_has_ufo = {
+    .name      = "virtio-net-ufo",
+    .post_load = virtio_net_ufo_post_load,
+    .pre_save  = virtio_net_ufo_pre_save,
+    .fields    = (VMStateField[]) {
+        VMSTATE_UINT8(has_ufo, struct VirtIONetMigTmp),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+/* the 'has_vnet_hdr' flag is just tested; if the incoming stream has the
+ * flag set we need to check that we have it
+ */
+static int virtio_net_vnet_post_load(void *opaque, int version_id)
+{
+    struct VirtIONetMigTmp *tmp = opaque;
+
+    if (tmp->has_vnet_hdr && !peer_has_vnet_hdr(tmp->parent)) {
+        error_report("virtio-net: saved image requires vnet_hdr=on");
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static void virtio_net_vnet_pre_save(void *opaque)
+{
+    struct VirtIONetMigTmp *tmp = opaque;
+
+    tmp->has_vnet_hdr = tmp->parent->has_vnet_hdr;
+}
+
+static const VMStateDescription vmstate_virtio_net_has_vnet = {
+    .name      = "virtio-net-vnet",
+    .post_load = virtio_net_vnet_post_load,
+    .pre_save  = virtio_net_vnet_pre_save,
+    .fields    = (VMStateField[]) {
+        VMSTATE_UINT32(has_vnet_hdr, struct VirtIONetMigTmp),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+static const VMStateDescription vmstate_virtio_net_device = {
+    .name = "virtio-net-device",
+    .version_id = VIRTIO_NET_VM_VERSION,
+    .minimum_version_id = VIRTIO_NET_VM_VERSION,
+    .post_load = virtio_net_post_load_device,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT8_ARRAY(mac, VirtIONet, ETH_ALEN),
+        VMSTATE_STRUCT_POINTER(vqs, VirtIONet,
+                               vmstate_virtio_net_queue_tx_waiting,
+                               VirtIONetQueue),
+        VMSTATE_UINT32(mergeable_rx_bufs, VirtIONet),
+        VMSTATE_UINT16(status, VirtIONet),
+        VMSTATE_UINT8(promisc, VirtIONet),
+        VMSTATE_UINT8(allmulti, VirtIONet),
+        VMSTATE_UINT32(mac_table.in_use, VirtIONet),
+
+        /* Guarded pair: If it fits we load it, else we throw it away
+         * - can happen if source has a larger MAC table.; post-load
+         *  sets flags in this case.
+         */
+        VMSTATE_VBUFFER_MULTIPLY(mac_table.macs, VirtIONet,
+                                0, mac_table_fits, mac_table.in_use,
+                                 ETH_ALEN),
+        VMSTATE_UNUSED_VARRAY_UINT32(VirtIONet, mac_table_doesnt_fit, 0,
+                                     mac_table.in_use, ETH_ALEN),
+
+        /* Note: This is an array of uint32's that's always been saved as a
+         * buffer; hold onto your endiannesses; it's actually used as a bitmap
+         * but based on the uint.
+         */
+        VMSTATE_BUFFER_POINTER_UNSAFE(vlans, VirtIONet, 0, MAX_VLAN >> 3),
+        VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
+                         vmstate_virtio_net_has_vnet),
+        VMSTATE_UINT8(mac_table.multi_overflow, VirtIONet),
+        VMSTATE_UINT8(mac_table.uni_overflow, VirtIONet),
+        VMSTATE_UINT8(alluni, VirtIONet),
+        VMSTATE_UINT8(nomulti, VirtIONet),
+        VMSTATE_UINT8(nouni, VirtIONet),
+        VMSTATE_UINT8(nobcast, VirtIONet),
+        VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
+                         vmstate_virtio_net_has_ufo),
+        VMSTATE_SINGLE_TEST(max_queues, VirtIONet, max_queues_gt_1, 0,
+                            vmstate_info_uint16_equal, uint16_t),
+        VMSTATE_UINT16_TEST(curr_queues, VirtIONet, max_queues_gt_1),
+        VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
+                         vmstate_virtio_net_tx_waiting),
+        VMSTATE_UINT64_TEST(curr_guest_offloads, VirtIONet,
+                            has_ctrl_guest_offloads),
+        VMSTATE_END_OF_LIST()
+   },
+};
+
 static NetClientInfo net_virtio_info = {
     .type = NET_CLIENT_DRIVER_NIC,
     .size = sizeof(NICState),
@@ -1989,9 +2096,8 @@ static void virtio_net_class_init(ObjectClass *klass, void *data)
     vdc->set_status = virtio_net_set_status;
     vdc->guest_notifier_mask = virtio_net_guest_notifier_mask;
     vdc->guest_notifier_pending = virtio_net_guest_notifier_pending;
-    vdc->load = virtio_net_load_device;
-    vdc->save = virtio_net_save_device;
     vdc->legacy_features |= (0x1 << VIRTIO_NET_F_GSO);
+    vdc->vmsd = &vmstate_virtio_net_device;
 }
 
 static const TypeInfo virtio_net_info = {
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index 8ea56a8f60..1eec9a2da3 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -47,7 +47,7 @@ typedef struct VirtIONetQueue {
     VirtQueue *tx_vq;
     QEMUTimer *tx_timer;
     QEMUBH *tx_bh;
-    int tx_waiting;
+    uint32_t tx_waiting;
     struct {
         VirtQueueElement *elem;
     } async_tx;
@@ -68,7 +68,7 @@ typedef struct VirtIONet {
     size_t guest_hdr_len;
     uint32_t host_features;
     uint8_t has_ufo;
-    int mergeable_rx_bufs;
+    uint32_t mergeable_rx_bufs;
     uint8_t promisc;
     uint8_t allmulti;
     uint8_t alluni;