From f22f928ec929c425c5aa402b5cc4d6d1fa4be238 Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Thu, 22 Mar 2018 21:17:22 +0300 Subject: [PATCH 01/16] migration: introduce postcopy-blocktime capability Right now it could be used on destination side to enable vCPU blocktime calculation for postcopy live migration. vCPU blocktime - it's time since vCPU thread was put into interruptible sleep, till memory page was copied and thread awake. Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Alexey Perevalov Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela Message-Id: <1521742647-25550-2-git-send-email-a.perevalov@samsung.com> Signed-off-by: Dr. David Alan Gilbert --- migration/migration.c | 9 +++++++++ migration/migration.h | 1 + qapi/migration.json | 6 +++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/migration/migration.c b/migration/migration.c index 52a5092add..bf2e8db9c7 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1541,6 +1541,15 @@ bool migrate_zero_blocks(void) return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS]; } +bool migrate_postcopy_blocktime(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME]; +} + bool migrate_use_compression(void) { MigrationState *s; diff --git a/migration/migration.h b/migration/migration.h index 8d2f320c48..46a50bc789 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -230,6 +230,7 @@ int migrate_compress_level(void); int migrate_compress_threads(void); int migrate_decompress_threads(void); bool migrate_use_events(void); +bool migrate_postcopy_blocktime(void); /* Sending on the return path - generic and then for each message type */ void migrate_send_rp_shut(MigrationIncomingState *mis, diff --git a/qapi/migration.json b/qapi/migration.json index 9d0bf82cf4..24bfc190ba 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -354,16 +354,20 @@ # # @x-multifd: Use more than one fd for migration (since 2.11) # +# # @dirty-bitmaps: If enabled, QEMU will migrate named dirty bitmaps. # (since 2.12) # +# @postcopy-blocktime: Calculate downtime for postcopy live migration +# (since 2.13) +# # Since: 1.2 ## { 'enum': 'MigrationCapability', 'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks', 'compress', 'events', 'postcopy-ram', 'x-colo', 'release-ram', 'block', 'return-path', 'pause-before-switchover', 'x-multifd', - 'dirty-bitmaps' ] } + 'dirty-bitmaps', 'postcopy-blocktime' ] } ## # @MigrationCapabilityStatus: From 2a4c42f18c987496c2c48764d4785a9d6448874a Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Thu, 22 Mar 2018 21:17:23 +0300 Subject: [PATCH 02/16] migration: add postcopy blocktime ctx into MigrationIncomingState This patch adds request to kernel space for UFFD_FEATURE_THREAD_ID, in case this feature is provided by kernel. PostcopyBlocktimeContext is encapsulated inside postcopy-ram.c, due to it being a postcopy-only feature. Also it defines PostcopyBlocktimeContext's instance live time. Information from PostcopyBlocktimeContext instance will be provided much after postcopy migration end, instance of PostcopyBlocktimeContext will live till QEMU exit, but part of it (vcpu_addr, page_fault_vcpu_time) used only during calculation, will be released when postcopy ended or failed. To enable postcopy blocktime calculation on destination, need to request proper compatibility (Patch for documentation will be at the tail of the patch set). As an example following command enable that capability, assume QEMU was started with -chardev socket,id=charmonitor,path=/var/lib/migrate-vm-monitor.sock option to control it [root@host]#printf "{\"execute\" : \"qmp_capabilities\"}\r\n \ {\"execute\": \"migrate-set-capabilities\" , \"arguments\": { \"capabilities\": [ { \"capability\": \"postcopy-blocktime\", \"state\": true } ] } }" | nc -U /var/lib/migrate-vm-monitor.sock Or just with HMP (qemu) migrate_set_capability postcopy-blocktime on Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Alexey Perevalov Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela Message-Id: <1521742647-25550-3-git-send-email-a.perevalov@samsung.com> Signed-off-by: Dr. David Alan Gilbert --- migration/migration.h | 8 ++++++ migration/postcopy-ram.c | 61 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/migration/migration.h b/migration/migration.h index 46a50bc789..6d9aaeb480 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -22,6 +22,8 @@ #include "hw/qdev.h" #include "io/channel.h" +struct PostcopyBlocktimeContext; + /* State for the incoming migration */ struct MigrationIncomingState { QEMUFile *from_src_file; @@ -65,6 +67,12 @@ struct MigrationIncomingState { /* The coroutine we should enter (back) after failover */ Coroutine *migration_incoming_co; QemuSemaphore colo_incoming_sem; + + /* + * PostcopyBlocktimeContext to keep information for postcopy + * live migration, to calculate vCPU block time + * */ + struct PostcopyBlocktimeContext *blocktime_ctx; }; MigrationIncomingState *migration_incoming_get_current(void); diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 4a0b33b373..eddba05b57 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -90,6 +90,54 @@ int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp) #include #include +typedef struct PostcopyBlocktimeContext { + /* time when page fault initiated per vCPU */ + uint32_t *page_fault_vcpu_time; + /* page address per vCPU */ + uintptr_t *vcpu_addr; + uint32_t total_blocktime; + /* blocktime per vCPU */ + uint32_t *vcpu_blocktime; + /* point in time when last page fault was initiated */ + uint32_t last_begin; + /* number of vCPU are suspended */ + int smp_cpus_down; + uint64_t start_time; + + /* + * Handler for exit event, necessary for + * releasing whole blocktime_ctx + */ + Notifier exit_notifier; +} PostcopyBlocktimeContext; + +static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx) +{ + g_free(ctx->page_fault_vcpu_time); + g_free(ctx->vcpu_addr); + g_free(ctx->vcpu_blocktime); + g_free(ctx); +} + +static void migration_exit_cb(Notifier *n, void *data) +{ + PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext, + exit_notifier); + destroy_blocktime_context(ctx); +} + +static struct PostcopyBlocktimeContext *blocktime_context_new(void) +{ + PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1); + ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus); + ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus); + ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus); + + ctx->exit_notifier.notify = migration_exit_cb; + ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + qemu_add_exit_notifier(&ctx->exit_notifier); + return ctx; +} /** * receive_ufd_features: check userfault fd features, to request only supported @@ -182,6 +230,19 @@ static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis) } } +#ifdef UFFD_FEATURE_THREAD_ID + if (migrate_postcopy_blocktime() && mis && + UFFD_FEATURE_THREAD_ID & supported_features) { + /* kernel supports that feature */ + /* don't create blocktime_context if it exists */ + if (!mis->blocktime_ctx) { + mis->blocktime_ctx = blocktime_context_new(); + } + + asked_features |= UFFD_FEATURE_THREAD_ID; + } +#endif + /* * request features, even if asked_features is 0, due to * kernel expects UFFD_API before UFFDIO_REGISTER, per From 575b0b332ea25986fd348c45a91e642bca912137 Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Thu, 22 Mar 2018 21:17:24 +0300 Subject: [PATCH 03/16] migration: calculate vCPU blocktime on dst side This patch provides blocktime calculation per vCPU, as a summary and as a overlapped value for all vCPUs. This approach was suggested by Peter Xu, as an improvements of previous approch where QEMU kept tree with faulted page address and cpus bitmask in it. Now QEMU is keeping array with faulted page address as value and vCPU as index. It helps to find proper vCPU at UFFD_COPY time. Also it keeps list for blocktime per vCPU (could be traced with page_fault_addr) Blocktime will not calculated if postcopy_blocktime field of MigrationIncomingState wasn't initialized. Signed-off-by: Alexey Perevalov Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela Message-Id: <1521742647-25550-4-git-send-email-a.perevalov@samsung.com> Signed-off-by: Dr. David Alan Gilbert --- migration/postcopy-ram.c | 151 ++++++++++++++++++++++++++++++++++++++- migration/trace-events | 5 +- 2 files changed, 154 insertions(+), 2 deletions(-) diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index eddba05b57..6b34a7148e 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -636,6 +636,148 @@ int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, return 0; } +static int get_mem_fault_cpu_index(uint32_t pid) +{ + CPUState *cpu_iter; + + CPU_FOREACH(cpu_iter) { + if (cpu_iter->thread_id == pid) { + trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid); + return cpu_iter->cpu_index; + } + } + trace_get_mem_fault_cpu_index(-1, pid); + return -1; +} + +static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc) +{ + int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - + dc->start_time; + return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX; +} + +/* + * This function is being called when pagefault occurs. It + * tracks down vCPU blocking time. + * + * @addr: faulted host virtual address + * @ptid: faulted process thread id + * @rb: ramblock appropriate to addr + */ +static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid, + RAMBlock *rb) +{ + int cpu, already_received; + MigrationIncomingState *mis = migration_incoming_get_current(); + PostcopyBlocktimeContext *dc = mis->blocktime_ctx; + uint32_t low_time_offset; + + if (!dc || ptid == 0) { + return; + } + cpu = get_mem_fault_cpu_index(ptid); + if (cpu < 0) { + return; + } + + low_time_offset = get_low_time_offset(dc); + if (dc->vcpu_addr[cpu] == 0) { + atomic_inc(&dc->smp_cpus_down); + } + + atomic_xchg(&dc->last_begin, low_time_offset); + atomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset); + atomic_xchg(&dc->vcpu_addr[cpu], addr); + + /* check it here, not at the begining of the function, + * due to, check could accur early than bitmap_set in + * qemu_ufd_copy_ioctl */ + already_received = ramblock_recv_bitmap_test(rb, (void *)addr); + if (already_received) { + atomic_xchg(&dc->vcpu_addr[cpu], 0); + atomic_xchg(&dc->page_fault_vcpu_time[cpu], 0); + atomic_dec(&dc->smp_cpus_down); + } + trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu], + cpu, already_received); +} + +/* + * This function just provide calculated blocktime per cpu and trace it. + * Total blocktime is calculated in mark_postcopy_blocktime_end. + * + * + * Assume we have 3 CPU + * + * S1 E1 S1 E1 + * -----***********------------xxx***************------------------------> CPU1 + * + * S2 E2 + * ------------****************xxx---------------------------------------> CPU2 + * + * S3 E3 + * ------------------------****xxx********-------------------------------> CPU3 + * + * We have sequence S1,S2,E1,S3,S1,E2,E3,E1 + * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3 + * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 - + * it's a part of total blocktime. + * S1 - here is last_begin + * Legend of the picture is following: + * * - means blocktime per vCPU + * x - means overlapped blocktime (total blocktime) + * + * @addr: host virtual address + */ +static void mark_postcopy_blocktime_end(uintptr_t addr) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + PostcopyBlocktimeContext *dc = mis->blocktime_ctx; + int i, affected_cpu = 0; + bool vcpu_total_blocktime = false; + uint32_t read_vcpu_time, low_time_offset; + + if (!dc) { + return; + } + + low_time_offset = get_low_time_offset(dc); + /* lookup cpu, to clear it, + * that algorithm looks straighforward, but it's not + * optimal, more optimal algorithm is keeping tree or hash + * where key is address value is a list of */ + for (i = 0; i < smp_cpus; i++) { + uint32_t vcpu_blocktime = 0; + + read_vcpu_time = atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0); + if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr || + read_vcpu_time == 0) { + continue; + } + atomic_xchg(&dc->vcpu_addr[i], 0); + vcpu_blocktime = low_time_offset - read_vcpu_time; + affected_cpu += 1; + /* we need to know is that mark_postcopy_end was due to + * faulted page, another possible case it's prefetched + * page and in that case we shouldn't be here */ + if (!vcpu_total_blocktime && + atomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) { + vcpu_total_blocktime = true; + } + /* continue cycle, due to one page could affect several vCPUs */ + dc->vcpu_blocktime[i] += vcpu_blocktime; + } + + atomic_sub(&dc->smp_cpus_down, affected_cpu); + if (vcpu_total_blocktime) { + dc->total_blocktime += low_time_offset - atomic_fetch_add( + &dc->last_begin, 0); + } + trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime, + affected_cpu); +} + /* * Handle faults detected by the USERFAULT markings */ @@ -742,7 +884,12 @@ static void *postcopy_ram_fault_thread(void *opaque) rb_offset &= ~(qemu_ram_pagesize(rb) - 1); trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address, qemu_ram_get_idstr(rb), - rb_offset); + rb_offset, + msg.arg.pagefault.feat.ptid); + mark_postcopy_blocktime_begin( + (uintptr_t)(msg.arg.pagefault.address), + msg.arg.pagefault.feat.ptid, rb); + /* * Send the request to the source - we want to request one * of our host page sizes (which is >= TPS) @@ -890,6 +1037,8 @@ static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr, if (!ret) { ramblock_recv_bitmap_set_range(rb, host_addr, pagesize / qemu_target_page_size()); + mark_postcopy_blocktime_end((uintptr_t)host_addr); + } return ret; } diff --git a/migration/trace-events b/migration/trace-events index a180d7b008..368bc4b663 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -115,6 +115,8 @@ process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d" process_incoming_migration_co_postcopy_end_main(void) "" migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s" migration_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname, void *err) "ioc=%p ioctype=%s hostname=%s err=%p" +mark_postcopy_blocktime_begin(uint64_t addr, void *dd, uint32_t time, int cpu, int received) "addr: 0x%" PRIx64 ", dd: %p, time: %u, cpu: %d, already_received: %d" +mark_postcopy_blocktime_end(uint64_t addr, void *dd, uint32_t time, int affected_cpu) "addr: 0x%" PRIx64 ", dd: %p, time: %u, affected_cpu: %d" # migration/rdma.c qemu_rdma_accept_incoming_migration(void) "" @@ -193,7 +195,7 @@ postcopy_ram_fault_thread_exit(void) "" postcopy_ram_fault_thread_fds_core(int baseufd, int quitfd) "ufd: %d quitfd: %d" postcopy_ram_fault_thread_fds_extra(size_t index, const char *name, int fd) "%zd/%s: %d" postcopy_ram_fault_thread_quit(void) "" -postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=0x%" PRIx64 " rb=%s offset=0x%zx" +postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset, uint32_t pid) "Request for HVA=0x%" PRIx64 " rb=%s offset=0x%zx pid=%u" postcopy_ram_incoming_cleanup_closeuf(void) "" postcopy_ram_incoming_cleanup_entry(void) "" postcopy_ram_incoming_cleanup_exit(void) "" @@ -206,6 +208,7 @@ save_xbzrle_page_skipping(void) "" save_xbzrle_page_overflow(void) "" ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations" ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64 +get_mem_fault_cpu_index(int cpu, uint32_t pid) "cpu: %d, pid: %u" # migration/exec.c migration_exec_outgoing(const char *cmd) "cmd=%s" From 9ed01779e8984b71cf62e4732de8d05ff091df36 Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Thu, 22 Mar 2018 21:17:25 +0300 Subject: [PATCH 04/16] migration: postcopy_blocktime documentation Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Alexey Perevalov Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela Message-Id: <1521742647-25550-5-git-send-email-a.perevalov@samsung.com> Signed-off-by: Dr. David Alan Gilbert --- docs/devel/migration.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/devel/migration.rst b/docs/devel/migration.rst index e32b087f6e..9342a8af06 100644 --- a/docs/devel/migration.rst +++ b/docs/devel/migration.rst @@ -401,6 +401,20 @@ will now cause the transition from precopy to postcopy. It can be issued immediately after migration is started or any time later on. Issuing it after the end of a migration is harmless. +Blocktime is a postcopy live migration metric, intended to show how +long the vCPU was in state of interruptable sleep due to pagefault. +That metric is calculated both for all vCPUs as overlapped value, and +separately for each vCPU. These values are calculated on destination +side. To enable postcopy blocktime calculation, enter following +command on destination monitor: + +``migrate_set_capability postcopy-blocktime on`` + +Postcopy blocktime can be retrieved by query-migrate qmp command. +postcopy-blocktime value of qmp command will show overlapped blocking +time for all vCPU, postcopy-vcpu-blocktime will show list of blocking +time per vCPU. + .. note:: During the postcopy phase, the bandwidth limits set using ``migrate_set_speed`` is ignored (to avoid delaying requested pages that From 346f3dab04ee2391e521fd276883e63959eb9ea9 Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Thu, 22 Mar 2018 21:17:26 +0300 Subject: [PATCH 05/16] migration: add blocktime calculation into migration-test This patch just requests blocktime calculation, and check it in case when UFFD_FEATURE_THREAD_ID feature is set on the host. Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Alexey Perevalov Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela Message-Id: <1521742647-25550-6-git-send-email-a.perevalov@samsung.com> Signed-off-by: Dr. David Alan Gilbert --- tests/migration-test.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/migration-test.c b/tests/migration-test.c index 422bf1afdf..dde7c464c3 100644 --- a/tests/migration-test.c +++ b/tests/migration-test.c @@ -26,6 +26,7 @@ const unsigned start_address = 1024 * 1024; const unsigned end_address = 100 * 1024 * 1024; bool got_stop; +static bool uffd_feature_thread_id; #if defined(__linux__) #include @@ -55,6 +56,7 @@ static bool ufd_version_check(void) g_test_message("Skipping test: UFFDIO_API failed"); return false; } + uffd_feature_thread_id = api_struct.features & UFFD_FEATURE_THREAD_ID; ioctl_mask = (__u64)1 << _UFFDIO_REGISTER | (__u64)1 << _UFFDIO_UNREGISTER; @@ -223,6 +225,16 @@ static uint64_t get_migration_pass(QTestState *who) return result; } +static void read_blocktime(QTestState *who) +{ + QDict *rsp, *rsp_return; + + rsp = wait_command(who, "{ 'execute': 'query-migrate' }"); + rsp_return = qdict_get_qdict(rsp, "return"); + g_assert(qdict_haskey(rsp_return, "postcopy-blocktime")); + QDECREF(rsp); +} + static void wait_for_migration_complete(QTestState *who) { while (true) { @@ -533,6 +545,7 @@ static void test_migrate(void) migrate_set_capability(from, "postcopy-ram", "true"); migrate_set_capability(to, "postcopy-ram", "true"); + migrate_set_capability(to, "postcopy-blocktime", "true"); /* We want to pick a speed slow enough that the test completes * quickly, but that it doesn't complete precopy even on a slow @@ -559,6 +572,9 @@ static void test_migrate(void) wait_for_serial("dest_serial"); wait_for_migration_complete(from); + if (uffd_feature_thread_id) { + read_blocktime(to); + } g_free(uri); test_migrate_end(from, to, true); From 65ace060455122a461cdc9302238b914084bcd42 Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Thu, 22 Mar 2018 21:17:27 +0300 Subject: [PATCH 06/16] migration: add postcopy total blocktime into query-migrate Postcopy total blocktime is available on destination side only. But query-migrate was possible only for source. This patch adds ability to call query-migrate on destination. To be able to see postcopy blocktime, need to request postcopy-blocktime capability. The query-migrate command will show following sample result: {"return": "postcopy-vcpu-blocktime": [115, 100], "status": "completed", "postcopy-blocktime": 100 }} postcopy_vcpu_blocktime contains list, where the first item is the first vCPU in QEMU. This patch has a drawback, it combines states of incoming and outgoing migration. Ongoing migration state will overwrite incoming state. Looks like better to separate query-migrate for incoming and outgoing migration or add parameter to indicate type of migration. Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Alexey Perevalov Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela Message-Id: <1521742647-25550-7-git-send-email-a.perevalov@samsung.com> Signed-off-by: Dr. David Alan Gilbert --- hmp.c | 15 +++++++++++ migration/migration.c | 42 +++++++++++++++++++++++++++--- migration/migration.h | 4 +++ migration/postcopy-ram.c | 56 ++++++++++++++++++++++++++++++++++++++++ migration/trace-events | 1 + qapi/migration.json | 11 +++++++- 6 files changed, 124 insertions(+), 5 deletions(-) diff --git a/hmp.c b/hmp.c index a25c7bd9a8..898e25f3e1 100644 --- a/hmp.c +++ b/hmp.c @@ -274,6 +274,21 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) info->cpu_throttle_percentage); } + if (info->has_postcopy_blocktime) { + monitor_printf(mon, "postcopy blocktime: %u\n", + info->postcopy_blocktime); + } + + if (info->has_postcopy_vcpu_blocktime) { + Visitor *v; + char *str; + v = string_output_visitor_new(false, &str); + visit_type_uint32List(v, NULL, &info->postcopy_vcpu_blocktime, NULL); + visit_complete(v, &str); + monitor_printf(mon, "postcopy vcpu blocktime: %s\n", str); + g_free(str); + visit_free(v); + } qapi_free_MigrationInfo(info); qapi_free_MigrationCapabilityStatusList(caps); } diff --git a/migration/migration.c b/migration/migration.c index bf2e8db9c7..0bdb28e144 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -630,14 +630,15 @@ static void populate_disk_info(MigrationInfo *info) } } -MigrationInfo *qmp_query_migrate(Error **errp) +static void fill_source_migration_info(MigrationInfo *info) { - MigrationInfo *info = g_malloc0(sizeof(*info)); MigrationState *s = migrate_get_current(); switch (s->state) { case MIGRATION_STATUS_NONE: /* no migration has happened ever */ + /* do not overwrite destination migration status */ + return; break; case MIGRATION_STATUS_SETUP: info->has_status = true; @@ -688,8 +689,6 @@ MigrationInfo *qmp_query_migrate(Error **errp) break; } info->status = s->state; - - return info; } /** @@ -753,6 +752,41 @@ static bool migrate_caps_check(bool *cap_list, return true; } +static void fill_destination_migration_info(MigrationInfo *info) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + + switch (mis->state) { + case MIGRATION_STATUS_NONE: + return; + break; + case MIGRATION_STATUS_SETUP: + case MIGRATION_STATUS_CANCELLING: + case MIGRATION_STATUS_CANCELLED: + case MIGRATION_STATUS_ACTIVE: + case MIGRATION_STATUS_POSTCOPY_ACTIVE: + case MIGRATION_STATUS_FAILED: + case MIGRATION_STATUS_COLO: + info->has_status = true; + break; + case MIGRATION_STATUS_COMPLETED: + info->has_status = true; + fill_destination_postcopy_migration_info(info); + break; + } + info->status = mis->state; +} + +MigrationInfo *qmp_query_migrate(Error **errp) +{ + MigrationInfo *info = g_malloc0(sizeof(*info)); + + fill_destination_migration_info(info); + fill_source_migration_info(info); + + return info; +} + void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, Error **errp) { diff --git a/migration/migration.h b/migration/migration.h index 6d9aaeb480..7c69598c54 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -77,6 +77,10 @@ struct MigrationIncomingState { MigrationIncomingState *migration_incoming_get_current(void); void migration_incoming_state_destroy(void); +/* + * Functions to work with blocktime context + */ +void fill_destination_postcopy_migration_info(MigrationInfo *info); #define TYPE_MIGRATION "migration" diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 6b34a7148e..8ceeaa2a93 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -139,6 +139,55 @@ static struct PostcopyBlocktimeContext *blocktime_context_new(void) return ctx; } +static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx) +{ + uint32List *list = NULL, *entry = NULL; + int i; + + for (i = smp_cpus - 1; i >= 0; i--) { + entry = g_new0(uint32List, 1); + entry->value = ctx->vcpu_blocktime[i]; + entry->next = list; + list = entry; + } + + return list; +} + +/* + * This function just populates MigrationInfo from postcopy's + * blocktime context. It will not populate MigrationInfo, + * unless postcopy-blocktime capability was set. + * + * @info: pointer to MigrationInfo to populate + */ +void fill_destination_postcopy_migration_info(MigrationInfo *info) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + PostcopyBlocktimeContext *bc = mis->blocktime_ctx; + + if (!bc) { + return; + } + + info->has_postcopy_blocktime = true; + info->postcopy_blocktime = bc->total_blocktime; + info->has_postcopy_vcpu_blocktime = true; + info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc); +} + +static uint32_t get_postcopy_total_blocktime(void) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + PostcopyBlocktimeContext *bc = mis->blocktime_ctx; + + if (!bc) { + return 0; + } + + return bc->total_blocktime; +} + /** * receive_ufd_features: check userfault fd features, to request only supported * features in the future. @@ -512,6 +561,9 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size); mis->postcopy_tmp_zero_page = NULL; } + trace_postcopy_ram_incoming_cleanup_blocktime( + get_postcopy_total_blocktime()); + trace_postcopy_ram_incoming_cleanup_exit(); return 0; } @@ -1157,6 +1209,10 @@ void *postcopy_get_tmp_page(MigrationIncomingState *mis) #else /* No target OS support, stubs just fail */ +void fill_destination_postcopy_migration_info(MigrationInfo *info) +{ +} + bool postcopy_ram_supported_by_host(MigrationIncomingState *mis) { error_report("%s: No OS support", __func__); diff --git a/migration/trace-events b/migration/trace-events index 368bc4b663..d6be74b7a7 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -200,6 +200,7 @@ postcopy_ram_incoming_cleanup_closeuf(void) "" postcopy_ram_incoming_cleanup_entry(void) "" postcopy_ram_incoming_cleanup_exit(void) "" postcopy_ram_incoming_cleanup_join(void) "" +postcopy_ram_incoming_cleanup_blocktime(uint64_t total) "total blocktime %" PRIu64 postcopy_request_shared_page(const char *sharer, const char *rb, uint64_t rb_offset) "for %s in %s offset 0x%"PRIx64 postcopy_request_shared_page_present(const char *sharer, const char *rb, uint64_t rb_offset) "%s already %s offset 0x%"PRIx64 postcopy_wake_shared(uint64_t client_addr, const char *rb) "at 0x%"PRIx64" in %s" diff --git a/qapi/migration.json b/qapi/migration.json index 24bfc190ba..f3974c6807 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -155,6 +155,13 @@ # @error-desc: the human readable error description string, when # @status is 'failed'. Clients should not attempt to parse the # error strings. (Since 2.7) +# +# @postcopy-blocktime: total time when all vCPU were blocked during postcopy +# live migration (Since 2.13) +# +# @postcopy-vcpu-blocktime: list of the postcopy blocktime per vCPU (Since 2.13) +# + # # Since: 0.14.0 ## @@ -167,7 +174,9 @@ '*downtime': 'int', '*setup-time': 'int', '*cpu-throttle-percentage': 'int', - '*error-desc': 'str'} } + '*error-desc': 'str', + '*postcopy-blocktime' : 'uint32', + '*postcopy-vcpu-blocktime': ['uint32']} } ## # @query-migrate: From 263a289ae61c8344a417a95b0142650fdff3af56 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 30 Mar 2018 15:51:19 +0800 Subject: [PATCH 07/16] migration: stop compressing page in migration thread As compression is a heavy work, do not do it in migration thread, instead, we post it out as a normal page Reviewed-by: Wei Wang Reviewed-by: Peter Xu Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Xiao Guangrong Message-Id: <20180330075128.26919-2-xiaoguangrong@tencent.com> Signed-off-by: Dr. David Alan Gilbert --- migration/ram.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 0e90efa092..409c847a76 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1137,7 +1137,7 @@ static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss, int pages = -1; uint64_t bytes_xmit = 0; uint8_t *p; - int ret, blen; + int ret; RAMBlock *block = pss->block; ram_addr_t offset = pss->page << TARGET_PAGE_BITS; @@ -1167,23 +1167,23 @@ static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss, if (block != rs->last_sent_block) { flush_compressed_data(rs); pages = save_zero_page(rs, block, offset); - if (pages == -1) { - /* Make sure the first page is sent out before other pages */ - bytes_xmit = save_page_header(rs, rs->f, block, offset | - RAM_SAVE_FLAG_COMPRESS_PAGE); - blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE, - migrate_compress_level()); - if (blen > 0) { - ram_counters.transferred += bytes_xmit + blen; - ram_counters.normal++; - pages = 1; - } else { - qemu_file_set_error(rs->f, blen); - error_report("compressed data failed!"); - } - } if (pages > 0) { ram_release_pages(block->idstr, offset, pages); + } else { + /* + * Make sure the first page is sent out before other pages. + * + * we post it as normal page as compression will take much + * CPU resource. + */ + ram_counters.transferred += save_page_header(rs, rs->f, block, + offset | RAM_SAVE_FLAG_PAGE); + qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE, + migrate_release_ram() & + migration_in_postcopy()); + ram_counters.transferred += TARGET_PAGE_SIZE; + ram_counters.normal++; + pages = 1; } } else { pages = save_zero_page(rs, block, offset); From dcaf446ebda5d87e05eb41cdbafb7ae4a7cc4a62 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 30 Mar 2018 15:51:20 +0800 Subject: [PATCH 08/16] migration: stop compression to allocate and free memory frequently Current code uses compress2() to compress memory which manages memory internally, that causes huge memory is allocated and freed very frequently More worse, frequently returning memory to kernel will flush TLBs and trigger invalidation callbacks on mmu-notification which interacts with KVM MMU, that dramatically reduce the performance of VM So, we maintain the memory by ourselves and reuse it for each compression Reviewed-by: Peter Xu Reviewed-by: Jiang Biao Signed-off-by: Xiao Guangrong Message-Id: <20180330075128.26919-3-xiaoguangrong@tencent.com> Signed-off-by: Dr. David Alan Gilbert --- migration/qemu-file.c | 39 ++++++++++++++++++++++++++++++++------- migration/qemu-file.h | 6 ++++-- migration/ram.c | 41 ++++++++++++++++++++++++++++++++--------- 3 files changed, 68 insertions(+), 18 deletions(-) diff --git a/migration/qemu-file.c b/migration/qemu-file.c index bb63c779cc..bafe3a0c0d 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -658,8 +658,32 @@ uint64_t qemu_get_be64(QEMUFile *f) return v; } -/* Compress size bytes of data start at p with specific compression - * level and store the compressed data to the buffer of f. +/* return the size after compression, or negative value on error */ +static int qemu_compress_data(z_stream *stream, uint8_t *dest, size_t dest_len, + const uint8_t *source, size_t source_len) +{ + int err; + + err = deflateReset(stream); + if (err != Z_OK) { + return -1; + } + + stream->avail_in = source_len; + stream->next_in = (uint8_t *)source; + stream->avail_out = dest_len; + stream->next_out = dest; + + err = deflate(stream, Z_FINISH); + if (err != Z_STREAM_END) { + return -1; + } + + return stream->next_out - dest; +} + +/* Compress size bytes of data start at p and store the compressed + * data to the buffer of f. * * When f is not writable, return -1 if f has no space to save the * compressed data. @@ -667,9 +691,8 @@ uint64_t qemu_get_be64(QEMUFile *f) * do fflush first, if f still has no space to save the compressed * data, return -1. */ - -ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, - int level) +ssize_t qemu_put_compression_data(QEMUFile *f, z_stream *stream, + const uint8_t *p, size_t size) { ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t); @@ -683,8 +706,10 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, return -1; } } - if (compress2(f->buf + f->buf_index + sizeof(int32_t), (uLongf *)&blen, - (Bytef *)p, size, level) != Z_OK) { + + blen = qemu_compress_data(stream, f->buf + f->buf_index + sizeof(int32_t), + blen, p, size); + if (blen < 0) { error_report("Compress Failed!"); return 0; } diff --git a/migration/qemu-file.h b/migration/qemu-file.h index f4f356ab12..2ccfcfb2a8 100644 --- a/migration/qemu-file.h +++ b/migration/qemu-file.h @@ -25,6 +25,8 @@ #ifndef MIGRATION_QEMU_FILE_H #define MIGRATION_QEMU_FILE_H +#include + /* Read a chunk of data from a file at the given position. The pos argument * can be ignored if the file is only be used for streaming. The number of * bytes actually read should be returned. @@ -132,8 +134,8 @@ bool qemu_file_is_writable(QEMUFile *f); size_t qemu_peek_buffer(QEMUFile *f, uint8_t **buf, size_t size, size_t offset); size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size); -ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, - int level); +ssize_t qemu_put_compression_data(QEMUFile *f, z_stream *stream, + const uint8_t *p, size_t size); int qemu_put_qemu_file(QEMUFile *f_des, QEMUFile *f_src); /* diff --git a/migration/ram.c b/migration/ram.c index 409c847a76..a21514a469 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -269,6 +269,7 @@ struct CompressParam { QemuCond cond; RAMBlock *block; ram_addr_t offset; + z_stream stream; }; typedef struct CompressParam CompressParam; @@ -299,7 +300,7 @@ static QemuThread *decompress_threads; static QemuMutex decomp_done_lock; static QemuCond decomp_done_cond; -static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, +static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, ram_addr_t offset); static void *do_data_compress(void *opaque) @@ -316,7 +317,7 @@ static void *do_data_compress(void *opaque) param->block = NULL; qemu_mutex_unlock(¶m->mutex); - do_compress_ram_page(param->file, block, offset); + do_compress_ram_page(param->file, ¶m->stream, block, offset); qemu_mutex_lock(&comp_done_lock); param->done = true; @@ -357,10 +358,19 @@ static void compress_threads_save_cleanup(void) terminate_compression_threads(); thread_count = migrate_compress_threads(); for (i = 0; i < thread_count; i++) { + /* + * we use it as a indicator which shows if the thread is + * properly init'd or not + */ + if (!comp_param[i].file) { + break; + } qemu_thread_join(compress_threads + i); - qemu_fclose(comp_param[i].file); qemu_mutex_destroy(&comp_param[i].mutex); qemu_cond_destroy(&comp_param[i].cond); + deflateEnd(&comp_param[i].stream); + qemu_fclose(comp_param[i].file); + comp_param[i].file = NULL; } qemu_mutex_destroy(&comp_done_lock); qemu_cond_destroy(&comp_done_cond); @@ -370,12 +380,12 @@ static void compress_threads_save_cleanup(void) comp_param = NULL; } -static void compress_threads_save_setup(void) +static int compress_threads_save_setup(void) { int i, thread_count; if (!migrate_use_compression()) { - return; + return 0; } thread_count = migrate_compress_threads(); compress_threads = g_new0(QemuThread, thread_count); @@ -383,6 +393,11 @@ static void compress_threads_save_setup(void) qemu_cond_init(&comp_done_cond); qemu_mutex_init(&comp_done_lock); for (i = 0; i < thread_count; i++) { + if (deflateInit(&comp_param[i].stream, + migrate_compress_level()) != Z_OK) { + goto exit; + } + /* comp_param[i].file is just used as a dummy buffer to save data, * set its ops to empty. */ @@ -395,6 +410,11 @@ static void compress_threads_save_setup(void) do_data_compress, comp_param + i, QEMU_THREAD_JOINABLE); } + return 0; + +exit: + compress_threads_save_cleanup(); + return -1; } /* Multiple fd's */ @@ -1031,7 +1051,7 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) return pages; } -static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, +static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, ram_addr_t offset) { RAMState *rs = ram_state; @@ -1040,8 +1060,7 @@ static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, bytes_sent = save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); - blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, - migrate_compress_level()); + blen = qemu_put_compression_data(f, stream, p, TARGET_PAGE_SIZE); if (blen < 0) { bytes_sent = 0; qemu_file_set_error(migrate_get_current()->to_dst_file, blen); @@ -2214,9 +2233,14 @@ static int ram_save_setup(QEMUFile *f, void *opaque) RAMState **rsp = opaque; RAMBlock *block; + if (compress_threads_save_setup()) { + return -1; + } + /* migration has already setup the bitmap, reuse it. */ if (!migration_in_colo_state()) { if (ram_init_all(rsp) != 0) { + compress_threads_save_cleanup(); return -1; } } @@ -2236,7 +2260,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque) } rcu_read_unlock(); - compress_threads_save_setup(); ram_control_before_iterate(f, RAM_CONTROL_SETUP); ram_control_after_iterate(f, RAM_CONTROL_SETUP); From 797ca154b4c68dbd8e93382f714388ab311f672d Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 30 Mar 2018 15:51:21 +0800 Subject: [PATCH 09/16] migration: stop decompression to allocate and free memory frequently Current code uses uncompress() to decompress memory which manages memory internally, that causes huge memory is allocated and freed very frequently, more worse, frequently returning memory to kernel will flush TLBs So, we maintain the memory by ourselves and reuse it for each decompression Reviewed-by: Peter Xu Reviewed-by: Jiang Biao Signed-off-by: Xiao Guangrong Message-Id: <20180330075128.26919-4-xiaoguangrong@tencent.com> Signed-off-by: Dr. David Alan Gilbert --- migration/ram.c | 112 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 82 insertions(+), 30 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index a21514a469..fb24b2f32f 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -281,6 +281,7 @@ struct DecompressParam { void *des; uint8_t *compbuf; int len; + z_stream stream; }; typedef struct DecompressParam DecompressParam; @@ -2524,6 +2525,31 @@ void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) } } +/* return the size after decompression, or negative value on error */ +static int +qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, + const uint8_t *source, size_t source_len) +{ + int err; + + err = inflateReset(stream); + if (err != Z_OK) { + return -1; + } + + stream->avail_in = source_len; + stream->next_in = (uint8_t *)source; + stream->avail_out = dest_len; + stream->next_out = dest; + + err = inflate(stream, Z_NO_FLUSH); + if (err != Z_STREAM_END) { + return -1; + } + + return stream->total_out; +} + static void *do_data_decompress(void *opaque) { DecompressParam *param = opaque; @@ -2540,13 +2566,13 @@ static void *do_data_decompress(void *opaque) qemu_mutex_unlock(¶m->mutex); pagesize = TARGET_PAGE_SIZE; - /* uncompress() will return failed in some case, especially - * when the page is dirted when doing the compression, it's - * not a problem because the dirty page will be retransferred + /* qemu_uncompress_data() will return failed in some case, + * especially when the page is dirtied when doing the compression, + * it's not a problem because the dirty page will be retransferred * and uncompress() won't break the data in other pages. */ - uncompress((Bytef *)des, &pagesize, - (const Bytef *)param->compbuf, len); + qemu_uncompress_data(¶m->stream, des, pagesize, param->compbuf, + len); qemu_mutex_lock(&decomp_done_lock); param->done = true; @@ -2581,30 +2607,6 @@ static void wait_for_decompress_done(void) qemu_mutex_unlock(&decomp_done_lock); } -static void compress_threads_load_setup(void) -{ - int i, thread_count; - - if (!migrate_use_compression()) { - return; - } - thread_count = migrate_decompress_threads(); - decompress_threads = g_new0(QemuThread, thread_count); - decomp_param = g_new0(DecompressParam, thread_count); - qemu_mutex_init(&decomp_done_lock); - qemu_cond_init(&decomp_done_cond); - for (i = 0; i < thread_count; i++) { - qemu_mutex_init(&decomp_param[i].mutex); - qemu_cond_init(&decomp_param[i].cond); - decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); - decomp_param[i].done = true; - decomp_param[i].quit = false; - qemu_thread_create(decompress_threads + i, "decompress", - do_data_decompress, decomp_param + i, - QEMU_THREAD_JOINABLE); - } -} - static void compress_threads_load_cleanup(void) { int i, thread_count; @@ -2614,16 +2616,30 @@ static void compress_threads_load_cleanup(void) } thread_count = migrate_decompress_threads(); for (i = 0; i < thread_count; i++) { + /* + * we use it as a indicator which shows if the thread is + * properly init'd or not + */ + if (!decomp_param[i].compbuf) { + break; + } + qemu_mutex_lock(&decomp_param[i].mutex); decomp_param[i].quit = true; qemu_cond_signal(&decomp_param[i].cond); qemu_mutex_unlock(&decomp_param[i].mutex); } for (i = 0; i < thread_count; i++) { + if (!decomp_param[i].compbuf) { + break; + } + qemu_thread_join(decompress_threads + i); qemu_mutex_destroy(&decomp_param[i].mutex); qemu_cond_destroy(&decomp_param[i].cond); + inflateEnd(&decomp_param[i].stream); g_free(decomp_param[i].compbuf); + decomp_param[i].compbuf = NULL; } g_free(decompress_threads); g_free(decomp_param); @@ -2631,6 +2647,39 @@ static void compress_threads_load_cleanup(void) decomp_param = NULL; } +static int compress_threads_load_setup(void) +{ + int i, thread_count; + + if (!migrate_use_compression()) { + return 0; + } + + thread_count = migrate_decompress_threads(); + decompress_threads = g_new0(QemuThread, thread_count); + decomp_param = g_new0(DecompressParam, thread_count); + qemu_mutex_init(&decomp_done_lock); + qemu_cond_init(&decomp_done_cond); + for (i = 0; i < thread_count; i++) { + if (inflateInit(&decomp_param[i].stream) != Z_OK) { + goto exit; + } + + decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); + qemu_mutex_init(&decomp_param[i].mutex); + qemu_cond_init(&decomp_param[i].cond); + decomp_param[i].done = true; + decomp_param[i].quit = false; + qemu_thread_create(decompress_threads + i, "decompress", + do_data_decompress, decomp_param + i, + QEMU_THREAD_JOINABLE); + } + return 0; +exit: + compress_threads_load_cleanup(); + return -1; +} + static void decompress_data_with_multi_threads(QEMUFile *f, void *host, int len) { @@ -2670,8 +2719,11 @@ static void decompress_data_with_multi_threads(QEMUFile *f, */ static int ram_load_setup(QEMUFile *f, void *opaque) { + if (compress_threads_load_setup()) { + return -1; + } + xbzrle_load_setup(); - compress_threads_load_setup(); ramblock_recv_map_init(); return 0; } From 34ab9e9743aeaf265929d930747f101fa5c76fea Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 30 Mar 2018 15:51:22 +0800 Subject: [PATCH 10/16] migration: detect compression and decompression errors Currently the page being compressed is allowed to be updated by the VM on the source QEMU, correspondingly the destination QEMU just ignores the decompression error. However, we completely miss the chance to catch real errors, then the VM is corrupted silently To make the migration more robuster, we copy the page to a buffer first to avoid it being written by VM, then detect and handle the errors of both compression and decompression errors properly Reviewed-by: Peter Xu Signed-off-by: Xiao Guangrong Message-Id: <20180330075128.26919-5-xiaoguangrong@tencent.com> Signed-off-by: Dr. David Alan Gilbert --- migration/qemu-file.c | 4 ++-- migration/ram.c | 56 ++++++++++++++++++++++++++++++------------- 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/migration/qemu-file.c b/migration/qemu-file.c index bafe3a0c0d..0463f4c321 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -710,9 +710,9 @@ ssize_t qemu_put_compression_data(QEMUFile *f, z_stream *stream, blen = qemu_compress_data(stream, f->buf + f->buf_index + sizeof(int32_t), blen, p, size); if (blen < 0) { - error_report("Compress Failed!"); - return 0; + return -1; } + qemu_put_be32(f, blen); if (f->ops->writev_buffer) { add_to_iovec(f, f->buf + f->buf_index, blen, false); diff --git a/migration/ram.c b/migration/ram.c index fb24b2f32f..72cb8dfb66 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -269,7 +269,10 @@ struct CompressParam { QemuCond cond; RAMBlock *block; ram_addr_t offset; + + /* internally used fields */ z_stream stream; + uint8_t *originbuf; }; typedef struct CompressParam CompressParam; @@ -296,13 +299,14 @@ static QemuCond comp_done_cond; /* The empty QEMUFileOps will be used by file in CompressParam */ static const QEMUFileOps empty_ops = { }; +static QEMUFile *decomp_file; static DecompressParam *decomp_param; static QemuThread *decompress_threads; static QemuMutex decomp_done_lock; static QemuCond decomp_done_cond; static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, - ram_addr_t offset); + ram_addr_t offset, uint8_t *source_buf); static void *do_data_compress(void *opaque) { @@ -318,7 +322,8 @@ static void *do_data_compress(void *opaque) param->block = NULL; qemu_mutex_unlock(¶m->mutex); - do_compress_ram_page(param->file, ¶m->stream, block, offset); + do_compress_ram_page(param->file, ¶m->stream, block, offset, + param->originbuf); qemu_mutex_lock(&comp_done_lock); param->done = true; @@ -370,6 +375,7 @@ static void compress_threads_save_cleanup(void) qemu_mutex_destroy(&comp_param[i].mutex); qemu_cond_destroy(&comp_param[i].cond); deflateEnd(&comp_param[i].stream); + g_free(comp_param[i].originbuf); qemu_fclose(comp_param[i].file); comp_param[i].file = NULL; } @@ -394,8 +400,14 @@ static int compress_threads_save_setup(void) qemu_cond_init(&comp_done_cond); qemu_mutex_init(&comp_done_lock); for (i = 0; i < thread_count; i++) { + comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE); + if (!comp_param[i].originbuf) { + goto exit; + } + if (deflateInit(&comp_param[i].stream, migrate_compress_level()) != Z_OK) { + g_free(comp_param[i].originbuf); goto exit; } @@ -1053,7 +1065,7 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) } static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, - ram_addr_t offset) + ram_addr_t offset, uint8_t *source_buf) { RAMState *rs = ram_state; int bytes_sent, blen; @@ -1061,7 +1073,14 @@ static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, bytes_sent = save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); - blen = qemu_put_compression_data(f, stream, p, TARGET_PAGE_SIZE); + + /* + * copy it to a internal buffer to avoid it being modified by VM + * so that we can catch up the error during compression and + * decompression + */ + memcpy(source_buf, p, TARGET_PAGE_SIZE); + blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE); if (blen < 0) { bytes_sent = 0; qemu_file_set_error(migrate_get_current()->to_dst_file, blen); @@ -2555,7 +2574,7 @@ static void *do_data_decompress(void *opaque) DecompressParam *param = opaque; unsigned long pagesize; uint8_t *des; - int len; + int len, ret; qemu_mutex_lock(¶m->mutex); while (!param->quit) { @@ -2566,13 +2585,13 @@ static void *do_data_decompress(void *opaque) qemu_mutex_unlock(¶m->mutex); pagesize = TARGET_PAGE_SIZE; - /* qemu_uncompress_data() will return failed in some case, - * especially when the page is dirtied when doing the compression, - * it's not a problem because the dirty page will be retransferred - * and uncompress() won't break the data in other pages. - */ - qemu_uncompress_data(¶m->stream, des, pagesize, param->compbuf, - len); + + ret = qemu_uncompress_data(¶m->stream, des, pagesize, + param->compbuf, len); + if (ret < 0) { + error_report("decompress data failed"); + qemu_file_set_error(decomp_file, ret); + } qemu_mutex_lock(&decomp_done_lock); param->done = true; @@ -2589,12 +2608,12 @@ static void *do_data_decompress(void *opaque) return NULL; } -static void wait_for_decompress_done(void) +static int wait_for_decompress_done(void) { int idx, thread_count; if (!migrate_use_compression()) { - return; + return 0; } thread_count = migrate_decompress_threads(); @@ -2605,6 +2624,7 @@ static void wait_for_decompress_done(void) } } qemu_mutex_unlock(&decomp_done_lock); + return qemu_file_get_error(decomp_file); } static void compress_threads_load_cleanup(void) @@ -2645,9 +2665,10 @@ static void compress_threads_load_cleanup(void) g_free(decomp_param); decompress_threads = NULL; decomp_param = NULL; + decomp_file = NULL; } -static int compress_threads_load_setup(void) +static int compress_threads_load_setup(QEMUFile *f) { int i, thread_count; @@ -2660,6 +2681,7 @@ static int compress_threads_load_setup(void) decomp_param = g_new0(DecompressParam, thread_count); qemu_mutex_init(&decomp_done_lock); qemu_cond_init(&decomp_done_cond); + decomp_file = f; for (i = 0; i < thread_count; i++) { if (inflateInit(&decomp_param[i].stream) != Z_OK) { goto exit; @@ -2719,7 +2741,7 @@ static void decompress_data_with_multi_threads(QEMUFile *f, */ static int ram_load_setup(QEMUFile *f, void *opaque) { - if (compress_threads_load_setup()) { + if (compress_threads_load_setup(f)) { return -1; } @@ -3074,7 +3096,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) } } - wait_for_decompress_done(); + ret |= wait_for_decompress_done(); rcu_read_unlock(); trace_ram_load_complete(ret, seq_iter); return ret; From 059ff0fb29dd3a56ac2843676915efc279938c6b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 30 Mar 2018 15:51:23 +0800 Subject: [PATCH 11/16] migration: introduce control_save_page() Abstract the common function control_save_page() to cleanup the code, no logic is changed Reviewed-by: Peter Xu Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Xiao Guangrong Message-Id: <20180330075128.26919-6-xiaoguangrong@tencent.com> Signed-off-by: Dr. David Alan Gilbert --- migration/ram.c | 174 +++++++++++++++++++++++++----------------------- 1 file changed, 89 insertions(+), 85 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 72cb8dfb66..79c7958993 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -974,6 +974,44 @@ static void ram_release_pages(const char *rbname, uint64_t offset, int pages) ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS); } +/* + * @pages: the number of pages written by the control path, + * < 0 - error + * > 0 - number of pages written + * + * Return true if the pages has been saved, otherwise false is returned. + */ +static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, + int *pages) +{ + uint64_t bytes_xmit = 0; + int ret; + + *pages = -1; + ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, + &bytes_xmit); + if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { + return false; + } + + if (bytes_xmit) { + ram_counters.transferred += bytes_xmit; + *pages = 1; + } + + if (ret == RAM_SAVE_CONTROL_DELAYED) { + return true; + } + + if (bytes_xmit > 0) { + ram_counters.normal++; + } else if (bytes_xmit == 0) { + ram_counters.duplicate++; + } + + return true; +} + /** * ram_save_page: send the given page to the stream * @@ -990,56 +1028,36 @@ static void ram_release_pages(const char *rbname, uint64_t offset, int pages) static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) { int pages = -1; - uint64_t bytes_xmit; - ram_addr_t current_addr; uint8_t *p; - int ret; bool send_async = true; RAMBlock *block = pss->block; ram_addr_t offset = pss->page << TARGET_PAGE_BITS; + ram_addr_t current_addr = block->offset + offset; p = block->host + offset; trace_ram_save_page(block->idstr, (uint64_t)offset, p); - /* In doubt sent page as normal */ - bytes_xmit = 0; - ret = ram_control_save_page(rs->f, block->offset, - offset, TARGET_PAGE_SIZE, &bytes_xmit); - if (bytes_xmit) { - ram_counters.transferred += bytes_xmit; - pages = 1; + if (control_save_page(rs, block, offset, &pages)) { + return pages; } XBZRLE_cache_lock(); - - current_addr = block->offset + offset; - - if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { - if (ret != RAM_SAVE_CONTROL_DELAYED) { - if (bytes_xmit > 0) { - ram_counters.normal++; - } else if (bytes_xmit == 0) { - ram_counters.duplicate++; - } - } - } else { - pages = save_zero_page(rs, block, offset); - if (pages > 0) { - /* Must let xbzrle know, otherwise a previous (now 0'd) cached - * page would be stale + pages = save_zero_page(rs, block, offset); + if (pages > 0) { + /* Must let xbzrle know, otherwise a previous (now 0'd) cached + * page would be stale + */ + xbzrle_cache_zero_page(rs, current_addr); + ram_release_pages(block->idstr, offset, pages); + } else if (!rs->ram_bulk_stage && + !migration_in_postcopy() && migrate_use_xbzrle()) { + pages = save_xbzrle_page(rs, &p, current_addr, block, + offset, last_stage); + if (!last_stage) { + /* Can't send this cached data async, since the cache page + * might get updated before it gets to the wire */ - xbzrle_cache_zero_page(rs, current_addr); - ram_release_pages(block->idstr, offset, pages); - } else if (!rs->ram_bulk_stage && - !migration_in_postcopy() && migrate_use_xbzrle()) { - pages = save_xbzrle_page(rs, &p, current_addr, block, - offset, last_stage); - if (!last_stage) { - /* Can't send this cached data async, since the cache page - * might get updated before it gets to the wire - */ - send_async = false; - } + send_async = false; } } @@ -1174,63 +1192,49 @@ static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) { int pages = -1; - uint64_t bytes_xmit = 0; uint8_t *p; - int ret; RAMBlock *block = pss->block; ram_addr_t offset = pss->page << TARGET_PAGE_BITS; p = block->host + offset; - ret = ram_control_save_page(rs->f, block->offset, - offset, TARGET_PAGE_SIZE, &bytes_xmit); - if (bytes_xmit) { - ram_counters.transferred += bytes_xmit; - pages = 1; + if (control_save_page(rs, block, offset, &pages)) { + return pages; } - if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { - if (ret != RAM_SAVE_CONTROL_DELAYED) { - if (bytes_xmit > 0) { - ram_counters.normal++; - } else if (bytes_xmit == 0) { - ram_counters.duplicate++; - } + + /* When starting the process of a new block, the first page of + * the block should be sent out before other pages in the same + * block, and all the pages in last block should have been sent + * out, keeping this order is important, because the 'cont' flag + * is used to avoid resending the block name. + */ + if (block != rs->last_sent_block) { + flush_compressed_data(rs); + pages = save_zero_page(rs, block, offset); + if (pages > 0) { + ram_release_pages(block->idstr, offset, pages); + } else { + /* + * Make sure the first page is sent out before other pages. + * + * we post it as normal page as compression will take much + * CPU resource. + */ + ram_counters.transferred += save_page_header(rs, rs->f, block, + offset | RAM_SAVE_FLAG_PAGE); + qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE, + migrate_release_ram() & + migration_in_postcopy()); + ram_counters.transferred += TARGET_PAGE_SIZE; + ram_counters.normal++; + pages = 1; } } else { - /* When starting the process of a new block, the first page of - * the block should be sent out before other pages in the same - * block, and all the pages in last block should have been sent - * out, keeping this order is important, because the 'cont' flag - * is used to avoid resending the block name. - */ - if (block != rs->last_sent_block) { - flush_compressed_data(rs); - pages = save_zero_page(rs, block, offset); - if (pages > 0) { - ram_release_pages(block->idstr, offset, pages); - } else { - /* - * Make sure the first page is sent out before other pages. - * - * we post it as normal page as compression will take much - * CPU resource. - */ - ram_counters.transferred += save_page_header(rs, rs->f, block, - offset | RAM_SAVE_FLAG_PAGE); - qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE, - migrate_release_ram() & - migration_in_postcopy()); - ram_counters.transferred += TARGET_PAGE_SIZE; - ram_counters.normal++; - pages = 1; - } + pages = save_zero_page(rs, block, offset); + if (pages == -1) { + pages = compress_page_with_multi_thread(rs, block, offset); } else { - pages = save_zero_page(rs, block, offset); - if (pages == -1) { - pages = compress_page_with_multi_thread(rs, block, offset); - } else { - ram_release_pages(block->idstr, offset, pages); - } + ram_release_pages(block->idstr, offset, pages); } } From 1faa5665c0f1df2eff291454a3a85625a3bc93dd Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 30 Mar 2018 15:51:24 +0800 Subject: [PATCH 12/16] migration: move some code to ram_save_host_page Move some code from ram_save_target_page() to ram_save_host_page() to make it be more readable for latter patches that dramatically clean ram_save_target_page() up Reviewed-by: Peter Xu Signed-off-by: Xiao Guangrong Message-Id: <20180330075128.26919-7-xiaoguangrong@tencent.com> Signed-off-by: Dr. David Alan Gilbert --- migration/ram.c | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 79c7958993..c3628b020e 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1483,38 +1483,23 @@ err: * Returns the number of pages written * * @rs: current RAM state - * @ms: current migration state * @pss: data about the page we want to send * @last_stage: if we are at the completion stage */ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) { - int res = 0; - - /* Check the pages is dirty and if it is send it */ - if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { - /* - * If xbzrle is on, stop using the data compression after first - * round of migration even if compression is enabled. In theory, - * xbzrle can do better than compression. - */ - if (migrate_use_compression() && - (rs->ram_bulk_stage || !migrate_use_xbzrle())) { - res = ram_save_compressed_page(rs, pss, last_stage); - } else { - res = ram_save_page(rs, pss, last_stage); - } - - if (res < 0) { - return res; - } - if (pss->block->unsentmap) { - clear_bit(pss->page, pss->block->unsentmap); - } + /* + * If xbzrle is on, stop using the data compression after first + * round of migration even if compression is enabled. In theory, + * xbzrle can do better than compression. + */ + if (migrate_use_compression() && + (rs->ram_bulk_stage || !migrate_use_xbzrle())) { + return ram_save_compressed_page(rs, pss, last_stage); } - return res; + return ram_save_page(rs, pss, last_stage); } /** @@ -1543,12 +1528,22 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; do { + /* Check the pages is dirty and if it is send it */ + if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { + pss->page++; + continue; + } + tmppages = ram_save_target_page(rs, pss, last_stage); if (tmppages < 0) { return tmppages; } pages += tmppages; + if (pss->block->unsentmap) { + clear_bit(pss->page, pss->block->unsentmap); + } + pss->page++; } while ((pss->page & (pagesize_bits - 1)) && offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS)); From a8ec91f941c5f83123796331c09333d3557eb5fc Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 30 Mar 2018 15:51:25 +0800 Subject: [PATCH 13/16] migration: move calling control_save_page to the common place The function is called by both ram_save_page and ram_save_target_page, so move it to the common caller to cleanup the code Reviewed-by: Peter Xu Signed-off-by: Xiao Guangrong Message-Id: <20180330075128.26919-8-xiaoguangrong@tencent.com> Signed-off-by: Dr. David Alan Gilbert --- migration/ram.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index c3628b020e..e0caf7182b 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1037,10 +1037,6 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) p = block->host + offset; trace_ram_save_page(block->idstr, (uint64_t)offset, p); - if (control_save_page(rs, block, offset, &pages)) { - return pages; - } - XBZRLE_cache_lock(); pages = save_zero_page(rs, block, offset); if (pages > 0) { @@ -1198,10 +1194,6 @@ static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss, p = block->host + offset; - if (control_save_page(rs, block, offset, &pages)) { - return pages; - } - /* When starting the process of a new block, the first page of * the block should be sent out before other pages in the same * block, and all the pages in last block should have been sent @@ -1489,6 +1481,14 @@ err: static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) { + RAMBlock *block = pss->block; + ram_addr_t offset = pss->page << TARGET_PAGE_BITS; + int res; + + if (control_save_page(rs, block, offset, &res)) { + return res; + } + /* * If xbzrle is on, stop using the data compression after first * round of migration even if compression is enabled. In theory, From d7400a3409982a52ac451cd3ca9caee9db670ca7 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 30 Mar 2018 15:51:26 +0800 Subject: [PATCH 14/16] migration: move calling save_zero_page to the common place save_zero_page() is always our first approach to try, move it to the common place before calling ram_save_compressed_page and ram_save_page Reviewed-by: Peter Xu Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Xiao Guangrong Message-Id: <20180330075128.26919-9-xiaoguangrong@tencent.com> Signed-off-by: Dr. David Alan Gilbert --- migration/ram.c | 105 +++++++++++++++++++++++++++--------------------- 1 file changed, 59 insertions(+), 46 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index e0caf7182b..97917542c5 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1038,15 +1038,8 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) trace_ram_save_page(block->idstr, (uint64_t)offset, p); XBZRLE_cache_lock(); - pages = save_zero_page(rs, block, offset); - if (pages > 0) { - /* Must let xbzrle know, otherwise a previous (now 0'd) cached - * page would be stale - */ - xbzrle_cache_zero_page(rs, current_addr); - ram_release_pages(block->idstr, offset, pages); - } else if (!rs->ram_bulk_stage && - !migration_in_postcopy() && migrate_use_xbzrle()) { + if (!rs->ram_bulk_stage && !migration_in_postcopy() && + migrate_use_xbzrle()) { pages = save_xbzrle_page(rs, &p, current_addr, block, offset, last_stage); if (!last_stage) { @@ -1194,40 +1187,23 @@ static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss, p = block->host + offset; - /* When starting the process of a new block, the first page of - * the block should be sent out before other pages in the same - * block, and all the pages in last block should have been sent - * out, keeping this order is important, because the 'cont' flag - * is used to avoid resending the block name. - */ if (block != rs->last_sent_block) { - flush_compressed_data(rs); - pages = save_zero_page(rs, block, offset); - if (pages > 0) { - ram_release_pages(block->idstr, offset, pages); - } else { - /* - * Make sure the first page is sent out before other pages. - * - * we post it as normal page as compression will take much - * CPU resource. - */ - ram_counters.transferred += save_page_header(rs, rs->f, block, - offset | RAM_SAVE_FLAG_PAGE); - qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE, - migrate_release_ram() & - migration_in_postcopy()); - ram_counters.transferred += TARGET_PAGE_SIZE; - ram_counters.normal++; - pages = 1; - } + /* + * Make sure the first page is sent out before other pages. + * + * we post it as normal page as compression will take much + * CPU resource. + */ + ram_counters.transferred += save_page_header(rs, rs->f, block, + offset | RAM_SAVE_FLAG_PAGE); + qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE, + migrate_release_ram() & + migration_in_postcopy()); + ram_counters.transferred += TARGET_PAGE_SIZE; + ram_counters.normal++; + pages = 1; } else { - pages = save_zero_page(rs, block, offset); - if (pages == -1) { - pages = compress_page_with_multi_thread(rs, block, offset); - } else { - ram_release_pages(block->idstr, offset, pages); - } + pages = compress_page_with_multi_thread(rs, block, offset); } return pages; @@ -1469,6 +1445,24 @@ err: return -1; } +static bool save_page_use_compression(RAMState *rs) +{ + if (!migrate_use_compression()) { + return false; + } + + /* + * If xbzrle is on, stop using the data compression after first + * round of migration even if compression is enabled. In theory, + * xbzrle can do better than compression. + */ + if (rs->ram_bulk_stage || !migrate_use_xbzrle()) { + return true; + } + + return false; +} + /** * ram_save_target_page: save one target page * @@ -1490,12 +1484,31 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, } /* - * If xbzrle is on, stop using the data compression after first - * round of migration even if compression is enabled. In theory, - * xbzrle can do better than compression. + * When starting the process of a new block, the first page of + * the block should be sent out before other pages in the same + * block, and all the pages in last block should have been sent + * out, keeping this order is important, because the 'cont' flag + * is used to avoid resending the block name. */ - if (migrate_use_compression() && - (rs->ram_bulk_stage || !migrate_use_xbzrle())) { + if (block != rs->last_sent_block && save_page_use_compression(rs)) { + flush_compressed_data(rs); + } + + res = save_zero_page(rs, block, offset); + if (res > 0) { + /* Must let xbzrle know, otherwise a previous (now 0'd) cached + * page would be stale + */ + if (!save_page_use_compression(rs)) { + XBZRLE_cache_lock(); + xbzrle_cache_zero_page(rs, block->offset + offset); + XBZRLE_cache_unlock(); + } + ram_release_pages(block->idstr, offset, res); + return res; + } + + if (save_page_use_compression(rs)) { return ram_save_compressed_page(rs, pss, last_stage); } From 65dacaa04fa7e6104cbcee9251c7845355769a10 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 30 Mar 2018 15:51:27 +0800 Subject: [PATCH 15/16] migration: introduce save_normal_page() It directly sends the page to the stream neither checking zero nor using xbzrle or compression Reviewed-by: Peter Xu Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Xiao Guangrong Message-Id: <20180330075128.26919-10-xiaoguangrong@tencent.com> Signed-off-by: Dr. David Alan Gilbert --- migration/ram.c | 50 +++++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 97917542c5..2eb4c0bf49 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1012,6 +1012,34 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, return true; } +/* + * directly send the page to the stream + * + * Returns the number of pages written. + * + * @rs: current RAM state + * @block: block that contains the page we want to send + * @offset: offset inside the block for the page + * @buf: the page to be sent + * @async: send to page asyncly + */ +static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, + uint8_t *buf, bool async) +{ + ram_counters.transferred += save_page_header(rs, rs->f, block, + offset | RAM_SAVE_FLAG_PAGE); + if (async) { + qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, + migrate_release_ram() & + migration_in_postcopy()); + } else { + qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); + } + ram_counters.transferred += TARGET_PAGE_SIZE; + ram_counters.normal++; + return 1; +} + /** * ram_save_page: send the given page to the stream * @@ -1052,18 +1080,7 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) /* XBZRLE overflow or normal page */ if (pages == -1) { - ram_counters.transferred += - save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE); - if (send_async) { - qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE, - migrate_release_ram() & - migration_in_postcopy()); - } else { - qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE); - } - ram_counters.transferred += TARGET_PAGE_SIZE; - pages = 1; - ram_counters.normal++; + pages = save_normal_page(rs, block, offset, p, send_async); } XBZRLE_cache_unlock(); @@ -1194,14 +1211,7 @@ static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss, * we post it as normal page as compression will take much * CPU resource. */ - ram_counters.transferred += save_page_header(rs, rs->f, block, - offset | RAM_SAVE_FLAG_PAGE); - qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE, - migrate_release_ram() & - migration_in_postcopy()); - ram_counters.transferred += TARGET_PAGE_SIZE; - ram_counters.normal++; - pages = 1; + pages = save_normal_page(rs, block, offset, p, true); } else { pages = compress_page_with_multi_thread(rs, block, offset); } From da3f56cb2e767016d3f204837a77caf35b463f90 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 30 Mar 2018 15:51:28 +0800 Subject: [PATCH 16/16] migration: remove ram_save_compressed_page() Now, we can reuse the path in ram_save_page() to post the page out as normal, then the only thing remained in ram_save_compressed_page() is compression that we can move it out to the caller Reviewed-by: Peter Xu Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Xiao Guangrong Message-Id: <20180330075128.26919-11-xiaoguangrong@tencent.com> Signed-off-by: Dr. David Alan Gilbert --- migration/ram.c | 45 ++++++++------------------------------------- 1 file changed, 8 insertions(+), 37 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 2eb4c0bf49..912810c18e 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1184,41 +1184,6 @@ static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, return pages; } -/** - * ram_save_compressed_page: compress the given page and send it to the stream - * - * Returns the number of pages written. - * - * @rs: current RAM state - * @block: block that contains the page we want to send - * @offset: offset inside the block for the page - * @last_stage: if we are at the completion stage - */ -static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss, - bool last_stage) -{ - int pages = -1; - uint8_t *p; - RAMBlock *block = pss->block; - ram_addr_t offset = pss->page << TARGET_PAGE_BITS; - - p = block->host + offset; - - if (block != rs->last_sent_block) { - /* - * Make sure the first page is sent out before other pages. - * - * we post it as normal page as compression will take much - * CPU resource. - */ - pages = save_normal_page(rs, block, offset, p, true); - } else { - pages = compress_page_with_multi_thread(rs, block, offset); - } - - return pages; -} - /** * find_dirty_block: find the next dirty page and update any state * associated with the search process. @@ -1518,8 +1483,14 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, return res; } - if (save_page_use_compression(rs)) { - return ram_save_compressed_page(rs, pss, last_stage); + /* + * Make sure the first page is sent out before other pages. + * + * we post it as normal page as compression will take much + * CPU resource. + */ + if (block == rs->last_sent_block && save_page_use_compression(rs)) { + res = compress_page_with_multi_thread(rs, block, offset); } return ram_save_page(rs, pss, last_stage);