diff --git a/migration/migration.c b/migration/migration.c index 5ff7ba9d5c..2d306582eb 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1855,6 +1855,15 @@ void migrate_set_error(MigrationState *s, const Error *error) } } +static void migrate_error_free(MigrationState *s) +{ + QEMU_LOCK_GUARD(&s->error_mutex); + if (s->error) { + error_free(s->error); + s->error = NULL; + } +} + void migrate_fd_error(MigrationState *s, const Error *error) { trace_migrate_fd_error(error_get_pretty(error)); @@ -2818,12 +2827,12 @@ out: * Maybe there is something we can do: it looks like a * network down issue, and we pause for a recovery. */ + qemu_fclose(rp); + ms->rp_state.from_dst_file = NULL; + rp = NULL; if (postcopy_pause_return_path_thread(ms)) { /* Reload rp, reset the rest */ - if (rp != ms->rp_state.from_dst_file) { - qemu_fclose(rp); - rp = ms->rp_state.from_dst_file; - } + rp = ms->rp_state.from_dst_file; ms->rp_state.error = false; goto retry; } @@ -3701,6 +3710,10 @@ static void qemu_savevm_wait_unplug(MigrationState *s, int old_state, while (timeout-- && qemu_savevm_state_guest_unplug_pending()) { qemu_sem_timedwait(&s->wait_unplug_sem, 250); } + if (qemu_savevm_state_guest_unplug_pending()) { + warn_report("migration: partially unplugged device on " + "failure"); + } } migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, new_state); @@ -3966,6 +3979,13 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) int64_t rate_limit; bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED; + /* + * If there's a previous error, free it and prepare for another one. + * Meanwhile if migration completes successfully, there won't have an error + * dumped when calling migrate_fd_cleanup(). + */ + migrate_error_free(s); + s->expected_downtime = s->parameters.downtime_limit; if (resume) { assert(s->cleanup_bh); @@ -3975,7 +3995,18 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) } if (error_in) { migrate_fd_error(s, error_in); - migrate_fd_cleanup(s); + if (resume) { + /* + * Don't do cleanup for resume if channel is invalid, but only dump + * the error. We wait for another channel connect from the user. + * The error_report still gives HMP user a hint on what failed. + * It's normally done in migrate_fd_cleanup(), but call it here + * explicitly. + */ + error_report_err(error_copy(s->error)); + } else { + migrate_fd_cleanup(s); + } return; } diff --git a/migration/ram.c b/migration/ram.c index 88ff34f574..b5fc454b2f 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -795,8 +795,6 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs, { bool ret; - QEMU_LOCK_GUARD(&rs->bitmap_mutex); - /* * Clear dirty bitmap if needed. This _must_ be called before we * send any of the page in the chunk because we need to make sure @@ -2834,6 +2832,14 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) goto out; } + /* + * We'll take this lock a little bit long, but it's okay for two reasons. + * Firstly, the only possible other thread to take it is who calls + * qemu_guest_free_page_hint(), which should be rare; secondly, see + * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which + * guarantees that we'll at least released it in a regular basis. + */ + qemu_mutex_lock(&rs->bitmap_mutex); WITH_RCU_READ_LOCK_GUARD() { if (ram_list.version != rs->last_version) { ram_state_reset(rs); @@ -2893,6 +2899,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) i++; } } + qemu_mutex_unlock(&rs->bitmap_mutex); /* * Must occur before EOS (or any QEMUFile operation) @@ -3682,6 +3689,7 @@ void colo_flush_ram_cache(void) unsigned long offset = 0; memory_global_dirty_log_sync(); + qemu_mutex_lock(&ram_state->bitmap_mutex); WITH_RCU_READ_LOCK_GUARD() { RAMBLOCK_FOREACH_NOT_IGNORED(block) { ramblock_sync_dirty_bitmap(ram_state, block); @@ -3710,6 +3718,7 @@ void colo_flush_ram_cache(void) } } trace_colo_flush_ram_cache_end(); + qemu_mutex_unlock(&ram_state->bitmap_mutex); } /** diff --git a/migration/rdma.c b/migration/rdma.c index 38a099f7ee..5c2d113aa9 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -1143,6 +1143,7 @@ static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma) for (i--; i >= 0; i--) { ibv_dereg_mr(local->block[i].mr); + local->block[i].mr = NULL; rdma->total_registrations--; }