migration: allow dst vm pause on postcopy
When there is IO error on the incoming channel (e.g., network down), instead of bailing out immediately, we allow the dst vm to switch to the new POSTCOPY_PAUSE state. Currently it is still simple - it waits the new semaphore, until someone poke it for another attempt. One note is that here on ram loading thread we cannot detect the POSTCOPY_ACTIVE state, but we need to detect the more specific POSTCOPY_INCOMING_RUNNING state, to make sure we have already loaded all the device states. Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Signed-off-by: Peter Xu <peterx@redhat.com> Message-Id: <20180502104740.12123-5-peterx@redhat.com> Signed-off-by: Juan Quintela <quintela@redhat.com>
This commit is contained in:
parent
b23c2ade25
commit
b411b844fb
@ -159,6 +159,7 @@ MigrationIncomingState *migration_incoming_get_current(void)
|
||||
sizeof(struct PostCopyFD));
|
||||
qemu_mutex_init(&mis_current.rp_mutex);
|
||||
qemu_event_init(&mis_current.main_thread_load_event, false);
|
||||
qemu_sem_init(&mis_current.postcopy_pause_sem_dst, 0);
|
||||
|
||||
init_dirty_bitmap_incoming_migration();
|
||||
|
||||
|
@ -73,6 +73,9 @@ struct MigrationIncomingState {
|
||||
* live migration, to calculate vCPU block time
|
||||
* */
|
||||
struct PostcopyBlocktimeContext *blocktime_ctx;
|
||||
|
||||
/* notify PAUSED postcopy incoming migrations to try to continue */
|
||||
QemuSemaphore postcopy_pause_sem_dst;
|
||||
};
|
||||
|
||||
MigrationIncomingState *migration_incoming_get_current(void);
|
||||
|
@ -1564,8 +1564,8 @@ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
|
||||
*/
|
||||
static void *postcopy_ram_listen_thread(void *opaque)
|
||||
{
|
||||
QEMUFile *f = opaque;
|
||||
MigrationIncomingState *mis = migration_incoming_get_current();
|
||||
QEMUFile *f = mis->from_src_file;
|
||||
int load_res;
|
||||
|
||||
migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
|
||||
@ -1579,6 +1579,14 @@ static void *postcopy_ram_listen_thread(void *opaque)
|
||||
*/
|
||||
qemu_file_set_blocking(f, true);
|
||||
load_res = qemu_loadvm_state_main(f, mis);
|
||||
|
||||
/*
|
||||
* This is tricky, but, mis->from_src_file can change after it
|
||||
* returns, when postcopy recovery happened. In the future, we may
|
||||
* want a wrapper for the QEMUFile handle.
|
||||
*/
|
||||
f = mis->from_src_file;
|
||||
|
||||
/* And non-blocking again so we don't block in any cleanup */
|
||||
qemu_file_set_blocking(f, false);
|
||||
|
||||
@ -1668,7 +1676,7 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
|
||||
/* Start up the listening thread and wait for it to signal ready */
|
||||
qemu_sem_init(&mis->listen_thread_sem, 0);
|
||||
qemu_thread_create(&mis->listen_thread, "postcopy/listen",
|
||||
postcopy_ram_listen_thread, mis->from_src_file,
|
||||
postcopy_ram_listen_thread, NULL,
|
||||
QEMU_THREAD_DETACHED);
|
||||
qemu_sem_wait(&mis->listen_thread_sem);
|
||||
qemu_sem_destroy(&mis->listen_thread_sem);
|
||||
@ -2055,11 +2063,44 @@ void qemu_loadvm_state_cleanup(void)
|
||||
}
|
||||
}
|
||||
|
||||
/* Return true if we should continue the migration, or false. */
|
||||
static bool postcopy_pause_incoming(MigrationIncomingState *mis)
|
||||
{
|
||||
trace_postcopy_pause_incoming();
|
||||
|
||||
migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
|
||||
MIGRATION_STATUS_POSTCOPY_PAUSED);
|
||||
|
||||
assert(mis->from_src_file);
|
||||
qemu_file_shutdown(mis->from_src_file);
|
||||
qemu_fclose(mis->from_src_file);
|
||||
mis->from_src_file = NULL;
|
||||
|
||||
assert(mis->to_src_file);
|
||||
qemu_file_shutdown(mis->to_src_file);
|
||||
qemu_mutex_lock(&mis->rp_mutex);
|
||||
qemu_fclose(mis->to_src_file);
|
||||
mis->to_src_file = NULL;
|
||||
qemu_mutex_unlock(&mis->rp_mutex);
|
||||
|
||||
error_report("Detected IO failure for postcopy. "
|
||||
"Migration paused.");
|
||||
|
||||
while (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
|
||||
qemu_sem_wait(&mis->postcopy_pause_sem_dst);
|
||||
}
|
||||
|
||||
trace_postcopy_pause_incoming_continued();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
|
||||
{
|
||||
uint8_t section_type;
|
||||
int ret = 0;
|
||||
|
||||
retry:
|
||||
while (true) {
|
||||
section_type = qemu_get_byte(f);
|
||||
|
||||
@ -2104,6 +2145,24 @@ static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
|
||||
out:
|
||||
if (ret < 0) {
|
||||
qemu_file_set_error(f, ret);
|
||||
|
||||
/*
|
||||
* Detect whether it is:
|
||||
*
|
||||
* 1. postcopy running (after receiving all device data, which
|
||||
* must be in POSTCOPY_INCOMING_RUNNING state. Note that
|
||||
* POSTCOPY_INCOMING_LISTENING is still not enough, it's
|
||||
* still receiving device states).
|
||||
* 2. network failure (-EIO)
|
||||
*
|
||||
* If so, we try to wait for a recovery.
|
||||
*/
|
||||
if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
|
||||
ret == -EIO && postcopy_pause_incoming(mis)) {
|
||||
/* Reset f to point to the newly created channel */
|
||||
f = mis->from_src_file;
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
@ -100,6 +100,8 @@ open_return_path_on_source(void) ""
|
||||
open_return_path_on_source_continue(void) ""
|
||||
postcopy_start(void) ""
|
||||
postcopy_pause_continued(void) ""
|
||||
postcopy_pause_incoming(void) ""
|
||||
postcopy_pause_incoming_continued(void) ""
|
||||
postcopy_start_set_run(void) ""
|
||||
source_return_path_thread_bad_end(void) ""
|
||||
source_return_path_thread_end(void) ""
|
||||
|
Loading…
Reference in New Issue
Block a user