migration: allow fault thread to pause

Allows the fault thread to stop handling page faults temporarily. When
network failure happened (and if we expect a recovery afterwards), we
should not allow the fault thread to continue sending things to source,
instead, it should halt for a while until the connection is rebuilt.

When the dest main thread noticed the failure, it kicks the fault thread
to switch to pause state.

Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20180502104740.12123-7-peterx@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
This commit is contained in:
Peter Xu 2018-05-02 18:47:22 +08:00 committed by Juan Quintela
parent 14b1742eaa
commit 3a7804c306
5 changed files with 57 additions and 4 deletions

View File

@ -160,6 +160,7 @@ MigrationIncomingState *migration_incoming_get_current(void)
qemu_mutex_init(&mis_current.rp_mutex);
qemu_event_init(&mis_current.main_thread_load_event, false);
qemu_sem_init(&mis_current.postcopy_pause_sem_dst, 0);
qemu_sem_init(&mis_current.postcopy_pause_sem_fault, 0);
init_dirty_bitmap_incoming_migration();

View File

@ -76,6 +76,7 @@ struct MigrationIncomingState {
/* notify PAUSED postcopy incoming migrations to try to continue */
QemuSemaphore postcopy_pause_sem_dst;
QemuSemaphore postcopy_pause_sem_fault;
};
MigrationIncomingState *migration_incoming_get_current(void);

View File

@ -830,6 +830,17 @@ static void mark_postcopy_blocktime_end(uintptr_t addr)
affected_cpu);
}
static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
{
trace_postcopy_pause_fault_thread();
qemu_sem_wait(&mis->postcopy_pause_sem_fault);
trace_postcopy_pause_fault_thread_continued();
return true;
}
/*
* Handle faults detected by the USERFAULT markings
*/
@ -880,6 +891,22 @@ static void *postcopy_ram_fault_thread(void *opaque)
break;
}
if (!mis->to_src_file) {
/*
* Possibly someone tells us that the return path is
* broken already using the event. We should hold until
* the channel is rebuilt.
*/
if (postcopy_pause_fault_thread(mis)) {
mis->last_rb = NULL;
/* Continue to read the userfaultfd */
} else {
error_report("%s: paused but don't allow to continue",
__func__);
break;
}
}
if (pfd[1].revents) {
uint64_t tmp64 = 0;
@ -942,18 +969,37 @@ static void *postcopy_ram_fault_thread(void *opaque)
(uintptr_t)(msg.arg.pagefault.address),
msg.arg.pagefault.feat.ptid, rb);
retry:
/*
* Send the request to the source - we want to request one
* of our host page sizes (which is >= TPS)
*/
if (rb != mis->last_rb) {
mis->last_rb = rb;
migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
rb_offset, qemu_ram_pagesize(rb));
ret = migrate_send_rp_req_pages(mis,
qemu_ram_get_idstr(rb),
rb_offset,
qemu_ram_pagesize(rb));
} else {
/* Save some space */
migrate_send_rp_req_pages(mis, NULL,
rb_offset, qemu_ram_pagesize(rb));
ret = migrate_send_rp_req_pages(mis,
NULL,
rb_offset,
qemu_ram_pagesize(rb));
}
if (ret) {
/* May be network failure, try to wait for recovery */
if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
/* We got reconnected somehow, try to continue */
mis->last_rb = NULL;
goto retry;
} else {
/* This is a unavoidable fault */
error_report("%s: migrate_send_rp_req_pages() get %d",
__func__, ret);
break;
}
}
}

View File

@ -2083,6 +2083,9 @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis)
mis->to_src_file = NULL;
qemu_mutex_unlock(&mis->rp_mutex);
/* Notify the fault thread for the invalidated file handle */
postcopy_fault_thread_notify(mis);
error_report("Detected IO failure for postcopy. "
"Migration paused.");

View File

@ -101,6 +101,8 @@ open_return_path_on_source_continue(void) ""
postcopy_start(void) ""
postcopy_pause_return_path(void) ""
postcopy_pause_return_path_continued(void) ""
postcopy_pause_fault_thread(void) ""
postcopy_pause_fault_thread_continued(void) ""
postcopy_pause_continued(void) ""
postcopy_pause_incoming(void) ""
postcopy_pause_incoming_continued(void) ""