migration: Postcopy preemption enablement

This patch enables postcopy-preempt feature. It contains two major changes to the migration logic: (1) Postcopy requests are now sent via a different socket from precopy background migration stream, so as to be isolated from very high page request delays. (2) For huge page enabled hosts: when there's postcopy requests, they can now intercept a partial sending of huge host pages on src QEMU. After this patch, we'll live migrate a VM with two channels for postcopy: (1) PRECOPY channel, which is the default channel that transfers background pages; and (2) POSTCOPY channel, which only transfers requested pages. There's no strict rule of which channel to use, e.g., if a requested page is already being transferred on precopy channel, then we will keep using the same precopy channel to transfer the page even if it's explicitly requested. In 99% of the cases we'll prioritize the channels so we send requested page via the postcopy channel as long as possible. On the source QEMU, when we found a postcopy request, we'll interrupt the PRECOPY channel sending process and quickly switch to the POSTCOPY channel. After we serviced all the high priority postcopy pages, we'll switch back to PRECOPY channel so that we'll continue to send the interrupted huge page again. There's no new thread introduced on src QEMU. On the destination QEMU, one new thread is introduced to receive page data from the postcopy specific socket (done in the preparation patch). This patch has a side effect: after sending postcopy pages, previously we'll assume the guest will access follow up pages so we'll keep sending from there. Now it's changed. Instead of going on with a postcopy requested page, we'll go back and continue sending the precopy huge page (which can be intercepted by a postcopy request so the huge page can be sent partially before). Whether that's a problem is debatable, because "assuming the guest will continue to access the next page" may not really suite when huge pages are used, especially if the huge page is large (e.g. 1GB pages). So that locality hint is much meaningless if huge pages are used. Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Signed-off-by: Peter Xu <peterx@redhat.com> Message-Id: <20220707185504.27203-1-peterx@redhat.com> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2022-07-07 14:55:04 -04:00 · 2022-07-07 14:55:04 -04:00 · c01b16edf6
commit c01b16edf6
parent 36f62f11e4
4 changed files with 253 additions and 9 deletions
--- a/migration/migration.c
+++ b/migration/migration.c
@ -3190,6 +3190,8 @@ static int postcopy_start(MigrationState *ms)
                              MIGRATION_STATUS_FAILED);
    }

+    trace_postcopy_preempt_enabled(migrate_postcopy_preempt());
+
    return ret;

 fail_closefb:
--- a/migration/migration.h
+++ b/migration/migration.h
@ -68,7 +68,7 @@ typedef struct {
 struct MigrationIncomingState {
    QEMUFile *from_src_file;
    /* Previously received RAM's RAMBlock pointer */
-    RAMBlock *last_recv_block;
+    RAMBlock *last_recv_block[RAM_CHANNEL_MAX];
    /* A hook to allow cleanup at the end of incoming migration */
    void *transport_data;
    void (*transport_cleanup)(void *data);
--- a/migration/ram.c
+++ b/migration/ram.c
@ -296,6 +296,20 @@ struct RAMSrcPageRequest {
    QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 };

+typedef struct {
+    /*
+     * Cached ramblock/offset values if preempted.  They're only meaningful if
+     * preempted==true below.
+     */
+    RAMBlock *ram_block;
+    unsigned long ram_page;
+    /*
+     * Whether a postcopy preemption just happened.  Will be reset after
+     * precopy recovered to background migration.
+     */
+    bool preempted;
+} PostcopyPreemptState;
+
 /* State of RAM for migration */
 struct RAMState {
    /* QEMUFile used for this migration */
@ -350,6 +364,14 @@ struct RAMState {
    /* Queue of outstanding page requests from the destination */
    QemuMutex src_page_req_mutex;
    QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
+
+    /* Postcopy preemption informations */
+    PostcopyPreemptState postcopy_preempt_state;
+    /*
+     * Current channel we're using on src VM.  Only valid if postcopy-preempt
+     * is enabled.
+     */
+    unsigned int postcopy_channel;
 };
 typedef struct RAMState RAMState;

@ -357,6 +379,11 @@ static RAMState *ram_state;

 static NotifierWithReturnList precopy_notifier_list;

+static void postcopy_preempt_reset(RAMState *rs)
+{
+    memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState));
+}
+
 /* Whether postcopy has queued requests? */
 static bool postcopy_has_request(RAMState *rs)
 {
@ -1947,6 +1974,55 @@ void ram_write_tracking_stop(void)
 }
 #endif /* defined(__linux__) */

+/*
+ * Check whether two addr/offset of the ramblock falls onto the same host huge
+ * page.  Returns true if so, false otherwise.
+ */
+static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1,
+                                     uint64_t addr2)
+{
+    size_t page_size = qemu_ram_pagesize(rb);
+
+    addr1 = ROUND_DOWN(addr1, page_size);
+    addr2 = ROUND_DOWN(addr2, page_size);
+
+    return addr1 == addr2;
+}
+
+/*
+ * Whether a previous preempted precopy huge page contains current requested
+ * page?  Returns true if so, false otherwise.
+ *
+ * This should really happen very rarely, because it means when we were sending
+ * during background migration for postcopy we're sending exactly the page that
+ * some vcpu got faulted on on dest node.  When it happens, we probably don't
+ * need to do much but drop the request, because we know right after we restore
+ * the precopy stream it'll be serviced.  It'll slightly affect the order of
+ * postcopy requests to be serviced (e.g. it'll be the same as we move current
+ * request to the end of the queue) but it shouldn't be a big deal.  The most
+ * imporant thing is we can _never_ try to send a partial-sent huge page on the
+ * POSTCOPY channel again, otherwise that huge page will got "split brain" on
+ * two channels (PRECOPY, POSTCOPY).
+ */
+static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block,
+                                        ram_addr_t offset)
+{
+    PostcopyPreemptState *state = &rs->postcopy_preempt_state;
+
+    /* No preemption at all? */
+    if (!state->preempted) {
+        return false;
+    }
+
+    /* Not even the same ramblock? */
+    if (state->ram_block != block) {
+        return false;
+    }
+
+    return offset_on_same_huge_page(block, offset,
+                                    state->ram_page << TARGET_PAGE_BITS);
+}
+
 /**
 * get_queued_page: unqueue a page from the postcopy requests
 *
@ -1962,9 +2038,17 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
    RAMBlock  *block;
    ram_addr_t offset;

+again:
    block = unqueue_page(rs, &offset);

-    if (!block) {
+    if (block) {
+        /* See comment above postcopy_preempted_contains() */
+        if (postcopy_preempted_contains(rs, block, offset)) {
+            trace_postcopy_preempt_hit(block->idstr, offset);
+            /* This request is dropped */
+            goto again;
+        }
+    } else {
        /*
         * Poll write faults too if background snapshot is enabled; that's
         * when we have vcpus got blocked by the write protected pages.
@ -2180,6 +2264,117 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
    return ram_save_page(rs, pss);
 }

+static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss)
+{
+    /* Not enabled eager preempt?  Then never do that. */
+    if (!migrate_postcopy_preempt()) {
+        return false;
+    }
+
+    /* If the ramblock we're sending is a small page?  Never bother. */
+    if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) {
+        return false;
+    }
+
+    /* Not in postcopy at all? */
+    if (!migration_in_postcopy()) {
+        return false;
+    }
+
+    /*
+     * If we're already handling a postcopy request, don't preempt as this page
+     * has got the same high priority.
+     */
+    if (pss->postcopy_requested) {
+        return false;
+    }
+
+    /* If there's postcopy requests, then check it up! */
+    return postcopy_has_request(rs);
+}
+
+/* Returns true if we preempted precopy, false otherwise */
+static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss)
+{
+    PostcopyPreemptState *p_state = &rs->postcopy_preempt_state;
+
+    trace_postcopy_preempt_triggered(pss->block->idstr, pss->page);
+
+    /*
+     * Time to preempt precopy. Cache current PSS into preempt state, so that
+     * after handling the postcopy pages we can recover to it.  We need to do
+     * so because the dest VM will have partial of the precopy huge page kept
+     * over in its tmp huge page caches; better move on with it when we can.
+     */
+    p_state->ram_block = pss->block;
+    p_state->ram_page = pss->page;
+    p_state->preempted = true;
+}
+
+/* Whether we're preempted by a postcopy request during sending a huge page */
+static bool postcopy_preempt_triggered(RAMState *rs)
+{
+    return rs->postcopy_preempt_state.preempted;
+}
+
+static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss)
+{
+    PostcopyPreemptState *state = &rs->postcopy_preempt_state;
+
+    assert(state->preempted);
+
+    pss->block = state->ram_block;
+    pss->page = state->ram_page;
+    /* This is not a postcopy request but restoring previous precopy */
+    pss->postcopy_requested = false;
+
+    trace_postcopy_preempt_restored(pss->block->idstr, pss->page);
+
+    /* Reset preempt state, most importantly, set preempted==false */
+    postcopy_preempt_reset(rs);
+}
+
+static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss)
+{
+    MigrationState *s = migrate_get_current();
+    unsigned int channel;
+    QEMUFile *next;
+
+    channel = pss->postcopy_requested ?
+        RAM_CHANNEL_POSTCOPY : RAM_CHANNEL_PRECOPY;
+
+    if (channel != rs->postcopy_channel) {
+        if (channel == RAM_CHANNEL_PRECOPY) {
+            next = s->to_dst_file;
+        } else {
+            next = s->postcopy_qemufile_src;
+        }
+        /* Update and cache the current channel */
+        rs->f = next;
+        rs->postcopy_channel = channel;
+
+        /*
+         * If channel switched, reset last_sent_block since the old sent block
+         * may not be on the same channel.
+         */
+        rs->last_sent_block = NULL;
+
+        trace_postcopy_preempt_switch_channel(channel);
+    }
+
+    trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
+}
+
+/* We need to make sure rs->f always points to the default channel elsewhere */
+static void postcopy_preempt_reset_channel(RAMState *rs)
+{
+    if (migrate_postcopy_preempt() && migration_in_postcopy()) {
+        rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
+        rs->f = migrate_get_current()->to_dst_file;
+        trace_postcopy_preempt_reset_channel();
+    }
+}
+
 /**
 * ram_save_host_page: save a whole host page
 *
@ -2211,7 +2406,16 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
        return 0;
    }

+    if (migrate_postcopy_preempt() && migration_in_postcopy()) {
+        postcopy_preempt_choose_channel(rs, pss);
+    }
+
    do {
+        if (postcopy_needs_preempt(rs, pss)) {
+            postcopy_do_preempt(rs, pss);
+            break;
+        }
+
        /* Check the pages is dirty and if it is send it */
        if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
            tmppages = ram_save_target_page(rs, pss);
@ -2235,6 +2439,19 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
    /* The offset we leave with is the min boundary of host page and block */
    pss->page = MIN(pss->page, hostpage_boundary);

+    /*
+     * When with postcopy preempt mode, flush the data as soon as possible for
+     * postcopy requests, because we've already sent a whole huge page, so the
+     * dst node should already have enough resource to atomically filling in
+     * the current missing page.
+     *
+     * More importantly, when using separate postcopy channel, we must do
+     * explicit flush or it won't flush until the buffer is full.
+     */
+    if (migrate_postcopy_preempt() && pss->postcopy_requested) {
+        qemu_fflush(rs->f);
+    }
+
    res = ram_save_release_protection(rs, pss, start_page);
    return (res < 0 ? res : pages);
 }
@ -2276,8 +2493,17 @@ static int ram_find_and_save_block(RAMState *rs)
        found = get_queued_page(rs, &pss);

        if (!found) {
-            /* priority queue empty, so just search for something dirty */
-            found = find_dirty_block(rs, &pss, &again);
+            /*
+             * Recover previous precopy ramblock/offset if postcopy has
+             * preempted precopy.  Otherwise find the next dirty bit.
+             */
+            if (postcopy_preempt_triggered(rs)) {
+                postcopy_preempt_restore(rs, &pss);
+                found = true;
+            } else {
+                /* priority queue empty, so just search for something dirty */
+                found = find_dirty_block(rs, &pss, &again);
+            }
        }

        if (found) {
@ -2405,6 +2631,8 @@ static void ram_state_reset(RAMState *rs)
    rs->last_page = 0;
    rs->last_version = ram_list.version;
    rs->xbzrle_enabled = false;
+    postcopy_preempt_reset(rs);
+    rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
 }

 #define MAX_WAIT 50 /* ms, half buffered_file limit */
@ -3048,6 +3276,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
    }
    qemu_mutex_unlock(&rs->bitmap_mutex);

+    postcopy_preempt_reset_channel(rs);
+
    /*
     * Must occur before EOS (or any QEMUFile operation)
     * because of RDMA protocol.
@ -3125,6 +3355,8 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
        return ret;
    }

+    postcopy_preempt_reset_channel(rs);
+
    ret = multifd_send_sync_main(rs->f);
    if (ret < 0) {
        return ret;
@ -3209,11 +3441,13 @@ static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
 * @mis: the migration incoming state pointer
 * @f: QEMUFile where to read the data from
 * @flags: Page flags (mostly to see if it's a continuation of previous block)
+ * @channel: the channel we're using
 */
 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
-                                              QEMUFile *f, int flags)
+                                              QEMUFile *f, int flags,
+                                              int channel)
 {
-    RAMBlock *block = mis->last_recv_block;
+    RAMBlock *block = mis->last_recv_block[channel];
    char id[256];
    uint8_t len;

@ -3240,7 +3474,7 @@ static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
        return NULL;
    }

-    mis->last_recv_block = block;
+    mis->last_recv_block[channel] = block;

    return block;
 }
@ -3694,7 +3928,7 @@ int ram_load_postcopy(QEMUFile *f, int channel)
        trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
                     RAM_SAVE_FLAG_COMPRESS_PAGE)) {
-            block = ram_block_from_stream(mis, f, flags);
+            block = ram_block_from_stream(mis, f, flags, channel);
            if (!block) {
                ret = -EINVAL;
                break;
@ -3945,7 +4179,8 @@ static int ram_load_precopy(QEMUFile *f)

        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
-            RAMBlock *block = ram_block_from_stream(mis, f, flags);
+            RAMBlock *block = ram_block_from_stream(mis, f, flags,
+                                                    RAM_CHANNEL_PRECOPY);

            host = host_from_ram_block_offset(block, addr);
            /*
--- a/migration/trace-events
+++ b/migration/trace-events
@ -111,6 +111,12 @@ ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRI
 ram_write_tracking_ramblock_start(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu"
 ram_write_tracking_ramblock_stop(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu"
 unqueue_page(char *block, uint64_t offset, bool dirty) "ramblock '%s' offset 0x%"PRIx64" dirty %d"
+postcopy_preempt_triggered(char *str, unsigned long page) "during sending ramblock %s offset 0x%lx"
+postcopy_preempt_restored(char *str, unsigned long page) "ramblock %s offset 0x%lx"
+postcopy_preempt_hit(char *str, uint64_t offset) "ramblock %s offset 0x%"PRIx64
+postcopy_preempt_send_host_page(char *str, uint64_t offset) "ramblock %s offset 0x%"PRIx64
+postcopy_preempt_switch_channel(int channel) "%d"
+postcopy_preempt_reset_channel(void) ""

 # multifd.c
 multifd_new_send_channel_async(uint8_t id) "channel %u"
@ -176,6 +182,7 @@ migration_thread_low_pending(uint64_t pending) "%" PRIu64
 migrate_transferred(uint64_t tranferred, uint64_t time_spent, uint64_t bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %" PRIu64 " max_size %" PRId64
 process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
 process_incoming_migration_co_postcopy_end_main(void) ""
+postcopy_preempt_enabled(bool value) "%d"

 # channel.c
 migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s"