diff --git a/exec.c b/exec.c index 518064530b..1d4f3784d6 100644 --- a/exec.c +++ b/exec.c @@ -1972,6 +1972,21 @@ const char *qemu_ram_get_idstr(RAMBlock *rb) return rb->idstr; } +void *qemu_ram_get_host_addr(RAMBlock *rb) +{ + return rb->host; +} + +ram_addr_t qemu_ram_get_offset(RAMBlock *rb) +{ + return rb->offset; +} + +ram_addr_t qemu_ram_get_used_length(RAMBlock *rb) +{ + return rb->used_length; +} + bool qemu_ram_is_shared(RAMBlock *rb) { return rb->flags & RAM_SHARED; @@ -3961,28 +3976,7 @@ int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque) rcu_read_lock(); RAMBLOCK_FOREACH(block) { - ret = func(block->idstr, block->host, block->offset, - block->used_length, opaque); - if (ret) { - break; - } - } - rcu_read_unlock(); - return ret; -} - -int qemu_ram_foreach_migratable_block(RAMBlockIterFunc func, void *opaque) -{ - RAMBlock *block; - int ret = 0; - - rcu_read_lock(); - RAMBLOCK_FOREACH(block) { - if (!qemu_ram_is_migratable(block)) { - continue; - } - ret = func(block->idstr, block->host, block->offset, - block->used_length, opaque); + ret = func(block, opaque); if (ret) { break; } diff --git a/hmp.c b/hmp.c index 5f13b16e24..c2ad3f8251 100644 --- a/hmp.c +++ b/hmp.c @@ -166,6 +166,27 @@ void hmp_info_mice(Monitor *mon, const QDict *qdict) qapi_free_MouseInfoList(mice_list); } +static char *SocketAddress_to_str(SocketAddress *addr) +{ + switch (addr->type) { + case SOCKET_ADDRESS_TYPE_INET: + return g_strdup_printf("tcp:%s:%s", + addr->u.inet.host, + addr->u.inet.port); + case SOCKET_ADDRESS_TYPE_UNIX: + return g_strdup_printf("unix:%s", + addr->u.q_unix.path); + case SOCKET_ADDRESS_TYPE_FD: + return g_strdup_printf("fd:%s", addr->u.fd.str); + case SOCKET_ADDRESS_TYPE_VSOCK: + return g_strdup_printf("tcp:%s:%s", + addr->u.vsock.cid, + addr->u.vsock.port); + default: + return g_strdup("unknown address type"); + } +} + void hmp_info_migrate(Monitor *mon, const QDict *qdict) { MigrationInfo *info; @@ -306,6 +327,18 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) g_free(str); visit_free(v); } + if (info->has_socket_address) { + SocketAddressList *addr; + + monitor_printf(mon, "socket address: [\n"); + + for (addr = info->socket_address; addr; addr = addr->next) { + char *s = SocketAddress_to_str(addr->value); + monitor_printf(mon, "\t%s\n", s); + g_free(s); + } + monitor_printf(mon, "]\n"); + } qapi_free_MigrationInfo(info); qapi_free_MigrationCapabilityStatusList(caps); } diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index d3f2913a85..e3a65940ef 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -27,6 +27,7 @@ #include "qapi/visitor.h" #include "trace.h" #include "qemu/error-report.h" +#include "migration/misc.h" #include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio-access.h" @@ -378,6 +379,184 @@ out: } } +static void virtio_balloon_handle_free_page_vq(VirtIODevice *vdev, + VirtQueue *vq) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(vdev); + qemu_bh_schedule(s->free_page_bh); +} + +static bool get_free_page_hints(VirtIOBalloon *dev) +{ + VirtQueueElement *elem; + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtQueue *vq = dev->free_page_vq; + + while (dev->block_iothread) { + qemu_cond_wait(&dev->free_page_cond, &dev->free_page_lock); + } + + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); + if (!elem) { + return false; + } + + if (elem->out_num) { + uint32_t id; + size_t size = iov_to_buf(elem->out_sg, elem->out_num, 0, + &id, sizeof(id)); + virtqueue_push(vq, elem, size); + g_free(elem); + + virtio_tswap32s(vdev, &id); + if (unlikely(size != sizeof(id))) { + virtio_error(vdev, "received an incorrect cmd id"); + return false; + } + if (id == dev->free_page_report_cmd_id) { + dev->free_page_report_status = FREE_PAGE_REPORT_S_START; + } else { + /* + * Stop the optimization only when it has started. This + * avoids a stale stop sign for the previous command. + */ + if (dev->free_page_report_status == FREE_PAGE_REPORT_S_START) { + dev->free_page_report_status = FREE_PAGE_REPORT_S_STOP; + } + } + } + + if (elem->in_num) { + if (dev->free_page_report_status == FREE_PAGE_REPORT_S_START) { + qemu_guest_free_page_hint(elem->in_sg[0].iov_base, + elem->in_sg[0].iov_len); + } + virtqueue_push(vq, elem, 1); + g_free(elem); + } + + return true; +} + +static void virtio_ballloon_get_free_page_hints(void *opaque) +{ + VirtIOBalloon *dev = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtQueue *vq = dev->free_page_vq; + bool continue_to_get_hints; + + do { + qemu_mutex_lock(&dev->free_page_lock); + virtio_queue_set_notification(vq, 0); + continue_to_get_hints = get_free_page_hints(dev); + qemu_mutex_unlock(&dev->free_page_lock); + virtio_notify(vdev, vq); + /* + * Start to poll the vq once the reporting started. Otherwise, continue + * only when there are entries on the vq, which need to be given back. + */ + } while (continue_to_get_hints || + dev->free_page_report_status == FREE_PAGE_REPORT_S_START); + virtio_queue_set_notification(vq, 1); +} + +static bool virtio_balloon_free_page_support(void *opaque) +{ + VirtIOBalloon *s = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT); +} + +static void virtio_balloon_free_page_start(VirtIOBalloon *s) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + /* For the stop and copy phase, we don't need to start the optimization */ + if (!vdev->vm_running) { + return; + } + + if (s->free_page_report_cmd_id == UINT_MAX) { + s->free_page_report_cmd_id = + VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN; + } else { + s->free_page_report_cmd_id++; + } + + s->free_page_report_status = FREE_PAGE_REPORT_S_REQUESTED; + virtio_notify_config(vdev); +} + +static void virtio_balloon_free_page_stop(VirtIOBalloon *s) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + if (s->free_page_report_status != FREE_PAGE_REPORT_S_STOP) { + /* + * The lock also guarantees us that the + * virtio_ballloon_get_free_page_hints exits after the + * free_page_report_status is set to S_STOP. + */ + qemu_mutex_lock(&s->free_page_lock); + /* + * The guest hasn't done the reporting, so host sends a notification + * to the guest to actively stop the reporting. + */ + s->free_page_report_status = FREE_PAGE_REPORT_S_STOP; + qemu_mutex_unlock(&s->free_page_lock); + virtio_notify_config(vdev); + } +} + +static void virtio_balloon_free_page_done(VirtIOBalloon *s) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + s->free_page_report_status = FREE_PAGE_REPORT_S_DONE; + virtio_notify_config(vdev); +} + +static int +virtio_balloon_free_page_report_notify(NotifierWithReturn *n, void *data) +{ + VirtIOBalloon *dev = container_of(n, VirtIOBalloon, + free_page_report_notify); + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + PrecopyNotifyData *pnd = data; + + if (!virtio_balloon_free_page_support(dev)) { + /* + * This is an optimization provided to migration, so just return 0 to + * have the normal migration process not affected when this feature is + * not supported. + */ + return 0; + } + + switch (pnd->reason) { + case PRECOPY_NOTIFY_SETUP: + precopy_enable_free_page_optimization(); + break; + case PRECOPY_NOTIFY_COMPLETE: + case PRECOPY_NOTIFY_CLEANUP: + case PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC: + virtio_balloon_free_page_stop(dev); + break; + case PRECOPY_NOTIFY_AFTER_BITMAP_SYNC: + if (vdev->vm_running) { + virtio_balloon_free_page_start(dev); + } else { + virtio_balloon_free_page_done(dev); + } + break; + default: + virtio_error(vdev, "%s: %d reason unknown", __func__, pnd->reason); + } + + return 0; +} + static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data) { VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); @@ -386,6 +565,17 @@ static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data) config.num_pages = cpu_to_le32(dev->num_pages); config.actual = cpu_to_le32(dev->actual); + if (dev->free_page_report_status == FREE_PAGE_REPORT_S_REQUESTED) { + config.free_page_report_cmd_id = + cpu_to_le32(dev->free_page_report_cmd_id); + } else if (dev->free_page_report_status == FREE_PAGE_REPORT_S_STOP) { + config.free_page_report_cmd_id = + cpu_to_le32(VIRTIO_BALLOON_CMD_ID_STOP); + } else if (dev->free_page_report_status == FREE_PAGE_REPORT_S_DONE) { + config.free_page_report_cmd_id = + cpu_to_le32(VIRTIO_BALLOON_CMD_ID_DONE); + } + trace_virtio_balloon_get_config(config.num_pages, config.actual); memcpy(config_data, &config, sizeof(struct virtio_balloon_config)); } @@ -446,6 +636,7 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f, VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); f |= dev->host_features; virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ); + return f; } @@ -482,6 +673,18 @@ static int virtio_balloon_post_load_device(void *opaque, int version_id) return 0; } +static const VMStateDescription vmstate_virtio_balloon_free_page_report = { + .name = "virtio-balloon-device/free-page-report", + .version_id = 1, + .minimum_version_id = 1, + .needed = virtio_balloon_free_page_support, + .fields = (VMStateField[]) { + VMSTATE_UINT32(free_page_report_cmd_id, VirtIOBalloon), + VMSTATE_UINT32(free_page_report_status, VirtIOBalloon), + VMSTATE_END_OF_LIST() + } +}; + static const VMStateDescription vmstate_virtio_balloon_device = { .name = "virtio-balloon-device", .version_id = 1, @@ -492,6 +695,10 @@ static const VMStateDescription vmstate_virtio_balloon_device = { VMSTATE_UINT32(actual, VirtIOBalloon), VMSTATE_END_OF_LIST() }, + .subsections = (const VMStateDescription * []) { + &vmstate_virtio_balloon_free_page_report, + NULL + } }; static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) @@ -516,6 +723,29 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats); + if (virtio_has_feature(s->host_features, + VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { + s->free_page_vq = virtio_add_queue(vdev, VIRTQUEUE_MAX_SIZE, + virtio_balloon_handle_free_page_vq); + s->free_page_report_status = FREE_PAGE_REPORT_S_STOP; + s->free_page_report_cmd_id = + VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN; + s->free_page_report_notify.notify = + virtio_balloon_free_page_report_notify; + precopy_add_notifier(&s->free_page_report_notify); + if (s->iothread) { + object_ref(OBJECT(s->iothread)); + s->free_page_bh = aio_bh_new(iothread_get_aio_context(s->iothread), + virtio_ballloon_get_free_page_hints, s); + qemu_mutex_init(&s->free_page_lock); + qemu_cond_init(&s->free_page_cond); + s->block_iothread = false; + } else { + /* Simply disable this feature if the iothread wasn't created. */ + s->host_features &= ~(1 << VIRTIO_BALLOON_F_FREE_PAGE_HINT); + virtio_error(vdev, "iothread is missing"); + } + } reset_stats(s); } @@ -524,6 +754,11 @@ static void virtio_balloon_device_unrealize(DeviceState *dev, Error **errp) VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIOBalloon *s = VIRTIO_BALLOON(dev); + if (virtio_balloon_free_page_support(s)) { + qemu_bh_delete(s->free_page_bh); + virtio_balloon_free_page_stop(s); + precopy_remove_notifier(&s->free_page_report_notify); + } balloon_stats_destroy_timer(s); qemu_remove_balloon_handler(s); virtio_cleanup(vdev); @@ -533,6 +768,10 @@ static void virtio_balloon_device_reset(VirtIODevice *vdev) { VirtIOBalloon *s = VIRTIO_BALLOON(vdev); + if (virtio_balloon_free_page_support(s)) { + virtio_balloon_free_page_stop(s); + } + if (s->stats_vq_elem != NULL) { virtqueue_unpop(s->svq, s->stats_vq_elem, 0); g_free(s->stats_vq_elem); @@ -550,6 +789,26 @@ static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status) * was stopped */ virtio_balloon_receive_stats(vdev, s->svq); } + + if (virtio_balloon_free_page_support(s)) { + /* + * The VM is woken up and the iothread was blocked, so signal it to + * continue. + */ + if (vdev->vm_running && s->block_iothread) { + qemu_mutex_lock(&s->free_page_lock); + s->block_iothread = false; + qemu_cond_signal(&s->free_page_cond); + qemu_mutex_unlock(&s->free_page_lock); + } + + /* The VM is stopped, block the iothread. */ + if (!vdev->vm_running) { + qemu_mutex_lock(&s->free_page_lock); + s->block_iothread = true; + qemu_mutex_unlock(&s->free_page_lock); + } + } } static void virtio_balloon_instance_init(Object *obj) @@ -578,6 +837,10 @@ static const VMStateDescription vmstate_virtio_balloon = { static Property virtio_balloon_properties[] = { DEFINE_PROP_BIT("deflate-on-oom", VirtIOBalloon, host_features, VIRTIO_BALLOON_F_DEFLATE_ON_OOM, false), + DEFINE_PROP_BIT("free-page-hint", VirtIOBalloon, host_features, + VIRTIO_BALLOON_F_FREE_PAGE_HINT, false), + DEFINE_PROP_LINK("iothread", VirtIOBalloon, iothread, TYPE_IOTHREAD, + IOThread *), DEFINE_PROP_END_OF_LIST(), }; diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index 63ec1f9b37..cef8b88a2a 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -72,6 +72,9 @@ ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host); void qemu_ram_set_idstr(RAMBlock *block, const char *name, DeviceState *dev); void qemu_ram_unset_idstr(RAMBlock *block); const char *qemu_ram_get_idstr(RAMBlock *rb); +void *qemu_ram_get_host_addr(RAMBlock *rb); +ram_addr_t qemu_ram_get_offset(RAMBlock *rb); +ram_addr_t qemu_ram_get_used_length(RAMBlock *rb); bool qemu_ram_is_shared(RAMBlock *rb); bool qemu_ram_is_uf_zeroable(RAMBlock *rb); void qemu_ram_set_uf_zeroable(RAMBlock *rb); @@ -116,11 +119,9 @@ void cpu_flush_icache_range(hwaddr start, hwaddr len); extern struct MemoryRegion io_mem_rom; extern struct MemoryRegion io_mem_notdirty; -typedef int (RAMBlockIterFunc)(const char *block_name, void *host_addr, - ram_addr_t offset, ram_addr_t length, void *opaque); +typedef int (RAMBlockIterFunc)(RAMBlock *rb, void *opaque); int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque); -int qemu_ram_foreach_migratable_block(RAMBlockIterFunc func, void *opaque); int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length); #endif diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h index 99dcd6d105..1afafb12f6 100644 --- a/include/hw/virtio/virtio-balloon.h +++ b/include/hw/virtio/virtio-balloon.h @@ -17,11 +17,14 @@ #include "standard-headers/linux/virtio_balloon.h" #include "hw/virtio/virtio.h" +#include "sysemu/iothread.h" #define TYPE_VIRTIO_BALLOON "virtio-balloon-device" #define VIRTIO_BALLOON(obj) \ OBJECT_CHECK(VirtIOBalloon, (obj), TYPE_VIRTIO_BALLOON) +#define VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN 0x80000000 + typedef struct virtio_balloon_stat VirtIOBalloonStat; typedef struct virtio_balloon_stat_modern { @@ -32,15 +35,38 @@ typedef struct virtio_balloon_stat_modern { typedef struct PartiallyBalloonedPage PartiallyBalloonedPage; +enum virtio_balloon_free_page_report_status { + FREE_PAGE_REPORT_S_STOP = 0, + FREE_PAGE_REPORT_S_REQUESTED = 1, + FREE_PAGE_REPORT_S_START = 2, + FREE_PAGE_REPORT_S_DONE = 3, +}; + typedef struct VirtIOBalloon { VirtIODevice parent_obj; - VirtQueue *ivq, *dvq, *svq; + VirtQueue *ivq, *dvq, *svq, *free_page_vq; + uint32_t free_page_report_status; uint32_t num_pages; uint32_t actual; + uint32_t free_page_report_cmd_id; uint64_t stats[VIRTIO_BALLOON_S_NR]; VirtQueueElement *stats_vq_elem; size_t stats_vq_offset; QEMUTimer *stats_timer; + IOThread *iothread; + QEMUBH *free_page_bh; + /* + * Lock to synchronize threads to access the free page reporting related + * fields (e.g. free_page_report_status). + */ + QemuMutex free_page_lock; + QemuCond free_page_cond; + /* + * Set to block iothread to continue reading free page hints as the VM is + * stopped. + */ + bool block_iothread; + NotifierWithReturn free_page_report_notify; int64_t stats_last_update; int64_t stats_poll_interval; uint32_t host_features; diff --git a/include/migration/misc.h b/include/migration/misc.h index 0471e04d1f..5cdbabd094 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -14,12 +14,34 @@ #ifndef MIGRATION_MISC_H #define MIGRATION_MISC_H +#include "exec/cpu-common.h" #include "qemu/notify.h" #include "qapi/qapi-types-net.h" /* migration/ram.c */ +typedef enum PrecopyNotifyReason { + PRECOPY_NOTIFY_SETUP = 0, + PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC = 1, + PRECOPY_NOTIFY_AFTER_BITMAP_SYNC = 2, + PRECOPY_NOTIFY_COMPLETE = 3, + PRECOPY_NOTIFY_CLEANUP = 4, + PRECOPY_NOTIFY_MAX = 5, +} PrecopyNotifyReason; + +typedef struct PrecopyNotifyData { + enum PrecopyNotifyReason reason; + Error **errp; +} PrecopyNotifyData; + +void precopy_infrastructure_init(void); +void precopy_add_notifier(NotifierWithReturn *n); +void precopy_remove_notifier(NotifierWithReturn *n); +int precopy_notify(PrecopyNotifyReason reason, Error **errp); +void precopy_enable_free_page_optimization(void); + void ram_mig_init(void); +void qemu_guest_free_page_hint(void *addr, size_t len); /* migration/block.c */ @@ -36,7 +58,7 @@ void dump_vmstate_json_to_file(FILE *out_fp); /* migration/migration.c */ void migration_object_init(void); -void migration_object_finalize(void); +void migration_shutdown(void); void qemu_start_incoming_migration(const char *uri, Error **errp); bool migration_is_idle(void); void add_migration_state_change_notifier(Notifier *notify); diff --git a/include/qemu/bitmap.h b/include/qemu/bitmap.h index 509eeddece..5c313346b9 100644 --- a/include/qemu/bitmap.h +++ b/include/qemu/bitmap.h @@ -221,6 +221,10 @@ static inline int bitmap_intersects(const unsigned long *src1, static inline long bitmap_count_one(const unsigned long *bitmap, long nbits) { + if (unlikely(!nbits)) { + return 0; + } + if (small_nbits(nbits)) { return ctpopl(*bitmap & BITMAP_LAST_WORD_MASK(nbits)); } else { @@ -228,6 +232,19 @@ static inline long bitmap_count_one(const unsigned long *bitmap, long nbits) } } +static inline long bitmap_count_one_with_offset(const unsigned long *bitmap, + long offset, long nbits) +{ + long aligned_offset = QEMU_ALIGN_DOWN(offset, BITS_PER_LONG); + long redundant_bits = offset - aligned_offset; + long bits_to_count = nbits + redundant_bits; + const unsigned long *bitmap_start = bitmap + + aligned_offset / BITS_PER_LONG; + + return bitmap_count_one(bitmap_start, bits_to_count) - + bitmap_count_one(bitmap_start, redundant_bits); +} + void bitmap_set(unsigned long *map, long i, long len); void bitmap_set_atomic(unsigned long *map, long i, long len); void bitmap_clear(unsigned long *map, long start, long nr); diff --git a/migration/colo.c b/migration/colo.c index 398b239d1c..5ba610dc01 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -872,8 +872,8 @@ out: /* Must be called after failover BH is completed */ if (mis->to_src_file) { qemu_fclose(mis->to_src_file); + mis->to_src_file = NULL; } - migration_incoming_disable_colo(); rcu_unregister_thread(); return NULL; diff --git a/migration/migration.c b/migration/migration.c index c39d3054ec..df6fd8e0e5 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -31,6 +31,8 @@ #include "migration/vmstate.h" #include "block/block.h" #include "qapi/error.h" +#include "qapi/clone-visitor.h" +#include "qapi/qapi-visit-sockets.h" #include "qapi/qapi-commands-migration.h" #include "qapi/qapi-events-migration.h" #include "qapi/qmp/qerror.h" @@ -126,6 +128,7 @@ static bool migration_object_check(MigrationState *ms, Error **errp); static int migration_maybe_pause(MigrationState *s, int *current_active_state, int new_state); +static void migrate_fd_cancel(MigrationState *s); void migration_object_init(void) { @@ -167,8 +170,13 @@ void migration_object_init(void) } } -void migration_object_finalize(void) +void migration_shutdown(void) { + /* + * Cancel the current migration - that will (eventually) + * stop the migration using this structure + */ + migrate_fd_cancel(current_migration); object_unref(OBJECT(current_migration)); } @@ -207,6 +215,11 @@ void migration_incoming_state_destroy(void) } qemu_event_reset(&mis->main_thread_load_event); + + if (mis->socket_address_list) { + qapi_free_SocketAddressList(mis->socket_address_list); + mis->socket_address_list = NULL; + } } static void migrate_generate_event(int new_state) @@ -322,6 +335,17 @@ void migration_incoming_enable_colo(void) migration_colo_enabled = true; } +void migrate_add_address(SocketAddress *address) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + SocketAddressList *addrs; + + addrs = g_new0(SocketAddressList, 1); + addrs->next = mis->socket_address_list; + mis->socket_address_list = addrs; + addrs->value = QAPI_CLONE(SocketAddress, address); +} + void qemu_start_incoming_migration(const char *uri, Error **errp) { const char *p; @@ -393,6 +417,9 @@ static void process_incoming_migration_bh(void *opaque) } else { runstate_set(RUN_STATE_PAUSED); } + } else if (migration_incoming_colo_enabled()) { + migration_incoming_disable_colo(); + vm_start(); } else { runstate_set(global_state_get_runstate()); } @@ -989,6 +1016,11 @@ static bool migrate_caps_check(bool *cap_list, error_setg(errp, "Postcopy is not supported"); return false; } + + if (cap_list[MIGRATION_CAPABILITY_X_IGNORE_SHARED]) { + error_setg(errp, "Postcopy is not compatible with ignore-shared"); + return false; + } } return true; @@ -998,6 +1030,12 @@ static void fill_destination_migration_info(MigrationInfo *info) { MigrationIncomingState *mis = migration_incoming_get_current(); + if (mis->socket_address_list) { + info->has_socket_address = true; + info->socket_address = + QAPI_CLONE(SocketAddressList, mis->socket_address_list); + } + switch (mis->state) { case MIGRATION_STATUS_NONE: return; @@ -2068,6 +2106,15 @@ bool migrate_dirty_bitmaps(void) return s->enabled_capabilities[MIGRATION_CAPABILITY_DIRTY_BITMAPS]; } +bool migrate_ignore_shared(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_X_IGNORE_SHARED]; +} + bool migrate_use_events(void) { MigrationState *s; @@ -2911,6 +2958,13 @@ static MigThrError postcopy_pause(MigrationState *s) static MigThrError migration_detect_error(MigrationState *s) { int ret; + int state = s->state; + + if (state == MIGRATION_STATUS_CANCELLING || + state == MIGRATION_STATUS_CANCELLED) { + /* End the migration, but don't set the state to failed */ + return MIG_THR_ERR_FATAL; + } /* Try to detect any file errors */ ret = qemu_file_get_error(s->to_dst_file); @@ -2920,7 +2974,7 @@ static MigThrError migration_detect_error(MigrationState *s) return MIG_THR_ERR_NONE; } - if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret == -EIO) { + if (state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret == -EIO) { /* * For postcopy, we allow the network to be down for a * while. After that, it can be continued by a @@ -2932,7 +2986,7 @@ static MigThrError migration_detect_error(MigrationState *s) * For precopy (or postcopy with error outside IO), we fail * with no time. */ - migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); + migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED); trace_migration_thread_file_err(); /* Time to stop the migration, now. */ @@ -3127,6 +3181,7 @@ static void *migration_thread(void *opaque) rcu_register_thread(); + object_ref(OBJECT(s)); s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); qemu_savevm_state_header(s->to_dst_file); @@ -3223,6 +3278,7 @@ static void *migration_thread(void *opaque) trace_migration_thread_after_loop(); migration_iteration_finish(s); + object_unref(OBJECT(s)); rcu_unregister_thread(); return NULL; } diff --git a/migration/migration.h b/migration/migration.h index c99154dea2..99e99e56bd 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -84,6 +84,9 @@ struct MigrationIncomingState { bool postcopy_recover_triggered; QemuSemaphore postcopy_pause_sem_dst; QemuSemaphore postcopy_pause_sem_fault; + + /* List of listening socket addresses */ + SocketAddressList *socket_address_list; }; MigrationIncomingState *migration_incoming_get_current(void); @@ -265,6 +268,7 @@ bool migrate_release_ram(void); bool migrate_postcopy_ram(void); bool migrate_zero_blocks(void); bool migrate_dirty_bitmaps(void); +bool migrate_ignore_shared(void); bool migrate_auto_converge(void); bool migrate_use_multifd(void); @@ -304,9 +308,12 @@ void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value); void dirty_bitmap_mig_before_vm_start(void); void init_dirty_bitmap_incoming_migration(void); +void migrate_add_address(SocketAddress *address); + +int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque); #define qemu_ram_foreach_block \ - #warning "Use qemu_ram_foreach_block_migratable in migration code" + #warning "Use foreach_not_ignored_block in migration code" void migration_make_urgent_request(void); void migration_consume_urgent_request(void); diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index fa09dba534..e2aa57a701 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -319,10 +319,10 @@ static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis) /* Callback from postcopy_ram_supported_by_host block iterator. */ -static int test_ramblock_postcopiable(const char *block_name, void *host_addr, - ram_addr_t offset, ram_addr_t length, void *opaque) +static int test_ramblock_postcopiable(RAMBlock *rb, void *opaque) { - RAMBlock *rb = qemu_ram_block_by_name(block_name); + const char *block_name = qemu_ram_get_idstr(rb); + ram_addr_t length = qemu_ram_get_used_length(rb); size_t pagesize = qemu_ram_pagesize(rb); if (length % pagesize) { @@ -374,7 +374,7 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis) } /* We don't support postcopy with shared RAM yet */ - if (qemu_ram_foreach_migratable_block(test_ramblock_postcopiable, NULL)) { + if (foreach_not_ignored_block(test_ramblock_postcopiable, NULL)) { goto out; } @@ -443,9 +443,12 @@ out: * must be done right at the start prior to pre-copy. * opaque should be the MIS. */ -static int init_range(const char *block_name, void *host_addr, - ram_addr_t offset, ram_addr_t length, void *opaque) +static int init_range(RAMBlock *rb, void *opaque) { + const char *block_name = qemu_ram_get_idstr(rb); + void *host_addr = qemu_ram_get_host_addr(rb); + ram_addr_t offset = qemu_ram_get_offset(rb); + ram_addr_t length = qemu_ram_get_used_length(rb); trace_postcopy_init_range(block_name, host_addr, offset, length); /* @@ -465,9 +468,12 @@ static int init_range(const char *block_name, void *host_addr, * At the end of migration, undo the effects of init_range * opaque should be the MIS. */ -static int cleanup_range(const char *block_name, void *host_addr, - ram_addr_t offset, ram_addr_t length, void *opaque) +static int cleanup_range(RAMBlock *rb, void *opaque) { + const char *block_name = qemu_ram_get_idstr(rb); + void *host_addr = qemu_ram_get_host_addr(rb); + ram_addr_t offset = qemu_ram_get_offset(rb); + ram_addr_t length = qemu_ram_get_used_length(rb); MigrationIncomingState *mis = opaque; struct uffdio_range range_struct; trace_postcopy_cleanup_range(block_name, host_addr, offset, length); @@ -502,7 +508,7 @@ static int cleanup_range(const char *block_name, void *host_addr, */ int postcopy_ram_incoming_init(MigrationIncomingState *mis) { - if (qemu_ram_foreach_migratable_block(init_range, NULL)) { + if (foreach_not_ignored_block(init_range, NULL)) { return -1; } @@ -544,7 +550,7 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) return -1; } - if (qemu_ram_foreach_migratable_block(cleanup_range, mis)) { + if (foreach_not_ignored_block(cleanup_range, mis)) { return -1; } @@ -586,9 +592,12 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) /* * Disable huge pages on an area */ -static int nhp_range(const char *block_name, void *host_addr, - ram_addr_t offset, ram_addr_t length, void *opaque) +static int nhp_range(RAMBlock *rb, void *opaque) { + const char *block_name = qemu_ram_get_idstr(rb); + void *host_addr = qemu_ram_get_host_addr(rb); + ram_addr_t offset = qemu_ram_get_offset(rb); + ram_addr_t length = qemu_ram_get_used_length(rb); trace_postcopy_nhp_range(block_name, host_addr, offset, length); /* @@ -608,7 +617,7 @@ static int nhp_range(const char *block_name, void *host_addr, */ int postcopy_ram_prepare_discard(MigrationIncomingState *mis) { - if (qemu_ram_foreach_migratable_block(nhp_range, mis)) { + if (foreach_not_ignored_block(nhp_range, mis)) { return -1; } @@ -619,22 +628,20 @@ int postcopy_ram_prepare_discard(MigrationIncomingState *mis) /* * Mark the given area of RAM as requiring notification to unwritten areas - * Used as a callback on qemu_ram_foreach_migratable_block. + * Used as a callback on foreach_not_ignored_block. * host_addr: Base of area to mark * offset: Offset in the whole ram arena * length: Length of the section * opaque: MigrationIncomingState pointer * Returns 0 on success */ -static int ram_block_enable_notify(const char *block_name, void *host_addr, - ram_addr_t offset, ram_addr_t length, - void *opaque) +static int ram_block_enable_notify(RAMBlock *rb, void *opaque) { MigrationIncomingState *mis = opaque; struct uffdio_register reg_struct; - reg_struct.range.start = (uintptr_t)host_addr; - reg_struct.range.len = length; + reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb); + reg_struct.range.len = qemu_ram_get_used_length(rb); reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; /* Now tell our userfault_fd that it's responsible for this area */ @@ -647,7 +654,6 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr, return -1; } if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) { - RAMBlock *rb = qemu_ram_block_by_name(block_name); qemu_ram_set_uf_zeroable(rb); } @@ -1116,7 +1122,7 @@ int postcopy_ram_enable_notify(MigrationIncomingState *mis) mis->have_fault_thread = true; /* Mark so that we get notified of accesses to unwritten areas */ - if (qemu_ram_foreach_migratable_block(ram_block_enable_notify, mis)) { + if (foreach_not_ignored_block(ram_block_enable_notify, mis)) { error_report("ram_block_enable_notify failed"); return -1; } diff --git a/migration/ram.c b/migration/ram.c index 59191c1ed2..35bd6213e9 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -159,18 +159,44 @@ out: return ret; } +static bool ramblock_is_ignored(RAMBlock *block) +{ + return !qemu_ram_is_migratable(block) || + (migrate_ignore_shared() && qemu_ram_is_shared(block)); +} + /* Should be holding either ram_list.mutex, or the RCU lock. */ +#define RAMBLOCK_FOREACH_NOT_IGNORED(block) \ + INTERNAL_RAMBLOCK_FOREACH(block) \ + if (ramblock_is_ignored(block)) {} else + #define RAMBLOCK_FOREACH_MIGRATABLE(block) \ INTERNAL_RAMBLOCK_FOREACH(block) \ if (!qemu_ram_is_migratable(block)) {} else #undef RAMBLOCK_FOREACH +int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) +{ + RAMBlock *block; + int ret = 0; + + rcu_read_lock(); + RAMBLOCK_FOREACH_NOT_IGNORED(block) { + ret = func(block, opaque); + if (ret) { + break; + } + } + rcu_read_unlock(); + return ret; +} + static void ramblock_recv_map_init(void) { RAMBlock *rb; - RAMBLOCK_FOREACH_MIGRATABLE(rb) { + RAMBLOCK_FOREACH_NOT_IGNORED(rb) { assert(!rb->receivedmap); rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); } @@ -290,6 +316,8 @@ struct RAMState { uint32_t last_version; /* We are in the first round */ bool ram_bulk_stage; + /* The free page optimization is enabled */ + bool fpo_enabled; /* How many times we have dirty too many pages */ int dirty_rate_high_cnt; /* these variables are used for bitmap sync */ @@ -316,7 +344,7 @@ struct RAMState { uint64_t target_page_count; /* number of dirty bits in the bitmap */ uint64_t migration_dirty_pages; - /* protects modification of the bitmap */ + /* Protects modification of the bitmap and migration dirty pages */ QemuMutex bitmap_mutex; /* The RAMBlock used in the last src_page_requests */ RAMBlock *last_req_rb; @@ -328,6 +356,41 @@ typedef struct RAMState RAMState; static RAMState *ram_state; +static NotifierWithReturnList precopy_notifier_list; + +void precopy_infrastructure_init(void) +{ + notifier_with_return_list_init(&precopy_notifier_list); +} + +void precopy_add_notifier(NotifierWithReturn *n) +{ + notifier_with_return_list_add(&precopy_notifier_list, n); +} + +void precopy_remove_notifier(NotifierWithReturn *n) +{ + notifier_with_return_remove(n); +} + +int precopy_notify(PrecopyNotifyReason reason, Error **errp) +{ + PrecopyNotifyData pnd; + pnd.reason = reason; + pnd.errp = errp; + + return notifier_with_return_list_notify(&precopy_notifier_list, &pnd); +} + +void precopy_enable_free_page_optimization(void) +{ + if (!ram_state) { + return; + } + + ram_state->fpo_enabled = true; +} + uint64_t ram_bytes_remaining(void) { return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : @@ -1545,11 +1608,15 @@ unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, unsigned long *bitmap = rb->bmap; unsigned long next; - if (!qemu_ram_is_migratable(rb)) { + if (ramblock_is_ignored(rb)) { return size; } - if (rs->ram_bulk_stage && start > 0) { + /* + * When the free page optimization is enabled, we need to check the bitmap + * to send the non-free pages rather than all the pages in the bulk stage. + */ + if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) { next = start + 1; } else { next = find_next_bit(bitmap, size, start); @@ -1564,11 +1631,14 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs, { bool ret; + qemu_mutex_lock(&rs->bitmap_mutex); ret = test_and_clear_bit(page, rb->bmap); if (ret) { rs->migration_dirty_pages--; } + qemu_mutex_unlock(&rs->bitmap_mutex); + return ret; } @@ -1594,7 +1664,7 @@ uint64_t ram_pagesize_summary(void) RAMBlock *block; uint64_t summary = 0; - RAMBLOCK_FOREACH_MIGRATABLE(block) { + RAMBLOCK_FOREACH_NOT_IGNORED(block) { summary |= block->page_size; } @@ -1664,7 +1734,7 @@ static void migration_bitmap_sync(RAMState *rs) qemu_mutex_lock(&rs->bitmap_mutex); rcu_read_lock(); - RAMBLOCK_FOREACH_MIGRATABLE(block) { + RAMBLOCK_FOREACH_NOT_IGNORED(block) { migration_bitmap_sync_range(rs, block, 0, block->used_length); } ram_counters.remaining = ram_bytes_remaining(); @@ -1712,6 +1782,25 @@ static void migration_bitmap_sync(RAMState *rs) } } +static void migration_bitmap_sync_precopy(RAMState *rs) +{ + Error *local_err = NULL; + + /* + * The current notifier usage is just an optimization to migration, so we + * don't stop the normal migration process in the error case. + */ + if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { + error_report_err(local_err); + } + + migration_bitmap_sync(rs); + + if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { + error_report_err(local_err); + } +} + /** * save_zero_page_to_file: send the zero page to the file * @@ -2388,7 +2477,7 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, size_t pagesize_bits = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; - if (!qemu_ram_is_migratable(pss->block)) { + if (ramblock_is_ignored(pss->block)) { error_report("block %s should not be migrated !", pss->block->idstr); return 0; } @@ -2486,19 +2575,30 @@ void acct_update_position(QEMUFile *f, size_t size, bool zero) } } -uint64_t ram_bytes_total(void) +static uint64_t ram_bytes_total_common(bool count_ignored) { RAMBlock *block; uint64_t total = 0; rcu_read_lock(); - RAMBLOCK_FOREACH_MIGRATABLE(block) { - total += block->used_length; + if (count_ignored) { + RAMBLOCK_FOREACH_MIGRATABLE(block) { + total += block->used_length; + } + } else { + RAMBLOCK_FOREACH_NOT_IGNORED(block) { + total += block->used_length; + } } rcu_read_unlock(); return total; } +uint64_t ram_bytes_total(void) +{ + return ram_bytes_total_common(false); +} + static void xbzrle_load_setup(void) { XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); @@ -2547,7 +2647,7 @@ static void ram_save_cleanup(void *opaque) */ memory_global_dirty_log_stop(); - RAMBLOCK_FOREACH_MIGRATABLE(block) { + RAMBLOCK_FOREACH_NOT_IGNORED(block) { g_free(block->bmap); block->bmap = NULL; g_free(block->unsentmap); @@ -2566,6 +2666,7 @@ static void ram_state_reset(RAMState *rs) rs->last_page = 0; rs->last_version = ram_list.version; rs->ram_bulk_stage = true; + rs->fpo_enabled = false; } #define MAX_WAIT 50 /* ms, half buffered_file limit */ @@ -2610,7 +2711,7 @@ void ram_postcopy_migrated_memory_release(MigrationState *ms) { struct RAMBlock *block; - RAMBLOCK_FOREACH_MIGRATABLE(block) { + RAMBLOCK_FOREACH_NOT_IGNORED(block) { unsigned long *bitmap = block->bmap; unsigned long range = block->used_length >> TARGET_PAGE_BITS; unsigned long run_start = find_next_zero_bit(bitmap, range, 0); @@ -2688,7 +2789,7 @@ static int postcopy_each_ram_send_discard(MigrationState *ms) struct RAMBlock *block; int ret; - RAMBLOCK_FOREACH_MIGRATABLE(block) { + RAMBLOCK_FOREACH_NOT_IGNORED(block) { PostcopyDiscardState *pds = postcopy_discard_send_init(ms, block->idstr); @@ -2896,7 +2997,7 @@ int ram_postcopy_send_discard_bitmap(MigrationState *ms) rs->last_sent_block = NULL; rs->last_page = 0; - RAMBLOCK_FOREACH_MIGRATABLE(block) { + RAMBLOCK_FOREACH_NOT_IGNORED(block) { unsigned long pages = block->used_length >> TARGET_PAGE_BITS; unsigned long *bitmap = block->bmap; unsigned long *unsentmap = block->unsentmap; @@ -3062,7 +3163,7 @@ static void ram_list_init_bitmaps(void) /* Skip setting bitmap if there is no RAM */ if (ram_bytes_total()) { - RAMBLOCK_FOREACH_MIGRATABLE(block) { + RAMBLOCK_FOREACH_NOT_IGNORED(block) { pages = block->max_length >> TARGET_PAGE_BITS; block->bmap = bitmap_new(pages); bitmap_set(block->bmap, 0, pages); @@ -3083,7 +3184,7 @@ static void ram_init_bitmaps(RAMState *rs) ram_list_init_bitmaps(); memory_global_dirty_log_start(); - migration_bitmap_sync(rs); + migration_bitmap_sync_precopy(rs); rcu_read_unlock(); qemu_mutex_unlock_ramlist(); @@ -3117,7 +3218,7 @@ static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) * about dirty page logging as well. */ - RAMBLOCK_FOREACH_MIGRATABLE(block) { + RAMBLOCK_FOREACH_NOT_IGNORED(block) { pages += bitmap_count_one(block->bmap, block->used_length >> TARGET_PAGE_BITS); } @@ -3141,6 +3242,53 @@ static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) trace_ram_state_resume_prepare(pages); } +/* + * This function clears bits of the free pages reported by the caller from the + * migration dirty bitmap. @addr is the host address corresponding to the + * start of the continuous guest free pages, and @len is the total bytes of + * those pages. + */ +void qemu_guest_free_page_hint(void *addr, size_t len) +{ + RAMBlock *block; + ram_addr_t offset; + size_t used_len, start, npages; + MigrationState *s = migrate_get_current(); + + /* This function is currently expected to be used during live migration */ + if (!migration_is_setup_or_active(s->state)) { + return; + } + + for (; len > 0; len -= used_len, addr += used_len) { + block = qemu_ram_block_from_host(addr, false, &offset); + if (unlikely(!block || offset >= block->used_length)) { + /* + * The implementation might not support RAMBlock resize during + * live migration, but it could happen in theory with future + * updates. So we add a check here to capture that case. + */ + error_report_once("%s unexpected error", __func__); + return; + } + + if (len <= block->used_length - offset) { + used_len = len; + } else { + used_len = block->used_length - offset; + } + + start = offset >> TARGET_PAGE_BITS; + npages = used_len >> TARGET_PAGE_BITS; + + qemu_mutex_lock(&ram_state->bitmap_mutex); + ram_state->migration_dirty_pages -= + bitmap_count_one_with_offset(block->bmap, start, npages); + bitmap_clear(block->bmap, start, npages); + qemu_mutex_unlock(&ram_state->bitmap_mutex); + } +} + /* * Each of ram_save_setup, ram_save_iterate and ram_save_complete has * long-running RCU critical section. When rcu-reclaims in the code @@ -3176,7 +3324,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) rcu_read_lock(); - qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE); + qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); RAMBLOCK_FOREACH_MIGRATABLE(block) { qemu_put_byte(f, strlen(block->idstr)); @@ -3185,6 +3333,10 @@ static int ram_save_setup(QEMUFile *f, void *opaque) if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) { qemu_put_be64(f, block->page_size); } + if (migrate_ignore_shared()) { + qemu_put_be64(f, block->mr->addr); + qemu_put_byte(f, ramblock_is_ignored(block) ? 1 : 0); + } } rcu_read_unlock(); @@ -3312,7 +3464,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque) rcu_read_lock(); if (!migration_in_postcopy()) { - migration_bitmap_sync(rs); + migration_bitmap_sync_precopy(rs); } ram_control_before_iterate(f, RAM_CONTROL_FINISH); @@ -3361,7 +3513,7 @@ static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, remaining_size < max_size) { qemu_mutex_lock_iothread(); rcu_read_lock(); - migration_bitmap_sync(rs); + migration_bitmap_sync_precopy(rs); rcu_read_unlock(); qemu_mutex_unlock_iothread(); remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; @@ -3443,7 +3595,7 @@ static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags) return NULL; } - if (!qemu_ram_is_migratable(block)) { + if (ramblock_is_ignored(block)) { error_report("block %s should not be migrated !", id); return NULL; } @@ -3698,7 +3850,7 @@ int colo_init_ram_cache(void) RAMBlock *block; rcu_read_lock(); - RAMBLOCK_FOREACH_MIGRATABLE(block) { + RAMBLOCK_FOREACH_NOT_IGNORED(block) { block->colo_cache = qemu_anon_ram_alloc(block->used_length, NULL, false); @@ -3719,7 +3871,7 @@ int colo_init_ram_cache(void) if (ram_bytes_total()) { RAMBlock *block; - RAMBLOCK_FOREACH_MIGRATABLE(block) { + RAMBLOCK_FOREACH_NOT_IGNORED(block) { unsigned long pages = block->max_length >> TARGET_PAGE_BITS; block->bmap = bitmap_new(pages); @@ -3734,7 +3886,7 @@ int colo_init_ram_cache(void) out_locked: - RAMBLOCK_FOREACH_MIGRATABLE(block) { + RAMBLOCK_FOREACH_NOT_IGNORED(block) { if (block->colo_cache) { qemu_anon_ram_free(block->colo_cache, block->used_length); block->colo_cache = NULL; @@ -3751,14 +3903,14 @@ void colo_release_ram_cache(void) RAMBlock *block; memory_global_dirty_log_stop(); - RAMBLOCK_FOREACH_MIGRATABLE(block) { + RAMBLOCK_FOREACH_NOT_IGNORED(block) { g_free(block->bmap); block->bmap = NULL; } rcu_read_lock(); - RAMBLOCK_FOREACH_MIGRATABLE(block) { + RAMBLOCK_FOREACH_NOT_IGNORED(block) { if (block->colo_cache) { qemu_anon_ram_free(block->colo_cache, block->used_length); block->colo_cache = NULL; @@ -3794,7 +3946,7 @@ static int ram_load_cleanup(void *opaque) { RAMBlock *rb; - RAMBLOCK_FOREACH_MIGRATABLE(rb) { + RAMBLOCK_FOREACH_NOT_IGNORED(rb) { if (ramblock_is_pmem(rb)) { pmem_persist(rb->host, rb->used_length); } @@ -3803,7 +3955,7 @@ static int ram_load_cleanup(void *opaque) xbzrle_load_cleanup(); compress_threads_load_cleanup(); - RAMBLOCK_FOREACH_MIGRATABLE(rb) { + RAMBLOCK_FOREACH_NOT_IGNORED(rb) { g_free(rb->receivedmap); rb->receivedmap = NULL; } @@ -4003,7 +4155,7 @@ static void colo_flush_ram_cache(void) memory_global_dirty_log_sync(); rcu_read_lock(); - RAMBLOCK_FOREACH_MIGRATABLE(block) { + RAMBLOCK_FOREACH_NOT_IGNORED(block) { migration_bitmap_sync_range(ram_state, block, 0, block->used_length); } rcu_read_unlock(); @@ -4146,6 +4298,23 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) ret = -EINVAL; } } + if (migrate_ignore_shared()) { + hwaddr addr = qemu_get_be64(f); + bool ignored = qemu_get_byte(f); + if (ignored != ramblock_is_ignored(block)) { + error_report("RAM block %s should %s be migrated", + id, ignored ? "" : "not"); + ret = -EINVAL; + } + if (ramblock_is_ignored(block) && + block->mr->addr != addr) { + error_report("Mismatched GPAs for block %s " + "%" PRId64 "!= %" PRId64, + id, (uint64_t)addr, + (uint64_t)block->mr->addr); + ret = -EINVAL; + } + } ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, block->idstr); } else { @@ -4216,7 +4385,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) static bool ram_has_postcopy(void *opaque) { RAMBlock *rb; - RAMBLOCK_FOREACH_MIGRATABLE(rb) { + RAMBLOCK_FOREACH_NOT_IGNORED(rb) { if (ramblock_is_pmem(rb)) { info_report("Block: %s, host: %p is a nvdimm memory, postcopy" "is not supported now!", rb->idstr, rb->host); @@ -4236,7 +4405,7 @@ static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) trace_ram_dirty_bitmap_sync_start(); - RAMBLOCK_FOREACH_MIGRATABLE(block) { + RAMBLOCK_FOREACH_NOT_IGNORED(block) { qemu_savevm_send_recv_bitmap(file, block->idstr); trace_ram_dirty_bitmap_request(block->idstr); ramblock_count++; diff --git a/migration/rdma.c b/migration/rdma.c index 54a3c11540..63c118af09 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -624,9 +624,12 @@ static int rdma_add_block(RDMAContext *rdma, const char *block_name, * in advanced before the migration starts. This tells us where the RAM blocks * are so that we can register them individually. */ -static int qemu_rdma_init_one_block(const char *block_name, void *host_addr, - ram_addr_t block_offset, ram_addr_t length, void *opaque) +static int qemu_rdma_init_one_block(RAMBlock *rb, void *opaque) { + const char *block_name = qemu_ram_get_idstr(rb); + void *host_addr = qemu_ram_get_host_addr(rb); + ram_addr_t block_offset = qemu_ram_get_offset(rb); + ram_addr_t length = qemu_ram_get_used_length(rb); return rdma_add_block(opaque, block_name, host_addr, block_offset, length); } @@ -641,7 +644,7 @@ static int qemu_rdma_init_ram_blocks(RDMAContext *rdma) assert(rdma->blockmap == NULL); memset(local, 0, sizeof *local); - qemu_ram_foreach_migratable_block(qemu_rdma_init_one_block, rdma); + foreach_not_ignored_block(qemu_rdma_init_one_block, rdma); trace_qemu_rdma_init_ram_blocks(local->nb_blocks); rdma->dest_blocks = g_new0(RDMADestBlock, rdma->local_ram_blocks.nb_blocks); @@ -2321,7 +2324,9 @@ static void qemu_rdma_cleanup(RDMAContext *rdma) rdma->connected = false; } - qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL); + if (rdma->channel) { + qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL); + } g_free(rdma->dest_blocks); rdma->dest_blocks = NULL; @@ -3611,13 +3616,16 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque) } chunk_start = ram_chunk_start(block, chunk); chunk_end = ram_chunk_end(block, chunk + reg->chunks); + /* avoid "-Waddress-of-packed-member" warning */ + uint32_t tmp_rkey = 0; if (qemu_rdma_register_and_get_keys(rdma, block, - (uintptr_t)host_addr, NULL, ®_result->rkey, + (uintptr_t)host_addr, NULL, &tmp_rkey, chunk, chunk_start, chunk_end)) { error_report("cannot get rkey"); ret = -EINVAL; goto out; } + reg_result->rkey = tmp_rkey; reg_result->host_addr = (uintptr_t)block->local_host_addr; diff --git a/migration/savevm.c b/migration/savevm.c index b3868f7fb5..1415001d1c 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -57,6 +57,7 @@ #include "sysemu/replay.h" #include "qjson.h" #include "migration/colo.h" +#include "qemu/bitmap.h" #include "net/announce.h" const unsigned int postcopy_ram_discard_version = 0; @@ -249,6 +250,8 @@ typedef struct SaveState { uint32_t len; const char *name; uint32_t target_page_bits; + uint32_t caps_count; + MigrationCapability *capabilities; } SaveState; static SaveState savevm_state = { @@ -256,15 +259,51 @@ static SaveState savevm_state = { .global_section_id = 0, }; +static bool should_validate_capability(int capability) +{ + assert(capability >= 0 && capability < MIGRATION_CAPABILITY__MAX); + /* Validate only new capabilities to keep compatibility. */ + switch (capability) { + case MIGRATION_CAPABILITY_X_IGNORE_SHARED: + return true; + default: + return false; + } +} + +static uint32_t get_validatable_capabilities_count(void) +{ + MigrationState *s = migrate_get_current(); + uint32_t result = 0; + int i; + for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) { + if (should_validate_capability(i) && s->enabled_capabilities[i]) { + result++; + } + } + return result; +} + static int configuration_pre_save(void *opaque) { SaveState *state = opaque; const char *current_name = MACHINE_GET_CLASS(current_machine)->name; + MigrationState *s = migrate_get_current(); + int i, j; state->len = strlen(current_name); state->name = current_name; state->target_page_bits = qemu_target_page_bits(); + state->caps_count = get_validatable_capabilities_count(); + state->capabilities = g_renew(MigrationCapability, state->capabilities, + state->caps_count); + for (i = j = 0; i < MIGRATION_CAPABILITY__MAX; i++) { + if (should_validate_capability(i) && s->enabled_capabilities[i]) { + state->capabilities[j++] = i; + } + } + return 0; } @@ -280,6 +319,40 @@ static int configuration_pre_load(void *opaque) return 0; } +static bool configuration_validate_capabilities(SaveState *state) +{ + bool ret = true; + MigrationState *s = migrate_get_current(); + unsigned long *source_caps_bm; + int i; + + source_caps_bm = bitmap_new(MIGRATION_CAPABILITY__MAX); + for (i = 0; i < state->caps_count; i++) { + MigrationCapability capability = state->capabilities[i]; + set_bit(capability, source_caps_bm); + } + + for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) { + bool source_state, target_state; + if (!should_validate_capability(i)) { + continue; + } + source_state = test_bit(i, source_caps_bm); + target_state = s->enabled_capabilities[i]; + if (source_state != target_state) { + error_report("Capability %s is %s, but received capability is %s", + MigrationCapability_str(i), + target_state ? "on" : "off", + source_state ? "on" : "off"); + ret = false; + /* Don't break here to report all failed capabilities */ + } + } + + g_free(source_caps_bm); + return ret; +} + static int configuration_post_load(void *opaque, int version_id) { SaveState *state = opaque; @@ -297,9 +370,53 @@ static int configuration_post_load(void *opaque, int version_id) return -EINVAL; } + if (!configuration_validate_capabilities(state)) { + return -EINVAL; + } + return 0; } +static int get_capability(QEMUFile *f, void *pv, size_t size, + const VMStateField *field) +{ + MigrationCapability *capability = pv; + char capability_str[UINT8_MAX + 1]; + uint8_t len; + int i; + + len = qemu_get_byte(f); + qemu_get_buffer(f, (uint8_t *)capability_str, len); + capability_str[len] = '\0'; + for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) { + if (!strcmp(MigrationCapability_str(i), capability_str)) { + *capability = i; + return 0; + } + } + error_report("Received unknown capability %s", capability_str); + return -EINVAL; +} + +static int put_capability(QEMUFile *f, void *pv, size_t size, + const VMStateField *field, QJSON *vmdesc) +{ + MigrationCapability *capability = pv; + const char *capability_str = MigrationCapability_str(*capability); + size_t len = strlen(capability_str); + assert(len <= UINT8_MAX); + + qemu_put_byte(f, len); + qemu_put_buffer(f, (uint8_t *)capability_str, len); + return 0; +} + +static const VMStateInfo vmstate_info_capability = { + .name = "capability", + .get = get_capability, + .put = put_capability, +}; + /* The target-page-bits subsection is present only if the * target page size is not the same as the default (ie the * minimum page size for a variable-page-size guest CPU). @@ -324,6 +441,25 @@ static const VMStateDescription vmstate_target_page_bits = { } }; +static bool vmstate_capabilites_needed(void *opaque) +{ + return get_validatable_capabilities_count() > 0; +} + +static const VMStateDescription vmstate_capabilites = { + .name = "configuration/capabilities", + .version_id = 1, + .minimum_version_id = 1, + .needed = vmstate_capabilites_needed, + .fields = (VMStateField[]) { + VMSTATE_UINT32_V(caps_count, SaveState, 1), + VMSTATE_VARRAY_UINT32_ALLOC(capabilities, SaveState, caps_count, 1, + vmstate_info_capability, + MigrationCapability), + VMSTATE_END_OF_LIST() + } +}; + static const VMStateDescription vmstate_configuration = { .name = "configuration", .version_id = 1, @@ -337,6 +473,7 @@ static const VMStateDescription vmstate_configuration = { }, .subsections = (const VMStateDescription*[]) { &vmstate_target_page_bits, + &vmstate_capabilites, NULL } }; @@ -951,6 +1088,7 @@ void qemu_savevm_state_header(QEMUFile *f) void qemu_savevm_state_setup(QEMUFile *f) { SaveStateEntry *se; + Error *local_err = NULL; int ret; trace_savevm_state_setup(); @@ -972,6 +1110,10 @@ void qemu_savevm_state_setup(QEMUFile *f) break; } } + + if (precopy_notify(PRECOPY_NOTIFY_SETUP, &local_err)) { + error_report_err(local_err); + } } int qemu_savevm_state_resume_prepare(MigrationState *s) @@ -1114,6 +1256,11 @@ int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only, SaveStateEntry *se; int ret; bool in_postcopy = migration_in_postcopy(); + Error *local_err = NULL; + + if (precopy_notify(PRECOPY_NOTIFY_COMPLETE, &local_err)) { + error_report_err(local_err); + } trace_savevm_state_complete_precopy(); @@ -1246,6 +1393,11 @@ void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size, void qemu_savevm_state_cleanup(void) { SaveStateEntry *se; + Error *local_err = NULL; + + if (precopy_notify(PRECOPY_NOTIFY_CLEANUP, &local_err)) { + error_report_err(local_err); + } trace_savevm_state_cleanup(); QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { diff --git a/migration/socket.c b/migration/socket.c index f4c8174400..239527fb1f 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -15,6 +15,7 @@ */ #include "qemu/osdep.h" +#include "qemu/cutils.h" #include "qemu-common.h" #include "qemu/error-report.h" @@ -177,6 +178,7 @@ static void socket_start_incoming_migration(SocketAddress *saddr, Error **errp) { QIONetListener *listener = qio_net_listener_new(); + size_t i; qio_net_listener_set_name(listener, "migration-socket-listener"); @@ -189,6 +191,15 @@ static void socket_start_incoming_migration(SocketAddress *saddr, socket_accept_incoming_migration, NULL, NULL, g_main_context_get_thread_default()); + + for (i = 0; i < listener->nsioc; i++) { + SocketAddress *address = + qio_channel_socket_get_local_address(listener->sioc[i], errp); + if (!address) { + return; + } + migrate_add_address(address); + } } void tcp_start_incoming_migration(const char *host_port, Error **errp) diff --git a/qapi/migration.json b/qapi/migration.json index 1fd7bbea9b..5684733754 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -6,6 +6,7 @@ ## { 'include': 'common.json' } +{ 'include': 'sockets.json' } ## # @MigrationStats: @@ -199,6 +200,8 @@ # @compression: migration compression statistics, only returned if compression # feature is on and status is 'active' or 'completed' (Since 3.1) # +# @socket-address: Only used for tcp, to know what the real port is (Since 4.0) +# # Since: 0.14.0 ## { 'struct': 'MigrationInfo', @@ -213,7 +216,8 @@ '*error-desc': 'str', '*postcopy-blocktime' : 'uint32', '*postcopy-vcpu-blocktime': ['uint32'], - '*compression': 'CompressionStats'} } + '*compression': 'CompressionStats', + '*socket-address': ['SocketAddress'] } } ## # @query-migrate: @@ -409,13 +413,16 @@ # devices (and thus take locks) immediately at the end of migration. # (since 3.0) # +# @x-ignore-shared: If enabled, QEMU will not migrate shared memory (since 4.0) +# # Since: 1.2 ## { 'enum': 'MigrationCapability', 'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks', 'compress', 'events', 'postcopy-ram', 'x-colo', 'release-ram', 'block', 'return-path', 'pause-before-switchover', 'x-multifd', - 'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate' ] } + 'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate', + 'x-ignore-shared' ] } ## # @MigrationCapabilityStatus: @@ -1386,7 +1393,7 @@ # Example: # # -> { "execute": "query-colo-status" } -# <- { "return": { "mode": "primary", "active": true, "reason": "request" } } +# <- { "return": { "mode": "primary", "reason": "request" } } # # Since: 3.1 ## diff --git a/stubs/ram-block.c b/stubs/ram-block.c index cfa5d8678f..73c0a3ee08 100644 --- a/stubs/ram-block.c +++ b/stubs/ram-block.c @@ -2,6 +2,21 @@ #include "exec/ramlist.h" #include "exec/cpu-common.h" +void *qemu_ram_get_host_addr(RAMBlock *rb) +{ + return 0; +} + +ram_addr_t qemu_ram_get_offset(RAMBlock *rb) +{ + return 0; +} + +ram_addr_t qemu_ram_get_used_length(RAMBlock *rb) +{ + return 0; +} + void ram_block_notifier_add(RAMBlockNotifier *n) { } diff --git a/tests/migration-test.c b/tests/migration-test.c index 8352612364..e3617ceaca 100644 --- a/tests/migration-test.c +++ b/tests/migration-test.c @@ -20,6 +20,9 @@ #include "qemu/sockets.h" #include "chardev/char.h" #include "sysemu/sysemu.h" +#include "qapi/qapi-visit-sockets.h" +#include "qapi/qobject-input-visitor.h" +#include "qapi/qobject-output-visitor.h" #include "migration/migration-test.h" @@ -215,10 +218,10 @@ static gchar *migrate_query_status(QTestState *who) * events suddenly appearing confuse the qmp()/hmp() responses. */ -static uint64_t get_migration_pass(QTestState *who) +static int64_t read_ram_property_int(QTestState *who, const char *property) { QDict *rsp_return, *rsp_ram; - uint64_t result; + int64_t result; rsp_return = migrate_query(who); if (!qdict_haskey(rsp_return, "ram")) { @@ -226,12 +229,17 @@ static uint64_t get_migration_pass(QTestState *who) result = 0; } else { rsp_ram = qdict_get_qdict(rsp_return, "ram"); - result = qdict_get_try_int(rsp_ram, "dirty-sync-count", 0); + result = qdict_get_try_int(rsp_ram, property, 0); } qobject_unref(rsp_return); return result; } +static uint64_t get_migration_pass(QTestState *who) +{ + return read_ram_property_int(who, "dirty-sync-count"); +} + static void read_blocktime(QTestState *who) { QDict *rsp_return; @@ -332,15 +340,75 @@ static void cleanup(const char *filename) g_free(path); } +static char *get_shmem_opts(const char *mem_size, const char *shmem_path) +{ + return g_strdup_printf("-object memory-backend-file,id=mem0,size=%s" + ",mem-path=%s,share=on -numa node,memdev=mem0", + mem_size, shmem_path); +} + +static char *SocketAddress_to_str(SocketAddress *addr) +{ + switch (addr->type) { + case SOCKET_ADDRESS_TYPE_INET: + return g_strdup_printf("tcp:%s:%s", + addr->u.inet.host, + addr->u.inet.port); + case SOCKET_ADDRESS_TYPE_UNIX: + return g_strdup_printf("unix:%s", + addr->u.q_unix.path); + case SOCKET_ADDRESS_TYPE_FD: + return g_strdup_printf("fd:%s", addr->u.fd.str); + case SOCKET_ADDRESS_TYPE_VSOCK: + return g_strdup_printf("tcp:%s:%s", + addr->u.vsock.cid, + addr->u.vsock.port); + default: + return g_strdup("unknown address type"); + } +} + +static char *migrate_get_socket_address(QTestState *who, const char *parameter) +{ + QDict *rsp; + char *result; + Error *local_err = NULL; + SocketAddressList *addrs; + Visitor *iv = NULL; + QObject *object; + + rsp = migrate_query(who); + object = qdict_get(rsp, parameter); + + iv = qobject_input_visitor_new(object); + visit_type_SocketAddressList(iv, NULL, &addrs, &local_err); + + /* we are only using a single address */ + result = g_strdup_printf("%s", SocketAddress_to_str(addrs->value)); + + qapi_free_SocketAddressList(addrs); + qobject_unref(rsp); + return result; +} + +static long long migrate_get_parameter(QTestState *who, const char *parameter) +{ + QDict *rsp; + long long result; + + rsp = wait_command(who, "{ 'execute': 'query-migrate-parameters' }"); + result = qdict_get_int(rsp, parameter); + qobject_unref(rsp); + return result; +} + static void migrate_check_parameter(QTestState *who, const char *parameter, long long value) { - QDict *rsp_return; + long long result; - rsp_return = wait_command(who, - "{ 'execute': 'query-migrate-parameters' }"); - g_assert_cmpint(qdict_get_int(rsp_return, parameter), ==, value); - qobject_unref(rsp_return); + result = migrate_get_parameter(who, parameter); + g_assert_cmpint(result, ==, value); } static void migrate_set_parameter(QTestState *who, const char *parameter, @@ -430,73 +498,95 @@ static void migrate_postcopy_start(QTestState *from, QTestState *to) } static int test_migrate_start(QTestState **from, QTestState **to, - const char *uri, bool hide_stderr) + const char *uri, bool hide_stderr, + bool use_shmem) { gchar *cmd_src, *cmd_dst; - char *bootpath = g_strdup_printf("%s/bootsect", tmpfs); + char *bootpath = NULL; + char *extra_opts = NULL; + char *shmem_path = NULL; const char *arch = qtest_get_arch(); const char *accel = "kvm:tcg"; - got_stop = false; + if (use_shmem) { + if (!g_file_test("/dev/shm", G_FILE_TEST_IS_DIR)) { + g_test_skip("/dev/shm is not supported"); + return -1; + } + shmem_path = g_strdup_printf("/dev/shm/qemu-%d", getpid()); + } + got_stop = false; + bootpath = g_strdup_printf("%s/bootsect", tmpfs); if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) { init_bootfile(bootpath, x86_bootsect); + extra_opts = use_shmem ? get_shmem_opts("150M", shmem_path) : NULL; cmd_src = g_strdup_printf("-machine accel=%s -m 150M" " -name source,debug-threads=on" " -serial file:%s/src_serial" - " -drive file=%s,format=raw", - accel, tmpfs, bootpath); + " -drive file=%s,format=raw %s", + accel, tmpfs, bootpath, + extra_opts ? extra_opts : ""); cmd_dst = g_strdup_printf("-machine accel=%s -m 150M" " -name target,debug-threads=on" " -serial file:%s/dest_serial" " -drive file=%s,format=raw" - " -incoming %s", - accel, tmpfs, bootpath, uri); + " -incoming %s %s", + accel, tmpfs, bootpath, uri, + extra_opts ? extra_opts : ""); start_address = X86_TEST_MEM_START; end_address = X86_TEST_MEM_END; } else if (g_str_equal(arch, "s390x")) { init_bootfile_s390x(bootpath); + extra_opts = use_shmem ? get_shmem_opts("128M", shmem_path) : NULL; cmd_src = g_strdup_printf("-machine accel=%s -m 128M" " -name source,debug-threads=on" - " -serial file:%s/src_serial -bios %s", - accel, tmpfs, bootpath); + " -serial file:%s/src_serial -bios %s %s", + accel, tmpfs, bootpath, + extra_opts ? extra_opts : ""); cmd_dst = g_strdup_printf("-machine accel=%s -m 128M" " -name target,debug-threads=on" " -serial file:%s/dest_serial -bios %s" - " -incoming %s", - accel, tmpfs, bootpath, uri); + " -incoming %s %s", + accel, tmpfs, bootpath, uri, + extra_opts ? extra_opts : ""); start_address = S390_TEST_MEM_START; end_address = S390_TEST_MEM_END; } else if (strcmp(arch, "ppc64") == 0) { + extra_opts = use_shmem ? get_shmem_opts("256M", shmem_path) : NULL; cmd_src = g_strdup_printf("-machine accel=%s -m 256M -nodefaults" " -name source,debug-threads=on" " -serial file:%s/src_serial" " -prom-env 'use-nvramrc?=true' -prom-env " "'nvramrc=hex .\" _\" begin %x %x " "do i c@ 1 + i c! 1000 +loop .\" B\" 0 " - "until'", accel, tmpfs, end_address, - start_address); + "until' %s", accel, tmpfs, end_address, + start_address, extra_opts ? extra_opts : ""); cmd_dst = g_strdup_printf("-machine accel=%s -m 256M" " -name target,debug-threads=on" " -serial file:%s/dest_serial" - " -incoming %s", - accel, tmpfs, uri); + " -incoming %s %s", + accel, tmpfs, uri, + extra_opts ? extra_opts : ""); start_address = PPC_TEST_MEM_START; end_address = PPC_TEST_MEM_END; } else if (strcmp(arch, "aarch64") == 0) { init_bootfile(bootpath, aarch64_kernel); + extra_opts = use_shmem ? get_shmem_opts("150M", shmem_path) : NULL; cmd_src = g_strdup_printf("-machine virt,accel=%s,gic-version=max " "-name vmsource,debug-threads=on -cpu max " "-m 150M -serial file:%s/src_serial " - "-kernel %s ", - accel, tmpfs, bootpath); + "-kernel %s %s", + accel, tmpfs, bootpath, + extra_opts ? extra_opts : ""); cmd_dst = g_strdup_printf("-machine virt,accel=%s,gic-version=max " "-name vmdest,debug-threads=on -cpu max " "-m 150M -serial file:%s/dest_serial " "-kernel %s " - "-incoming %s ", - accel, tmpfs, bootpath, uri); + "-incoming %s %s", + accel, tmpfs, bootpath, uri, + extra_opts ? extra_opts : ""); start_address = ARM_TEST_MEM_START; end_address = ARM_TEST_MEM_END; @@ -507,6 +597,7 @@ static int test_migrate_start(QTestState **from, QTestState **to, } g_free(bootpath); + g_free(extra_opts); if (hide_stderr) { gchar *tmp; @@ -524,6 +615,16 @@ static int test_migrate_start(QTestState **from, QTestState **to, *to = qtest_init(cmd_dst); g_free(cmd_dst); + + /* + * Remove shmem file immediately to avoid memory leak in test failed case. + * It's valid becase QEMU has already opened this file + */ + if (use_shmem) { + unlink(shmem_path); + g_free(shmem_path); + } + return 0; } @@ -584,6 +685,17 @@ static void deprecated_set_speed(QTestState *who, long long value) migrate_check_parameter(who, "max-bandwidth", value); } +static void deprecated_set_cache_size(QTestState *who, long long value) +{ + QDict *rsp; + + rsp = qtest_qmp(who, "{ 'execute': 'migrate-set-cache-size'," + "'arguments': { 'value': %lld } }", value); + g_assert(qdict_haskey(rsp, "return")); + qobject_unref(rsp); + migrate_check_parameter(who, "xbzrle-cache-size", value); +} + static void test_deprecated(void) { QTestState *from; @@ -592,6 +704,7 @@ static void test_deprecated(void) deprecated_set_downtime(from, 0.12345); deprecated_set_speed(from, 12345); + deprecated_set_cache_size(from, 4096); qtest_quit(from); } @@ -603,7 +716,7 @@ static int migrate_postcopy_prepare(QTestState **from_ptr, char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs); QTestState *from, *to; - if (test_migrate_start(&from, &to, uri, hide_error)) { + if (test_migrate_start(&from, &to, uri, hide_error, false)) { return -1; } @@ -720,7 +833,7 @@ static void test_baddest(void) char *status; bool failed; - if (test_migrate_start(&from, &to, "tcp:0:0", true)) { + if (test_migrate_start(&from, &to, "tcp:0:0", true, false)) { return; } migrate(from, "tcp:0:0", "{}"); @@ -745,7 +858,7 @@ static void test_precopy_unix(void) char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs); QTestState *from, *to; - if (test_migrate_start(&from, &to, uri, false)) { + if (test_migrate_start(&from, &to, uri, false, false)) { return; } @@ -781,6 +894,138 @@ static void test_precopy_unix(void) g_free(uri); } +#if 0 +/* Currently upset on aarch64 TCG */ +static void test_ignore_shared(void) +{ + char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs); + QTestState *from, *to; + + if (test_migrate_start(&from, &to, uri, false, true)) { + return; + } + + migrate_set_capability(from, "x-ignore-shared", true); + migrate_set_capability(to, "x-ignore-shared", true); + + /* Wait for the first serial output from the source */ + wait_for_serial("src_serial"); + + migrate(from, uri, "{}"); + + wait_for_migration_pass(from); + + if (!got_stop) { + qtest_qmp_eventwait(from, "STOP"); + } + + qtest_qmp_eventwait(to, "RESUME"); + + wait_for_serial("dest_serial"); + wait_for_migration_complete(from); + + /* Check whether shared RAM has been really skipped */ + g_assert_cmpint(read_ram_property_int(from, "transferred"), <, 1024 * 1024); + + test_migrate_end(from, to, true); + g_free(uri); +} +#endif + +static void test_xbzrle(const char *uri) +{ + QTestState *from, *to; + + if (test_migrate_start(&from, &to, uri, false, false)) { + return; + } + + /* + * We want to pick a speed slow enough that the test completes + * quickly, but that it doesn't complete precopy even on a slow + * machine, so also set the downtime. + */ + /* 1 ms should make it not converge*/ + migrate_set_parameter(from, "downtime-limit", 1); + /* 1GB/s */ + migrate_set_parameter(from, "max-bandwidth", 1000000000); + + migrate_set_parameter(from, "xbzrle-cache-size", 33554432); + + migrate_set_capability(from, "xbzrle", "true"); + migrate_set_capability(to, "xbzrle", "true"); + /* Wait for the first serial output from the source */ + wait_for_serial("src_serial"); + + migrate(from, uri, "{}"); + + wait_for_migration_pass(from); + + /* 300ms should converge */ + migrate_set_parameter(from, "downtime-limit", 300); + + if (!got_stop) { + qtest_qmp_eventwait(from, "STOP"); + } + qtest_qmp_eventwait(to, "RESUME"); + + wait_for_serial("dest_serial"); + wait_for_migration_complete(from); + + test_migrate_end(from, to, true); +} + +static void test_xbzrle_unix(void) +{ + char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs); + + test_xbzrle(uri); + g_free(uri); +} + +static void test_precopy_tcp(void) +{ + char *uri; + QTestState *from, *to; + + if (test_migrate_start(&from, &to, "tcp:127.0.0.1:0", false, false)) { + return; + } + + /* + * We want to pick a speed slow enough that the test completes + * quickly, but that it doesn't complete precopy even on a slow + * machine, so also set the downtime. + */ + /* 1 ms should make it not converge*/ + migrate_set_parameter(from, "downtime-limit", 1); + /* 1GB/s */ + migrate_set_parameter(from, "max-bandwidth", 1000000000); + + /* Wait for the first serial output from the source */ + wait_for_serial("src_serial"); + + uri = migrate_get_socket_address(to, "socket-address"); + + migrate(from, uri, "{}"); + + wait_for_migration_pass(from); + + /* 300ms should converge */ + migrate_set_parameter(from, "downtime-limit", 300); + + if (!got_stop) { + qtest_qmp_eventwait(from, "STOP"); + } + qtest_qmp_eventwait(to, "RESUME"); + + wait_for_serial("dest_serial"); + wait_for_migration_complete(from); + + test_migrate_end(from, to, true); + g_free(uri); +} + int main(int argc, char **argv) { char template[] = "/tmp/migration-test-XXXXXX"; @@ -832,6 +1077,9 @@ int main(int argc, char **argv) qtest_add_func("/migration/deprecated", test_deprecated); qtest_add_func("/migration/bad_dest", test_baddest); qtest_add_func("/migration/precopy/unix", test_precopy_unix); + qtest_add_func("/migration/precopy/tcp", test_precopy_tcp); + /* qtest_add_func("/migration/ignore_shared", test_ignore_shared); */ + qtest_add_func("/migration/xbzrle/unix", test_xbzrle_unix); ret = g_test_run(); diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c index 342d4a2285..2367fe8f7f 100644 --- a/util/vfio-helpers.c +++ b/util/vfio-helpers.c @@ -391,10 +391,10 @@ static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, } } -static int qemu_vfio_init_ramblock(const char *block_name, void *host_addr, - ram_addr_t offset, ram_addr_t length, - void *opaque) +static int qemu_vfio_init_ramblock(RAMBlock *rb, void *opaque) { + void *host_addr = qemu_ram_get_host_addr(rb); + ram_addr_t length = qemu_ram_get_used_length(rb); int ret; QEMUVFIOState *s = opaque; diff --git a/vl.c b/vl.c index fd0d51320d..4c5cc0d8ad 100644 --- a/vl.c +++ b/vl.c @@ -3039,6 +3039,7 @@ int main(int argc, char **argv, char **envp) module_call_init(MODULE_INIT_OPTS); runstate_init(); + precopy_infrastructure_init(); postcopy_infrastructure_init(); monitor_init_globals(); @@ -4579,6 +4580,12 @@ int main(int argc, char **argv, char **envp) gdbserver_cleanup(); + /* + * cleaning up the migration object cancels any existing migration + * try to do this early so that it also stops using devices. + */ + migration_shutdown(); + /* No more vcpu or device emulation activity beyond this point */ vm_shutdown(); @@ -4594,7 +4601,6 @@ int main(int argc, char **argv, char **envp) monitor_cleanup(); qemu_chr_cleanup(); user_creatable_cleanup(); - migration_object_finalize(); /* TODO: unref root container, check all devices are ok */ return 0;