177f9b1ee4
Having large virtio-mem devices that only expose little memory to a VM is currently a problem: we map the whole sparse memory region into the guest using a single memslot, resulting in one gigantic memslot in KVM. KVM allocates metadata for the whole memslot, which can result in quite some memory waste. Assuming we have a 1 TiB virtio-mem device and only expose little (e.g., 1 GiB) memory, we would create a single 1 TiB memslot and KVM has to allocate metadata for that 1 TiB memslot: on x86, this implies allocating a significant amount of memory for metadata: (1) RMAP: 8 bytes per 4 KiB, 8 bytes per 2 MiB, 8 bytes per 1 GiB -> For 1 TiB: 2147483648 + 4194304 + 8192 = ~ 2 GiB (0.2 %) With the TDP MMU (cat /sys/module/kvm/parameters/tdp_mmu) this gets allocated lazily when required for nested VMs (2) gfn_track: 2 bytes per 4 KiB -> For 1 TiB: 536870912 = ~512 MiB (0.05 %) (3) lpage_info: 4 bytes per 2 MiB, 4 bytes per 1 GiB -> For 1 TiB: 2097152 + 4096 = ~2 MiB (0.0002 %) (4) 2x dirty bitmaps for tracking: 2x 1 bit per 4 KiB page -> For 1 TiB: 536870912 = 64 MiB (0.006 %) So we primarily care about (1) and (2). The bad thing is, that the memory consumption *doubles* once SMM is enabled, because we create the memslot once for !SMM and once for SMM. Having a 1 TiB memslot without the TDP MMU consumes around: * With SMM: 5 GiB * Without SMM: 2.5 GiB Having a 1 TiB memslot with the TDP MMU consumes around: * With SMM: 1 GiB * Without SMM: 512 MiB ... and that's really something we want to optimize, to be able to just start a VM with small boot memory (e.g., 4 GiB) and a virtio-mem device that can grow very large (e.g., 1 TiB). Consequently, using multiple memslots and only mapping the memslots we really need can significantly reduce memory waste and speed up memslot-related operations. Let's expose the sparse RAM memory region using multiple memslots, mapping only the memslots we currently need into our device memory region container. The feature can be enabled using "dynamic-memslots=on" and requires "unplugged-inaccessible=on", which is nowadays the default. Once enabled, we'll auto-detect the number of memslots to use based on the memslot limit provided by the core. We'll use at most 1 memslot per gigabyte. Note that our global limit of memslots accross all memory devices is currently set to 256: even with multiple large virtio-mem devices, we'd still have a sane limit on the number of memslots used. The default is to not dynamically map memslot for now ("dynamic-memslots=off"). The optimization must be enabled manually, because some vhost setups (e.g., hotplug of vhost-user devices) might be problematic until we support more memslots especially in vhost-user backends. Note that "dynamic-memslots=on" is just a hint that multiple memslots *may* be used for internal optimizations, not that multiple memslots *must* be used. The actual number of memslots that are used is an internal detail: for example, once memslot metadata is no longer an issue, we could simply stop optimizing for that. Migration source and destination can differ on the setting of "dynamic-memslots". Message-ID: <20230926185738.277351-17-david@redhat.com> Reviewed-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: David Hildenbrand <david@redhat.com>
135 lines
3.7 KiB
C
135 lines
3.7 KiB
C
/*
|
|
* Virtio MEM device
|
|
*
|
|
* Copyright (C) 2020 Red Hat, Inc.
|
|
*
|
|
* Authors:
|
|
* David Hildenbrand <david@redhat.com>
|
|
*
|
|
* This work is licensed under the terms of the GNU GPL, version 2.
|
|
* See the COPYING file in the top-level directory.
|
|
*/
|
|
|
|
#ifndef HW_VIRTIO_MEM_H
|
|
#define HW_VIRTIO_MEM_H
|
|
|
|
#include "standard-headers/linux/virtio_mem.h"
|
|
#include "hw/virtio/virtio.h"
|
|
#include "qapi/qapi-types-misc.h"
|
|
#include "sysemu/hostmem.h"
|
|
#include "qom/object.h"
|
|
|
|
#define TYPE_VIRTIO_MEM "virtio-mem"
|
|
|
|
OBJECT_DECLARE_TYPE(VirtIOMEM, VirtIOMEMClass,
|
|
VIRTIO_MEM)
|
|
|
|
#define VIRTIO_MEM_MEMDEV_PROP "memdev"
|
|
#define VIRTIO_MEM_NODE_PROP "node"
|
|
#define VIRTIO_MEM_SIZE_PROP "size"
|
|
#define VIRTIO_MEM_REQUESTED_SIZE_PROP "requested-size"
|
|
#define VIRTIO_MEM_BLOCK_SIZE_PROP "block-size"
|
|
#define VIRTIO_MEM_ADDR_PROP "memaddr"
|
|
#define VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP "unplugged-inaccessible"
|
|
#define VIRTIO_MEM_EARLY_MIGRATION_PROP "x-early-migration"
|
|
#define VIRTIO_MEM_PREALLOC_PROP "prealloc"
|
|
#define VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP "dynamic-memslots"
|
|
|
|
struct VirtIOMEM {
|
|
VirtIODevice parent_obj;
|
|
|
|
/* guest -> host request queue */
|
|
VirtQueue *vq;
|
|
|
|
/* bitmap used to track unplugged memory */
|
|
int32_t bitmap_size;
|
|
unsigned long *bitmap;
|
|
|
|
/*
|
|
* With "dynamic-memslots=on": Device memory region in which we dynamically
|
|
* map the memslots.
|
|
*/
|
|
MemoryRegion *mr;
|
|
|
|
/*
|
|
* With "dynamic-memslots=on": The individual memslots (aliases into the
|
|
* memory backend).
|
|
*/
|
|
MemoryRegion *memslots;
|
|
|
|
/* With "dynamic-memslots=on": The total number of memslots. */
|
|
uint16_t nb_memslots;
|
|
|
|
/*
|
|
* With "dynamic-memslots=on": Size of one memslot (the size of the
|
|
* last one can differ).
|
|
*/
|
|
uint64_t memslot_size;
|
|
|
|
/* Assigned memory backend with the RAM memory region. */
|
|
HostMemoryBackend *memdev;
|
|
|
|
/* NUMA node */
|
|
uint32_t node;
|
|
|
|
/* assigned address of the region in guest physical memory */
|
|
uint64_t addr;
|
|
|
|
/* usable region size (<= region_size) */
|
|
uint64_t usable_region_size;
|
|
|
|
/* actual size (how much the guest plugged) */
|
|
uint64_t size;
|
|
|
|
/* requested size */
|
|
uint64_t requested_size;
|
|
|
|
/* block size and alignment */
|
|
uint64_t block_size;
|
|
|
|
/*
|
|
* Whether we indicate VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE to the guest.
|
|
* For !x86 targets this will always be "on" and consequently indicate
|
|
* VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE.
|
|
*/
|
|
OnOffAuto unplugged_inaccessible;
|
|
|
|
/* whether to prealloc memory when plugging new blocks */
|
|
bool prealloc;
|
|
|
|
/*
|
|
* Whether we migrate properties that are immutable while migration is
|
|
* active early, before state of other devices and especially, before
|
|
* migrating any RAM content.
|
|
*/
|
|
bool early_migration;
|
|
|
|
/*
|
|
* Whether we dynamically map (multiple, if possible) memslots instead of
|
|
* statically mapping the whole RAM memory region.
|
|
*/
|
|
bool dynamic_memslots;
|
|
|
|
/* notifiers to notify when "size" changes */
|
|
NotifierList size_change_notifiers;
|
|
|
|
/* listeners to notify on plug/unplug activity. */
|
|
QLIST_HEAD(, RamDiscardListener) rdl_list;
|
|
};
|
|
|
|
struct VirtIOMEMClass {
|
|
/* private */
|
|
VirtIODevice parent;
|
|
|
|
/* public */
|
|
void (*fill_device_info)(const VirtIOMEM *vmen, VirtioMEMDeviceInfo *vi);
|
|
MemoryRegion *(*get_memory_region)(VirtIOMEM *vmem, Error **errp);
|
|
void (*decide_memslots)(VirtIOMEM *vmem, unsigned int limit);
|
|
unsigned int (*get_memslots)(VirtIOMEM *vmem);
|
|
void (*add_size_change_notifier)(VirtIOMEM *vmem, Notifier *notifier);
|
|
void (*remove_size_change_notifier)(VirtIOMEM *vmem, Notifier *notifier);
|
|
void (*unplug_request_check)(VirtIOMEM *vmem, Error **errp);
|
|
};
|
|
|
|
#endif
|