From b11cf32e07a2f7ff0d171b89497381a04c9d07e0 Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Fri, 16 Dec 2022 14:22:31 +0800 Subject: [PATCH 1/4] virtio-mem: Fix the bitmap index of the section offset vmem->bitmap indexes the memory region of the virtio-mem backend at a granularity of block_size. To calculate the index of target section offset, the block_size should be divided instead of the bitmap_size. Fixes: 2044969f0b ("virtio-mem: Implement RamDiscardManager interface") Signed-off-by: Chenyi Qiang Message-Id: <20221216062231.11181-1-chenyi.qiang@intel.com> Reviewed-by: David Hildenbrand Reviewed-by: Michael S. Tsirkin Cc: qemu-stable@nongnu.org Signed-off-by: David Hildenbrand --- hw/virtio/virtio-mem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c index d96bde1fab..5c22c4b876 100644 --- a/hw/virtio/virtio-mem.c +++ b/hw/virtio/virtio-mem.c @@ -235,7 +235,7 @@ static int virtio_mem_for_each_plugged_section(const VirtIOMEM *vmem, uint64_t offset, size; int ret = 0; - first_bit = s->offset_within_region / vmem->bitmap_size; + first_bit = s->offset_within_region / vmem->block_size; first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, first_bit); while (first_bit < vmem->bitmap_size) { MemoryRegionSection tmp = *s; @@ -267,7 +267,7 @@ static int virtio_mem_for_each_unplugged_section(const VirtIOMEM *vmem, uint64_t offset, size; int ret = 0; - first_bit = s->offset_within_region / vmem->bitmap_size; + first_bit = s->offset_within_region / vmem->block_size; first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, first_bit); while (first_bit < vmem->bitmap_size) { MemoryRegionSection tmp = *s; From 29f1b328e3b767cba2661920a8470738469b9e36 Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Wed, 28 Dec 2022 17:03:12 +0800 Subject: [PATCH 2/4] virtio-mem: Fix the iterator variable in a vmem->rdl_list loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It should be the variable rdl2 to revert the already-notified listeners. Fixes: 2044969f0b ("virtio-mem: Implement RamDiscardManager interface") Signed-off-by: Chenyi Qiang Message-Id: <20221228090312.17276-1-chenyi.qiang@intel.com> Cc: qemu-stable@nongnu.org Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: David Hildenbrand --- hw/virtio/virtio-mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c index 5c22c4b876..2b0271442b 100644 --- a/hw/virtio/virtio-mem.c +++ b/hw/virtio/virtio-mem.c @@ -341,7 +341,7 @@ static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset, if (ret) { /* Notify all already-notified listeners. */ QLIST_FOREACH(rdl2, &vmem->rdl_list, next) { - MemoryRegionSection tmp = *rdl->section; + MemoryRegionSection tmp = *rdl2->section; if (rdl2 == rdl) { break; From 82ba778e1329f6fb89a23d728c680656698d275a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Wed, 28 Dec 2022 14:09:56 +0100 Subject: [PATCH 3/4] virtio-mem: Fix typo in function name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20221228130956.80515-1-philmd@linaro.org> Signed-off-by: David Hildenbrand --- hw/virtio/virtio-mem.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c index 2b0271442b..1ed1f5a4af 100644 --- a/hw/virtio/virtio-mem.c +++ b/hw/virtio/virtio-mem.c @@ -207,7 +207,7 @@ static int virtio_mem_for_each_unplugged_range(const VirtIOMEM *vmem, void *arg, * * Returns false if the intersection is empty, otherwise returns true. */ -static bool virito_mem_intersect_memory_section(MemoryRegionSection *s, +static bool virtio_mem_intersect_memory_section(MemoryRegionSection *s, uint64_t offset, uint64_t size) { uint64_t start = MAX(s->offset_within_region, offset); @@ -245,7 +245,7 @@ static int virtio_mem_for_each_plugged_section(const VirtIOMEM *vmem, first_bit + 1) - 1; size = (last_bit - first_bit + 1) * vmem->block_size; - if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) { break; } ret = cb(&tmp, arg); @@ -277,7 +277,7 @@ static int virtio_mem_for_each_unplugged_section(const VirtIOMEM *vmem, first_bit + 1) - 1; size = (last_bit - first_bit + 1) * vmem->block_size; - if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) { break; } ret = cb(&tmp, arg); @@ -313,7 +313,7 @@ static void virtio_mem_notify_unplug(VirtIOMEM *vmem, uint64_t offset, QLIST_FOREACH(rdl, &vmem->rdl_list, next) { MemoryRegionSection tmp = *rdl->section; - if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) { continue; } rdl->notify_discard(rdl, &tmp); @@ -329,7 +329,7 @@ static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset, QLIST_FOREACH(rdl, &vmem->rdl_list, next) { MemoryRegionSection tmp = *rdl->section; - if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) { continue; } ret = rdl->notify_populate(rdl, &tmp); @@ -346,7 +346,7 @@ static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset, if (rdl2 == rdl) { break; } - if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) { continue; } rdl2->notify_discard(rdl2, &tmp); From 6bb613f0812d1364fc8fcf0846647446884d5148 Mon Sep 17 00:00:00 2001 From: Michal Privoznik Date: Thu, 15 Dec 2022 10:55:03 +0100 Subject: [PATCH 4/4] hostmem: Honor multiple preferred nodes if possible If a memory-backend is configured with mode HOST_MEM_POLICY_PREFERRED then host_memory_backend_memory_complete() calls mbind() as: mbind(..., MPOL_PREFERRED, nodemask, ...); Here, 'nodemask' is a bitmap of host NUMA nodes and corresponds to the .host-nodes attribute. Therefore, there can be multiple nodes specified. However, the documentation to MPOL_PREFERRED says: MPOL_PREFERRED This mode sets the preferred node for allocation. ... If nodemask specifies more than one node ID, the first node in the mask will be selected as the preferred node. Therefore, only the first node is honored and the rest is silently ignored. Well, with recent changes to the kernel and numactl we can do better. The Linux kernel added in v5.15 via commit cfcaa66f8032 ("mm/hugetlb: add support for mempolicy MPOL_PREFERRED_MANY") support for MPOL_PREFERRED_MANY, which accepts multiple preferred NUMA nodes instead. Then, numa_has_preferred_many() API was introduced to numactl (v2.0.15~26) allowing applications to query kernel support. Wiring this all together, we can pass MPOL_PREFERRED_MANY to the mbind() call instead and stop ignoring multiple nodes, silently. Signed-off-by: Michal Privoznik Message-Id: Reviewed-by: David Hildenbrand Signed-off-by: David Hildenbrand --- backends/hostmem.c | 19 +++++++++++++++++-- meson.build | 5 +++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/backends/hostmem.c b/backends/hostmem.c index 8640294c10..747e7838c0 100644 --- a/backends/hostmem.c +++ b/backends/hostmem.c @@ -23,7 +23,12 @@ #ifdef CONFIG_NUMA #include +#include QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_DEFAULT != MPOL_DEFAULT); +/* + * HOST_MEM_POLICY_PREFERRED may either translate to MPOL_PREFERRED or + * MPOL_PREFERRED_MANY, see comments further below. + */ QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_PREFERRED != MPOL_PREFERRED); QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_BIND != MPOL_BIND); QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_INTERLEAVE != MPOL_INTERLEAVE); @@ -346,6 +351,7 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp) * before mbind(). note: MPOL_MF_STRICT is ignored on hugepages so * this doesn't catch hugepage case. */ unsigned flags = MPOL_MF_STRICT | MPOL_MF_MOVE; + int mode = backend->policy; /* check for invalid host-nodes and policies and give more verbose * error messages than mbind(). */ @@ -369,9 +375,18 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp) BITS_TO_LONGS(MAX_NODES + 1) * sizeof(unsigned long)); assert(maxnode <= MAX_NODES); +#ifdef HAVE_NUMA_HAS_PREFERRED_MANY + if (mode == MPOL_PREFERRED && numa_has_preferred_many() > 0) { + /* + * Replace with MPOL_PREFERRED_MANY otherwise the mbind() below + * silently picks the first node. + */ + mode = MPOL_PREFERRED_MANY; + } +#endif + if (maxnode && - mbind(ptr, sz, backend->policy, backend->host_nodes, maxnode + 1, - flags)) { + mbind(ptr, sz, mode, backend->host_nodes, maxnode + 1, flags)) { if (backend->policy != MPOL_DEFAULT || errno != ENOSYS) { error_setg_errno(errp, errno, "cannot bind memory to host NUMA nodes"); diff --git a/meson.build b/meson.build index 4c6f8a674a..3f31db5963 100644 --- a/meson.build +++ b/meson.build @@ -1858,6 +1858,11 @@ config_host_data.set('CONFIG_LINUX_AIO', libaio.found()) config_host_data.set('CONFIG_LINUX_IO_URING', linux_io_uring.found()) config_host_data.set('CONFIG_LIBPMEM', libpmem.found()) config_host_data.set('CONFIG_NUMA', numa.found()) +if numa.found() + config_host_data.set('HAVE_NUMA_HAS_PREFERRED_MANY', + cc.has_function('numa_has_preferred_many', + dependencies: numa)) +endif config_host_data.set('CONFIG_OPENGL', opengl.found()) config_host_data.set('CONFIG_PROFILER', get_option('profiler')) config_host_data.set('CONFIG_RBD', rbd.found())