From 0dc3f44aca18b1be8b425f3f4feb4b3e8d68de2e Mon Sep 17 00:00:00 2001 From: Mike Day Date: Thu, 5 Sep 2013 14:41:35 -0400 Subject: [PATCH] Convert ram_list to RCU Allow "unlocked" reads of the ram_list by using an RCU-enabled QLIST. The ramlist mutex is kept. call_rcu callbacks are run with the iothread lock taken, but that may change in the future. Writers still take the ramlist mutex, but they no longer need to assume that the iothread lock is taken. Readers of the list, instead, no longer require either the iothread or ramlist mutex, but they need to use rcu_read_lock() and rcu_read_unlock(). One place in arch_init.c was downgrading from write side to read side like this: qemu_mutex_lock_iothread() qemu_mutex_lock_ramlist() ... qemu_mutex_unlock_iothread() ... qemu_mutex_unlock_ramlist() and the equivalent idiom is: qemu_mutex_lock_ramlist() rcu_read_lock() ... qemu_mutex_unlock_ramlist() ... rcu_read_unlock() Reviewed-by: Fam Zheng Signed-off-by: Mike Day Signed-off-by: Paolo Bonzini --- arch_init.c | 65 ++++++++++++++++++-------- exec.c | 102 +++++++++++++++++++++++++++++------------ include/exec/cpu-all.h | 6 +-- 3 files changed, 121 insertions(+), 52 deletions(-) diff --git a/arch_init.c b/arch_init.c index 1ee2e35352..5fc6fc382c 100644 --- a/arch_init.c +++ b/arch_init.c @@ -52,6 +52,7 @@ #include "exec/ram_addr.h" #include "hw/acpi/acpi.h" #include "qemu/host-utils.h" +#include "qemu/rcu_queue.h" #ifdef DEBUG_ARCH_INIT #define DPRINTF(fmt, ...) \ @@ -523,9 +524,12 @@ static void migration_bitmap_sync(void) trace_migration_bitmap_sync_start(); address_space_sync_dirty_bitmap(&address_space_memory); - QLIST_FOREACH(block, &ram_list.blocks, next) { + rcu_read_lock(); + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { migration_bitmap_sync_range(block->mr->ram_addr, block->used_length); } + rcu_read_unlock(); + trace_migration_bitmap_sync_end(migration_dirty_pages - num_dirty_pages_init); num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init; @@ -648,6 +652,8 @@ static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset, /* * ram_find_and_save_block: Finds a page to send and sends it to f * + * Called within an RCU critical section. + * * Returns: The number of bytes written. * 0 means no dirty pages */ @@ -661,7 +667,7 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage) MemoryRegion *mr; if (!block) - block = QLIST_FIRST(&ram_list.blocks); + block = QLIST_FIRST_RCU(&ram_list.blocks); while (true) { mr = block->mr; @@ -672,9 +678,9 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage) } if (offset >= block->used_length) { offset = 0; - block = QLIST_NEXT(block, next); + block = QLIST_NEXT_RCU(block, next); if (!block) { - block = QLIST_FIRST(&ram_list.blocks); + block = QLIST_FIRST_RCU(&ram_list.blocks); complete_round = true; ram_bulk_stage = false; } @@ -728,10 +734,10 @@ uint64_t ram_bytes_total(void) RAMBlock *block; uint64_t total = 0; - QLIST_FOREACH(block, &ram_list.blocks, next) { + rcu_read_lock(); + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) total += block->used_length; - } - + rcu_read_unlock(); return total; } @@ -777,6 +783,13 @@ static void reset_ram_globals(void) #define MAX_WAIT 50 /* ms, half buffered_file limit */ + +/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has + * long-running RCU critical section. When rcu-reclaims in the code + * start to become numerous it will be necessary to reduce the + * granularity of these critical sections. + */ + static int ram_save_setup(QEMUFile *f, void *opaque) { RAMBlock *block; @@ -820,6 +833,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) /* iothread lock needed for ram_list.dirty_memory[] */ qemu_mutex_lock_iothread(); qemu_mutex_lock_ramlist(); + rcu_read_lock(); bytes_transferred = 0; reset_ram_globals(); @@ -832,7 +846,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) * gaps due to alignment or unplugs. */ migration_dirty_pages = 0; - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { uint64_t block_pages; block_pages = block->used_length >> TARGET_PAGE_BITS; @@ -841,17 +855,18 @@ static int ram_save_setup(QEMUFile *f, void *opaque) memory_global_dirty_log_start(); migration_bitmap_sync(); + qemu_mutex_unlock_ramlist(); qemu_mutex_unlock_iothread(); qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE); - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { qemu_put_byte(f, strlen(block->idstr)); qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); qemu_put_be64(f, block->used_length); } - qemu_mutex_unlock_ramlist(); + rcu_read_unlock(); ram_control_before_iterate(f, RAM_CONTROL_SETUP); ram_control_after_iterate(f, RAM_CONTROL_SETUP); @@ -868,12 +883,14 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) int64_t t0; int total_sent = 0; - qemu_mutex_lock_ramlist(); - + rcu_read_lock(); if (ram_list.version != last_version) { reset_ram_globals(); } + /* Read version before ram_list.blocks */ + smp_rmb(); + ram_control_before_iterate(f, RAM_CONTROL_ROUND); t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); @@ -904,8 +921,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) } i++; } - - qemu_mutex_unlock_ramlist(); + rcu_read_unlock(); /* * Must occur before EOS (or any QEMUFile operation) @@ -933,7 +949,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) /* Called with iothread lock */ static int ram_save_complete(QEMUFile *f, void *opaque) { - qemu_mutex_lock_ramlist(); + rcu_read_lock(); + migration_bitmap_sync(); ram_control_before_iterate(f, RAM_CONTROL_FINISH); @@ -955,7 +972,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque) ram_control_after_iterate(f, RAM_CONTROL_FINISH); migration_end(); - qemu_mutex_unlock_ramlist(); + rcu_read_unlock(); qemu_put_be64(f, RAM_SAVE_FLAG_EOS); return 0; @@ -969,7 +986,9 @@ static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size) if (remaining_size < max_size) { qemu_mutex_lock_iothread(); + rcu_read_lock(); migration_bitmap_sync(); + rcu_read_unlock(); qemu_mutex_unlock_iothread(); remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; } @@ -1011,6 +1030,9 @@ static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) return 0; } +/* Must be called from within a rcu critical section. + * Returns a pointer from within the RCU-protected ram_list. + */ static inline void *host_from_stream_offset(QEMUFile *f, ram_addr_t offset, int flags) @@ -1032,7 +1054,7 @@ static inline void *host_from_stream_offset(QEMUFile *f, qemu_get_buffer(f, (uint8_t *)id, len); id[len] = 0; - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { if (!strncmp(id, block->idstr, sizeof(id)) && block->max_length > offset) { return memory_region_get_ram_ptr(block->mr) + offset; @@ -1065,6 +1087,12 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) ret = -EINVAL; } + /* This RCU critical section can be very long running. + * When RCU reclaims in the code start to become numerous, + * it will be necessary to reduce the granularity of this + * critical section. + */ + rcu_read_lock(); while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { ram_addr_t addr, total_ram_bytes; void *host; @@ -1089,7 +1117,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) id[len] = 0; length = qemu_get_be64(f); - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { if (!strncmp(id, block->idstr, sizeof(id))) { if (length != block->used_length) { Error *local_err = NULL; @@ -1163,6 +1191,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) } } + rcu_read_unlock(); DPRINTF("Completed load of VM with exit code %d seq iteration " "%" PRIu64 "\n", ret, seq_iter); return ret; diff --git a/exec.c b/exec.c index a7c2b92d0a..c85321a38b 100644 --- a/exec.c +++ b/exec.c @@ -44,7 +44,7 @@ #include "trace.h" #endif #include "exec/cpu-all.h" - +#include "qemu/rcu_queue.h" #include "exec/cputlb.h" #include "translate-all.h" @@ -58,6 +58,9 @@ #if !defined(CONFIG_USER_ONLY) static bool in_migration; +/* ram_list is read under rcu_read_lock()/rcu_read_unlock(). Writes + * are protected by the ramlist lock. + */ RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) }; static MemoryRegion *system_memory; @@ -806,16 +809,16 @@ void cpu_abort(CPUState *cpu, const char *fmt, ...) } #if !defined(CONFIG_USER_ONLY) +/* Called from RCU critical section */ static RAMBlock *qemu_get_ram_block(ram_addr_t addr) { RAMBlock *block; - /* The list is protected by the iothread lock here. */ block = atomic_rcu_read(&ram_list.mru_block); if (block && addr - block->offset < block->max_length) { goto found; } - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { if (addr - block->offset < block->max_length) { goto found; } @@ -854,10 +857,12 @@ static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length) end = TARGET_PAGE_ALIGN(start + length); start &= TARGET_PAGE_MASK; + rcu_read_lock(); block = qemu_get_ram_block(start); assert(block == qemu_get_ram_block(end - 1)); start1 = (uintptr_t)ramblock_ptr(block, start - block->offset); cpu_tlb_reset_dirty_all(start1, length); + rcu_read_unlock(); } /* Note: start and end must be within the same ram block. */ @@ -1190,6 +1195,7 @@ error: } #endif +/* Called with the ramlist lock held. */ static ram_addr_t find_ram_offset(ram_addr_t size) { RAMBlock *block, *next_block; @@ -1197,16 +1203,16 @@ static ram_addr_t find_ram_offset(ram_addr_t size) assert(size != 0); /* it would hand out same offset multiple times */ - if (QLIST_EMPTY(&ram_list.blocks)) { + if (QLIST_EMPTY_RCU(&ram_list.blocks)) { return 0; } - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { ram_addr_t end, next = RAM_ADDR_MAX; end = block->offset + block->max_length; - QLIST_FOREACH(next_block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(next_block, &ram_list.blocks, next) { if (next_block->offset >= end) { next = MIN(next, next_block->offset); } @@ -1231,9 +1237,11 @@ ram_addr_t last_ram_offset(void) RAMBlock *block; ram_addr_t last = 0; - QLIST_FOREACH(block, &ram_list.blocks, next) { + rcu_read_lock(); + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { last = MAX(last, block->offset + block->max_length); } + rcu_read_unlock(); return last; } @@ -1253,11 +1261,14 @@ static void qemu_ram_setup_dump(void *addr, ram_addr_t size) } } +/* Called within an RCU critical section, or while the ramlist lock + * is held. + */ static RAMBlock *find_ram_block(ram_addr_t addr) { RAMBlock *block; - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { if (block->offset == addr) { return block; } @@ -1271,6 +1282,7 @@ void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev) { RAMBlock *new_block, *block; + rcu_read_lock(); new_block = find_ram_block(addr); assert(new_block); assert(!new_block->idstr[0]); @@ -1284,15 +1296,14 @@ void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev) } pstrcat(new_block->idstr, sizeof(new_block->idstr), name); - qemu_mutex_lock_ramlist(); - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { if (block != new_block && !strcmp(block->idstr, new_block->idstr)) { fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n", new_block->idstr); abort(); } } - qemu_mutex_unlock_ramlist(); + rcu_read_unlock(); } /* Called with iothread lock held. */ @@ -1305,10 +1316,12 @@ void qemu_ram_unset_idstr(ram_addr_t addr) * does not work anyway. */ + rcu_read_lock(); block = find_ram_block(addr); if (block) { memset(block->idstr, 0, sizeof(block->idstr)); } + rcu_read_unlock(); } static int memory_try_enable_merging(void *addr, size_t len) @@ -1372,7 +1385,6 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp) old_ram_size = last_ram_offset() >> TARGET_PAGE_BITS; - /* This assumes the iothread lock is taken here too. */ qemu_mutex_lock_ramlist(); new_block->offset = find_ram_offset(new_block->max_length); @@ -1398,21 +1410,23 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp) * QLIST (which has an RCU-friendly variant) does not have insertion at * tail, so save the last element in last_block. */ - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { last_block = block; if (block->max_length < new_block->max_length) { break; } } if (block) { - QLIST_INSERT_BEFORE(block, new_block, next); + QLIST_INSERT_BEFORE_RCU(block, new_block, next); } else if (last_block) { - QLIST_INSERT_AFTER(last_block, new_block, next); + QLIST_INSERT_AFTER_RCU(last_block, new_block, next); } else { /* list is empty */ - QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next); + QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next); } ram_list.mru_block = NULL; + /* Write list before version */ + smp_wmb(); ram_list.version++; qemu_mutex_unlock_ramlist(); @@ -1552,12 +1566,13 @@ void qemu_ram_free_from_ptr(ram_addr_t addr) { RAMBlock *block; - /* This assumes the iothread lock is taken here too. */ qemu_mutex_lock_ramlist(); - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { if (addr == block->offset) { - QLIST_REMOVE(block, next); + QLIST_REMOVE_RCU(block, next); ram_list.mru_block = NULL; + /* Write list before version */ + smp_wmb(); ram_list.version++; g_free_rcu(block, rcu); break; @@ -1583,17 +1598,17 @@ static void reclaim_ramblock(RAMBlock *block) g_free(block); } -/* Called with the iothread lock held */ void qemu_ram_free(ram_addr_t addr) { RAMBlock *block; - /* This assumes the iothread lock is taken here too. */ qemu_mutex_lock_ramlist(); - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { if (addr == block->offset) { - QLIST_REMOVE(block, next); + QLIST_REMOVE_RCU(block, next); ram_list.mru_block = NULL; + /* Write list before version */ + smp_wmb(); ram_list.version++; call_rcu(block, reclaim_ramblock, rcu); break; @@ -1610,7 +1625,7 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) int flags; void *area, *vaddr; - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { offset = addr - block->offset; if (offset < block->max_length) { vaddr = ramblock_ptr(block, offset); @@ -1657,8 +1672,10 @@ int qemu_get_ram_fd(ram_addr_t addr) RAMBlock *block; int fd; + rcu_read_lock(); block = qemu_get_ram_block(addr); fd = block->fd; + rcu_read_unlock(); return fd; } @@ -1667,8 +1684,10 @@ void *qemu_get_ram_block_host_ptr(ram_addr_t addr) RAMBlock *block; void *ptr; + rcu_read_lock(); block = qemu_get_ram_block(addr); ptr = ramblock_ptr(block, 0); + rcu_read_unlock(); return ptr; } @@ -1676,12 +1695,19 @@ void *qemu_get_ram_block_host_ptr(ram_addr_t addr) * This should not be used for general purpose DMA. Use address_space_map * or address_space_rw instead. For local memory (e.g. video ram) that the * device owns, use memory_region_get_ram_ptr. + * + * By the time this function returns, the returned pointer is not protected + * by RCU anymore. If the caller is not within an RCU critical section and + * does not hold the iothread lock, it must have other means of protecting the + * pointer, such as a reference to the region that includes the incoming + * ram_addr_t. */ void *qemu_get_ram_ptr(ram_addr_t addr) { RAMBlock *block; void *ptr; + rcu_read_lock(); block = qemu_get_ram_block(addr); if (xen_enabled() && block->host == NULL) { @@ -1691,19 +1717,26 @@ void *qemu_get_ram_ptr(ram_addr_t addr) */ if (block->offset == 0) { ptr = xen_map_cache(addr, 0, 0); - goto done; + goto unlock; } block->host = xen_map_cache(block->offset, block->max_length, 1); } ptr = ramblock_ptr(block, addr - block->offset); -done: +unlock: + rcu_read_unlock(); return ptr; } /* Return a host pointer to guest's ram. Similar to qemu_get_ram_ptr * but takes a size argument. + * + * By the time this function returns, the returned pointer is not protected + * by RCU anymore. If the caller is not within an RCU critical section and + * does not hold the iothread lock, it must have other means of protecting the + * pointer, such as a reference to the region that includes the incoming + * ram_addr_t. */ static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size) { @@ -1715,11 +1748,13 @@ static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size) return xen_map_cache(addr, *size, 1); } else { RAMBlock *block; - QLIST_FOREACH(block, &ram_list.blocks, next) { + rcu_read_lock(); + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { if (addr - block->offset < block->max_length) { if (addr - block->offset + *size > block->max_length) *size = block->max_length - addr + block->offset; ptr = ramblock_ptr(block, addr - block->offset); + rcu_read_unlock(); return ptr; } } @@ -1745,17 +1780,20 @@ MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr) MemoryRegion *mr; if (xen_enabled()) { + rcu_read_lock(); *ram_addr = xen_ram_addr_from_mapcache(ptr); mr = qemu_get_ram_block(*ram_addr)->mr; + rcu_read_unlock(); return mr; } - block = ram_list.mru_block; + rcu_read_lock(); + block = atomic_rcu_read(&ram_list.mru_block); if (block && block->host && host - block->host < block->max_length) { goto found; } - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { /* This case append when the block is not mapped. */ if (block->host == NULL) { continue; @@ -1765,11 +1803,13 @@ MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr) } } + rcu_read_unlock(); return NULL; found: *ram_addr = block->offset + (host - block->host); mr = block->mr; + rcu_read_unlock(); return mr; } @@ -3027,8 +3067,10 @@ void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque) { RAMBlock *block; - QLIST_FOREACH(block, &ram_list.blocks, next) { + rcu_read_lock(); + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { func(block->host, block->offset, block->used_length, opaque); } + rcu_read_unlock(); } #endif diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h index 87b865800c..ac06c6721c 100644 --- a/include/exec/cpu-all.h +++ b/include/exec/cpu-all.h @@ -279,9 +279,7 @@ struct RAMBlock { uint32_t flags; /* Protected by iothread lock. */ char idstr[256]; - /* Reads can take either the iothread or the ramlist lock. - * Writes must take both locks. - */ + /* RCU-enabled, writes protected by the ramlist lock */ QLIST_ENTRY(RAMBlock) next; int fd; }; @@ -298,7 +296,7 @@ typedef struct RAMList { /* Protected by the iothread lock. */ unsigned long *dirty_memory[DIRTY_MEMORY_NUM]; RAMBlock *mru_block; - /* Protected by the ramlist lock. */ + /* RCU-enabled, writes protected by the ramlist lock. */ QLIST_HEAD(, RAMBlock) blocks; uint32_t version; } RAMList;