From 911a4d2215b05267b16925503218f49d607c6b29 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Wed, 8 Jun 2016 14:55:19 -0400 Subject: [PATCH 01/15] compiler.h: add QEMU_ALIGNED() to enforce struct alignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Sergey Fedorov Reviewed-by: Richard Henderson Reviewed-by: Alex Bennée Signed-off-by: Emilio G. Cota Message-Id: <1465412133-3029-2-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson --- include/qemu/compiler.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h index 8f1cc7ba67..b64f899870 100644 --- a/include/qemu/compiler.h +++ b/include/qemu/compiler.h @@ -41,6 +41,8 @@ # define QEMU_PACKED __attribute__((packed)) #endif +#define QEMU_ALIGNED(X) __attribute__((aligned(X))) + #ifndef glue #define xglue(x, y) x ## y #define glue(x, y) xglue(x, y) From ccdb3c1fc8865cf9142235b00e3a8fe3246f2a59 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Wed, 8 Jun 2016 14:55:20 -0400 Subject: [PATCH 02/15] seqlock: remove optional mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This option is unused; besides, it bloats the struct when not needed. Let's just let writers define their own locks elsewhere. Reviewed-by: Sergey Fedorov Reviewed-by: Alex Bennée Reviewed-by: Richard Henderson Signed-off-by: Emilio G. Cota Message-Id: <1465412133-3029-3-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson --- cpus.c | 2 +- include/qemu/seqlock.h | 10 +--------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/cpus.c b/cpus.c index 326742f445..1c92362449 100644 --- a/cpus.c +++ b/cpus.c @@ -621,7 +621,7 @@ int cpu_throttle_get_percentage(void) void cpu_ticks_init(void) { - seqlock_init(&timers_state.vm_clock_seqlock, NULL); + seqlock_init(&timers_state.vm_clock_seqlock); vmstate_register(NULL, 0, &vmstate_timers, &timers_state); throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT, cpu_throttle_timer_tick, NULL); diff --git a/include/qemu/seqlock.h b/include/qemu/seqlock.h index 70b01fd60d..e6734823ec 100644 --- a/include/qemu/seqlock.h +++ b/include/qemu/seqlock.h @@ -19,22 +19,17 @@ typedef struct QemuSeqLock QemuSeqLock; struct QemuSeqLock { - QemuMutex *mutex; unsigned sequence; }; -static inline void seqlock_init(QemuSeqLock *sl, QemuMutex *mutex) +static inline void seqlock_init(QemuSeqLock *sl) { - sl->mutex = mutex; sl->sequence = 0; } /* Lock out other writers and update the count. */ static inline void seqlock_write_lock(QemuSeqLock *sl) { - if (sl->mutex) { - qemu_mutex_lock(sl->mutex); - } ++sl->sequence; /* Write sequence before updating other fields. */ @@ -47,9 +42,6 @@ static inline void seqlock_write_unlock(QemuSeqLock *sl) smp_wmb(); ++sl->sequence; - if (sl->mutex) { - qemu_mutex_unlock(sl->mutex); - } } static inline unsigned seqlock_read_begin(QemuSeqLock *sl) From 03719e44b6d6d559c506808e5b6c340feff867b8 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Wed, 8 Jun 2016 14:55:21 -0400 Subject: [PATCH 03/15] seqlock: rename write_lock/unlock to write_begin/end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is a more appropriate name, now that the mutex embedded in the seqlock is gone. Reviewed-by: Sergey Fedorov Reviewed-by: Alex Bennée Reviewed-by: Richard Henderson Signed-off-by: Emilio G. Cota Message-Id: <1465412133-3029-4-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson --- cpus.c | 28 ++++++++++++++-------------- include/qemu/seqlock.h | 4 ++-- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/cpus.c b/cpus.c index 1c92362449..84c3520d44 100644 --- a/cpus.c +++ b/cpus.c @@ -249,13 +249,13 @@ int64_t cpu_get_clock(void) void cpu_enable_ticks(void) { /* Here, the really thing protected by seqlock is cpu_clock_offset. */ - seqlock_write_lock(&timers_state.vm_clock_seqlock); + seqlock_write_begin(&timers_state.vm_clock_seqlock); if (!timers_state.cpu_ticks_enabled) { timers_state.cpu_ticks_offset -= cpu_get_host_ticks(); timers_state.cpu_clock_offset -= get_clock(); timers_state.cpu_ticks_enabled = 1; } - seqlock_write_unlock(&timers_state.vm_clock_seqlock); + seqlock_write_end(&timers_state.vm_clock_seqlock); } /* disable cpu_get_ticks() : the clock is stopped. You must not call @@ -265,13 +265,13 @@ void cpu_enable_ticks(void) void cpu_disable_ticks(void) { /* Here, the really thing protected by seqlock is cpu_clock_offset. */ - seqlock_write_lock(&timers_state.vm_clock_seqlock); + seqlock_write_begin(&timers_state.vm_clock_seqlock); if (timers_state.cpu_ticks_enabled) { timers_state.cpu_ticks_offset += cpu_get_host_ticks(); timers_state.cpu_clock_offset = cpu_get_clock_locked(); timers_state.cpu_ticks_enabled = 0; } - seqlock_write_unlock(&timers_state.vm_clock_seqlock); + seqlock_write_end(&timers_state.vm_clock_seqlock); } /* Correlation between real and virtual time is always going to be @@ -294,7 +294,7 @@ static void icount_adjust(void) return; } - seqlock_write_lock(&timers_state.vm_clock_seqlock); + seqlock_write_begin(&timers_state.vm_clock_seqlock); cur_time = cpu_get_clock_locked(); cur_icount = cpu_get_icount_locked(); @@ -315,7 +315,7 @@ static void icount_adjust(void) last_delta = delta; timers_state.qemu_icount_bias = cur_icount - (timers_state.qemu_icount << icount_time_shift); - seqlock_write_unlock(&timers_state.vm_clock_seqlock); + seqlock_write_end(&timers_state.vm_clock_seqlock); } static void icount_adjust_rt(void *opaque) @@ -355,7 +355,7 @@ static void icount_warp_rt(void) return; } - seqlock_write_lock(&timers_state.vm_clock_seqlock); + seqlock_write_begin(&timers_state.vm_clock_seqlock); if (runstate_is_running()) { int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT, cpu_get_clock_locked()); @@ -374,7 +374,7 @@ static void icount_warp_rt(void) timers_state.qemu_icount_bias += warp_delta; } vm_clock_warp_start = -1; - seqlock_write_unlock(&timers_state.vm_clock_seqlock); + seqlock_write_end(&timers_state.vm_clock_seqlock); if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) { qemu_clock_notify(QEMU_CLOCK_VIRTUAL); @@ -399,9 +399,9 @@ void qtest_clock_warp(int64_t dest) int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL); int64_t warp = qemu_soonest_timeout(dest - clock, deadline); - seqlock_write_lock(&timers_state.vm_clock_seqlock); + seqlock_write_begin(&timers_state.vm_clock_seqlock); timers_state.qemu_icount_bias += warp; - seqlock_write_unlock(&timers_state.vm_clock_seqlock); + seqlock_write_end(&timers_state.vm_clock_seqlock); qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL); timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]); @@ -468,9 +468,9 @@ void qemu_start_warp_timer(void) * It is useful when we want a deterministic execution time, * isolated from host latencies. */ - seqlock_write_lock(&timers_state.vm_clock_seqlock); + seqlock_write_begin(&timers_state.vm_clock_seqlock); timers_state.qemu_icount_bias += deadline; - seqlock_write_unlock(&timers_state.vm_clock_seqlock); + seqlock_write_end(&timers_state.vm_clock_seqlock); qemu_clock_notify(QEMU_CLOCK_VIRTUAL); } else { /* @@ -481,11 +481,11 @@ void qemu_start_warp_timer(void) * you will not be sending network packets continuously instead of * every 100ms. */ - seqlock_write_lock(&timers_state.vm_clock_seqlock); + seqlock_write_begin(&timers_state.vm_clock_seqlock); if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) { vm_clock_warp_start = clock; } - seqlock_write_unlock(&timers_state.vm_clock_seqlock); + seqlock_write_end(&timers_state.vm_clock_seqlock); timer_mod_anticipate(icount_warp_timer, clock + deadline); } } else if (deadline == 0) { diff --git a/include/qemu/seqlock.h b/include/qemu/seqlock.h index e6734823ec..4dfc055353 100644 --- a/include/qemu/seqlock.h +++ b/include/qemu/seqlock.h @@ -28,7 +28,7 @@ static inline void seqlock_init(QemuSeqLock *sl) } /* Lock out other writers and update the count. */ -static inline void seqlock_write_lock(QemuSeqLock *sl) +static inline void seqlock_write_begin(QemuSeqLock *sl) { ++sl->sequence; @@ -36,7 +36,7 @@ static inline void seqlock_write_lock(QemuSeqLock *sl) smp_wmb(); } -static inline void seqlock_write_unlock(QemuSeqLock *sl) +static inline void seqlock_write_end(QemuSeqLock *sl) { /* Write other fields before finalizing sequence. */ smp_wmb(); From 462cda505f304c3915e5e30429cee22418f5a6ea Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Wed, 8 Jun 2016 14:55:22 -0400 Subject: [PATCH 04/15] include/processor.h: define cpu_relax() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Taken from the linux kernel. Reviewed-by: Sergey Fedorov Reviewed-by: Richard Henderson Reviewed-by: Alex Bennée Signed-off-by: Emilio G. Cota Message-Id: <1465412133-3029-5-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson --- include/qemu/processor.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 include/qemu/processor.h diff --git a/include/qemu/processor.h b/include/qemu/processor.h new file mode 100644 index 0000000000..8b2570283a --- /dev/null +++ b/include/qemu/processor.h @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2016, Emilio G. Cota + * + * License: GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ +#ifndef QEMU_PROCESSOR_H +#define QEMU_PROCESSOR_H + +#include "qemu/atomic.h" + +#if defined(__i386__) || defined(__x86_64__) +# define cpu_relax() asm volatile("rep; nop" ::: "memory") + +#elif defined(__ia64__) +# define cpu_relax() asm volatile("hint @pause" ::: "memory") + +#elif defined(__aarch64__) +# define cpu_relax() asm volatile("yield" ::: "memory") + +#elif defined(__powerpc64__) +/* set Hardware Multi-Threading (HMT) priority to low; then back to medium */ +# define cpu_relax() asm volatile("or 1, 1, 1;" \ + "or 2, 2, 2;" ::: "memory") + +#else +# define cpu_relax() barrier() +#endif + +#endif /* QEMU_PROCESSOR_H */ From ac9a9eba1e43c5354905c0c7f8e17852ed396ac2 Mon Sep 17 00:00:00 2001 From: Guillaume Delbergue Date: Wed, 8 Jun 2016 14:55:23 -0400 Subject: [PATCH 05/15] qemu-thread: add simple test-and-set spinlock Reviewed-by: Sergey Fedorov Signed-off-by: Guillaume Delbergue [Rewritten. - Paolo] Signed-off-by: Paolo Bonzini [Emilio's additions: use TAS instead of atomic_xchg; emit acquire/release barriers; return bool from trylock; call cpu_relax() while spinning; optimize for uncontended locks by acquiring the lock with TAS instead of TATAS; add qemu_spin_locked().] Signed-off-by: Emilio G. Cota Message-Id: <1465412133-3029-6-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson --- include/qemu/thread.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/include/qemu/thread.h b/include/qemu/thread.h index bdae6dfdbe..c5d71cf8fc 100644 --- a/include/qemu/thread.h +++ b/include/qemu/thread.h @@ -1,6 +1,8 @@ #ifndef __QEMU_THREAD_H #define __QEMU_THREAD_H 1 +#include "qemu/processor.h" +#include "qemu/atomic.h" typedef struct QemuMutex QemuMutex; typedef struct QemuCond QemuCond; @@ -60,4 +62,37 @@ struct Notifier; void qemu_thread_atexit_add(struct Notifier *notifier); void qemu_thread_atexit_remove(struct Notifier *notifier); +typedef struct QemuSpin { + int value; +} QemuSpin; + +static inline void qemu_spin_init(QemuSpin *spin) +{ + __sync_lock_release(&spin->value); +} + +static inline void qemu_spin_lock(QemuSpin *spin) +{ + while (unlikely(__sync_lock_test_and_set(&spin->value, true))) { + while (atomic_read(&spin->value)) { + cpu_relax(); + } + } +} + +static inline bool qemu_spin_trylock(QemuSpin *spin) +{ + return __sync_lock_test_and_set(&spin->value, true); +} + +static inline bool qemu_spin_locked(QemuSpin *spin) +{ + return atomic_read(&spin->value); +} + +static inline void qemu_spin_unlock(QemuSpin *spin) +{ + __sync_lock_release(&spin->value); +} + #endif From dc8b295d05ec35a8c032f9abca421772347ba5d4 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Wed, 8 Jun 2016 14:55:24 -0400 Subject: [PATCH 06/15] exec: add tb_hash_func5, derived from xxhash This will be used by upcoming changes for hashing the tb hash. Add this into a separate file to include the copyright notice from xxhash. Reviewed-by: Sergey Fedorov Reviewed-by: Richard Henderson Signed-off-by: Emilio G. Cota Message-Id: <1465412133-3029-7-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson --- include/exec/tb-hash-xx.h | 94 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 include/exec/tb-hash-xx.h diff --git a/include/exec/tb-hash-xx.h b/include/exec/tb-hash-xx.h new file mode 100644 index 0000000000..9f3fc05636 --- /dev/null +++ b/include/exec/tb-hash-xx.h @@ -0,0 +1,94 @@ +/* + * xxHash - Fast Hash algorithm + * Copyright (C) 2012-2016, Yann Collet + * + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - xxHash source repository : https://github.com/Cyan4973/xxHash + */ +#ifndef EXEC_TB_HASH_XX +#define EXEC_TB_HASH_XX + +#include + +#define PRIME32_1 2654435761U +#define PRIME32_2 2246822519U +#define PRIME32_3 3266489917U +#define PRIME32_4 668265263U +#define PRIME32_5 374761393U + +#define TB_HASH_XX_SEED 1 + +/* + * xxhash32, customized for input variables that are not guaranteed to be + * contiguous in memory. + */ +static inline +uint32_t tb_hash_func5(uint64_t a0, uint64_t b0, uint32_t e) +{ + uint32_t v1 = TB_HASH_XX_SEED + PRIME32_1 + PRIME32_2; + uint32_t v2 = TB_HASH_XX_SEED + PRIME32_2; + uint32_t v3 = TB_HASH_XX_SEED + 0; + uint32_t v4 = TB_HASH_XX_SEED - PRIME32_1; + uint32_t a = a0 >> 32; + uint32_t b = a0; + uint32_t c = b0 >> 32; + uint32_t d = b0; + uint32_t h32; + + v1 += a * PRIME32_2; + v1 = rol32(v1, 13); + v1 *= PRIME32_1; + + v2 += b * PRIME32_2; + v2 = rol32(v2, 13); + v2 *= PRIME32_1; + + v3 += c * PRIME32_2; + v3 = rol32(v3, 13); + v3 *= PRIME32_1; + + v4 += d * PRIME32_2; + v4 = rol32(v4, 13); + v4 *= PRIME32_1; + + h32 = rol32(v1, 1) + rol32(v2, 7) + rol32(v3, 12) + rol32(v4, 18); + h32 += 20; + + h32 += e * PRIME32_3; + h32 = rol32(h32, 17) * PRIME32_4; + + h32 ^= h32 >> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} + +#endif /* EXEC_TB_HASH_XX */ From 42bd32287f3a18d823f2258b813824a39ed7c6d9 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Wed, 8 Jun 2016 14:55:25 -0400 Subject: [PATCH 07/15] tb hash: hash phys_pc, pc, and flags with xxhash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For some workloads such as arm bootup, tb_phys_hash is performance-critical. The is due to the high frequency of accesses to the hash table, originated by (frequent) TLB flushes that wipe out the cpu-private tb_jmp_cache's. More info: https://lists.nongnu.org/archive/html/qemu-devel/2016-03/msg05098.html To dig further into this I modified an arm image booting debian jessie to immediately shut down after boot. Analysis revealed that quite a bit of time is unnecessarily spent in tb_phys_hash: the cause is poor hashing that results in very uneven loading of chains in the hash table's buckets; the longest observed chain had ~550 elements. The appended addresses this with two changes: 1) Use xxhash as the hash table's hash function. xxhash is a fast, high-quality hashing function. 2) Feed the hashing function with not just tb_phys, but also pc and flags. This improves performance over using just tb_phys for hashing, since that resulted in some hash buckets having many TB's, while others getting very few; with these changes, the longest observed chain on a single hash bucket is brought down from ~550 to ~40. Tests show that the other element checked for in tb_find_physical, cs_base, is always a match when tb_phys+pc+flags are a match, so hashing cs_base is wasteful. It could be that this is an ARM-only thing, though. UPDATE: On Tue, Apr 05, 2016 at 08:41:43 -0700, Richard Henderson wrote: > The cs_base field is only used by i386 (in 16-bit modes), and sparc (for a TB > consisting of only a delay slot). > It may well still turn out to be reasonable to ignore cs_base for hashing. BTW, after this change the hash table should not be called "tb_hash_phys" anymore; this is addressed later in this series. This change gives consistent bootup time improvements. I tested two host machines: - Intel Xeon E5-2690: 11.6% less time - Intel i7-4790K: 19.2% less time Increasing the number of hash buckets yields further improvements. However, using a larger, fixed number of buckets can degrade performance for other workloads that do not translate as many blocks (600K+ for debian-jessie arm bootup). This is dealt with later in this series. Reviewed-by: Sergey Fedorov Reviewed-by: Richard Henderson Reviewed-by: Alex Bennée Signed-off-by: Emilio G. Cota Message-Id: <1465412133-3029-8-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson --- cpu-exec.c | 4 ++-- include/exec/tb-hash.h | 8 ++++++-- translate-all.c | 10 +++++----- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/cpu-exec.c b/cpu-exec.c index f7c642f4a9..b9e294c0e6 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -232,13 +232,13 @@ static TranslationBlock *tb_find_physical(CPUState *cpu, { CPUArchState *env = (CPUArchState *)cpu->env_ptr; TranslationBlock *tb, **tb_hash_head, **ptb1; - unsigned int h; + uint32_t h; tb_page_addr_t phys_pc, phys_page1; /* find translated block using physical mappings */ phys_pc = get_page_addr_code(env, pc); phys_page1 = phys_pc & TARGET_PAGE_MASK; - h = tb_phys_hash_func(phys_pc); + h = tb_hash_func(phys_pc, pc, flags); /* Start at head of the hash entry */ ptb1 = tb_hash_head = &tcg_ctx.tb_ctx.tb_phys_hash[h]; diff --git a/include/exec/tb-hash.h b/include/exec/tb-hash.h index 0f4e8a08af..88ccfd1730 100644 --- a/include/exec/tb-hash.h +++ b/include/exec/tb-hash.h @@ -20,6 +20,9 @@ #ifndef EXEC_TB_HASH #define EXEC_TB_HASH +#include "exec/exec-all.h" +#include "exec/tb-hash-xx.h" + /* Only the bottom TB_JMP_PAGE_BITS of the jump cache hash bits vary for addresses on the same page. The top bits are the same. This allows TLB invalidation to quickly clear a subset of the hash table. */ @@ -43,9 +46,10 @@ static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc) | (tmp & TB_JMP_ADDR_MASK)); } -static inline unsigned int tb_phys_hash_func(tb_page_addr_t pc) +static inline +uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, uint32_t flags) { - return (pc >> 2) & (CODE_GEN_PHYS_HASH_SIZE - 1); + return tb_hash_func5(phys_pc, pc, flags) & (CODE_GEN_PHYS_HASH_SIZE - 1); } #endif diff --git a/translate-all.c b/translate-all.c index 118e7d3c84..d75737c3aa 100644 --- a/translate-all.c +++ b/translate-all.c @@ -992,12 +992,12 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr) { CPUState *cpu; PageDesc *p; - unsigned int h; + uint32_t h; tb_page_addr_t phys_pc; /* remove the TB from the hash list */ phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK); - h = tb_phys_hash_func(phys_pc); + h = tb_hash_func(phys_pc, tb->pc, tb->flags); tb_hash_remove(&tcg_ctx.tb_ctx.tb_phys_hash[h], tb); /* remove the TB from the page list */ @@ -1127,11 +1127,11 @@ static inline void tb_alloc_page(TranslationBlock *tb, static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc, tb_page_addr_t phys_page2) { - unsigned int h; + uint32_t h; TranslationBlock **ptb; - /* add in the physical hash table */ - h = tb_phys_hash_func(phys_pc); + /* add in the hash table */ + h = tb_hash_func(phys_pc, tb->pc, tb->flags); ptb = &tcg_ctx.tb_ctx.tb_phys_hash[h]; tb->phys_hash_next = *ptb; *ptb = tb; From bf3afd5f419a2054bf03d963bdd223fbb27b72d2 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Wed, 8 Jun 2016 14:55:26 -0400 Subject: [PATCH 08/15] qdist: add module to represent frequency distributions of data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sometimes it is useful to have a quick histogram to represent a certain distribution -- for example, when investigating a performance regression in a hash table due to inadequate hashing. The appended allows us to easily represent a distribution using Unicode characters. Further, the data structure keeping track of the distribution is so simple that obtaining its values for off-line processing is trivial. Example, taking the last 10 commits to QEMU: Characters in commit title Count ----------------------------------- 39 1 48 1 53 1 54 2 57 1 61 1 67 1 78 1 80 1 qdist_init(&dist); qdist_inc(&dist, 39); [...] qdist_inc(&dist, 80); char *str = qdist_pr(&dist, 9, QDIST_PR_LABELS); // -> [39.0,43.6)▂▂ █▂ ▂ ▄[75.4,80.0] g_free(str); char *str = qdist_pr(&dist, 4, QDIST_PR_LABELS); // -> [39.0,49.2)▁█▁▁[69.8,80.0] g_free(str); Reviewed-by: Richard Henderson Signed-off-by: Emilio G. Cota Message-Id: <1465412133-3029-9-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson --- include/qemu/qdist.h | 63 +++++++ util/Makefile.objs | 1 + util/qdist.c | 395 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 459 insertions(+) create mode 100644 include/qemu/qdist.h create mode 100644 util/qdist.c diff --git a/include/qemu/qdist.h b/include/qemu/qdist.h new file mode 100644 index 0000000000..f30050c2d1 --- /dev/null +++ b/include/qemu/qdist.h @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2016, Emilio G. Cota + * + * License: GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#ifndef QEMU_QDIST_H +#define QEMU_QDIST_H + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "qemu/bitops.h" + +/* + * Samples with the same 'x value' end up in the same qdist_entry, + * e.g. inc(0.1) and inc(0.1) end up as {x=0.1, count=2}. + * + * Binning happens only at print time, so that we retain the flexibility to + * choose the binning. This might not be ideal for workloads that do not care + * much about precision and insert many samples all with different x values; + * in that case, pre-binning (e.g. entering both 0.115 and 0.097 as 0.1) + * should be considered. + */ +struct qdist_entry { + double x; + unsigned long count; +}; + +struct qdist { + struct qdist_entry *entries; + size_t n; + size_t size; +}; + +#define QDIST_PR_BORDER BIT(0) +#define QDIST_PR_LABELS BIT(1) +/* the remaining options only work if PR_LABELS is set */ +#define QDIST_PR_NODECIMAL BIT(2) +#define QDIST_PR_PERCENT BIT(3) +#define QDIST_PR_100X BIT(4) +#define QDIST_PR_NOBINRANGE BIT(5) + +void qdist_init(struct qdist *dist); +void qdist_destroy(struct qdist *dist); + +void qdist_add(struct qdist *dist, double x, long count); +void qdist_inc(struct qdist *dist, double x); +double qdist_xmin(const struct qdist *dist); +double qdist_xmax(const struct qdist *dist); +double qdist_avg(const struct qdist *dist); +unsigned long qdist_sample_count(const struct qdist *dist); +size_t qdist_unique_entries(const struct qdist *dist); + +/* callers must free the returned string with g_free() */ +char *qdist_pr_plain(const struct qdist *dist, size_t n_groups); + +/* callers must free the returned string with g_free() */ +char *qdist_pr(const struct qdist *dist, size_t n_groups, uint32_t opt); + +/* Only qdist code and test code should ever call this function */ +void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t n); + +#endif /* QEMU_QDIST_H */ diff --git a/util/Makefile.objs b/util/Makefile.objs index a8a777ec40..702435e839 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -32,3 +32,4 @@ util-obj-y += buffer.o util-obj-y += timed-average.o util-obj-y += base64.o util-obj-y += log.o +util-obj-y += qdist.o diff --git a/util/qdist.c b/util/qdist.c new file mode 100644 index 0000000000..4ea2e34fc2 --- /dev/null +++ b/util/qdist.c @@ -0,0 +1,395 @@ +/* + * qdist.c - QEMU helpers for handling frequency distributions of data. + * + * Copyright (C) 2016, Emilio G. Cota + * + * License: GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/qdist.h" + +#include +#ifndef NAN +#define NAN (0.0 / 0.0) +#endif + +void qdist_init(struct qdist *dist) +{ + dist->entries = g_malloc(sizeof(*dist->entries)); + dist->size = 1; + dist->n = 0; +} + +void qdist_destroy(struct qdist *dist) +{ + g_free(dist->entries); +} + +static inline int qdist_cmp_double(double a, double b) +{ + if (a > b) { + return 1; + } else if (a < b) { + return -1; + } + return 0; +} + +static int qdist_cmp(const void *ap, const void *bp) +{ + const struct qdist_entry *a = ap; + const struct qdist_entry *b = bp; + + return qdist_cmp_double(a->x, b->x); +} + +void qdist_add(struct qdist *dist, double x, long count) +{ + struct qdist_entry *entry = NULL; + + if (dist->n) { + struct qdist_entry e; + + e.x = x; + entry = bsearch(&e, dist->entries, dist->n, sizeof(e), qdist_cmp); + } + + if (entry) { + entry->count += count; + return; + } + + if (unlikely(dist->n == dist->size)) { + dist->size *= 2; + dist->entries = g_realloc(dist->entries, + sizeof(*dist->entries) * (dist->size)); + } + dist->n++; + entry = &dist->entries[dist->n - 1]; + entry->x = x; + entry->count = count; + qsort(dist->entries, dist->n, sizeof(*entry), qdist_cmp); +} + +void qdist_inc(struct qdist *dist, double x) +{ + qdist_add(dist, x, 1); +} + +/* + * Unicode for block elements. See: + * https://en.wikipedia.org/wiki/Block_Elements + */ +static const gunichar qdist_blocks[] = { + 0x2581, + 0x2582, + 0x2583, + 0x2584, + 0x2585, + 0x2586, + 0x2587, + 0x2588 +}; + +#define QDIST_NR_BLOCK_CODES ARRAY_SIZE(qdist_blocks) + +/* + * Print a distribution into a string. + * + * This function assumes that appropriate binning has been done on the input; + * see qdist_bin__internal() and qdist_pr_plain(). + * + * Callers must free the returned string with g_free(). + */ +static char *qdist_pr_internal(const struct qdist *dist) +{ + double min, max; + GString *s = g_string_new(""); + size_t i; + + /* if only one entry, its printout will be either full or empty */ + if (dist->n == 1) { + if (dist->entries[0].count) { + g_string_append_unichar(s, qdist_blocks[QDIST_NR_BLOCK_CODES - 1]); + } else { + g_string_append_c(s, ' '); + } + goto out; + } + + /* get min and max counts */ + min = dist->entries[0].count; + max = min; + for (i = 0; i < dist->n; i++) { + struct qdist_entry *e = &dist->entries[i]; + + if (e->count < min) { + min = e->count; + } + if (e->count > max) { + max = e->count; + } + } + + for (i = 0; i < dist->n; i++) { + struct qdist_entry *e = &dist->entries[i]; + int index; + + /* make an exception with 0; instead of using block[0], print a space */ + if (e->count) { + /* divide first to avoid loss of precision when e->count == max */ + index = (e->count - min) / (max - min) * (QDIST_NR_BLOCK_CODES - 1); + g_string_append_unichar(s, qdist_blocks[index]); + } else { + g_string_append_c(s, ' '); + } + } + out: + return g_string_free(s, FALSE); +} + +/* + * Bin the distribution in @from into @n bins of consecutive, non-overlapping + * intervals, copying the result to @to. + * + * This function is internal to qdist: only this file and test code should + * ever call it. + * + * Note: calling this function on an already-binned qdist is a bug. + * + * If @n == 0 or @from->n == 1, use @from->n. + */ +void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t n) +{ + double xmin, xmax; + double step; + size_t i, j; + + qdist_init(to); + + if (from->n == 0) { + return; + } + if (n == 0 || from->n == 1) { + n = from->n; + } + + /* set equally-sized bins between @from's left and right */ + xmin = qdist_xmin(from); + xmax = qdist_xmax(from); + step = (xmax - xmin) / n; + + if (n == from->n) { + /* if @from's entries are equally spaced, no need to re-bin */ + for (i = 0; i < from->n; i++) { + if (from->entries[i].x != xmin + i * step) { + goto rebin; + } + } + /* they're equally spaced, so copy the dist and bail out */ + to->entries = g_new(struct qdist_entry, from->n); + to->n = from->n; + memcpy(to->entries, from->entries, sizeof(*to->entries) * to->n); + return; + } + + rebin: + j = 0; + for (i = 0; i < n; i++) { + double x; + double left, right; + + left = xmin + i * step; + right = xmin + (i + 1) * step; + + /* Add x, even if it might not get any counts later */ + x = left; + qdist_add(to, x, 0); + + /* + * To avoid double-counting we capture [left, right) ranges, except for + * the righmost bin, which captures a [left, right] range. + */ + while (j < from->n && (from->entries[j].x < right || i == n - 1)) { + struct qdist_entry *o = &from->entries[j]; + + qdist_add(to, x, o->count); + j++; + } + } +} + +/* + * Print @dist into a string, after re-binning it into @n bins of consecutive, + * non-overlapping intervals. + * + * If @n == 0, use @orig->n. + * + * Callers must free the returned string with g_free(). + */ +char *qdist_pr_plain(const struct qdist *dist, size_t n) +{ + struct qdist binned; + char *ret; + + if (dist->n == 0) { + return NULL; + } + qdist_bin__internal(&binned, dist, n); + ret = qdist_pr_internal(&binned); + qdist_destroy(&binned); + return ret; +} + +static char *qdist_pr_label(const struct qdist *dist, size_t n_bins, + uint32_t opt, bool is_left) +{ + const char *percent; + const char *lparen; + const char *rparen; + GString *s; + double x1, x2, step; + double x; + double n; + int dec; + + s = g_string_new(""); + if (!(opt & QDIST_PR_LABELS)) { + goto out; + } + + dec = opt & QDIST_PR_NODECIMAL ? 0 : 1; + percent = opt & QDIST_PR_PERCENT ? "%" : ""; + + n = n_bins ? n_bins : dist->n; + x = is_left ? qdist_xmin(dist) : qdist_xmax(dist); + step = (qdist_xmax(dist) - qdist_xmin(dist)) / n; + + if (opt & QDIST_PR_100X) { + x *= 100.0; + step *= 100.0; + } + if (opt & QDIST_PR_NOBINRANGE) { + lparen = rparen = ""; + x1 = x; + x2 = x; /* unnecessary, but a dumb compiler might not figure it out */ + } else { + lparen = "["; + rparen = is_left ? ")" : "]"; + if (is_left) { + x1 = x; + x2 = x + step; + } else { + x1 = x - step; + x2 = x; + } + } + g_string_append_printf(s, "%s%.*f", lparen, dec, x1); + if (!(opt & QDIST_PR_NOBINRANGE)) { + g_string_append_printf(s, ",%.*f%s", dec, x2, rparen); + } + g_string_append(s, percent); + out: + return g_string_free(s, FALSE); +} + +/* + * Print the distribution's histogram into a string. + * + * See also: qdist_pr_plain(). + * + * Callers must free the returned string with g_free(). + */ +char *qdist_pr(const struct qdist *dist, size_t n_bins, uint32_t opt) +{ + const char *border = opt & QDIST_PR_BORDER ? "|" : ""; + char *llabel, *rlabel; + char *hgram; + GString *s; + + if (dist->n == 0) { + return NULL; + } + + s = g_string_new(""); + + llabel = qdist_pr_label(dist, n_bins, opt, true); + rlabel = qdist_pr_label(dist, n_bins, opt, false); + hgram = qdist_pr_plain(dist, n_bins); + g_string_append_printf(s, "%s%s%s%s%s", + llabel, border, hgram, border, rlabel); + g_free(llabel); + g_free(rlabel); + g_free(hgram); + + return g_string_free(s, FALSE); +} + +static inline double qdist_x(const struct qdist *dist, int index) +{ + if (dist->n == 0) { + return NAN; + } + return dist->entries[index].x; +} + +double qdist_xmin(const struct qdist *dist) +{ + return qdist_x(dist, 0); +} + +double qdist_xmax(const struct qdist *dist) +{ + return qdist_x(dist, dist->n - 1); +} + +size_t qdist_unique_entries(const struct qdist *dist) +{ + return dist->n; +} + +unsigned long qdist_sample_count(const struct qdist *dist) +{ + unsigned long count = 0; + size_t i; + + for (i = 0; i < dist->n; i++) { + struct qdist_entry *e = &dist->entries[i]; + + count += e->count; + } + return count; +} + +static double qdist_pairwise_avg(const struct qdist *dist, size_t index, + size_t n, unsigned long count) +{ + /* amortize the recursion by using a base case > 2 */ + if (n <= 8) { + size_t i; + double ret = 0; + + for (i = 0; i < n; i++) { + struct qdist_entry *e = &dist->entries[index + i]; + + ret += e->x * e->count / count; + } + return ret; + } else { + size_t n2 = n / 2; + + return qdist_pairwise_avg(dist, index, n2, count) + + qdist_pairwise_avg(dist, index + n2, n - n2, count); + } +} + +double qdist_avg(const struct qdist *dist) +{ + unsigned long count; + + count = qdist_sample_count(dist); + if (!count) { + return NAN; + } + return qdist_pairwise_avg(dist, 0, dist->n, count); +} From ff9249b7337df52ff36cd8d35da496f6d9d009b1 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Wed, 8 Jun 2016 14:55:27 -0400 Subject: [PATCH 09/15] qdist: add test program Acked-by: Sergey Fedorov Reviewed-by: Richard Henderson Signed-off-by: Emilio G. Cota Message-Id: <1465412133-3029-10-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson --- tests/.gitignore | 1 + tests/Makefile.include | 6 +- tests/test-qdist.c | 384 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 390 insertions(+), 1 deletion(-) create mode 100644 tests/test-qdist.c diff --git a/tests/.gitignore b/tests/.gitignore index a06a8ba26f..7c0d156b74 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -48,6 +48,7 @@ test-qapi-types.[ch] test-qapi-visit.[ch] test-qdev-global-props test-qemu-opts +test-qdist test-qga test-qmp-commands test-qmp-commands.h diff --git a/tests/Makefile.include b/tests/Makefile.include index a3e20e39ec..294894558e 100644 --- a/tests/Makefile.include +++ b/tests/Makefile.include @@ -70,6 +70,8 @@ check-unit-y += tests/rcutorture$(EXESUF) gcov-files-rcutorture-y = util/rcu.c check-unit-y += tests/test-rcu-list$(EXESUF) gcov-files-test-rcu-list-y = util/rcu.c +check-unit-y += tests/test-qdist$(EXESUF) +gcov-files-test-qdist-y = util/qdist.c check-unit-y += tests/test-bitops$(EXESUF) check-unit-$(CONFIG_HAS_GLIB_SUBPROCESS_TESTS) += tests/test-qdev-global-props$(EXESUF) check-unit-y += tests/check-qom-interface$(EXESUF) @@ -394,7 +396,8 @@ test-obj-y = tests/check-qint.o tests/check-qstring.o tests/check-qdict.o \ tests/test-qmp-commands.o tests/test-visitor-serialization.o \ tests/test-x86-cpuid.o tests/test-mul64.o tests/test-int128.o \ tests/test-opts-visitor.o tests/test-qmp-event.o \ - tests/rcutorture.o tests/test-rcu-list.o + tests/rcutorture.o tests/test-rcu-list.o \ + tests/test-qdist.o $(test-obj-y): QEMU_INCLUDES += -Itests QEMU_CFLAGS += -I$(SRC_PATH)/tests @@ -433,6 +436,7 @@ tests/test-cutils$(EXESUF): tests/test-cutils.o util/cutils.o tests/test-int128$(EXESUF): tests/test-int128.o tests/rcutorture$(EXESUF): tests/rcutorture.o $(test-util-obj-y) tests/test-rcu-list$(EXESUF): tests/test-rcu-list.o $(test-util-obj-y) +tests/test-qdist$(EXESUF): tests/test-qdist.o $(test-util-obj-y) tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \ hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\ diff --git a/tests/test-qdist.c b/tests/test-qdist.c new file mode 100644 index 0000000000..a67f26057e --- /dev/null +++ b/tests/test-qdist.c @@ -0,0 +1,384 @@ +/* + * Copyright (C) 2016, Emilio G. Cota + * + * License: GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include +#include "qemu/qdist.h" + +#include + +struct entry_desc { + double x; + unsigned long count; + + /* 0 prints a space, 1-8 prints from qdist_blocks[] */ + int fill_code; +}; + +/* See: https://en.wikipedia.org/wiki/Block_Elements */ +static const gunichar qdist_blocks[] = { + 0x2581, + 0x2582, + 0x2583, + 0x2584, + 0x2585, + 0x2586, + 0x2587, + 0x2588 +}; + +#define QDIST_NR_BLOCK_CODES ARRAY_SIZE(qdist_blocks) + +static char *pr_hist(const struct entry_desc *darr, size_t n) +{ + GString *s = g_string_new(""); + size_t i; + + for (i = 0; i < n; i++) { + int fill = darr[i].fill_code; + + if (fill) { + assert(fill <= QDIST_NR_BLOCK_CODES); + g_string_append_unichar(s, qdist_blocks[fill - 1]); + } else { + g_string_append_c(s, ' '); + } + } + return g_string_free(s, FALSE); +} + +static void +histogram_check(const struct qdist *dist, const struct entry_desc *darr, + size_t n, size_t n_bins) +{ + char *pr = qdist_pr_plain(dist, n_bins); + char *str = pr_hist(darr, n); + + g_assert_cmpstr(pr, ==, str); + g_free(pr); + g_free(str); +} + +static void histogram_check_single_full(const struct qdist *dist, size_t n_bins) +{ + struct entry_desc desc = { .fill_code = 8 }; + + histogram_check(dist, &desc, 1, n_bins); +} + +static void +entries_check(const struct qdist *dist, const struct entry_desc *darr, size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) { + struct qdist_entry *e = &dist->entries[i]; + + g_assert_cmpuint(e->count, ==, darr[i].count); + } +} + +static void +entries_insert(struct qdist *dist, const struct entry_desc *darr, size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) { + qdist_add(dist, darr[i].x, darr[i].count); + } +} + +static void do_test_bin(const struct entry_desc *a, size_t n_a, + const struct entry_desc *b, size_t n_b) +{ + struct qdist qda; + struct qdist qdb; + + qdist_init(&qda); + + entries_insert(&qda, a, n_a); + qdist_inc(&qda, a[0].x); + qdist_add(&qda, a[0].x, -1); + + g_assert_cmpuint(qdist_unique_entries(&qda), ==, n_a); + g_assert_cmpfloat(qdist_xmin(&qda), ==, a[0].x); + g_assert_cmpfloat(qdist_xmax(&qda), ==, a[n_a - 1].x); + histogram_check(&qda, a, n_a, 0); + histogram_check(&qda, a, n_a, n_a); + + qdist_bin__internal(&qdb, &qda, n_b); + g_assert_cmpuint(qdb.n, ==, n_b); + entries_check(&qdb, b, n_b); + g_assert_cmpuint(qdist_sample_count(&qda), ==, qdist_sample_count(&qdb)); + /* + * No histogram_check() for $qdb, since we'd rebin it and that is a bug. + * Instead, regenerate it from $qda. + */ + histogram_check(&qda, b, n_b, n_b); + + qdist_destroy(&qdb); + qdist_destroy(&qda); +} + +static void do_test_pr(uint32_t opt) +{ + static const struct entry_desc desc[] = { + [0] = { 1, 900, 8 }, + [1] = { 2, 1, 1 }, + [2] = { 3, 2, 1 } + }; + static const char border[] = "|"; + const char *llabel = NULL; + const char *rlabel = NULL; + struct qdist dist; + GString *s; + char *str; + char *pr; + size_t n; + + n = ARRAY_SIZE(desc); + qdist_init(&dist); + + entries_insert(&dist, desc, n); + histogram_check(&dist, desc, n, 0); + + s = g_string_new(""); + + if (opt & QDIST_PR_LABELS) { + unsigned int lopts = opt & (QDIST_PR_NODECIMAL | + QDIST_PR_PERCENT | + QDIST_PR_100X | + QDIST_PR_NOBINRANGE); + + if (lopts == 0) { + llabel = "[1.0,1.7)"; + rlabel = "[2.3,3.0]"; + } else if (lopts == QDIST_PR_NODECIMAL) { + llabel = "[1,2)"; + rlabel = "[2,3]"; + } else if (lopts == (QDIST_PR_PERCENT | QDIST_PR_NODECIMAL)) { + llabel = "[1,2)%"; + rlabel = "[2,3]%"; + } else if (lopts == QDIST_PR_100X) { + llabel = "[100.0,166.7)"; + rlabel = "[233.3,300.0]"; + } else if (lopts == (QDIST_PR_NOBINRANGE | QDIST_PR_NODECIMAL)) { + llabel = "1"; + rlabel = "3"; + } else { + g_assert_cmpstr("BUG", ==, "This is not meant to be exhaustive"); + } + } + + if (llabel) { + g_string_append(s, llabel); + } + if (opt & QDIST_PR_BORDER) { + g_string_append(s, border); + } + + str = pr_hist(desc, n); + g_string_append(s, str); + g_free(str); + + if (opt & QDIST_PR_BORDER) { + g_string_append(s, border); + } + if (rlabel) { + g_string_append(s, rlabel); + } + + str = g_string_free(s, FALSE); + pr = qdist_pr(&dist, n, opt); + g_assert_cmpstr(pr, ==, str); + g_free(pr); + g_free(str); + + qdist_destroy(&dist); +} + +static inline void do_test_pr_label(uint32_t opt) +{ + opt |= QDIST_PR_LABELS; + do_test_pr(opt); +} + +static void test_pr(void) +{ + do_test_pr(0); + + do_test_pr(QDIST_PR_BORDER); + + /* 100X should be ignored because we're not setting LABELS */ + do_test_pr(QDIST_PR_100X); + + do_test_pr_label(0); + do_test_pr_label(QDIST_PR_NODECIMAL); + do_test_pr_label(QDIST_PR_PERCENT | QDIST_PR_NODECIMAL); + do_test_pr_label(QDIST_PR_100X); + do_test_pr_label(QDIST_PR_NOBINRANGE | QDIST_PR_NODECIMAL); +} + +static void test_bin_shrink(void) +{ + static const struct entry_desc a[] = { + [0] = { 0.0, 42922, 7 }, + [1] = { 0.25, 47834, 8 }, + [2] = { 0.50, 26628, 0 }, + [3] = { 0.625, 597, 4 }, + [4] = { 0.75, 10298, 1 }, + [5] = { 0.875, 22, 2 }, + [6] = { 1.0, 2771, 1 } + }; + static const struct entry_desc b[] = { + [0] = { 0.0, 42922, 7 }, + [1] = { 0.25, 47834, 8 }, + [2] = { 0.50, 27225, 3 }, + [3] = { 0.75, 13091, 1 } + }; + + return do_test_bin(a, ARRAY_SIZE(a), b, ARRAY_SIZE(b)); +} + +static void test_bin_expand(void) +{ + static const struct entry_desc a[] = { + [0] = { 0.0, 11713, 5 }, + [1] = { 0.25, 20294, 0 }, + [2] = { 0.50, 17266, 8 }, + [3] = { 0.625, 1506, 0 }, + [4] = { 0.75, 10355, 6 }, + [5] = { 0.833, 2, 1 }, + [6] = { 0.875, 99, 4 }, + [7] = { 1.0, 4301, 2 } + }; + static const struct entry_desc b[] = { + [0] = { 0.0, 11713, 5 }, + [1] = { 0.0, 0, 0 }, + [2] = { 0.0, 20294, 8 }, + [3] = { 0.0, 0, 0 }, + [4] = { 0.0, 0, 0 }, + [5] = { 0.0, 17266, 6 }, + [6] = { 0.0, 1506, 1 }, + [7] = { 0.0, 10355, 4 }, + [8] = { 0.0, 101, 1 }, + [9] = { 0.0, 4301, 2 } + }; + + return do_test_bin(a, ARRAY_SIZE(a), b, ARRAY_SIZE(b)); +} + +static void test_bin_precision(void) +{ + static const struct entry_desc a[] = { + [0] = { 0, 213549, 8 }, + [1] = { 1, 70, 1 }, + }; + static const struct entry_desc b[] = { + [0] = { 0, 213549, 8 }, + [1] = { 0, 70, 1 }, + }; + + return do_test_bin(a, ARRAY_SIZE(a), b, ARRAY_SIZE(b)); +} + +static void test_bin_simple(void) +{ + static const struct entry_desc a[] = { + [0] = { 10, 101, 8 }, + [1] = { 11, 0, 0 }, + [2] = { 12, 2, 1 } + }; + static const struct entry_desc b[] = { + [0] = { 0, 101, 8 }, + [1] = { 0, 0, 0 }, + [2] = { 0, 0, 0 }, + [3] = { 0, 0, 0 }, + [4] = { 0, 2, 1 } + }; + + return do_test_bin(a, ARRAY_SIZE(a), b, ARRAY_SIZE(b)); +} + +static void test_single_full(void) +{ + struct qdist dist; + + qdist_init(&dist); + + qdist_add(&dist, 3, 102); + g_assert_cmpfloat(qdist_avg(&dist), ==, 3); + g_assert_cmpfloat(qdist_xmin(&dist), ==, 3); + g_assert_cmpfloat(qdist_xmax(&dist), ==, 3); + + histogram_check_single_full(&dist, 0); + histogram_check_single_full(&dist, 1); + histogram_check_single_full(&dist, 10); + + qdist_destroy(&dist); +} + +static void test_single_empty(void) +{ + struct qdist dist; + char *pr; + + qdist_init(&dist); + + qdist_add(&dist, 3, 0); + g_assert_cmpuint(qdist_sample_count(&dist), ==, 0); + g_assert(isnan(qdist_avg(&dist))); + g_assert_cmpfloat(qdist_xmin(&dist), ==, 3); + g_assert_cmpfloat(qdist_xmax(&dist), ==, 3); + + pr = qdist_pr_plain(&dist, 0); + g_assert_cmpstr(pr, ==, " "); + g_free(pr); + + pr = qdist_pr_plain(&dist, 1); + g_assert_cmpstr(pr, ==, " "); + g_free(pr); + + pr = qdist_pr_plain(&dist, 2); + g_assert_cmpstr(pr, ==, " "); + g_free(pr); + + qdist_destroy(&dist); +} + +static void test_none(void) +{ + struct qdist dist; + char *pr; + + qdist_init(&dist); + + g_assert(isnan(qdist_avg(&dist))); + g_assert(isnan(qdist_xmin(&dist))); + g_assert(isnan(qdist_xmax(&dist))); + + pr = qdist_pr_plain(&dist, 0); + g_assert(pr == NULL); + + pr = qdist_pr_plain(&dist, 2); + g_assert(pr == NULL); + + qdist_destroy(&dist); +} + +int main(int argc, char *argv[]) +{ + g_test_init(&argc, &argv, NULL); + g_test_add_func("/qdist/none", test_none); + g_test_add_func("/qdist/single/empty", test_single_empty); + g_test_add_func("/qdist/single/full", test_single_full); + g_test_add_func("/qdist/binning/simple", test_bin_simple); + g_test_add_func("/qdist/binning/precision", test_bin_precision); + g_test_add_func("/qdist/binning/expand", test_bin_expand); + g_test_add_func("/qdist/binning/shrink", test_bin_shrink); + g_test_add_func("/qdist/pr", test_pr); + return g_test_run(); +} From 2e11264aafe476c7a53accde4a23cfc2395a02fd Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Wed, 8 Jun 2016 14:55:28 -0400 Subject: [PATCH 10/15] qht: QEMU's fast, resizable and scalable Hash Table This is a fast, scalable chained hash table with optional auto-resizing, allowing reads that are concurrent with reads, and reads/writes that are concurrent with writes to separate buckets. A hash table with these features will be necessary for the scalability of the ongoing MTTCG work; before those changes arrive we can already benefit from the single-threaded speedup that qht also provides. Signed-off-by: Emilio G. Cota Message-Id: <1465412133-3029-11-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson --- include/qemu/qht.h | 183 ++++++++++ util/Makefile.objs | 1 + util/qht.c | 833 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1017 insertions(+) create mode 100644 include/qemu/qht.h create mode 100644 util/qht.c diff --git a/include/qemu/qht.h b/include/qemu/qht.h new file mode 100644 index 0000000000..aec60aa534 --- /dev/null +++ b/include/qemu/qht.h @@ -0,0 +1,183 @@ +/* + * Copyright (C) 2016, Emilio G. Cota + * + * License: GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#ifndef QEMU_QHT_H +#define QEMU_QHT_H + +#include "qemu/osdep.h" +#include "qemu/seqlock.h" +#include "qemu/thread.h" +#include "qemu/qdist.h" + +struct qht { + struct qht_map *map; + QemuMutex lock; /* serializes setters of ht->map */ + unsigned int mode; +}; + +/** + * struct qht_stats - Statistics of a QHT + * @head_buckets: number of head buckets + * @used_head_buckets: number of non-empty head buckets + * @entries: total number of entries + * @chain: frequency distribution representing the number of buckets in each + * chain, excluding empty chains. + * @occupancy: frequency distribution representing chain occupancy rate. + * Valid range: from 0.0 (empty) to 1.0 (full occupancy). + * + * An entry is a pointer-hash pair. + * Each bucket can host several entries. + * Chains are chains of buckets, whose first link is always a head bucket. + */ +struct qht_stats { + size_t head_buckets; + size_t used_head_buckets; + size_t entries; + struct qdist chain; + struct qdist occupancy; +}; + +typedef bool (*qht_lookup_func_t)(const void *obj, const void *userp); +typedef void (*qht_iter_func_t)(struct qht *ht, void *p, uint32_t h, void *up); + +#define QHT_MODE_AUTO_RESIZE 0x1 /* auto-resize when heavily loaded */ + +/** + * qht_init - Initialize a QHT + * @ht: QHT to be initialized + * @n_elems: number of entries the hash table should be optimized for. + * @mode: bitmask with OR'ed QHT_MODE_* + */ +void qht_init(struct qht *ht, size_t n_elems, unsigned int mode); + +/** + * qht_destroy - destroy a previously initialized QHT + * @ht: QHT to be destroyed + * + * Call only when there are no readers/writers left. + */ +void qht_destroy(struct qht *ht); + +/** + * qht_insert - Insert a pointer into the hash table + * @ht: QHT to insert to + * @p: pointer to be inserted + * @hash: hash corresponding to @p + * + * Attempting to insert a NULL @p is a bug. + * Inserting the same pointer @p with different @hash values is a bug. + * + * Returns true on sucess. + * Returns false if the @p-@hash pair already exists in the hash table. + */ +bool qht_insert(struct qht *ht, void *p, uint32_t hash); + +/** + * qht_lookup - Look up a pointer in a QHT + * @ht: QHT to be looked up + * @func: function to compare existing pointers against @userp + * @userp: pointer to pass to @func + * @hash: hash of the pointer to be looked up + * + * Needs to be called under an RCU read-critical section. + * + * The user-provided @func compares pointers in QHT against @userp. + * If the function returns true, a match has been found. + * + * Returns the corresponding pointer when a match is found. + * Returns NULL otherwise. + */ +void *qht_lookup(struct qht *ht, qht_lookup_func_t func, const void *userp, + uint32_t hash); + +/** + * qht_remove - remove a pointer from the hash table + * @ht: QHT to remove from + * @p: pointer to be removed + * @hash: hash corresponding to @p + * + * Attempting to remove a NULL @p is a bug. + * + * Just-removed @p pointers cannot be immediately freed; they need to remain + * valid until the end of the RCU grace period in which qht_remove() is called. + * This guarantees that concurrent lookups will always compare against valid + * data. + * + * Returns true on success. + * Returns false if the @p-@hash pair was not found. + */ +bool qht_remove(struct qht *ht, const void *p, uint32_t hash); + +/** + * qht_reset - reset a QHT + * @ht: QHT to be reset + * + * All entries in the hash table are reset. No resizing is performed. + * + * If concurrent readers may exist, the objects pointed to by the hash table + * must remain valid for the existing RCU grace period -- see qht_remove(). + * See also: qht_reset_size() + */ +void qht_reset(struct qht *ht); + +/** + * qht_reset_size - reset and resize a QHT + * @ht: QHT to be reset and resized + * @n_elems: number of entries the resized hash table should be optimized for. + * + * Returns true if the resize was necessary and therefore performed. + * Returns false otherwise. + * + * If concurrent readers may exist, the objects pointed to by the hash table + * must remain valid for the existing RCU grace period -- see qht_remove(). + * See also: qht_reset(), qht_resize(). + */ +bool qht_reset_size(struct qht *ht, size_t n_elems); + +/** + * qht_resize - resize a QHT + * @ht: QHT to be resized + * @n_elems: number of entries the resized hash table should be optimized for + * + * Returns true on success. + * Returns false if the resize was not necessary and therefore not performed. + * See also: qht_reset_size(). + */ +bool qht_resize(struct qht *ht, size_t n_elems); + +/** + * qht_iter - Iterate over a QHT + * @ht: QHT to be iterated over + * @func: function to be called for each entry in QHT + * @userp: additional pointer to be passed to @func + * + * Each time it is called, user-provided @func is passed a pointer-hash pair, + * plus @userp. + */ +void qht_iter(struct qht *ht, qht_iter_func_t func, void *userp); + +/** + * qht_statistics_init - Gather statistics from a QHT + * @ht: QHT to gather statistics from + * @stats: pointer to a struct qht_stats to be filled in + * + * Does NOT need to be called under an RCU read-critical section, + * since it does not dereference any pointers stored in the hash table. + * + * When done with @stats, pass the struct to qht_statistics_destroy(). + * Failing to do this will leak memory. + */ +void qht_statistics_init(struct qht *ht, struct qht_stats *stats); + +/** + * qht_statistics_destroy - Destroy a struct qht_stats + * @stats: stuct qht_stats to be destroyed + * + * See also: qht_statistics_init(). + */ +void qht_statistics_destroy(struct qht_stats *stats); + +#endif /* QEMU_QHT_H */ diff --git a/util/Makefile.objs b/util/Makefile.objs index 702435e839..45f8794864 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -33,3 +33,4 @@ util-obj-y += timed-average.o util-obj-y += base64.o util-obj-y += log.o util-obj-y += qdist.o +util-obj-y += qht.o diff --git a/util/qht.c b/util/qht.c new file mode 100644 index 0000000000..6f749098f4 --- /dev/null +++ b/util/qht.c @@ -0,0 +1,833 @@ +/* + * qht.c - QEMU Hash Table, designed to scale for read-mostly workloads. + * + * Copyright (C) 2016, Emilio G. Cota + * + * License: GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + * Assumptions: + * - NULL cannot be inserted/removed as a pointer value. + * - Trying to insert an already-existing hash-pointer pair is OK. However, + * it is not OK to insert into the same hash table different hash-pointer + * pairs that have the same pointer value, but not the hashes. + * - Lookups are performed under an RCU read-critical section; removals + * must wait for a grace period to elapse before freeing removed objects. + * + * Features: + * - Reads (i.e. lookups and iterators) can be concurrent with other reads. + * Lookups that are concurrent with writes to the same bucket will retry + * via a seqlock; iterators acquire all bucket locks and therefore can be + * concurrent with lookups and are serialized wrt writers. + * - Writes (i.e. insertions/removals) can be concurrent with writes to + * different buckets; writes to the same bucket are serialized through a lock. + * - Optional auto-resizing: the hash table resizes up if the load surpasses + * a certain threshold. Resizing is done concurrently with readers; writes + * are serialized with the resize operation. + * + * The key structure is the bucket, which is cacheline-sized. Buckets + * contain a few hash values and pointers; the u32 hash values are stored in + * full so that resizing is fast. Having this structure instead of directly + * chaining items has two advantages: + * - Failed lookups fail fast, and touch a minimum number of cache lines. + * - Resizing the hash table with concurrent lookups is easy. + * + * There are two types of buckets: + * 1. "head" buckets are the ones allocated in the array of buckets in qht_map. + * 2. all "non-head" buckets (i.e. all others) are members of a chain that + * starts from a head bucket. + * Note that the seqlock and spinlock of a head bucket applies to all buckets + * chained to it; these two fields are unused in non-head buckets. + * + * On removals, we move the last valid item in the chain to the position of the + * just-removed entry. This makes lookups slightly faster, since the moment an + * invalid entry is found, the (failed) lookup is over. + * + * Resizing is done by taking all bucket spinlocks (so that no other writers can + * race with us) and then copying all entries into a new hash map. Then, the + * ht->map pointer is set, and the old map is freed once no RCU readers can see + * it anymore. + * + * Writers check for concurrent resizes by comparing ht->map before and after + * acquiring their bucket lock. If they don't match, a resize has occured + * while the bucket spinlock was being acquired. + * + * Related Work: + * - Idea of cacheline-sized buckets with full hashes taken from: + * David, Guerraoui & Trigonakis, "Asynchronized Concurrency: + * The Secret to Scaling Concurrent Search Data Structures", ASPLOS'15. + * - Why not RCU-based hash tables? They would allow us to get rid of the + * seqlock, but resizing would take forever since RCU read critical + * sections in QEMU take quite a long time. + * More info on relativistic hash tables: + * + Triplett, McKenney & Walpole, "Resizable, Scalable, Concurrent Hash + * Tables via Relativistic Programming", USENIX ATC'11. + * + Corbet, "Relativistic hash tables, part 1: Algorithms", @ lwn.net, 2014. + * https://lwn.net/Articles/612021/ + */ +#include "qemu/qht.h" +#include "qemu/atomic.h" +#include "qemu/rcu.h" + +//#define QHT_DEBUG + +/* + * We want to avoid false sharing of cache lines. Most systems have 64-byte + * cache lines so we go with it for simplicity. + * + * Note that systems with smaller cache lines will be fine (the struct is + * almost 64-bytes); systems with larger cache lines might suffer from + * some false sharing. + */ +#define QHT_BUCKET_ALIGN 64 + +/* define these to keep sizeof(qht_bucket) within QHT_BUCKET_ALIGN */ +#if HOST_LONG_BITS == 32 +#define QHT_BUCKET_ENTRIES 6 +#else /* 64-bit */ +#define QHT_BUCKET_ENTRIES 4 +#endif + +/* + * Note: reading partially-updated pointers in @pointers could lead to + * segfaults. We thus access them with atomic_read/set; this guarantees + * that the compiler makes all those accesses atomic. We also need the + * volatile-like behavior in atomic_read, since otherwise the compiler + * might refetch the pointer. + * atomic_read's are of course not necessary when the bucket lock is held. + * + * If both ht->lock and b->lock are grabbed, ht->lock should always + * be grabbed first. + */ +struct qht_bucket { + QemuSpin lock; + QemuSeqLock sequence; + uint32_t hashes[QHT_BUCKET_ENTRIES]; + void *pointers[QHT_BUCKET_ENTRIES]; + struct qht_bucket *next; +} QEMU_ALIGNED(QHT_BUCKET_ALIGN); + +QEMU_BUILD_BUG_ON(sizeof(struct qht_bucket) > QHT_BUCKET_ALIGN); + +/** + * struct qht_map - structure to track an array of buckets + * @rcu: used by RCU. Keep it as the top field in the struct to help valgrind + * find the whole struct. + * @buckets: array of head buckets. It is constant once the map is created. + * @n_buckets: number of head buckets. It is constant once the map is created. + * @n_added_buckets: number of added (i.e. "non-head") buckets + * @n_added_buckets_threshold: threshold to trigger an upward resize once the + * number of added buckets surpasses it. + * + * Buckets are tracked in what we call a "map", i.e. this structure. + */ +struct qht_map { + struct rcu_head rcu; + struct qht_bucket *buckets; + size_t n_buckets; + size_t n_added_buckets; + size_t n_added_buckets_threshold; +}; + +/* trigger a resize when n_added_buckets > n_buckets / div */ +#define QHT_NR_ADDED_BUCKETS_THRESHOLD_DIV 8 + +static void qht_do_resize(struct qht *ht, struct qht_map *new); +static void qht_grow_maybe(struct qht *ht); + +#ifdef QHT_DEBUG + +#define qht_debug_assert(X) do { assert(X); } while (0) + +static void qht_bucket_debug__locked(struct qht_bucket *b) +{ + bool seen_empty = false; + bool corrupt = false; + int i; + + do { + for (i = 0; i < QHT_BUCKET_ENTRIES; i++) { + if (b->pointers[i] == NULL) { + seen_empty = true; + continue; + } + if (seen_empty) { + fprintf(stderr, "%s: b: %p, pos: %i, hash: 0x%x, p: %p\n", + __func__, b, i, b->hashes[i], b->pointers[i]); + corrupt = true; + } + } + b = b->next; + } while (b); + qht_debug_assert(!corrupt); +} + +static void qht_map_debug__all_locked(struct qht_map *map) +{ + int i; + + for (i = 0; i < map->n_buckets; i++) { + qht_bucket_debug__locked(&map->buckets[i]); + } +} +#else + +#define qht_debug_assert(X) do { (void)(X); } while (0) + +static inline void qht_bucket_debug__locked(struct qht_bucket *b) +{ } + +static inline void qht_map_debug__all_locked(struct qht_map *map) +{ } +#endif /* QHT_DEBUG */ + +static inline size_t qht_elems_to_buckets(size_t n_elems) +{ + return pow2ceil(n_elems / QHT_BUCKET_ENTRIES); +} + +static inline void qht_head_init(struct qht_bucket *b) +{ + memset(b, 0, sizeof(*b)); + qemu_spin_init(&b->lock); + seqlock_init(&b->sequence); +} + +static inline +struct qht_bucket *qht_map_to_bucket(struct qht_map *map, uint32_t hash) +{ + return &map->buckets[hash & (map->n_buckets - 1)]; +} + +/* acquire all bucket locks from a map */ +static void qht_map_lock_buckets(struct qht_map *map) +{ + size_t i; + + for (i = 0; i < map->n_buckets; i++) { + struct qht_bucket *b = &map->buckets[i]; + + qemu_spin_lock(&b->lock); + } +} + +static void qht_map_unlock_buckets(struct qht_map *map) +{ + size_t i; + + for (i = 0; i < map->n_buckets; i++) { + struct qht_bucket *b = &map->buckets[i]; + + qemu_spin_unlock(&b->lock); + } +} + +/* + * Call with at least a bucket lock held. + * @map should be the value read before acquiring the lock (or locks). + */ +static inline bool qht_map_is_stale__locked(struct qht *ht, struct qht_map *map) +{ + return map != ht->map; +} + +/* + * Grab all bucket locks, and set @pmap after making sure the map isn't stale. + * + * Pairs with qht_map_unlock_buckets(), hence the pass-by-reference. + * + * Note: callers cannot have ht->lock held. + */ +static inline +void qht_map_lock_buckets__no_stale(struct qht *ht, struct qht_map **pmap) +{ + struct qht_map *map; + + map = atomic_rcu_read(&ht->map); + qht_map_lock_buckets(map); + if (likely(!qht_map_is_stale__locked(ht, map))) { + *pmap = map; + return; + } + qht_map_unlock_buckets(map); + + /* we raced with a resize; acquire ht->lock to see the updated ht->map */ + qemu_mutex_lock(&ht->lock); + map = ht->map; + qht_map_lock_buckets(map); + qemu_mutex_unlock(&ht->lock); + *pmap = map; + return; +} + +/* + * Get a head bucket and lock it, making sure its parent map is not stale. + * @pmap is filled with a pointer to the bucket's parent map. + * + * Unlock with qemu_spin_unlock(&b->lock). + * + * Note: callers cannot have ht->lock held. + */ +static inline +struct qht_bucket *qht_bucket_lock__no_stale(struct qht *ht, uint32_t hash, + struct qht_map **pmap) +{ + struct qht_bucket *b; + struct qht_map *map; + + map = atomic_rcu_read(&ht->map); + b = qht_map_to_bucket(map, hash); + + qemu_spin_lock(&b->lock); + if (likely(!qht_map_is_stale__locked(ht, map))) { + *pmap = map; + return b; + } + qemu_spin_unlock(&b->lock); + + /* we raced with a resize; acquire ht->lock to see the updated ht->map */ + qemu_mutex_lock(&ht->lock); + map = ht->map; + b = qht_map_to_bucket(map, hash); + qemu_spin_lock(&b->lock); + qemu_mutex_unlock(&ht->lock); + *pmap = map; + return b; +} + +static inline bool qht_map_needs_resize(struct qht_map *map) +{ + return atomic_read(&map->n_added_buckets) > map->n_added_buckets_threshold; +} + +static inline void qht_chain_destroy(struct qht_bucket *head) +{ + struct qht_bucket *curr = head->next; + struct qht_bucket *prev; + + while (curr) { + prev = curr; + curr = curr->next; + qemu_vfree(prev); + } +} + +/* pass only an orphan map */ +static void qht_map_destroy(struct qht_map *map) +{ + size_t i; + + for (i = 0; i < map->n_buckets; i++) { + qht_chain_destroy(&map->buckets[i]); + } + qemu_vfree(map->buckets); + g_free(map); +} + +static struct qht_map *qht_map_create(size_t n_buckets) +{ + struct qht_map *map; + size_t i; + + map = g_malloc(sizeof(*map)); + map->n_buckets = n_buckets; + + map->n_added_buckets = 0; + map->n_added_buckets_threshold = n_buckets / + QHT_NR_ADDED_BUCKETS_THRESHOLD_DIV; + + /* let tiny hash tables to at least add one non-head bucket */ + if (unlikely(map->n_added_buckets_threshold == 0)) { + map->n_added_buckets_threshold = 1; + } + + map->buckets = qemu_memalign(QHT_BUCKET_ALIGN, + sizeof(*map->buckets) * n_buckets); + for (i = 0; i < n_buckets; i++) { + qht_head_init(&map->buckets[i]); + } + return map; +} + +void qht_init(struct qht *ht, size_t n_elems, unsigned int mode) +{ + struct qht_map *map; + size_t n_buckets = qht_elems_to_buckets(n_elems); + + ht->mode = mode; + qemu_mutex_init(&ht->lock); + map = qht_map_create(n_buckets); + atomic_rcu_set(&ht->map, map); +} + +/* call only when there are no readers/writers left */ +void qht_destroy(struct qht *ht) +{ + qht_map_destroy(ht->map); + memset(ht, 0, sizeof(*ht)); +} + +static void qht_bucket_reset__locked(struct qht_bucket *head) +{ + struct qht_bucket *b = head; + int i; + + seqlock_write_begin(&head->sequence); + do { + for (i = 0; i < QHT_BUCKET_ENTRIES; i++) { + if (b->pointers[i] == NULL) { + goto done; + } + b->hashes[i] = 0; + atomic_set(&b->pointers[i], NULL); + } + b = b->next; + } while (b); + done: + seqlock_write_end(&head->sequence); +} + +/* call with all bucket locks held */ +static void qht_map_reset__all_locked(struct qht_map *map) +{ + size_t i; + + for (i = 0; i < map->n_buckets; i++) { + qht_bucket_reset__locked(&map->buckets[i]); + } + qht_map_debug__all_locked(map); +} + +void qht_reset(struct qht *ht) +{ + struct qht_map *map; + + qht_map_lock_buckets__no_stale(ht, &map); + qht_map_reset__all_locked(map); + qht_map_unlock_buckets(map); +} + +bool qht_reset_size(struct qht *ht, size_t n_elems) +{ + struct qht_map *new; + struct qht_map *map; + size_t n_buckets; + bool resize = false; + + n_buckets = qht_elems_to_buckets(n_elems); + + qemu_mutex_lock(&ht->lock); + map = ht->map; + if (n_buckets != map->n_buckets) { + new = qht_map_create(n_buckets); + resize = true; + } + + qht_map_lock_buckets(map); + qht_map_reset__all_locked(map); + if (resize) { + qht_do_resize(ht, new); + } + qht_map_unlock_buckets(map); + qemu_mutex_unlock(&ht->lock); + + return resize; +} + +static inline +void *qht_do_lookup(struct qht_bucket *head, qht_lookup_func_t func, + const void *userp, uint32_t hash) +{ + struct qht_bucket *b = head; + int i; + + do { + for (i = 0; i < QHT_BUCKET_ENTRIES; i++) { + if (b->hashes[i] == hash) { + void *p = atomic_read(&b->pointers[i]); + + if (likely(p) && likely(func(p, userp))) { + return p; + } + } + } + b = atomic_rcu_read(&b->next); + } while (b); + + return NULL; +} + +static __attribute__((noinline)) +void *qht_lookup__slowpath(struct qht_bucket *b, qht_lookup_func_t func, + const void *userp, uint32_t hash) +{ + unsigned int version; + void *ret; + + do { + version = seqlock_read_begin(&b->sequence); + ret = qht_do_lookup(b, func, userp, hash); + } while (seqlock_read_retry(&b->sequence, version)); + return ret; +} + +void *qht_lookup(struct qht *ht, qht_lookup_func_t func, const void *userp, + uint32_t hash) +{ + struct qht_bucket *b; + struct qht_map *map; + unsigned int version; + void *ret; + + map = atomic_rcu_read(&ht->map); + b = qht_map_to_bucket(map, hash); + + version = seqlock_read_begin(&b->sequence); + ret = qht_do_lookup(b, func, userp, hash); + if (likely(!seqlock_read_retry(&b->sequence, version))) { + return ret; + } + /* + * Removing the do/while from the fastpath gives a 4% perf. increase when + * running a 100%-lookup microbenchmark. + */ + return qht_lookup__slowpath(b, func, userp, hash); +} + +/* call with head->lock held */ +static bool qht_insert__locked(struct qht *ht, struct qht_map *map, + struct qht_bucket *head, void *p, uint32_t hash, + bool *needs_resize) +{ + struct qht_bucket *b = head; + struct qht_bucket *prev = NULL; + struct qht_bucket *new = NULL; + int i; + + do { + for (i = 0; i < QHT_BUCKET_ENTRIES; i++) { + if (b->pointers[i]) { + if (unlikely(b->pointers[i] == p)) { + return false; + } + } else { + goto found; + } + } + prev = b; + b = b->next; + } while (b); + + b = qemu_memalign(QHT_BUCKET_ALIGN, sizeof(*b)); + memset(b, 0, sizeof(*b)); + new = b; + i = 0; + atomic_inc(&map->n_added_buckets); + if (unlikely(qht_map_needs_resize(map)) && needs_resize) { + *needs_resize = true; + } + + found: + /* found an empty key: acquire the seqlock and write */ + seqlock_write_begin(&head->sequence); + if (new) { + atomic_rcu_set(&prev->next, b); + } + b->hashes[i] = hash; + atomic_set(&b->pointers[i], p); + seqlock_write_end(&head->sequence); + return true; +} + +static __attribute__((noinline)) void qht_grow_maybe(struct qht *ht) +{ + struct qht_map *map; + + /* + * If the lock is taken it probably means there's an ongoing resize, + * so bail out. + */ + if (qemu_mutex_trylock(&ht->lock)) { + return; + } + map = ht->map; + /* another thread might have just performed the resize we were after */ + if (qht_map_needs_resize(map)) { + struct qht_map *new = qht_map_create(map->n_buckets * 2); + + qht_map_lock_buckets(map); + qht_do_resize(ht, new); + qht_map_unlock_buckets(map); + } + qemu_mutex_unlock(&ht->lock); +} + +bool qht_insert(struct qht *ht, void *p, uint32_t hash) +{ + struct qht_bucket *b; + struct qht_map *map; + bool needs_resize = false; + bool ret; + + /* NULL pointers are not supported */ + qht_debug_assert(p); + + b = qht_bucket_lock__no_stale(ht, hash, &map); + ret = qht_insert__locked(ht, map, b, p, hash, &needs_resize); + qht_bucket_debug__locked(b); + qemu_spin_unlock(&b->lock); + + if (unlikely(needs_resize) && ht->mode & QHT_MODE_AUTO_RESIZE) { + qht_grow_maybe(ht); + } + return ret; +} + +static inline bool qht_entry_is_last(struct qht_bucket *b, int pos) +{ + if (pos == QHT_BUCKET_ENTRIES - 1) { + if (b->next == NULL) { + return true; + } + return b->next->pointers[0] == NULL; + } + return b->pointers[pos + 1] == NULL; +} + +static void +qht_entry_move(struct qht_bucket *to, int i, struct qht_bucket *from, int j) +{ + qht_debug_assert(!(to == from && i == j)); + qht_debug_assert(to->pointers[i]); + qht_debug_assert(from->pointers[j]); + + to->hashes[i] = from->hashes[j]; + atomic_set(&to->pointers[i], from->pointers[j]); + + from->hashes[j] = 0; + atomic_set(&from->pointers[j], NULL); +} + +/* + * Find the last valid entry in @head, and swap it with @orig[pos], which has + * just been invalidated. + */ +static inline void qht_bucket_remove_entry(struct qht_bucket *orig, int pos) +{ + struct qht_bucket *b = orig; + struct qht_bucket *prev = NULL; + int i; + + if (qht_entry_is_last(orig, pos)) { + orig->hashes[pos] = 0; + atomic_set(&orig->pointers[pos], NULL); + return; + } + do { + for (i = 0; i < QHT_BUCKET_ENTRIES; i++) { + if (b->pointers[i]) { + continue; + } + if (i > 0) { + return qht_entry_move(orig, pos, b, i - 1); + } + qht_debug_assert(prev); + return qht_entry_move(orig, pos, prev, QHT_BUCKET_ENTRIES - 1); + } + prev = b; + b = b->next; + } while (b); + /* no free entries other than orig[pos], so swap it with the last one */ + qht_entry_move(orig, pos, prev, QHT_BUCKET_ENTRIES - 1); +} + +/* call with b->lock held */ +static inline +bool qht_remove__locked(struct qht_map *map, struct qht_bucket *head, + const void *p, uint32_t hash) +{ + struct qht_bucket *b = head; + int i; + + do { + for (i = 0; i < QHT_BUCKET_ENTRIES; i++) { + void *q = b->pointers[i]; + + if (unlikely(q == NULL)) { + return false; + } + if (q == p) { + qht_debug_assert(b->hashes[i] == hash); + seqlock_write_begin(&head->sequence); + qht_bucket_remove_entry(b, i); + seqlock_write_end(&head->sequence); + return true; + } + } + b = b->next; + } while (b); + return false; +} + +bool qht_remove(struct qht *ht, const void *p, uint32_t hash) +{ + struct qht_bucket *b; + struct qht_map *map; + bool ret; + + /* NULL pointers are not supported */ + qht_debug_assert(p); + + b = qht_bucket_lock__no_stale(ht, hash, &map); + ret = qht_remove__locked(map, b, p, hash); + qht_bucket_debug__locked(b); + qemu_spin_unlock(&b->lock); + return ret; +} + +static inline void qht_bucket_iter(struct qht *ht, struct qht_bucket *b, + qht_iter_func_t func, void *userp) +{ + int i; + + do { + for (i = 0; i < QHT_BUCKET_ENTRIES; i++) { + if (b->pointers[i] == NULL) { + return; + } + func(ht, b->pointers[i], b->hashes[i], userp); + } + b = b->next; + } while (b); +} + +/* call with all of the map's locks held */ +static inline void qht_map_iter__all_locked(struct qht *ht, struct qht_map *map, + qht_iter_func_t func, void *userp) +{ + size_t i; + + for (i = 0; i < map->n_buckets; i++) { + qht_bucket_iter(ht, &map->buckets[i], func, userp); + } +} + +void qht_iter(struct qht *ht, qht_iter_func_t func, void *userp) +{ + struct qht_map *map; + + map = atomic_rcu_read(&ht->map); + qht_map_lock_buckets(map); + /* Note: ht here is merely for carrying ht->mode; ht->map won't be read */ + qht_map_iter__all_locked(ht, map, func, userp); + qht_map_unlock_buckets(map); +} + +static void qht_map_copy(struct qht *ht, void *p, uint32_t hash, void *userp) +{ + struct qht_map *new = userp; + struct qht_bucket *b = qht_map_to_bucket(new, hash); + + /* no need to acquire b->lock because no thread has seen this map yet */ + qht_insert__locked(ht, new, b, p, hash, NULL); +} + +/* + * Call with ht->lock and all bucket locks held. + * + * Creating the @new map here would add unnecessary delay while all the locks + * are held--holding up the bucket locks is particularly bad, since no writes + * can occur while these are held. Thus, we let callers create the new map, + * hopefully without the bucket locks held. + */ +static void qht_do_resize(struct qht *ht, struct qht_map *new) +{ + struct qht_map *old; + + old = ht->map; + g_assert_cmpuint(new->n_buckets, !=, old->n_buckets); + + qht_map_iter__all_locked(ht, old, qht_map_copy, new); + qht_map_debug__all_locked(new); + + atomic_rcu_set(&ht->map, new); + call_rcu(old, qht_map_destroy, rcu); +} + +bool qht_resize(struct qht *ht, size_t n_elems) +{ + size_t n_buckets = qht_elems_to_buckets(n_elems); + size_t ret = false; + + qemu_mutex_lock(&ht->lock); + if (n_buckets != ht->map->n_buckets) { + struct qht_map *new; + struct qht_map *old = ht->map; + + new = qht_map_create(n_buckets); + qht_map_lock_buckets(old); + qht_do_resize(ht, new); + qht_map_unlock_buckets(old); + ret = true; + } + qemu_mutex_unlock(&ht->lock); + + return ret; +} + +/* pass @stats to qht_statistics_destroy() when done */ +void qht_statistics_init(struct qht *ht, struct qht_stats *stats) +{ + struct qht_map *map; + int i; + + map = atomic_rcu_read(&ht->map); + + stats->head_buckets = map->n_buckets; + stats->used_head_buckets = 0; + stats->entries = 0; + qdist_init(&stats->chain); + qdist_init(&stats->occupancy); + + for (i = 0; i < map->n_buckets; i++) { + struct qht_bucket *head = &map->buckets[i]; + struct qht_bucket *b; + unsigned int version; + size_t buckets; + size_t entries; + int j; + + do { + version = seqlock_read_begin(&head->sequence); + buckets = 0; + entries = 0; + b = head; + do { + for (j = 0; j < QHT_BUCKET_ENTRIES; j++) { + if (atomic_read(&b->pointers[j]) == NULL) { + break; + } + entries++; + } + buckets++; + b = atomic_rcu_read(&b->next); + } while (b); + } while (seqlock_read_retry(&head->sequence, version)); + + if (entries) { + qdist_inc(&stats->chain, buckets); + qdist_inc(&stats->occupancy, + (double)entries / QHT_BUCKET_ENTRIES / buckets); + stats->used_head_buckets++; + stats->entries += entries; + } else { + qdist_inc(&stats->occupancy, 0); + } + } +} + +void qht_statistics_destroy(struct qht_stats *stats) +{ + qdist_destroy(&stats->occupancy); + qdist_destroy(&stats->chain); +} From 1a95404fbd34f4ed66fa2450869fb3b384b0a97b Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Wed, 8 Jun 2016 14:55:29 -0400 Subject: [PATCH 11/15] qht: add test program MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Acked-by: Sergey Fedorov Reviewed-by: Alex Bennée Reviewed-by: Richard Henderson Signed-off-by: Emilio G. Cota Message-Id: <1465412133-3029-12-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson --- tests/.gitignore | 1 + tests/Makefile.include | 6 +- tests/test-qht.c | 159 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 tests/test-qht.c diff --git a/tests/.gitignore b/tests/.gitignore index 7c0d156b74..ffde5d2b83 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -50,6 +50,7 @@ test-qdev-global-props test-qemu-opts test-qdist test-qga +test-qht test-qmp-commands test-qmp-commands.h test-qmp-event diff --git a/tests/Makefile.include b/tests/Makefile.include index 294894558e..429d384842 100644 --- a/tests/Makefile.include +++ b/tests/Makefile.include @@ -72,6 +72,8 @@ check-unit-y += tests/test-rcu-list$(EXESUF) gcov-files-test-rcu-list-y = util/rcu.c check-unit-y += tests/test-qdist$(EXESUF) gcov-files-test-qdist-y = util/qdist.c +check-unit-y += tests/test-qht$(EXESUF) +gcov-files-test-qht-y = util/qht.c check-unit-y += tests/test-bitops$(EXESUF) check-unit-$(CONFIG_HAS_GLIB_SUBPROCESS_TESTS) += tests/test-qdev-global-props$(EXESUF) check-unit-y += tests/check-qom-interface$(EXESUF) @@ -397,7 +399,8 @@ test-obj-y = tests/check-qint.o tests/check-qstring.o tests/check-qdict.o \ tests/test-x86-cpuid.o tests/test-mul64.o tests/test-int128.o \ tests/test-opts-visitor.o tests/test-qmp-event.o \ tests/rcutorture.o tests/test-rcu-list.o \ - tests/test-qdist.o + tests/test-qdist.o \ + tests/test-qht.o $(test-obj-y): QEMU_INCLUDES += -Itests QEMU_CFLAGS += -I$(SRC_PATH)/tests @@ -437,6 +440,7 @@ tests/test-int128$(EXESUF): tests/test-int128.o tests/rcutorture$(EXESUF): tests/rcutorture.o $(test-util-obj-y) tests/test-rcu-list$(EXESUF): tests/test-rcu-list.o $(test-util-obj-y) tests/test-qdist$(EXESUF): tests/test-qdist.o $(test-util-obj-y) +tests/test-qht$(EXESUF): tests/test-qht.o $(test-util-obj-y) tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \ hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\ diff --git a/tests/test-qht.c b/tests/test-qht.c new file mode 100644 index 0000000000..c8eb9305ed --- /dev/null +++ b/tests/test-qht.c @@ -0,0 +1,159 @@ +/* + * Copyright (C) 2016, Emilio G. Cota + * + * License: GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include +#include "qemu/qht.h" + +#define N 5000 + +static struct qht ht; +static int32_t arr[N * 2]; + +static bool is_equal(const void *obj, const void *userp) +{ + const int32_t *a = obj; + const int32_t *b = userp; + + return *a == *b; +} + +static void insert(int a, int b) +{ + int i; + + for (i = a; i < b; i++) { + uint32_t hash; + + arr[i] = i; + hash = i; + + qht_insert(&ht, &arr[i], hash); + } +} + +static void rm(int init, int end) +{ + int i; + + for (i = init; i < end; i++) { + uint32_t hash; + + hash = arr[i]; + g_assert_true(qht_remove(&ht, &arr[i], hash)); + } +} + +static void check(int a, int b, bool expected) +{ + struct qht_stats stats; + int i; + + for (i = a; i < b; i++) { + void *p; + uint32_t hash; + int32_t val; + + val = i; + hash = i; + p = qht_lookup(&ht, is_equal, &val, hash); + g_assert_true(!!p == expected); + } + qht_statistics_init(&ht, &stats); + if (stats.used_head_buckets) { + g_assert_cmpfloat(qdist_avg(&stats.chain), >=, 1.0); + } + g_assert_cmpuint(stats.head_buckets, >, 0); + qht_statistics_destroy(&stats); +} + +static void count_func(struct qht *ht, void *p, uint32_t hash, void *userp) +{ + unsigned int *curr = userp; + + (*curr)++; +} + +static void check_n(size_t expected) +{ + struct qht_stats stats; + + qht_statistics_init(&ht, &stats); + g_assert_cmpuint(stats.entries, ==, expected); + qht_statistics_destroy(&stats); +} + +static void iter_check(unsigned int count) +{ + unsigned int curr = 0; + + qht_iter(&ht, count_func, &curr); + g_assert_cmpuint(curr, ==, count); +} + +static void qht_do_test(unsigned int mode, size_t init_entries) +{ + qht_init(&ht, 0, mode); + + insert(0, N); + check(0, N, true); + check_n(N); + check(-N, -1, false); + iter_check(N); + + rm(101, 102); + check_n(N - 1); + insert(N, N * 2); + check_n(N + N - 1); + rm(N, N * 2); + check_n(N - 1); + insert(101, 102); + check_n(N); + + rm(10, 200); + check_n(N - 190); + insert(150, 200); + check_n(N - 190 + 50); + insert(10, 150); + check_n(N); + + rm(1, 2); + check_n(N - 1); + qht_reset_size(&ht, 0); + check_n(0); + check(0, N, false); + + qht_destroy(&ht); +} + +static void qht_test(unsigned int mode) +{ + qht_do_test(mode, 0); + qht_do_test(mode, 1); + qht_do_test(mode, 2); + qht_do_test(mode, 8); + qht_do_test(mode, 16); + qht_do_test(mode, 8192); + qht_do_test(mode, 16384); +} + +static void test_default(void) +{ + qht_test(0); +} + +static void test_resize(void) +{ + qht_test(QHT_MODE_AUTO_RESIZE); +} + +int main(int argc, char *argv[]) +{ + g_test_init(&argc, &argv, NULL); + g_test_add_func("/qht/mode/default", test_default); + g_test_add_func("/qht/mode/resize", test_resize); + return g_test_run(); +} From 515864a0d7a2033163e8e08df20b7873bb6287ae Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Wed, 8 Jun 2016 14:55:30 -0400 Subject: [PATCH 12/15] qht: add qht-bench, a performance benchmark This serves as a performance benchmark as well as a stress test for QHT. We can tweak quite a number of things, including the number of resize threads and how frequently resizes are triggered. A performance comparison of QHT vs CLHT[1] and ck_hs[2] using this same benchmark program can be found here: http://imgur.com/a/0Bms4 The tests are run on a 64-core AMD Opteron 6376, pinning threads to cores favoring same-socket cores. For each run, qht-bench is invoked with: $ tests/qht-bench -d $duration -n $n -u $u -g $range , where $duration is in seconds, $n is the number of threads, $u is the update rate (0.0 to 100.0), and $range is the number of keys. Note that ck_hs's performance drops significantly as writes go up, since it requires an external lock (I used a ck_spinlock) around every write. Also, note that CLHT instead of using a seqlock, relies on an allocator that does not ever return the same address during the same read-critical section. This gives it a slight performance advantage over QHT on read-heavy workloads, since the seqlock writes aren't there. [1] CLHT: https://github.com/LPD-EPFL/CLHT https://infoscience.epfl.ch/record/207109/files/ascy_asplos15.pdf [2] ck_hs: http://concurrencykit.org/ http://backtrace.io/blog/blog/2015/03/13/workload-specialization/ A few of those plots are shown in text here, since that site might not be online forever. Throughput is on Mops/s on the Y axis. 200K keys, 0 % updates 450 ++--+------+------+-------+-------+-------+-------+------+-------+--++ | + + + + + + + + +N+ | 400 ++ ---+E+ ++ | +++---- | 350 ++ 9 ++------+------++ --+E+ -+H+ ++ | | +H+- | -+N+---- ---- +++ | 300 ++ 8 ++ +E+ ++ -----+E+ --+H+ ++ | | +++ | -+N+-----+H+-- | 250 ++ 7 ++------+------++ +++-----+E+---- ++ 200 ++ 1 -+E+-----+H+ ++ | ---- qht +-E--+ | 150 ++ -+E+ clht +-H--+ ++ | ---- ck +-N--+ | 100 ++ +E+ ++ | ---- | 50 ++ -+E+ ++ | +E+E+ + + + + + + + + | 0 ++--E------+------+-------+-------+-------+-------+------+-------+--++ 1 8 16 24 32 40 48 56 64 Number of threads 200K keys, 1 % updates 350 ++--+------+------+-------+-------+-------+-------+------+-------+--++ | + + + + + + + + -+E+ | 300 ++ -----+H+ ++ | +E+-- | | 9 ++------+------++ +++---- | 250 ++ | +E+ -- | -+E+ ++ | 8 ++ -- ++ ---- | 200 ++ | +++- | +++ ---+E+ ++ | 7 ++------N------++ -+E+-- qht +-E--+ | | 1 +++---- clht +-H--+ | 150 ++ -+E+ ck +-N--+ ++ | ---- | 100 ++ +E+ ++ | ---- | | -+E+ | 50 ++ +H+-+N+----+N+-----+N+------ ++ | +E+E+ + + + +N+-----+N+-----+N+----+N+-----+N+ | 0 ++--E------+------+-------+-------+-------+-------+------+-------+--++ 1 8 16 24 32 40 48 56 64 Number of threads 200K keys, 20 % updates 300 ++--+------+------+-------+-------+-------+-------+------+-------+--++ | + + + + + + + + + | | -+H+ | 250 ++ ---- ++ | 9 ++------+------++ --+H+ ---+E+ | | 8 ++ +H+-- ++ -+H+----+E+-- | 200 ++ | +E+ --| -----+E+-- +++ ++ | 7 ++ + ---- ++ ---+H+---- +++ qht +-E--+ | 150 ++ 6 ++------N------++ -+H+-----+E+ clht +-H--+ ++ | 1 -----+E+-- ck +-N--+ | | -+H+---- | 100 ++ -----+E+ ++ | +E+-- | | ----+++ | 50 ++ -+E+ ++ | +E+ +++ | | +E+N+-+N+-----+ + + + + + + | 0 ++--E------+------N-------N-------N-------N-------N------N-------N--++ 1 8 16 24 32 40 48 56 64 Number of threads 200K keys, 100 % updates qht +-E--+ clht +-H--+ 160 ++--+------+------+-------+-------+-------+-------+---ck-+-N-----+--++ | + + + + + + + + ----H | 140 ++ +H+-- -+E+ ++ | +++---- ---- | 120 ++ 8 ++------+------++ -+H+ +E+ ++ | 7 ++ +H+---- ++ ---- +++---- | 100 ++ | +E+ | +++ ---+H+ -+E+ ++ | 6 ++ +++ ++ -+H+-- +++---- | 80 ++ 5 ++------N----------+E+-----+E+ ++ | 1 -+H+---- +++ | | -----+E+ | 60 ++ +H+---- +++ ++ | ----+E+ | 40 ++ +H+---- ++ | --+E+ | 20 ++ +E+ ++ | +EE+ + + + + + + + + | 0 ++--+N-N---N------N-------N-------N-------N-------N------N-------N--++ 1 8 16 24 32 40 48 56 64 Number of threads Signed-off-by: Emilio G. Cota Message-Id: <1465412133-3029-13-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson --- tests/.gitignore | 1 + tests/Makefile.include | 3 +- tests/qht-bench.c | 488 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 491 insertions(+), 1 deletion(-) create mode 100644 tests/qht-bench.c diff --git a/tests/.gitignore b/tests/.gitignore index ffde5d2b83..d19023e41d 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -7,6 +7,7 @@ check-qnull check-qstring check-qom-interface check-qom-proplist +qht-bench rcutorture test-aio test-base64 diff --git a/tests/Makefile.include b/tests/Makefile.include index 429d384842..7b00301c9c 100644 --- a/tests/Makefile.include +++ b/tests/Makefile.include @@ -400,7 +400,7 @@ test-obj-y = tests/check-qint.o tests/check-qstring.o tests/check-qdict.o \ tests/test-opts-visitor.o tests/test-qmp-event.o \ tests/rcutorture.o tests/test-rcu-list.o \ tests/test-qdist.o \ - tests/test-qht.o + tests/test-qht.o tests/qht-bench.o $(test-obj-y): QEMU_INCLUDES += -Itests QEMU_CFLAGS += -I$(SRC_PATH)/tests @@ -441,6 +441,7 @@ tests/rcutorture$(EXESUF): tests/rcutorture.o $(test-util-obj-y) tests/test-rcu-list$(EXESUF): tests/test-rcu-list.o $(test-util-obj-y) tests/test-qdist$(EXESUF): tests/test-qdist.o $(test-util-obj-y) tests/test-qht$(EXESUF): tests/test-qht.o $(test-util-obj-y) +tests/qht-bench$(EXESUF): tests/qht-bench.o $(test-util-obj-y) tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \ hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\ diff --git a/tests/qht-bench.c b/tests/qht-bench.c new file mode 100644 index 0000000000..ad8efbca95 --- /dev/null +++ b/tests/qht-bench.c @@ -0,0 +1,488 @@ +/* + * Copyright (C) 2016, Emilio G. Cota + * + * License: GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include +#include "qemu/processor.h" +#include "qemu/atomic.h" +#include "qemu/qht.h" +#include "qemu/rcu.h" +#include "exec/tb-hash-xx.h" + +struct thread_stats { + size_t rd; + size_t not_rd; + size_t in; + size_t not_in; + size_t rm; + size_t not_rm; + size_t rz; + size_t not_rz; +}; + +struct thread_info { + void (*func)(struct thread_info *); + struct thread_stats stats; + uint64_t r; + bool write_op; /* writes alternate between insertions and removals */ + bool resize_down; +} QEMU_ALIGNED(64); /* avoid false sharing among threads */ + +static struct qht ht; +static QemuThread *rw_threads; + +#define DEFAULT_RANGE (4096) +#define DEFAULT_QHT_N_ELEMS DEFAULT_RANGE + +static unsigned int duration = 1; +static unsigned int n_rw_threads = 1; +static unsigned long lookup_range = DEFAULT_RANGE; +static unsigned long update_range = DEFAULT_RANGE; +static size_t init_range = DEFAULT_RANGE; +static size_t init_size = DEFAULT_RANGE; +static size_t n_ready_threads; +static long populate_offset; +static long *keys; + +static size_t resize_min; +static size_t resize_max; +static struct thread_info *rz_info; +static unsigned long resize_delay = 1000; +static double resize_rate; /* 0.0 to 1.0 */ +static unsigned int n_rz_threads = 1; +static QemuThread *rz_threads; + +static double update_rate; /* 0.0 to 1.0 */ +static uint64_t update_threshold; +static uint64_t resize_threshold; + +static size_t qht_n_elems = DEFAULT_QHT_N_ELEMS; +static int qht_mode; + +static bool test_start; +static bool test_stop; + +static struct thread_info *rw_info; + +static const char commands_string[] = + " -d = duration, in seconds\n" + " -n = number of threads\n" + "\n" + " -o = offset at which keys start\n" + "\n" + " -g = set -s,-k,-K,-l,-r to the same value\n" + " -s = initial size hint\n" + " -k = initial number of keys\n" + " -K = initial range of keys (will be rounded up to pow2)\n" + " -l = lookup range of keys (will be rounded up to pow2)\n" + " -r = update range of keys (will be rounded up to pow2)\n" + "\n" + " -u = update rate (0.0 to 100.0), 50/50 split of insertions/removals\n" + "\n" + " -R = enable auto-resize\n" + " -S = resize rate (0.0 to 100.0)\n" + " -D = delay (in us) between potential resizes\n" + " -N = number of resize threads"; + +static void usage_complete(int argc, char *argv[]) +{ + fprintf(stderr, "Usage: %s [options]\n", argv[0]); + fprintf(stderr, "options:\n%s\n", commands_string); + exit(-1); +} + +static bool is_equal(const void *obj, const void *userp) +{ + const long *a = obj; + const long *b = userp; + + return *a == *b; +} + +static inline uint32_t h(unsigned long v) +{ + return tb_hash_func5(v, 0, 0); +} + +/* + * From: https://en.wikipedia.org/wiki/Xorshift + * This is faster than rand_r(), and gives us a wider range (RAND_MAX is only + * guaranteed to be >= INT_MAX). + */ +static uint64_t xorshift64star(uint64_t x) +{ + x ^= x >> 12; /* a */ + x ^= x << 25; /* b */ + x ^= x >> 27; /* c */ + return x * UINT64_C(2685821657736338717); +} + +static void do_rz(struct thread_info *info) +{ + struct thread_stats *stats = &info->stats; + + if (info->r < resize_threshold) { + size_t size = info->resize_down ? resize_min : resize_max; + bool resized; + + resized = qht_resize(&ht, size); + info->resize_down = !info->resize_down; + + if (resized) { + stats->rz++; + } else { + stats->not_rz++; + } + } + g_usleep(resize_delay); +} + +static void do_rw(struct thread_info *info) +{ + struct thread_stats *stats = &info->stats; + uint32_t hash; + long *p; + + if (info->r >= update_threshold) { + bool read; + + p = &keys[info->r & (lookup_range - 1)]; + hash = h(*p); + read = qht_lookup(&ht, is_equal, p, hash); + if (read) { + stats->rd++; + } else { + stats->not_rd++; + } + } else { + p = &keys[info->r & (update_range - 1)]; + hash = h(*p); + if (info->write_op) { + bool written = false; + + if (qht_lookup(&ht, is_equal, p, hash) == NULL) { + written = qht_insert(&ht, p, hash); + } + if (written) { + stats->in++; + } else { + stats->not_in++; + } + } else { + bool removed = false; + + if (qht_lookup(&ht, is_equal, p, hash)) { + removed = qht_remove(&ht, p, hash); + } + if (removed) { + stats->rm++; + } else { + stats->not_rm++; + } + } + info->write_op = !info->write_op; + } +} + +static void *thread_func(void *p) +{ + struct thread_info *info = p; + + rcu_register_thread(); + + atomic_inc(&n_ready_threads); + while (!atomic_mb_read(&test_start)) { + cpu_relax(); + } + + rcu_read_lock(); + while (!atomic_read(&test_stop)) { + info->r = xorshift64star(info->r); + info->func(info); + } + rcu_read_unlock(); + + rcu_unregister_thread(); + return NULL; +} + +/* sets everything except info->func */ +static void prepare_thread_info(struct thread_info *info, int i) +{ + /* seed for the RNG; each thread should have a different one */ + info->r = (i + 1) ^ time(NULL); + /* the first update will be a write */ + info->write_op = true; + /* the first resize will be down */ + info->resize_down = true; + + memset(&info->stats, 0, sizeof(info->stats)); +} + +static void +th_create_n(QemuThread **threads, struct thread_info **infos, const char *name, + void (*func)(struct thread_info *), int offset, int n) +{ + struct thread_info *info; + QemuThread *th; + int i; + + th = g_malloc(sizeof(*th) * n); + *threads = th; + + info = qemu_memalign(64, sizeof(*info) * n); + *infos = info; + + for (i = 0; i < n; i++) { + prepare_thread_info(&info[i], offset + i); + info[i].func = func; + qemu_thread_create(&th[i], name, thread_func, &info[i], + QEMU_THREAD_JOINABLE); + } +} + +static void create_threads(void) +{ + th_create_n(&rw_threads, &rw_info, "rw", do_rw, 0, n_rw_threads); + th_create_n(&rz_threads, &rz_info, "rz", do_rz, n_rw_threads, n_rz_threads); +} + +static void pr_params(void) +{ + printf("Parameters:\n"); + printf(" duration: %d s\n", duration); + printf(" # of threads: %u\n", n_rw_threads); + printf(" initial # of keys: %zu\n", init_size); + printf(" initial size hint: %zu\n", qht_n_elems); + printf(" auto-resize: %s\n", + qht_mode & QHT_MODE_AUTO_RESIZE ? "on" : "off"); + if (resize_rate) { + printf(" resize_rate: %f%%\n", resize_rate * 100.0); + printf(" resize range: %zu-%zu\n", resize_min, resize_max); + printf(" # resize threads %u\n", n_rz_threads); + } + printf(" update rate: %f%%\n", update_rate * 100.0); + printf(" offset: %ld\n", populate_offset); + printf(" initial key range: %zu\n", init_range); + printf(" lookup range: %lu\n", lookup_range); + printf(" update range: %lu\n", update_range); +} + +static void do_threshold(double rate, uint64_t *threshold) +{ + if (rate == 1.0) { + *threshold = UINT64_MAX; + } else { + *threshold = rate * UINT64_MAX; + } +} + +static void htable_init(void) +{ + unsigned long n = MAX(init_range, update_range); + uint64_t r = time(NULL); + size_t retries = 0; + size_t i; + + /* avoid allocating memory later by allocating all the keys now */ + keys = g_malloc(sizeof(*keys) * n); + for (i = 0; i < n; i++) { + keys[i] = populate_offset + i; + } + + /* some sanity checks */ + g_assert_cmpuint(lookup_range, <=, n); + + /* compute thresholds */ + do_threshold(update_rate, &update_threshold); + do_threshold(resize_rate, &resize_threshold); + + if (resize_rate) { + resize_min = n / 2; + resize_max = n; + assert(resize_min < resize_max); + } else { + n_rz_threads = 0; + } + + /* initialize the hash table */ + qht_init(&ht, qht_n_elems, qht_mode); + assert(init_size <= init_range); + + pr_params(); + + fprintf(stderr, "Initialization: populating %zu items...", init_size); + for (i = 0; i < init_size; i++) { + for (;;) { + uint32_t hash; + long *p; + + r = xorshift64star(r); + p = &keys[r & (init_range - 1)]; + hash = h(*p); + if (qht_insert(&ht, p, hash)) { + break; + } + retries++; + } + } + fprintf(stderr, " populated after %zu retries\n", retries); +} + +static void add_stats(struct thread_stats *s, struct thread_info *info, int n) +{ + int i; + + for (i = 0; i < n; i++) { + struct thread_stats *stats = &info[i].stats; + + s->rd += stats->rd; + s->not_rd += stats->not_rd; + + s->in += stats->in; + s->not_in += stats->not_in; + + s->rm += stats->rm; + s->not_rm += stats->not_rm; + + s->rz += stats->rz; + s->not_rz += stats->not_rz; + } +} + +static void pr_stats(void) +{ + struct thread_stats s = {}; + double tx; + + add_stats(&s, rw_info, n_rw_threads); + add_stats(&s, rz_info, n_rz_threads); + + printf("Results:\n"); + + if (resize_rate) { + printf(" Resizes: %zu (%.2f%% of %zu)\n", + s.rz, (double)s.rz / (s.rz + s.not_rz) * 100, s.rz + s.not_rz); + } + + printf(" Read: %.2f M (%.2f%% of %.2fM)\n", + (double)s.rd / 1e6, + (double)s.rd / (s.rd + s.not_rd) * 100, + (double)(s.rd + s.not_rd) / 1e6); + printf(" Inserted: %.2f M (%.2f%% of %.2fM)\n", + (double)s.in / 1e6, + (double)s.in / (s.in + s.not_in) * 100, + (double)(s.in + s.not_in) / 1e6); + printf(" Removed: %.2f M (%.2f%% of %.2fM)\n", + (double)s.rm / 1e6, + (double)s.rm / (s.rm + s.not_rm) * 100, + (double)(s.rm + s.not_rm) / 1e6); + + tx = (s.rd + s.not_rd + s.in + s.not_in + s.rm + s.not_rm) / 1e6 / duration; + printf(" Throughput: %.2f MT/s\n", tx); + printf(" Throughput/thread: %.2f MT/s/thread\n", tx / n_rw_threads); +} + +static void run_test(void) +{ + unsigned int remaining; + int i; + + while (atomic_read(&n_ready_threads) != n_rw_threads + n_rz_threads) { + cpu_relax(); + } + atomic_mb_set(&test_start, true); + do { + remaining = sleep(duration); + } while (remaining); + atomic_mb_set(&test_stop, true); + + for (i = 0; i < n_rw_threads; i++) { + qemu_thread_join(&rw_threads[i]); + } + for (i = 0; i < n_rz_threads; i++) { + qemu_thread_join(&rz_threads[i]); + } +} + +static void parse_args(int argc, char *argv[]) +{ + int c; + + for (;;) { + c = getopt(argc, argv, "d:D:g:k:K:l:hn:N:o:r:Rs:S:u:"); + if (c < 0) { + break; + } + switch (c) { + case 'd': + duration = atoi(optarg); + break; + case 'D': + resize_delay = atol(optarg); + break; + case 'g': + init_range = pow2ceil(atol(optarg)); + lookup_range = pow2ceil(atol(optarg)); + update_range = pow2ceil(atol(optarg)); + qht_n_elems = atol(optarg); + init_size = atol(optarg); + break; + case 'h': + usage_complete(argc, argv); + exit(0); + case 'k': + init_size = atol(optarg); + break; + case 'K': + init_range = pow2ceil(atol(optarg)); + break; + case 'l': + lookup_range = pow2ceil(atol(optarg)); + break; + case 'n': + n_rw_threads = atoi(optarg); + break; + case 'N': + n_rz_threads = atoi(optarg); + break; + case 'o': + populate_offset = atol(optarg); + break; + case 'r': + update_range = pow2ceil(atol(optarg)); + break; + case 'R': + qht_mode |= QHT_MODE_AUTO_RESIZE; + break; + case 's': + qht_n_elems = atol(optarg); + break; + case 'S': + resize_rate = atof(optarg) / 100.0; + if (resize_rate > 1.0) { + resize_rate = 1.0; + } + break; + case 'u': + update_rate = atof(optarg) / 100.0; + if (update_rate > 1.0) { + update_rate = 1.0; + } + break; + } + } +} + +int main(int argc, char *argv[]) +{ + parse_args(argc, argv); + htable_init(); + create_threads(); + run_test(); + pr_stats(); + return 0; +} From 896a9ee96720c5cc7e00e63943435ccccec54d23 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Wed, 8 Jun 2016 14:55:31 -0400 Subject: [PATCH 13/15] qht: add test-qht-par to invoke qht-bench from 'check' target Signed-off-by: Emilio G. Cota Message-Id: <1465412133-3029-14-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson --- tests/.gitignore | 1 + tests/Makefile.include | 5 +++- tests/test-qht-par.c | 56 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 tests/test-qht-par.c diff --git a/tests/.gitignore b/tests/.gitignore index d19023e41d..840ea396ab 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -52,6 +52,7 @@ test-qemu-opts test-qdist test-qga test-qht +test-qht-par test-qmp-commands test-qmp-commands.h test-qmp-event diff --git a/tests/Makefile.include b/tests/Makefile.include index 7b00301c9c..7d63d16c3f 100644 --- a/tests/Makefile.include +++ b/tests/Makefile.include @@ -74,6 +74,8 @@ check-unit-y += tests/test-qdist$(EXESUF) gcov-files-test-qdist-y = util/qdist.c check-unit-y += tests/test-qht$(EXESUF) gcov-files-test-qht-y = util/qht.c +check-unit-y += tests/test-qht-par$(EXESUF) +gcov-files-test-qht-par-y = util/qht.c check-unit-y += tests/test-bitops$(EXESUF) check-unit-$(CONFIG_HAS_GLIB_SUBPROCESS_TESTS) += tests/test-qdev-global-props$(EXESUF) check-unit-y += tests/check-qom-interface$(EXESUF) @@ -400,7 +402,7 @@ test-obj-y = tests/check-qint.o tests/check-qstring.o tests/check-qdict.o \ tests/test-opts-visitor.o tests/test-qmp-event.o \ tests/rcutorture.o tests/test-rcu-list.o \ tests/test-qdist.o \ - tests/test-qht.o tests/qht-bench.o + tests/test-qht.o tests/qht-bench.o tests/test-qht-par.o $(test-obj-y): QEMU_INCLUDES += -Itests QEMU_CFLAGS += -I$(SRC_PATH)/tests @@ -441,6 +443,7 @@ tests/rcutorture$(EXESUF): tests/rcutorture.o $(test-util-obj-y) tests/test-rcu-list$(EXESUF): tests/test-rcu-list.o $(test-util-obj-y) tests/test-qdist$(EXESUF): tests/test-qdist.o $(test-util-obj-y) tests/test-qht$(EXESUF): tests/test-qht.o $(test-util-obj-y) +tests/test-qht-par$(EXESUF): tests/test-qht-par.o tests/qht-bench$(EXESUF) $(test-util-obj-y) tests/qht-bench$(EXESUF): tests/qht-bench.o $(test-util-obj-y) tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \ diff --git a/tests/test-qht-par.c b/tests/test-qht-par.c new file mode 100644 index 0000000000..f09e004ec6 --- /dev/null +++ b/tests/test-qht-par.c @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2016, Emilio G. Cota + * + * License: GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include + +#define TEST_QHT_STRING "tests/qht-bench 1>/dev/null 2>&1 -R -S0.1 -D10000 -N1 " + +static void test_qht(int n_threads, int update_rate, int duration) +{ + char *str; + int rc; + + str = g_strdup_printf(TEST_QHT_STRING "-n %d -u %d -d %d", + n_threads, update_rate, duration); + rc = system(str); + g_free(str); + g_assert_cmpint(rc, ==, 0); +} + +static void test_2th0u1s(void) +{ + test_qht(2, 0, 1); +} + +static void test_2th20u1s(void) +{ + test_qht(2, 20, 1); +} + +static void test_2th0u5s(void) +{ + test_qht(2, 0, 5); +} + +static void test_2th20u5s(void) +{ + test_qht(2, 20, 5); +} + +int main(int argc, char *argv[]) +{ + g_test_init(&argc, &argv, NULL); + + if (g_test_quick()) { + g_test_add_func("/qht/parallel/2threads-0%updates-1s", test_2th0u1s); + g_test_add_func("/qht/parallel/2threads-20%updates-1s", test_2th20u1s); + } else { + g_test_add_func("/qht/parallel/2threads-0%updates-5s", test_2th0u5s); + g_test_add_func("/qht/parallel/2threads-20%updates-5s", test_2th20u5s); + } + return g_test_run(); +} From 909eaac9bbc2ed4f3a82ce38e905b87d478a3e00 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Wed, 8 Jun 2016 14:55:32 -0400 Subject: [PATCH 14/15] tb hash: track translated blocks with qht MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Having a fixed-size hash table for keeping track of all translation blocks is suboptimal: some workloads are just too big or too small to get maximum performance from the hash table. The MRU promotion policy helps improve performance when the hash table is a little undersized, but it cannot make up for severely undersized hash tables. Furthermore, frequent MRU promotions result in writes that are a scalability bottleneck. For scalability, lookups should only perform reads, not writes. This is not a big deal for now, but it will become one once MTTCG matures. The appended fixes these issues by using qht as the implementation of the TB hash table. This solution is superior to other alternatives considered, namely: - master: implementation in QEMU before this patchset - xxhash: before this patch, i.e. fixed buckets + xxhash hashing + MRU. - xxhash-rcu: fixed buckets + xxhash + RCU list + MRU. MRU is implemented here by adding an intermediate struct that contains the u32 hash and a pointer to the TB; this allows us, on an MRU promotion, to copy said struct (that is not at the head), and put this new copy at the head. After a grace period, the original non-head struct can be eliminated, and after another grace period, freed. - qht-fixed-nomru: fixed buckets + xxhash + qht without auto-resize + no MRU for lookups; MRU for inserts. The appended solution is the following: - qht-dyn-nomru: dynamic number of buckets + xxhash + qht w/ auto-resize + no MRU for lookups; MRU for inserts. The plots below compare the considered solutions. The Y axis shows the boot time (in seconds) of a debian jessie image with arm-softmmu; the X axis sweeps the number of buckets (or initial number of buckets for qht-autoresize). The plots in PNG format (and with errorbars) can be seen here: http://imgur.com/a/Awgnq Each test runs 5 times, and the entire QEMU process is pinned to a single core for repeatability of results. Host: Intel Xeon E5-2690 28 ++------------+-------------+-------------+-------------+------------++ A***** + + + master **A*** + 27 ++ * xxhash ##B###++ | A******A****** xxhash-rcu $$C$$$ | 26 C$$ A******A****** qht-fixed-nomru*%%D%%%++ D%%$$ A******A******A*qht-dyn-mru A*E****A 25 ++ %%$$ qht-dyn-nomru &&F&&&++ B#####% | 24 ++ #C$$$$$ ++ | B### $ | | ## C$$$$$$ | 23 ++ # C$$$$$$ ++ | B###### C$$$$$$ %%%D 22 ++ %B###### C$$$$$$C$$$$$$C$$$$$$C$$$$$$C$$$$$$C | D%%%%%%B###### @E@@@@@@ %%%D%%%@@@E@@@@@@E 21 E@@@@@@E@@@@@@F&&&@@@E@@@&&&D%%%%%%B######B######B######B######B######B + E@@@ F&&& + E@ + F&&& + + 20 ++------------+-------------+-------------+-------------+------------++ 14 16 18 20 22 24 log2 number of buckets Host: Intel i7-4790K 14.5 ++------------+------------+-------------+------------+------------++ A** + + + master **A*** + 14 ++ ** xxhash ##B###++ 13.5 ++ ** xxhash-rcu $$C$$$++ | qht-fixed-nomru %%D%%% | 13 ++ A****** qht-dyn-mru @@E@@@++ | A*****A******A****** qht-dyn-nomru &&F&&& | 12.5 C$$ A******A******A*****A****** ***A 12 ++ $$ A*** ++ D%%% $$ | 11.5 ++ %% ++ B### %C$$$$$$ | 11 ++ ## D%%%%% C$$$$$ ++ | # % C$$$$$$ | 10.5 F&&&&&&B######D%%%%% C$$$$$$C$$$$$$C$$$$$$C$$$$$C$$$$$$ $$$C 10 E@@@@@@E@@@@@@B#####B######B######E@@@@@@E@@@%%%D%%%%%D%%%###B######B + F&& D%%%%%%B######B######B#####B###@@@D%%% + 9.5 ++------------+------------+-------------+------------+------------++ 14 16 18 20 22 24 log2 number of buckets Note that the original point before this patch series is X=15 for "master"; the little sensitivity to the increased number of buckets is due to the poor hashing function in master. xxhash-rcu has significant overhead due to the constant churn of allocating and deallocating intermediate structs for implementing MRU. An alternative would be do consider failed lookups as "maybe not there", and then acquire the external lock (tb_lock in this case) to really confirm that there was indeed a failed lookup. This, however, would not be enough to implement dynamic resizing--this is more complex: see "Resizable, Scalable, Concurrent Hash Tables via Relativistic Programming" by Triplett, McKenney and Walpole. This solution was discarded due to the very coarse RCU read critical sections that we have in MTTCG; resizing requires waiting for readers after every pointer update, and resizes require many pointer updates, so this would quickly become prohibitive. qht-fixed-nomru shows that MRU promotion is advisable for undersized hash tables. However, qht-dyn-mru shows that MRU promotion is not important if the hash table is properly sized: there is virtually no difference in performance between qht-dyn-nomru and qht-dyn-mru. Before this patch, we're at X=15 on "xxhash"; after this patch, we're at X=15 @ qht-dyn-nomru. This patch thus matches the best performance that we can achieve with optimum sizing of the hash table, while keeping the hash table scalable for readers. The improvement we get before and after this patch for booting debian jessie with arm-softmmu is: - Intel Xeon E5-2690: 10.5% less time - Intel i7-4790K: 5.2% less time We could get this same improvement _for this particular workload_ by statically increasing the size of the hash table. But this would hurt workloads that do not need a large hash table. The dynamic (upward) resizing allows us to start small and enlarge the hash table as needed. A quick note on downsizing: the table is resized back to 2**15 buckets on every tb_flush; this makes sense because it is not guaranteed that the table will reach the same number of TBs later on (e.g. most bootup code is thrown away after boot); it makes sense to grow the hash table as more code blocks are translated. This also avoids the complication of having to build downsizing hysteresis logic into qht. Reviewed-by: Sergey Fedorov Reviewed-by: Alex Bennée Reviewed-by: Richard Henderson Signed-off-by: Emilio G. Cota Message-Id: <1465412133-3029-15-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson --- cpu-exec.c | 86 +++++++++++++++++++-------------------- include/exec/exec-all.h | 2 - include/exec/tb-context.h | 7 ++-- include/exec/tb-hash.h | 3 +- translate-all.c | 85 +++++++++++++++++--------------------- 5 files changed, 86 insertions(+), 97 deletions(-) diff --git a/cpu-exec.c b/cpu-exec.c index b9e294c0e6..b840e1d2dd 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -225,57 +225,57 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles, } #endif +struct tb_desc { + target_ulong pc; + target_ulong cs_base; + CPUArchState *env; + tb_page_addr_t phys_page1; + uint32_t flags; +}; + +static bool tb_cmp(const void *p, const void *d) +{ + const TranslationBlock *tb = p; + const struct tb_desc *desc = d; + + if (tb->pc == desc->pc && + tb->page_addr[0] == desc->phys_page1 && + tb->cs_base == desc->cs_base && + tb->flags == desc->flags) { + /* check next page if needed */ + if (tb->page_addr[1] == -1) { + return true; + } else { + tb_page_addr_t phys_page2; + target_ulong virt_page2; + + virt_page2 = (desc->pc & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE; + phys_page2 = get_page_addr_code(desc->env, virt_page2); + if (tb->page_addr[1] == phys_page2) { + return true; + } + } + } + return false; +} + static TranslationBlock *tb_find_physical(CPUState *cpu, target_ulong pc, target_ulong cs_base, uint32_t flags) { - CPUArchState *env = (CPUArchState *)cpu->env_ptr; - TranslationBlock *tb, **tb_hash_head, **ptb1; + tb_page_addr_t phys_pc; + struct tb_desc desc; uint32_t h; - tb_page_addr_t phys_pc, phys_page1; - /* find translated block using physical mappings */ - phys_pc = get_page_addr_code(env, pc); - phys_page1 = phys_pc & TARGET_PAGE_MASK; + desc.env = (CPUArchState *)cpu->env_ptr; + desc.cs_base = cs_base; + desc.flags = flags; + desc.pc = pc; + phys_pc = get_page_addr_code(desc.env, pc); + desc.phys_page1 = phys_pc & TARGET_PAGE_MASK; h = tb_hash_func(phys_pc, pc, flags); - - /* Start at head of the hash entry */ - ptb1 = tb_hash_head = &tcg_ctx.tb_ctx.tb_phys_hash[h]; - tb = *ptb1; - - while (tb) { - if (tb->pc == pc && - tb->page_addr[0] == phys_page1 && - tb->cs_base == cs_base && - tb->flags == flags) { - - if (tb->page_addr[1] == -1) { - /* done, we have a match */ - break; - } else { - /* check next page if needed */ - target_ulong virt_page2 = (pc & TARGET_PAGE_MASK) + - TARGET_PAGE_SIZE; - tb_page_addr_t phys_page2 = get_page_addr_code(env, virt_page2); - - if (tb->page_addr[1] == phys_page2) { - break; - } - } - } - - ptb1 = &tb->phys_hash_next; - tb = *ptb1; - } - - if (tb) { - /* Move the TB to the head of the list */ - *ptb1 = tb->phys_hash_next; - tb->phys_hash_next = *tb_hash_head; - *tb_hash_head = tb; - } - return tb; + return qht_lookup(&tcg_ctx.tb_ctx.htable, tb_cmp, &desc, h); } static TranslationBlock *tb_find_slow(CPUState *cpu, diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index e076397e2f..c1f59fa59d 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -215,8 +215,6 @@ struct TranslationBlock { void *tc_ptr; /* pointer to the translated code */ uint8_t *tc_search; /* pointer to search data */ - /* next matching tb for physical address. */ - struct TranslationBlock *phys_hash_next; /* original tb when cflags has CF_NOCACHE */ struct TranslationBlock *orig_tb; /* first and second physical page containing code. The lower bit diff --git a/include/exec/tb-context.h b/include/exec/tb-context.h index 5efe3d9087..e209c1c28c 100644 --- a/include/exec/tb-context.h +++ b/include/exec/tb-context.h @@ -21,9 +21,10 @@ #define QEMU_TB_CONTEXT_H_ #include "qemu/thread.h" +#include "qemu/qht.h" -#define CODE_GEN_PHYS_HASH_BITS 15 -#define CODE_GEN_PHYS_HASH_SIZE (1 << CODE_GEN_PHYS_HASH_BITS) +#define CODE_GEN_HTABLE_BITS 15 +#define CODE_GEN_HTABLE_SIZE (1 << CODE_GEN_HTABLE_BITS) typedef struct TranslationBlock TranslationBlock; typedef struct TBContext TBContext; @@ -31,7 +32,7 @@ typedef struct TBContext TBContext; struct TBContext { TranslationBlock *tbs; - TranslationBlock *tb_phys_hash[CODE_GEN_PHYS_HASH_SIZE]; + struct qht htable; int nb_tbs; /* any access to the tbs or the page table must use this lock */ QemuMutex tb_lock; diff --git a/include/exec/tb-hash.h b/include/exec/tb-hash.h index 88ccfd1730..1d0200bc91 100644 --- a/include/exec/tb-hash.h +++ b/include/exec/tb-hash.h @@ -20,7 +20,6 @@ #ifndef EXEC_TB_HASH #define EXEC_TB_HASH -#include "exec/exec-all.h" #include "exec/tb-hash-xx.h" /* Only the bottom TB_JMP_PAGE_BITS of the jump cache hash bits vary for @@ -49,7 +48,7 @@ static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc) static inline uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, uint32_t flags) { - return tb_hash_func5(phys_pc, pc, flags) & (CODE_GEN_PHYS_HASH_SIZE - 1); + return tb_hash_func5(phys_pc, pc, flags); } #endif diff --git a/translate-all.c b/translate-all.c index d75737c3aa..b620fccfc2 100644 --- a/translate-all.c +++ b/translate-all.c @@ -735,6 +735,13 @@ static inline void code_gen_alloc(size_t tb_size) qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock); } +static void tb_htable_init(void) +{ + unsigned int mode = QHT_MODE_AUTO_RESIZE; + + qht_init(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE, mode); +} + /* Must be called before using the QEMU cpus. 'tb_size' is the size (in bytes) allocated to the translation buffer. Zero means default size. */ @@ -742,6 +749,7 @@ void tcg_exec_init(unsigned long tb_size) { cpu_gen_init(); page_init(); + tb_htable_init(); code_gen_alloc(tb_size); #if defined(CONFIG_SOFTMMU) /* There's no guest base to take into account, so go ahead and @@ -846,7 +854,7 @@ void tb_flush(CPUState *cpu) cpu->tb_flushed = true; } - memset(tcg_ctx.tb_ctx.tb_phys_hash, 0, sizeof(tcg_ctx.tb_ctx.tb_phys_hash)); + qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE); page_flush_tb(); tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer; @@ -857,60 +865,46 @@ void tb_flush(CPUState *cpu) #ifdef DEBUG_TB_CHECK +static void +do_tb_invalidate_check(struct qht *ht, void *p, uint32_t hash, void *userp) +{ + TranslationBlock *tb = p; + target_ulong addr = *(target_ulong *)userp; + + if (!(addr + TARGET_PAGE_SIZE <= tb->pc || addr >= tb->pc + tb->size)) { + printf("ERROR invalidate: address=" TARGET_FMT_lx + " PC=%08lx size=%04x\n", addr, (long)tb->pc, tb->size); + } +} + static void tb_invalidate_check(target_ulong address) { - TranslationBlock *tb; - int i; - address &= TARGET_PAGE_MASK; - for (i = 0; i < CODE_GEN_PHYS_HASH_SIZE; i++) { - for (tb = tcg_ctx.tb_ctx.tb_phys_hash[i]; tb != NULL; - tb = tb->phys_hash_next) { - if (!(address + TARGET_PAGE_SIZE <= tb->pc || - address >= tb->pc + tb->size)) { - printf("ERROR invalidate: address=" TARGET_FMT_lx - " PC=%08lx size=%04x\n", - address, (long)tb->pc, tb->size); - } - } + qht_iter(&tcg_ctx.tb_ctx.htable, do_tb_invalidate_check, &address); +} + +static void +do_tb_page_check(struct qht *ht, void *p, uint32_t hash, void *userp) +{ + TranslationBlock *tb = p; + int flags1, flags2; + + flags1 = page_get_flags(tb->pc); + flags2 = page_get_flags(tb->pc + tb->size - 1); + if ((flags1 & PAGE_WRITE) || (flags2 & PAGE_WRITE)) { + printf("ERROR page flags: PC=%08lx size=%04x f1=%x f2=%x\n", + (long)tb->pc, tb->size, flags1, flags2); } } /* verify that all the pages have correct rights for code */ static void tb_page_check(void) { - TranslationBlock *tb; - int i, flags1, flags2; - - for (i = 0; i < CODE_GEN_PHYS_HASH_SIZE; i++) { - for (tb = tcg_ctx.tb_ctx.tb_phys_hash[i]; tb != NULL; - tb = tb->phys_hash_next) { - flags1 = page_get_flags(tb->pc); - flags2 = page_get_flags(tb->pc + tb->size - 1); - if ((flags1 & PAGE_WRITE) || (flags2 & PAGE_WRITE)) { - printf("ERROR page flags: PC=%08lx size=%04x f1=%x f2=%x\n", - (long)tb->pc, tb->size, flags1, flags2); - } - } - } + qht_iter(&tcg_ctx.tb_ctx.htable, do_tb_page_check, NULL); } #endif -static inline void tb_hash_remove(TranslationBlock **ptb, TranslationBlock *tb) -{ - TranslationBlock *tb1; - - for (;;) { - tb1 = *ptb; - if (tb1 == tb) { - *ptb = tb1->phys_hash_next; - break; - } - ptb = &tb1->phys_hash_next; - } -} - static inline void tb_page_remove(TranslationBlock **ptb, TranslationBlock *tb) { TranslationBlock *tb1; @@ -998,7 +992,7 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr) /* remove the TB from the hash list */ phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK); h = tb_hash_func(phys_pc, tb->pc, tb->flags); - tb_hash_remove(&tcg_ctx.tb_ctx.tb_phys_hash[h], tb); + qht_remove(&tcg_ctx.tb_ctx.htable, tb, h); /* remove the TB from the page list */ if (tb->page_addr[0] != page_addr) { @@ -1128,13 +1122,10 @@ static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc, tb_page_addr_t phys_page2) { uint32_t h; - TranslationBlock **ptb; /* add in the hash table */ h = tb_hash_func(phys_pc, tb->pc, tb->flags); - ptb = &tcg_ctx.tb_ctx.tb_phys_hash[h]; - tb->phys_hash_next = *ptb; - *ptb = tb; + qht_insert(&tcg_ctx.tb_ctx.htable, tb, h); /* add in the page list */ tb_alloc_page(tb, 0, phys_pc & TARGET_PAGE_MASK); From 329844d4bc3d5a11f1e63938d66f74c9584c7abc Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Wed, 8 Jun 2016 14:55:33 -0400 Subject: [PATCH 15/15] translate-all: add tb hash bucket info to 'info jit' dump MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Examples: - Good hashing, i.e. tb_hash_func5(phys_pc, pc, flags): TB count 715135/2684354 [...] TB hash buckets 388775/524288 (74.15% head buckets used) TB hash occupancy 33.04% avg chain occ. Histogram: [0,10)%|▆ █ ▅▁▃▁▁|[90,100]% TB hash avg chain 1.017 buckets. Histogram: 1|█▁▁|3 - Not-so-good hashing, i.e. tb_hash_func5(phys_pc, pc, 0): TB count 712636/2684354 [...] TB hash buckets 344924/524288 (65.79% head buckets used) TB hash occupancy 31.64% avg chain occ. Histogram: [0,10)%|█ ▆ ▅▁▃▁▂|[90,100]% TB hash avg chain 1.047 buckets. Histogram: 1|█▁▁▁|4 - Bad hashing, i.e. tb_hash_func5(phys_pc, 0, 0): TB count 702818/2684354 [...] TB hash buckets 112741/524288 (21.50% head buckets used) TB hash occupancy 10.15% avg chain occ. Histogram: [0,10)%|█ ▁ ▁▁▁▁▁|[90,100]% TB hash avg chain 2.107 buckets. Histogram: [1.0,10.2)|█▁▁▁▁▁▁▁▁▁|[83.8,93.0] - Good hashing, but no auto-resize: TB count 715634/2684354 TB hash buckets 8192/8192 (100.00% head buckets used) TB hash occupancy 98.30% avg chain occ. Histogram: [95.3,95.8)%|▁▁▃▄▃▄▁▇▁█|[99.5,100.0]% TB hash avg chain 22.070 buckets. Histogram: [15.0,16.7)|▁▂▅▄█▅▁▁▁▁|[30.3,32.0] Acked-by: Sergey Fedorov Suggested-by: Richard Henderson Reviewed-by: Richard Henderson Signed-off-by: Emilio G. Cota Message-Id: <1465412133-3029-16-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson --- translate-all.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/translate-all.c b/translate-all.c index b620fccfc2..e8b88b4485 100644 --- a/translate-all.c +++ b/translate-all.c @@ -1668,6 +1668,10 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf) int i, target_code_size, max_target_code_size; int direct_jmp_count, direct_jmp2_count, cross_page; TranslationBlock *tb; + struct qht_stats hst; + uint32_t hgram_opts; + size_t hgram_bins; + char *hgram; target_code_size = 0; max_target_code_size = 0; @@ -1718,6 +1722,38 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf) direct_jmp2_count, tcg_ctx.tb_ctx.nb_tbs ? (direct_jmp2_count * 100) / tcg_ctx.tb_ctx.nb_tbs : 0); + + qht_statistics_init(&tcg_ctx.tb_ctx.htable, &hst); + + cpu_fprintf(f, "TB hash buckets %zu/%zu (%0.2f%% head buckets used)\n", + hst.used_head_buckets, hst.head_buckets, + (double)hst.used_head_buckets / hst.head_buckets * 100); + + hgram_opts = QDIST_PR_BORDER | QDIST_PR_LABELS; + hgram_opts |= QDIST_PR_100X | QDIST_PR_PERCENT; + if (qdist_xmax(&hst.occupancy) - qdist_xmin(&hst.occupancy) == 1) { + hgram_opts |= QDIST_PR_NODECIMAL; + } + hgram = qdist_pr(&hst.occupancy, 10, hgram_opts); + cpu_fprintf(f, "TB hash occupancy %0.2f%% avg chain occ. Histogram: %s\n", + qdist_avg(&hst.occupancy) * 100, hgram); + g_free(hgram); + + hgram_opts = QDIST_PR_BORDER | QDIST_PR_LABELS; + hgram_bins = qdist_xmax(&hst.chain) - qdist_xmin(&hst.chain); + if (hgram_bins > 10) { + hgram_bins = 10; + } else { + hgram_bins = 0; + hgram_opts |= QDIST_PR_NODECIMAL | QDIST_PR_NOBINRANGE; + } + hgram = qdist_pr(&hst.chain, hgram_bins, hgram_opts); + cpu_fprintf(f, "TB hash avg chain %0.3f buckets. Histogram: %s\n", + qdist_avg(&hst.chain), hgram); + g_free(hgram); + + qht_statistics_destroy(&hst); + cpu_fprintf(f, "\nStatistics:\n"); cpu_fprintf(f, "TB flush count %d\n", tcg_ctx.tb_ctx.tb_flush_count); cpu_fprintf(f, "TB invalidate count %d\n",