TCG patch queue:

Workaround macos assembler lossage. Eliminate tb_lock. Fix TB code generation overflow. -----BEGIN PGP SIGNATURE----- iQEcBAABAgAGBQJbJBZIAAoJEGTfOOivfiFfy0gH/1brodMhJbTS6/k9+FyXWEy5 zYjCGKKlMZk//Y+4wcF5tXY/qDRNWk80j6KyxumNp3gCBehx6u59EEsrJRQaxBHm nYbDoE3Fy0J4KgRzdGmkYtl89XDK1++Ea9uL9N/stg2MSodzqoV6uudLYr/f+nRj 4MkS+7BI+aJ4/XIKLU+/+cRo+5FdD0hNEabjlUxTOSrfJbr/YxbnVINX01A4yD6q LSzwLAEqpJehFBQjeSLu93ztrapj/1vEaguPOf04F6pXgOLpvSPlPahqwwk4qRwS OFgWwSPby3jrNLYZcufx2cY5pG3i4wDGK3z/B35hnDEGwYp1fNt6xdq+EzmHhaM= =ibt/ -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20180615' into staging TCG patch queue: Workaround macos assembler lossage. Eliminate tb_lock. Fix TB code generation overflow. # gpg: Signature made Fri 15 Jun 2018 20:40:56 BST # gpg: using RSA key 64DF38E8AF7E215F # gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" # Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A 05C0 64DF 38E8 AF7E 215F * remotes/rth/tags/pull-tcg-20180615: tcg: Reduce max TB opcode count tcg: remove tb_lock translate-all: remove tb_lock mention from cpu_restore_state_from_tb cputlb: remove tb_lock from tlb_flush functions translate-all: protect TB jumps with a per-destination-TB lock translate-all: discard TB when tb_link_page returns an existing matching TB translate-all: introduce assert_no_pages_locked translate-all: add page_locked assertions translate-all: use per-page locking in !user-mode translate-all: move tb_invalidate_phys_page_range up in the file translate-all: work page-by-page in tb_invalidate_phys_range_1 translate-all: remove hole in PageDesc translate-all: make l1_map lockless translate-all: iterate over TBs in a page with PAGE_FOR_EACH_TB tcg: move tb_ctx.tb_phys_invalidate_count to tcg_ctx tcg: track TBs with per-region BST's qht: return existing entry when qht_insert fails qht: require a default comparison function tcg/i386: Use byte form of xgetbv instruction Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2018-06-21 17:54:26 +01:00 · 2018-06-21 17:54:26 +01:00 · 33836a7315
commit 33836a7315
parent 46012db666 9f75462065
25 changed files with 1157 additions and 490 deletions
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@ -212,20 +212,20 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
       We only end up here when an existing TB is too long.  */
    cflags |= MIN(max_cycles, CF_COUNT_MASK);

-    tb_lock();
+    mmap_lock();
    tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base,
                     orig_tb->flags, cflags);
    tb->orig_tb = orig_tb;
-    tb_unlock();
+    mmap_unlock();

    /* execute the generated code */
    trace_exec_tb_nocache(tb, tb->pc);
    cpu_tb_exec(cpu, tb);

-    tb_lock();
+    mmap_lock();
    tb_phys_invalidate(tb, -1);
-    tb_remove(tb);
-    tb_unlock();
+    mmap_unlock();
+    tcg_tb_remove(tb);
 }
 #endif

@ -244,12 +244,7 @@ void cpu_exec_step_atomic(CPUState *cpu)
        tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
        if (tb == NULL) {
            mmap_lock();
-            tb_lock();
-            tb = tb_htable_lookup(cpu, pc, cs_base, flags, cf_mask);
-            if (likely(tb == NULL)) {
-                tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
-            }
-            tb_unlock();
+            tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
            mmap_unlock();
        }

@ -264,15 +259,14 @@ void cpu_exec_step_atomic(CPUState *cpu)
        cpu_tb_exec(cpu, tb);
        cc->cpu_exec_exit(cpu);
    } else {
-        /* We may have exited due to another problem here, so we need
-         * to reset any tb_locks we may have taken but didn't release.
+        /*
         * The mmap_lock is dropped by tb_gen_code if it runs out of
         * memory.
         */
 #ifndef CONFIG_SOFTMMU
        tcg_debug_assert(!have_mmap_lock());
 #endif
-        tb_lock_reset();
+        assert_no_pages_locked();
    }

    if (in_exclusive_region) {
@ -295,7 +289,7 @@ struct tb_desc {
    uint32_t trace_vcpu_dstate;
 };

-static bool tb_cmp(const void *p, const void *d)
+static bool tb_lookup_cmp(const void *p, const void *d)
 {
    const TranslationBlock *tb = p;
    const struct tb_desc *desc = d;
@ -340,7 +334,7 @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
    phys_pc = get_page_addr_code(desc.env, pc);
    desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
    h = tb_hash_func(phys_pc, pc, flags, cf_mask, *cpu->trace_dstate);
-    return qht_lookup(&tb_ctx.htable, tb_cmp, &desc, h);
+    return qht_lookup_custom(&tb_ctx.htable, &desc, h, tb_lookup_cmp);
 }

 void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
@ -354,28 +348,43 @@ void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
    }
 }

-/* Called with tb_lock held.  */
 static inline void tb_add_jump(TranslationBlock *tb, int n,
                               TranslationBlock *tb_next)
 {
+    uintptr_t old;
+
    assert(n < ARRAY_SIZE(tb->jmp_list_next));
-    if (tb->jmp_list_next[n]) {
-        /* Another thread has already done this while we were
-         * outside of the lock; nothing to do in this case */
-        return;
+    qemu_spin_lock(&tb_next->jmp_lock);
+
+    /* make sure the destination TB is valid */
+    if (tb_next->cflags & CF_INVALID) {
+        goto out_unlock_next;
    }
+    /* Atomically claim the jump destination slot only if it was NULL */
+    old = atomic_cmpxchg(&tb->jmp_dest[n], (uintptr_t)NULL, (uintptr_t)tb_next);
+    if (old) {
+        goto out_unlock_next;
+    }
+
+    /* patch the native jump address */
+    tb_set_jmp_target(tb, n, (uintptr_t)tb_next->tc.ptr);
+
+    /* add in TB jmp list */
+    tb->jmp_list_next[n] = tb_next->jmp_list_head;
+    tb_next->jmp_list_head = (uintptr_t)tb | n;
+
+    qemu_spin_unlock(&tb_next->jmp_lock);
+
    qemu_log_mask_and_addr(CPU_LOG_EXEC, tb->pc,
                           "Linking TBs %p [" TARGET_FMT_lx
                           "] index %d -> %p [" TARGET_FMT_lx "]\n",
                           tb->tc.ptr, tb->pc, n,
                           tb_next->tc.ptr, tb_next->pc);
+    return;

-    /* patch the native jump address */
-    tb_set_jmp_target(tb, n, (uintptr_t)tb_next->tc.ptr);
-
-    /* add in TB jmp circular list */
-    tb->jmp_list_next[n] = tb_next->jmp_list_first;
-    tb_next->jmp_list_first = (uintptr_t)tb | n;
+ out_unlock_next:
+    qemu_spin_unlock(&tb_next->jmp_lock);
+    return;
 }

 static inline TranslationBlock *tb_find(CPUState *cpu,
@ -385,27 +394,11 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
    TranslationBlock *tb;
    target_ulong cs_base, pc;
    uint32_t flags;
-    bool acquired_tb_lock = false;

    tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
    if (tb == NULL) {
-        /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
-         * taken outside tb_lock. As system emulation is currently
-         * single threaded the locks are NOPs.
-         */
        mmap_lock();
-        tb_lock();
-        acquired_tb_lock = true;
-
-        /* There's a chance that our desired tb has been translated while
-         * taking the locks so we check again inside the lock.
-         */
-        tb = tb_htable_lookup(cpu, pc, cs_base, flags, cf_mask);
-        if (likely(tb == NULL)) {
-            /* if no translated code available, then translate it now */
-            tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
-        }
-
+        tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
        mmap_unlock();
        /* We add the TB in the virtual pc hash table for the fast lookup */
        atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
@ -421,16 +414,7 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
 #endif
    /* See if we can patch the calling TB. */
    if (last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
-        if (!acquired_tb_lock) {
-            tb_lock();
-            acquired_tb_lock = true;
-        }
-        if (!(tb->cflags & CF_INVALID)) {
-            tb_add_jump(last_tb, tb_exit, tb);
-        }
-    }
-    if (acquired_tb_lock) {
-        tb_unlock();
+        tb_add_jump(last_tb, tb_exit, tb);
    }
    return tb;
 }
@ -706,7 +690,9 @@ int cpu_exec(CPUState *cpu)
        g_assert(cpu == current_cpu);
        g_assert(cc == CPU_GET_CLASS(cpu));
 #endif /* buggy compiler */
-        tb_lock_reset();
+#ifndef CONFIG_SOFTMMU
+        tcg_debug_assert(!have_mmap_lock());
+#endif
        if (qemu_mutex_iothread_locked()) {
            qemu_mutex_unlock_iothread();
        }
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@ -125,8 +125,6 @@ static void tlb_flush_nocheck(CPUState *cpu)
    atomic_set(&env->tlb_flush_count, env->tlb_flush_count + 1);
    tlb_debug("(count: %zu)\n", tlb_flush_count());

-    tb_lock();
-
    memset(env->tlb_table, -1, sizeof(env->tlb_table));
    memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
    cpu_tb_jmp_cache_clear(cpu);
@ -135,8 +133,6 @@ static void tlb_flush_nocheck(CPUState *cpu)
    env->tlb_flush_addr = -1;
    env->tlb_flush_mask = 0;

-    tb_unlock();
-
    atomic_mb_set(&cpu->pending_tlb_flush, 0);
 }

@ -180,8 +176,6 @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)

    assert_cpu_is_self(cpu);

-    tb_lock();
-
    tlb_debug("start: mmu_idx:0x%04lx\n", mmu_idx_bitmask);

    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
@ -197,8 +191,6 @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
    cpu_tb_jmp_cache_clear(cpu);

    tlb_debug("done\n");
-
-    tb_unlock();
 }

 void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap)
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
--- a/accel/tcg/translate-all.h
+++ b/accel/tcg/translate-all.h
@ -23,7 +23,11 @@


 /* translate-all.c */
-void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len);
+struct page_collection *page_collection_lock(tb_page_addr_t start,
+                                             tb_page_addr_t end);
+void page_collection_unlock(struct page_collection *set);
+void tb_invalidate_phys_page_fast(struct page_collection *pages,
+                                  tb_page_addr_t start, int len);
 void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
                                   int is_cpu_write_access);
 void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end);
--- a/docs/devel/multi-thread-tcg.txt
+++ b/docs/devel/multi-thread-tcg.txt
@ -61,6 +61,7 @@ have their block-to-block jumps patched.
 Global TCG State
 ----------------

+### User-mode emulation
 We need to protect the entire code generation cycle including any post
 generation patching of the translated code. This also implies a shared
 translation buffer which contains code running on all cores. Any
@ -75,9 +76,11 @@ patching.

 (Current solution)

-Mainly as part of the linux-user work all code generation is
-serialised with a tb_lock(). For the SoftMMU tb_lock() also takes the
-place of mmap_lock() in linux-user.
+Code generation is serialised with mmap_lock().
+
+### !User-mode emulation
+Each vCPU has its own TCG context and associated TCG region, thereby
+requiring no locking.

 Translation Blocks
 ------------------
@ -131,15 +134,20 @@ DESIGN REQUIREMENT: Safely handle invalidation of TBs

 The direct jump themselves are updated atomically by the TCG
 tb_set_jmp_target() code. Modification to the linked lists that allow
-searching for linked pages are done under the protect of the
-tb_lock().
+searching for linked pages are done under the protection of tb->jmp_lock,
+where tb is the destination block of a jump. Each origin block keeps a
+pointer to its destinations so that the appropriate lock can be acquired before
+iterating over a jump list.

-The global page table is protected by the tb_lock() in system-mode and
-mmap_lock() in linux-user mode.
+The global page table is a lockless radix tree; cmpxchg is used
+to atomically insert new elements.

 The lookup caches are updated atomically and the lookup hash uses QHT
 which is designed for concurrent safe lookup.

+Parallel code generation is supported. QHT is used at insertion time
+as the synchronization point across threads, thereby ensuring that we only
+keep track of a single TranslationBlock for each guest code block.

 Memory maps and TLBs
 --------------------
@ -190,7 +198,7 @@ work as "safe work" and exiting the cpu run loop. This ensure by the
 time execution restarts all flush operations have completed.

 TLB flag updates are all done atomically and are also protected by the
-tb_lock() which is used by the functions that update the TLB in bulk.
+corresponding page lock.

 (Known limitation)

--- a/exec.c
+++ b/exec.c
@ -1031,9 +1031,7 @@ const char *parse_cpu_model(const char *cpu_model)
 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 {
    mmap_lock();
-    tb_lock();
    tb_invalidate_phys_page_range(pc, pc + 1, 0);
-    tb_unlock();
    mmap_unlock();
 }
 #else
@ -2644,21 +2642,21 @@ void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
    ndi->ram_addr = ram_addr;
    ndi->mem_vaddr = mem_vaddr;
    ndi->size = size;
-    ndi->locked = false;
+    ndi->pages = NULL;

    assert(tcg_enabled());
    if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
-        ndi->locked = true;
-        tb_lock();
-        tb_invalidate_phys_page_fast(ram_addr, size);
+        ndi->pages = page_collection_lock(ram_addr, ram_addr + size);
+        tb_invalidate_phys_page_fast(ndi->pages, ram_addr, size);
    }
 }

 /* Called within RCU critical section. */
 void memory_notdirty_write_complete(NotDirtyInfo *ndi)
 {
-    if (ndi->locked) {
-        tb_unlock();
+    if (ndi->pages) {
+        page_collection_unlock(ndi->pages);
+        ndi->pages = NULL;
    }

    /* Set both VGA and migration bits for simplicity and to remove
@ -2745,18 +2743,16 @@ static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
                }
                cpu->watchpoint_hit = wp;

-                /* Both tb_lock and iothread_mutex will be reset when
-                 * cpu_loop_exit or cpu_loop_exit_noexc longjmp
-                 * back into the cpu_exec main loop.
-                 */
-                tb_lock();
+                mmap_lock();
                tb_check_watchpoint(cpu);
                if (wp->flags & BP_STOP_BEFORE_ACCESS) {
                    cpu->exception_index = EXCP_DEBUG;
+                    mmap_unlock();
                    cpu_loop_exit(cpu);
                } else {
                    /* Force execution of one insn next time.  */
                    cpu->cflags_next_tb = 1 | curr_cflags();
+                    mmap_unlock();
                    cpu_loop_exit_noexc(cpu);
                }
            }
@ -3147,9 +3143,9 @@ static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
    }
    if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
        assert(tcg_enabled());
-        tb_lock();
+        mmap_lock();
        tb_invalidate_phys_range(addr, addr + length);
-        tb_unlock();
+        mmap_unlock();
        dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
    }
    cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@ -23,7 +23,7 @@ typedef struct CPUListState {
    FILE *file;
 } CPUListState;

-/* The CPU list lock nests outside tb_lock/tb_unlock.  */
+/* The CPU list lock nests outside page_(un)lock or mmap_(un)lock */
 void qemu_init_cpu_list(void);
 void cpu_list_lock(void);
 void cpu_list_unlock(void);
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@ -345,7 +345,7 @@ struct TranslationBlock {
 #define CF_LAST_IO     0x00008000 /* Last insn may be an IO access.  */
 #define CF_NOCACHE     0x00010000 /* To be freed after execution */
 #define CF_USE_ICOUNT  0x00020000
-#define CF_INVALID     0x00040000 /* TB is stale. Setters need tb_lock */
+#define CF_INVALID     0x00040000 /* TB is stale. Set with @jmp_lock held */
 #define CF_PARALLEL    0x00080000 /* Generate code for a parallel context */
 /* cflags' mask for hashing/comparison */
 #define CF_HASH_MASK   \
@ -359,10 +359,14 @@ struct TranslationBlock {
    /* original tb when cflags has CF_NOCACHE */
    struct TranslationBlock *orig_tb;
    /* first and second physical page containing code. The lower bit
-       of the pointer tells the index in page_next[] */
-    struct TranslationBlock *page_next[2];
+       of the pointer tells the index in page_next[].
+       The list is protected by the TB's page('s) lock(s) */
+    uintptr_t page_next[2];
    tb_page_addr_t page_addr[2];

+    /* jmp_lock placed here to fill a 4-byte hole. Its documentation is below */
+    QemuSpin jmp_lock;
+
    /* The following data are used to directly call another TB from
     * the code of this one. This can be done either by emitting direct or
     * indirect native jump instructions. These jumps are reset so that the TB
@ -374,20 +378,26 @@ struct TranslationBlock {
 #define TB_JMP_RESET_OFFSET_INVALID 0xffff /* indicates no jump generated */
    uintptr_t jmp_target_arg[2];  /* target address or offset */

-    /* Each TB has an associated circular list of TBs jumping to this one.
-     * jmp_list_first points to the first TB jumping to this one.
-     * jmp_list_next is used to point to the next TB in a list.
-     * Since each TB can have two jumps, it can participate in two lists.
-     * jmp_list_first and jmp_list_next are 4-byte aligned pointers to a
-     * TranslationBlock structure, but the two least significant bits of
-     * them are used to encode which data field of the pointed TB should
-     * be used to traverse the list further from that TB:
-     * 0 => jmp_list_next[0], 1 => jmp_list_next[1], 2 => jmp_list_first.
-     * In other words, 0/1 tells which jump is used in the pointed TB,
-     * and 2 means that this is a pointer back to the target TB of this list.
+    /*
+     * Each TB has a NULL-terminated list (jmp_list_head) of incoming jumps.
+     * Each TB can have two outgoing jumps, and therefore can participate
+     * in two lists. The list entries are kept in jmp_list_next[2]. The least
+     * significant bit (LSB) of the pointers in these lists is used to encode
+     * which of the two list entries is to be used in the pointed TB.
+     *
+     * List traversals are protected by jmp_lock. The destination TB of each
+     * outgoing jump is kept in jmp_dest[] so that the appropriate jmp_lock
+     * can be acquired from any origin TB.
+     *
+     * jmp_dest[] are tagged pointers as well. The LSB is set when the TB is
+     * being invalidated, so that no further outgoing jumps from it can be set.
+     *
+     * jmp_lock also protects the CF_INVALID cflag; a jump must not be chained
+     * to a destination TB that has CF_INVALID set.
     */
+    uintptr_t jmp_list_head;
    uintptr_t jmp_list_next[2];
-    uintptr_t jmp_list_first;
+    uintptr_t jmp_dest[2];
 };

 extern bool parallel_cpus;
@ -405,7 +415,6 @@ static inline uint32_t curr_cflags(void)
         | (use_icount ? CF_USE_ICOUNT : 0);
 }

-void tb_remove(TranslationBlock *tb);
 void tb_flush(CPUState *cpu);
 void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr);
 TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
@ -431,9 +440,13 @@ extern uintptr_t tci_tb_ptr;
   smaller than 4 bytes, so we don't worry about special-casing this.  */
 #define GETPC_ADJ   2

-void tb_lock(void);
-void tb_unlock(void);
-void tb_lock_reset(void);
+#if !defined(CONFIG_USER_ONLY) && defined(CONFIG_DEBUG_TCG)
+void assert_no_pages_locked(void);
+#else
+static inline void assert_no_pages_locked(void)
+{
+}
+#endif

 #if !defined(CONFIG_USER_ONLY)

--- a/include/exec/memory-internal.h
+++ b/include/exec/memory-internal.h
@ -49,6 +49,8 @@ void mtree_print_dispatch(fprintf_function mon, void *f,
                          struct AddressSpaceDispatch *d,
                          MemoryRegion *root);

+struct page_collection;
+
 /* Opaque struct for passing info from memory_notdirty_write_prepare()
 * to memory_notdirty_write_complete(). Callers should treat all fields
 * as private, with the exception of @active.
@ -60,10 +62,10 @@ void mtree_print_dispatch(fprintf_function mon, void *f,
 */
 typedef struct {
    CPUState *cpu;
+    struct page_collection *pages;
    ram_addr_t ram_addr;
    vaddr mem_vaddr;
    unsigned size;
-    bool locked;
    bool active;
 } NotDirtyInfo;

@ -91,7 +93,7 @@ typedef struct {
 *
 * This must only be called if we are using TCG; it will assert otherwise.
 *
- * We may take a lock in the prepare call, so callers must ensure that
+ * We may take locks in the prepare call, so callers must ensure that
 * they don't exit (via longjump or otherwise) without calling complete.
 *
 * This call must only be made inside an RCU critical section.
--- a/include/exec/tb-context.h
+++ b/include/exec/tb-context.h
@ -31,14 +31,10 @@ typedef struct TBContext TBContext;

 struct TBContext {

-    GTree *tb_tree;
    struct qht htable;
-    /* any access to the tbs or the page table must use this lock */
-    QemuMutex tb_lock;

    /* statistics */
    unsigned tb_flush_count;
-    int tb_phys_invalidate_count;
 };

 extern TBContext tb_ctx;
--- a/include/qemu/qht.h
+++ b/include/qemu/qht.h
@ -11,8 +11,11 @@
 #include "qemu/thread.h"
 #include "qemu/qdist.h"

+typedef bool (*qht_cmp_func_t)(const void *a, const void *b);
+
 struct qht {
    struct qht_map *map;
+    qht_cmp_func_t cmp;
    QemuMutex lock; /* serializes setters of ht->map */
    unsigned int mode;
 };
@ -47,10 +50,12 @@ typedef void (*qht_iter_func_t)(struct qht *ht, void *p, uint32_t h, void *up);
 /**
 * qht_init - Initialize a QHT
 * @ht: QHT to be initialized
+ * @cmp: default comparison function. Cannot be NULL.
 * @n_elems: number of entries the hash table should be optimized for.
 * @mode: bitmask with OR'ed QHT_MODE_*
 */
-void qht_init(struct qht *ht, size_t n_elems, unsigned int mode);
+void qht_init(struct qht *ht, qht_cmp_func_t cmp, size_t n_elems,
+              unsigned int mode);

 /**
 * qht_destroy - destroy a previously initialized QHT
@ -65,6 +70,7 @@ void qht_destroy(struct qht *ht);
 * @ht: QHT to insert to
 * @p: pointer to be inserted
 * @hash: hash corresponding to @p
+ * @existing: address where the pointer to an existing entry can be copied to
 *
 * Attempting to insert a NULL @p is a bug.
 * Inserting the same pointer @p with different @hash values is a bug.
@ -73,16 +79,18 @@ void qht_destroy(struct qht *ht);
 * inserted into the hash table.
 *
 * Returns true on success.
- * Returns false if the @p-@hash pair already exists in the hash table.
+ * Returns false if there is an existing entry in the table that is equivalent
+ * (i.e. ht->cmp matches and the hash is the same) to @p-@h. If @existing
+ * is !NULL, a pointer to this existing entry is copied to it.
 */
-bool qht_insert(struct qht *ht, void *p, uint32_t hash);
+bool qht_insert(struct qht *ht, void *p, uint32_t hash, void **existing);

 /**
- * qht_lookup - Look up a pointer in a QHT
+ * qht_lookup_custom - Look up a pointer using a custom comparison function.
 * @ht: QHT to be looked up
- * @func: function to compare existing pointers against @userp
 * @userp: pointer to pass to @func
 * @hash: hash of the pointer to be looked up
+ * @func: function to compare existing pointers against @userp
 *
 * Needs to be called under an RCU read-critical section.
 *
@ -94,8 +102,18 @@ bool qht_insert(struct qht *ht, void *p, uint32_t hash);
 * Returns the corresponding pointer when a match is found.
 * Returns NULL otherwise.
 */
-void *qht_lookup(struct qht *ht, qht_lookup_func_t func, const void *userp,
-                 uint32_t hash);
+void *qht_lookup_custom(struct qht *ht, const void *userp, uint32_t hash,
+                        qht_lookup_func_t func);
+
+/**
+ * qht_lookup - Look up a pointer in a QHT
+ * @ht: QHT to be looked up
+ * @userp: pointer to pass to the comparison function
+ * @hash: hash of the pointer to be looked up
+ *
+ * Calls qht_lookup_custom() using @ht's default comparison function.
+ */
+void *qht_lookup(struct qht *ht, const void *userp, uint32_t hash);

 /**
 * qht_remove - remove a pointer from the hash table
--- a/linux-user/main.c
+++ b/linux-user/main.c
@ -120,7 +120,6 @@ void fork_start(void)
 {
    start_exclusive();
    mmap_fork_start();
-    qemu_mutex_lock(&tb_ctx.tb_lock);
    cpu_list_lock();
 }

@ -136,14 +135,12 @@ void fork_end(int child)
                QTAILQ_REMOVE(&cpus, cpu, node);
            }
        }
-        qemu_mutex_init(&tb_ctx.tb_lock);
        qemu_init_cpu_list();
        gdbserver_fork(thread_cpu);
        /* qemu_init_cpu_list() takes care of reinitializing the
         * exclusive state, so we don't need to end_exclusive() here.
         */
    } else {
-        qemu_mutex_unlock(&tb_ctx.tb_lock);
        cpu_list_unlock();
        end_exclusive();
    }
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@ -1733,7 +1733,7 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
            tcg_out_insn(s, 3305, LDR, offset, TCG_REG_TMP);
        }
        tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
-        s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
+        set_jmp_reset_offset(s, a0);
        break;

    case INDEX_op_goto_ptr:
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@ -1822,7 +1822,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                tcg_out_movi32(s, COND_AL, base, ptr - dil);
            }
            tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
-            s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
+            set_jmp_reset_offset(s, args[0]);
        }
        break;
    case INDEX_op_goto_ptr:
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@ -2245,7 +2245,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
                                 (intptr_t)(s->tb_jmp_target_addr + a0));
        }
-        s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
+        set_jmp_reset_offset(s, a0);
        break;
    case INDEX_op_goto_ptr:
        /* jmp to the given host address (could be epilogue) */
@ -3501,7 +3501,10 @@ static void tcg_target_init(TCGContext *s)
           sure of not hitting invalid opcode.  */
        if (c & bit_OSXSAVE) {
            unsigned xcrl, xcrh;
-            asm ("xgetbv" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
+            /* The xgetbv instruction is not available to older versions of
+             * the assembler, so we encode the instruction manually.
+             */
+            asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
            if ((xcrl & 6) == 6) {
                have_avx1 = (c & bit_AVX) != 0;
                have_avx2 = (b7 & bit_AVX2) != 0;
--- a/tcg/mips/tcg-target.inc.c
+++ b/tcg/mips/tcg-target.inc.c
@ -1744,7 +1744,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
            tcg_out_opc_reg(s, OPC_JR, 0, TCG_TMP0, 0);
        }
        tcg_out_nop(s);
-        s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
+        set_jmp_reset_offset(s, a0);
        break;
    case INDEX_op_goto_ptr:
        /* jmp to the given host address (could be epilogue) */
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@ -2025,10 +2025,10 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
        }
        tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
        tcg_out32(s, BCCTR | BO_ALWAYS);
-        s->tb_jmp_reset_offset[args[0]] = c = tcg_current_code_size(s);
+        set_jmp_reset_offset(s, args[0]);
        if (USE_REG_TB) {
            /* For the unlinked case, need to reset TCG_REG_TB.  */
-            c = -c;
+            c = -tcg_current_code_size(s);
            assert(c == (int16_t)c);
            tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, c));
        }
--- a/tcg/s390/tcg-target.inc.c
+++ b/tcg/s390/tcg-target.inc.c
@ -1783,7 +1783,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
            /* and go there */
            tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, TCG_REG_TB);
        }
-        s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
+        set_jmp_reset_offset(s, a0);

        /* For the unlinked path of goto_tb, we need to reset
           TCG_REG_TB to the beginning of this TB.  */
--- a/tcg/sparc/tcg-target.inc.c
+++ b/tcg/sparc/tcg-target.inc.c
@ -1388,12 +1388,12 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
            tcg_out_arithi(s, TCG_REG_G0, TCG_REG_TB, 0, JMPL);
            tcg_out_nop(s);
        }
-        s->tb_jmp_reset_offset[a0] = c = tcg_current_code_size(s);
+        set_jmp_reset_offset(s, a0);

        /* For the unlinked path of goto_tb, we need to reset
           TCG_REG_TB to the beginning of this TB.  */
        if (USE_REG_TB) {
-            c = -c;
+            c = -tcg_current_code_size(s);
            if (check_fit_i32(c, 13)) {
                tcg_out_arithi(s, TCG_REG_TB, TCG_REG_TB, c, ARITH_ADD);
            } else {
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@ -135,6 +135,12 @@ static TCGContext **tcg_ctxs;
 static unsigned int n_tcg_ctxs;
 TCGv_env cpu_env = 0;

+struct tcg_region_tree {
+    QemuMutex lock;
+    GTree *tree;
+    /* padding to avoid false sharing is computed at run-time */
+};
+
 /*
 * We divide code_gen_buffer into equally-sized "regions" that TCG threads
 * dynamically allocate from as demand dictates. Given appropriate region
@ -158,6 +164,13 @@ struct tcg_region_state {
 };

 static struct tcg_region_state region;
+/*
+ * This is an array of struct tcg_region_tree's, with padding.
+ * We use void * to simplify the computation of region_trees[i]; each
+ * struct is found every tree_size bytes.
+ */
+static void *region_trees;
+static size_t tree_size;
 static TCGRegSet tcg_target_available_regs[TCG_TYPE_COUNT];
 static TCGRegSet tcg_target_call_clobber_regs;

@ -293,8 +306,190 @@ TCGLabel *gen_new_label(void)
    return l;
 }

+static void set_jmp_reset_offset(TCGContext *s, int which)
+{
+    size_t off = tcg_current_code_size(s);
+    s->tb_jmp_reset_offset[which] = off;
+    /* Make sure that we didn't overflow the stored offset.  */
+    assert(s->tb_jmp_reset_offset[which] == off);
+}
+
 #include "tcg-target.inc.c"

+/* compare a pointer @ptr and a tb_tc @s */
+static int ptr_cmp_tb_tc(const void *ptr, const struct tb_tc *s)
+{
+    if (ptr >= s->ptr + s->size) {
+        return 1;
+    } else if (ptr < s->ptr) {
+        return -1;
+    }
+    return 0;
+}
+
+static gint tb_tc_cmp(gconstpointer ap, gconstpointer bp)
+{
+    const struct tb_tc *a = ap;
+    const struct tb_tc *b = bp;
+
+    /*
+     * When both sizes are set, we know this isn't a lookup.
+     * This is the most likely case: every TB must be inserted; lookups
+     * are a lot less frequent.
+     */
+    if (likely(a->size && b->size)) {
+        if (a->ptr > b->ptr) {
+            return 1;
+        } else if (a->ptr < b->ptr) {
+            return -1;
+        }
+        /* a->ptr == b->ptr should happen only on deletions */
+        g_assert(a->size == b->size);
+        return 0;
+    }
+    /*
+     * All lookups have either .size field set to 0.
+     * From the glib sources we see that @ap is always the lookup key. However
+     * the docs provide no guarantee, so we just mark this case as likely.
+     */
+    if (likely(a->size == 0)) {
+        return ptr_cmp_tb_tc(a->ptr, b);
+    }
+    return ptr_cmp_tb_tc(b->ptr, a);
+}
+
+static void tcg_region_trees_init(void)
+{
+    size_t i;
+
+    tree_size = ROUND_UP(sizeof(struct tcg_region_tree), qemu_dcache_linesize);
+    region_trees = qemu_memalign(qemu_dcache_linesize, region.n * tree_size);
+    for (i = 0; i < region.n; i++) {
+        struct tcg_region_tree *rt = region_trees + i * tree_size;
+
+        qemu_mutex_init(&rt->lock);
+        rt->tree = g_tree_new(tb_tc_cmp);
+    }
+}
+
+static struct tcg_region_tree *tc_ptr_to_region_tree(void *p)
+{
+    size_t region_idx;
+
+    if (p < region.start_aligned) {
+        region_idx = 0;
+    } else {
+        ptrdiff_t offset = p - region.start_aligned;
+
+        if (offset > region.stride * (region.n - 1)) {
+            region_idx = region.n - 1;
+        } else {
+            region_idx = offset / region.stride;
+        }
+    }
+    return region_trees + region_idx * tree_size;
+}
+
+void tcg_tb_insert(TranslationBlock *tb)
+{
+    struct tcg_region_tree *rt = tc_ptr_to_region_tree(tb->tc.ptr);
+
+    qemu_mutex_lock(&rt->lock);
+    g_tree_insert(rt->tree, &tb->tc, tb);
+    qemu_mutex_unlock(&rt->lock);
+}
+
+void tcg_tb_remove(TranslationBlock *tb)
+{
+    struct tcg_region_tree *rt = tc_ptr_to_region_tree(tb->tc.ptr);
+
+    qemu_mutex_lock(&rt->lock);
+    g_tree_remove(rt->tree, &tb->tc);
+    qemu_mutex_unlock(&rt->lock);
+}
+
+/*
+ * Find the TB 'tb' such that
+ * tb->tc.ptr <= tc_ptr < tb->tc.ptr + tb->tc.size
+ * Return NULL if not found.
+ */
+TranslationBlock *tcg_tb_lookup(uintptr_t tc_ptr)
+{
+    struct tcg_region_tree *rt = tc_ptr_to_region_tree((void *)tc_ptr);
+    TranslationBlock *tb;
+    struct tb_tc s = { .ptr = (void *)tc_ptr };
+
+    qemu_mutex_lock(&rt->lock);
+    tb = g_tree_lookup(rt->tree, &s);
+    qemu_mutex_unlock(&rt->lock);
+    return tb;
+}
+
+static void tcg_region_tree_lock_all(void)
+{
+    size_t i;
+
+    for (i = 0; i < region.n; i++) {
+        struct tcg_region_tree *rt = region_trees + i * tree_size;
+
+        qemu_mutex_lock(&rt->lock);
+    }
+}
+
+static void tcg_region_tree_unlock_all(void)
+{
+    size_t i;
+
+    for (i = 0; i < region.n; i++) {
+        struct tcg_region_tree *rt = region_trees + i * tree_size;
+
+        qemu_mutex_unlock(&rt->lock);
+    }
+}
+
+void tcg_tb_foreach(GTraverseFunc func, gpointer user_data)
+{
+    size_t i;
+
+    tcg_region_tree_lock_all();
+    for (i = 0; i < region.n; i++) {
+        struct tcg_region_tree *rt = region_trees + i * tree_size;
+
+        g_tree_foreach(rt->tree, func, user_data);
+    }
+    tcg_region_tree_unlock_all();
+}
+
+size_t tcg_nb_tbs(void)
+{
+    size_t nb_tbs = 0;
+    size_t i;
+
+    tcg_region_tree_lock_all();
+    for (i = 0; i < region.n; i++) {
+        struct tcg_region_tree *rt = region_trees + i * tree_size;
+
+        nb_tbs += g_tree_nnodes(rt->tree);
+    }
+    tcg_region_tree_unlock_all();
+    return nb_tbs;
+}
+
+static void tcg_region_tree_reset_all(void)
+{
+    size_t i;
+
+    tcg_region_tree_lock_all();
+    for (i = 0; i < region.n; i++) {
+        struct tcg_region_tree *rt = region_trees + i * tree_size;
+
+        /* Increment the refcount first so that destroy acts as a reset */
+        g_tree_ref(rt->tree);
+        g_tree_destroy(rt->tree);
+    }
+    tcg_region_tree_unlock_all();
+}
+
 static void tcg_region_bounds(size_t curr_region, void **pstart, void **pend)
 {
    void *start, *end;
@ -380,6 +575,8 @@ void tcg_region_reset_all(void)
        g_assert(!err);
    }
    qemu_mutex_unlock(&region.lock);
+
+    tcg_region_tree_reset_all();
 }

 #ifdef CONFIG_USER_ONLY
@ -496,6 +693,8 @@ void tcg_region_init(void)
        g_assert(!rc);
    }

+    tcg_region_trees_init();
+
    /* In user-mode we support only one ctx, so do the initial allocation now */
 #ifdef CONFIG_USER_ONLY
    {
@ -600,6 +799,20 @@ size_t tcg_code_capacity(void)
    return capacity;
 }

+size_t tcg_tb_phys_invalidate_count(void)
+{
+    unsigned int n_ctxs = atomic_read(&n_tcg_ctxs);
+    unsigned int i;
+    size_t total = 0;
+
+    for (i = 0; i < n_ctxs; i++) {
+        const TCGContext *s = atomic_read(&tcg_ctxs[i]);
+
+        total += atomic_read(&s->tb_phys_invalidate_count);
+    }
+    return total;
+}
+
 /* pool based memory allocation */
 void *tcg_malloc_internal(TCGContext *s, int size)
 {
@ -3327,7 +3540,10 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
            break;
        case INDEX_op_insn_start:
            if (num_insns >= 0) {
-                s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
+                size_t off = tcg_current_code_size(s);
+                s->gen_insn_end_off[num_insns] = off;
+                /* Assert that we do not overflow our stored offset.  */
+                assert(s->gen_insn_end_off[num_insns] == off);
            }
            num_insns++;
            for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@ -695,6 +695,8 @@ struct TCGContext {
    /* Threshold to flush the translated code buffer.  */
    void *code_gen_highwater;

+    size_t tb_phys_invalidate_count;
+
    /* Track which vCPU triggers events */
    CPUState *cpu;                      /* *_trans */

@ -848,14 +850,16 @@ static inline bool tcg_op_buf_full(void)
    /* This is not a hard limit, it merely stops translation when
     * we have produced "enough" opcodes.  We want to limit TB size
     * such that a RISC host can reasonably use a 16-bit signed
-     * branch within the TB.
+     * branch within the TB.  We also need to be mindful of the
+     * 16-bit unsigned offsets, TranslationBlock.jmp_reset_offset[]
+     * and TCGContext.gen_insn_end_off[].
     */
-    return tcg_ctx->nb_ops >= 8000;
+    return tcg_ctx->nb_ops >= 4000;
 }

 /* pool based memory allocation */

-/* user-mode: tb_lock must be held for tcg_malloc_internal. */
+/* user-mode: mmap_lock must be held for tcg_malloc_internal. */
 void *tcg_malloc_internal(TCGContext *s, int size);
 void tcg_pool_reset(TCGContext *s);
 TranslationBlock *tcg_tb_alloc(TCGContext *s);
@ -866,7 +870,14 @@ void tcg_region_reset_all(void);
 size_t tcg_code_size(void);
 size_t tcg_code_capacity(void);

-/* user-mode: Called with tb_lock held.  */
+void tcg_tb_insert(TranslationBlock *tb);
+void tcg_tb_remove(TranslationBlock *tb);
+size_t tcg_tb_phys_invalidate_count(void);
+TranslationBlock *tcg_tb_lookup(uintptr_t tc_ptr);
+void tcg_tb_foreach(GTraverseFunc func, gpointer user_data);
+size_t tcg_nb_tbs(void);
+
+/* user-mode: Called with mmap_lock held.  */
 static inline void *tcg_malloc(int size)
 {
    TCGContext *s = tcg_ctx;
--- a/tcg/tci/tcg-target.inc.c
+++ b/tcg/tci/tcg-target.inc.c
@ -574,7 +574,7 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
            /* Indirect jump method. */
            TODO();
        }
-        s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
+        set_jmp_reset_offset(s, args[0]);
        break;
    case INDEX_op_br:
        tci_out_label(s, arg_label(args[0]));
--- a/tests/qht-bench.c
+++ b/tests/qht-bench.c
@ -93,10 +93,10 @@ static void usage_complete(int argc, char *argv[])
    exit(-1);
 }

-static bool is_equal(const void *obj, const void *userp)
+static bool is_equal(const void *ap, const void *bp)
 {
-    const long *a = obj;
-    const long *b = userp;
+    const long *a = ap;
+    const long *b = bp;

    return *a == *b;
 }
@ -150,7 +150,7 @@ static void do_rw(struct thread_info *info)

        p = &keys[info->r & (lookup_range - 1)];
        hash = h(*p);
-        read = qht_lookup(&ht, is_equal, p, hash);
+        read = qht_lookup(&ht, p, hash);
        if (read) {
            stats->rd++;
        } else {
@ -162,8 +162,8 @@ static void do_rw(struct thread_info *info)
        if (info->write_op) {
            bool written = false;

-            if (qht_lookup(&ht, is_equal, p, hash) == NULL) {
-                written = qht_insert(&ht, p, hash);
+            if (qht_lookup(&ht, p, hash) == NULL) {
+                written = qht_insert(&ht, p, hash, NULL);
            }
            if (written) {
                stats->in++;
@ -173,7 +173,7 @@ static void do_rw(struct thread_info *info)
        } else {
            bool removed = false;

-            if (qht_lookup(&ht, is_equal, p, hash)) {
+            if (qht_lookup(&ht, p, hash)) {
                removed = qht_remove(&ht, p, hash);
            }
            if (removed) {
@ -308,7 +308,7 @@ static void htable_init(void)
    }

    /* initialize the hash table */
-    qht_init(&ht, qht_n_elems, qht_mode);
+    qht_init(&ht, is_equal, qht_n_elems, qht_mode);
    assert(init_size <= init_range);

    pr_params();
@ -322,7 +322,7 @@ static void htable_init(void)
            r = xorshift64star(r);
            p = &keys[r & (init_range - 1)];
            hash = h(*p);
-            if (qht_insert(&ht, p, hash)) {
+            if (qht_insert(&ht, p, hash, NULL)) {
                break;
            }
            retries++;
--- a/tests/test-qht.c
+++ b/tests/test-qht.c
@ -13,10 +13,10 @@
 static struct qht ht;
 static int32_t arr[N * 2];

-static bool is_equal(const void *obj, const void *userp)
+static bool is_equal(const void *ap, const void *bp)
 {
-    const int32_t *a = obj;
-    const int32_t *b = userp;
+    const int32_t *a = ap;
+    const int32_t *b = bp;

    return *a == *b;
 }
@ -27,11 +27,17 @@ static void insert(int a, int b)

    for (i = a; i < b; i++) {
        uint32_t hash;
+        void *existing;
+        bool inserted;

        arr[i] = i;
        hash = i;

-        qht_insert(&ht, &arr[i], hash);
+        inserted = qht_insert(&ht, &arr[i], hash, NULL);
+        g_assert_true(inserted);
+        inserted = qht_insert(&ht, &arr[i], hash, &existing);
+        g_assert_false(inserted);
+        g_assert_true(existing == &arr[i]);
    }
 }

@ -60,7 +66,12 @@ static void check(int a, int b, bool expected)

        val = i;
        hash = i;
-        p = qht_lookup(&ht, is_equal, &val, hash);
+        /* test both lookup variants; results should be the same */
+        if (i % 2) {
+            p = qht_lookup(&ht, &val, hash);
+        } else {
+            p = qht_lookup_custom(&ht, &val, hash, is_equal);
+        }
        g_assert_true(!!p == expected);
    }
    rcu_read_unlock();
@ -102,7 +113,7 @@ static void qht_do_test(unsigned int mode, size_t init_entries)
    /* under KVM we might fetch stats from an uninitialized qht */
    check_n(0);

-    qht_init(&ht, 0, mode);
+    qht_init(&ht, is_equal, 0, mode);

    check_n(0);
    insert(0, N);
--- a/util/qht.c
+++ b/util/qht.c
@ -351,11 +351,14 @@ static struct qht_map *qht_map_create(size_t n_buckets)
    return map;
 }

-void qht_init(struct qht *ht, size_t n_elems, unsigned int mode)
+void qht_init(struct qht *ht, qht_cmp_func_t cmp, size_t n_elems,
+              unsigned int mode)
 {
    struct qht_map *map;
    size_t n_buckets = qht_elems_to_buckets(n_elems);

+    g_assert(cmp);
+    ht->cmp = cmp;
    ht->mode = mode;
    qemu_mutex_init(&ht->lock);
    map = qht_map_create(n_buckets);
@ -479,8 +482,8 @@ void *qht_lookup__slowpath(struct qht_bucket *b, qht_lookup_func_t func,
    return ret;
 }

-void *qht_lookup(struct qht *ht, qht_lookup_func_t func, const void *userp,
-                 uint32_t hash)
+void *qht_lookup_custom(struct qht *ht, const void *userp, uint32_t hash,
+                        qht_lookup_func_t func)
 {
    struct qht_bucket *b;
    struct qht_map *map;
@ -502,10 +505,15 @@ void *qht_lookup(struct qht *ht, qht_lookup_func_t func, const void *userp,
    return qht_lookup__slowpath(b, func, userp, hash);
 }

+void *qht_lookup(struct qht *ht, const void *userp, uint32_t hash)
+{
+    return qht_lookup_custom(ht, userp, hash, ht->cmp);
+}
+
 /* call with head->lock held */
-static bool qht_insert__locked(struct qht *ht, struct qht_map *map,
-                               struct qht_bucket *head, void *p, uint32_t hash,
-                               bool *needs_resize)
+static void *qht_insert__locked(struct qht *ht, struct qht_map *map,
+                                struct qht_bucket *head, void *p, uint32_t hash,
+                                bool *needs_resize)
 {
    struct qht_bucket *b = head;
    struct qht_bucket *prev = NULL;
@ -515,8 +523,9 @@ static bool qht_insert__locked(struct qht *ht, struct qht_map *map,
    do {
        for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
            if (b->pointers[i]) {
-                if (unlikely(b->pointers[i] == p)) {
-                    return false;
+                if (unlikely(b->hashes[i] == hash &&
+                             ht->cmp(b->pointers[i], p))) {
+                    return b->pointers[i];
                }
            } else {
                goto found;
@ -545,7 +554,7 @@ static bool qht_insert__locked(struct qht *ht, struct qht_map *map,
    atomic_set(&b->hashes[i], hash);
    atomic_set(&b->pointers[i], p);
    seqlock_write_end(&head->sequence);
-    return true;
+    return NULL;
 }

 static __attribute__((noinline)) void qht_grow_maybe(struct qht *ht)
@ -569,25 +578,31 @@ static __attribute__((noinline)) void qht_grow_maybe(struct qht *ht)
    qemu_mutex_unlock(&ht->lock);
 }

-bool qht_insert(struct qht *ht, void *p, uint32_t hash)
+bool qht_insert(struct qht *ht, void *p, uint32_t hash, void **existing)
 {
    struct qht_bucket *b;
    struct qht_map *map;
    bool needs_resize = false;
-    bool ret;
+    void *prev;

    /* NULL pointers are not supported */
    qht_debug_assert(p);

    b = qht_bucket_lock__no_stale(ht, hash, &map);
-    ret = qht_insert__locked(ht, map, b, p, hash, &needs_resize);
+    prev = qht_insert__locked(ht, map, b, p, hash, &needs_resize);
    qht_bucket_debug__locked(b);
    qemu_spin_unlock(&b->lock);

    if (unlikely(needs_resize) && ht->mode & QHT_MODE_AUTO_RESIZE) {
        qht_grow_maybe(ht);
    }
-    return ret;
+    if (likely(prev == NULL)) {
+        return true;
+    }
+    if (existing) {
+        *existing = prev;
+    }
+    return false;
 }

 static inline bool qht_entry_is_last(struct qht_bucket *b, int pos)