Use interval trees for user-only vma mappings.

Assorted cleanups to page locking.
 -----BEGIN PGP SIGNATURE-----
 
 iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmOiku4dHHJpY2hhcmQu
 aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV9eawf+JeZRrzFIqQNCceOw
 mdBcPhxBf56ncRbIM0jGj8cMnnWgNX46gsXZ3bYRlRT9ZshWiTY/mcWlPEevBBmX
 yGtG48NfjPzHmo7WiodEFQJvID1+E6MfFO8GtieDZcJa0Md+2gbyFG5SlyAFaGgF
 ql4Hzj22KkssMcisIVL4Gkrxp2FB8hyO123JXcg4sRp9sgZ1amDm9WbZ/DpLjc7y
 C91Nur256JKrHEmTaCbJkgZRpOxhCV72MaR/pow/m7QpN0OcxIRPzhe+zJhumg5o
 6/9Y9isnkZ0UrN0yX360dOAE7z/qLS/zj4cDbo08pzo44K7RJZRVuTosG8XAAjtP
 veoeIA==
 =vaRH
 -----END PGP SIGNATURE-----

Merge tag 'pull-tcg-20221220' of https://gitlab.com/rth7680/qemu into staging

Use interval trees for user-only vma mappings.
Assorted cleanups to page locking.

# gpg: Signature made Wed 21 Dec 2022 05:00:30 GMT
# gpg:                using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F
# gpg:                issuer "richard.henderson@linaro.org"
# gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [full]
# Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A  05C0 64DF 38E8 AF7E 215F

* tag 'pull-tcg-20221220' of https://gitlab.com/rth7680/qemu:
  accel/tcg: Restrict page_collection structure to system TB maintainance
  accel/tcg: Factor tb_invalidate_phys_range_fast() out
  accel/tcg: Rename tb_invalidate_phys_page_fast{,__locked}()
  accel/tcg: Remove trace events from trace-root.h
  accel/tcg: Restrict cpu_io_recompile() to system emulation
  accel/tcg: Move remainder of page locking to tb-maint.c
  accel/tcg: Move PageDesc tree into tb-maint.c for system
  accel/tcg: Use interval tree for user-only page tracking
  accel/tcg: Move page_{get,set}_flags to user-exec.c
  accel/tcg: Drop PAGE_RESERVED for CONFIG_BSD
  accel/tcg: Use interval tree for TARGET_PAGE_DATA_SIZE
  accel/tcg: Use interval tree for TBs in user-only mode
  accel/tcg: Rename page_flush_tb
  util: Add interval-tree.c

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Peter Maydell 2022-12-21 14:15:18 +00:00
commit 700ce3b1bb
15 changed files with 2662 additions and 1162 deletions

View File

@ -33,7 +33,7 @@
#include "qemu/atomic.h" #include "qemu/atomic.h"
#include "qemu/atomic128.h" #include "qemu/atomic128.h"
#include "exec/translate-all.h" #include "exec/translate-all.h"
#include "trace/trace-root.h" #include "trace.h"
#include "tb-hash.h" #include "tb-hash.h"
#include "internal.h" #include "internal.h"
#ifdef CONFIG_PLUGIN #ifdef CONFIG_PLUGIN
@ -1508,10 +1508,7 @@ static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size); trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) { if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
struct page_collection *pages tb_invalidate_phys_range_fast(ram_addr, size, retaddr);
= page_collection_lock(ram_addr, ram_addr + size);
tb_invalidate_phys_page_fast(pages, ram_addr, size, retaddr);
page_collection_unlock(pages);
} }
/* /*

View File

@ -23,83 +23,28 @@
#define assert_memory_lock() tcg_debug_assert(have_mmap_lock()) #define assert_memory_lock() tcg_debug_assert(have_mmap_lock())
#endif #endif
typedef struct PageDesc { #if defined(CONFIG_SOFTMMU) && defined(CONFIG_DEBUG_TCG)
/* list of TBs intersecting this ram page */
uintptr_t first_tb;
#ifdef CONFIG_USER_ONLY
unsigned long flags;
void *target_data;
#endif
#ifdef CONFIG_SOFTMMU
QemuSpin lock;
#endif
} PageDesc;
/* Size of the L2 (and L3, etc) page tables. */
#define V_L2_BITS 10
#define V_L2_SIZE (1 << V_L2_BITS)
/*
* L1 Mapping properties
*/
extern int v_l1_size;
extern int v_l1_shift;
extern int v_l2_levels;
/*
* The bottom level has pointers to PageDesc, and is indexed by
* anything from 4 to (V_L2_BITS + 3) bits, depending on target page size.
*/
#define V_L1_MIN_BITS 4
#define V_L1_MAX_BITS (V_L2_BITS + 3)
#define V_L1_MAX_SIZE (1 << V_L1_MAX_BITS)
extern void *l1_map[V_L1_MAX_SIZE];
PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc);
static inline PageDesc *page_find(tb_page_addr_t index)
{
return page_find_alloc(index, false);
}
/* list iterators for lists of tagged pointers in TranslationBlock */
#define TB_FOR_EACH_TAGGED(head, tb, n, field) \
for (n = (head) & 1, tb = (TranslationBlock *)((head) & ~1); \
tb; tb = (TranslationBlock *)tb->field[n], n = (uintptr_t)tb & 1, \
tb = (TranslationBlock *)((uintptr_t)tb & ~1))
#define PAGE_FOR_EACH_TB(pagedesc, tb, n) \
TB_FOR_EACH_TAGGED((pagedesc)->first_tb, tb, n, page_next)
#define TB_FOR_EACH_JMP(head_tb, tb, n) \
TB_FOR_EACH_TAGGED((head_tb)->jmp_list_head, tb, n, jmp_list_next)
/* In user-mode page locks aren't used; mmap_lock is enough */
#ifdef CONFIG_USER_ONLY
#define assert_page_locked(pd) tcg_debug_assert(have_mmap_lock())
static inline void page_lock(PageDesc *pd) { }
static inline void page_unlock(PageDesc *pd) { }
#else
#ifdef CONFIG_DEBUG_TCG
void do_assert_page_locked(const PageDesc *pd, const char *file, int line);
#define assert_page_locked(pd) do_assert_page_locked(pd, __FILE__, __LINE__)
#else
#define assert_page_locked(pd)
#endif
void page_lock(PageDesc *pd);
void page_unlock(PageDesc *pd);
#endif
#if !defined(CONFIG_USER_ONLY) && defined(CONFIG_DEBUG_TCG)
void assert_no_pages_locked(void); void assert_no_pages_locked(void);
#else #else
static inline void assert_no_pages_locked(void) { } static inline void assert_no_pages_locked(void) { }
#endif #endif
#ifdef CONFIG_USER_ONLY
static inline void page_table_config_init(void) { }
#else
void page_table_config_init(void);
#endif
#ifdef CONFIG_SOFTMMU
void tb_invalidate_phys_range_fast(ram_addr_t ram_addr,
unsigned size,
uintptr_t retaddr);
G_NORETURN void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr);
#endif /* CONFIG_SOFTMMU */
TranslationBlock *tb_gen_code(CPUState *cpu, target_ulong pc, TranslationBlock *tb_gen_code(CPUState *cpu, target_ulong pc,
target_ulong cs_base, uint32_t flags, target_ulong cs_base, uint32_t flags,
int cflags); int cflags);
G_NORETURN void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr);
void page_init(void); void page_init(void);
void tb_htable_init(void); void tb_htable_init(void);
void tb_reset_jump(TranslationBlock *tb, int n); void tb_reset_jump(TranslationBlock *tb, int n);

File diff suppressed because it is too large Load Diff

View File

@ -6,5 +6,9 @@ exec_tb(void *tb, uintptr_t pc) "tb:%p pc=0x%"PRIxPTR
exec_tb_nocache(void *tb, uintptr_t pc) "tb:%p pc=0x%"PRIxPTR exec_tb_nocache(void *tb, uintptr_t pc) "tb:%p pc=0x%"PRIxPTR
exec_tb_exit(void *last_tb, unsigned int flags) "tb:%p flags=0x%x" exec_tb_exit(void *last_tb, unsigned int flags) "tb:%p flags=0x%x"
# cputlb.c
memory_notdirty_write_access(uint64_t vaddr, uint64_t ram_addr, unsigned size) "0x%" PRIx64 " ram_addr 0x%" PRIx64 " size %u"
memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64
# translate-all.c # translate-all.c
translate_block(void *tb, uintptr_t pc, const void *tb_code) "tb:%p, pc:0x%"PRIxPTR", tb_code:%p" translate_block(void *tb, uintptr_t pc, const void *tb_code) "tb:%p, pc:0x%"PRIxPTR", tb_code:%p"

View File

@ -63,105 +63,13 @@
#include "tb-context.h" #include "tb-context.h"
#include "internal.h" #include "internal.h"
/* make various TB consistency checks */
/**
* struct page_entry - page descriptor entry
* @pd: pointer to the &struct PageDesc of the page this entry represents
* @index: page index of the page
* @locked: whether the page is locked
*
* This struct helps us keep track of the locked state of a page, without
* bloating &struct PageDesc.
*
* A page lock protects accesses to all fields of &struct PageDesc.
*
* See also: &struct page_collection.
*/
struct page_entry {
PageDesc *pd;
tb_page_addr_t index;
bool locked;
};
/**
* struct page_collection - tracks a set of pages (i.e. &struct page_entry's)
* @tree: Binary search tree (BST) of the pages, with key == page index
* @max: Pointer to the page in @tree with the highest page index
*
* To avoid deadlock we lock pages in ascending order of page index.
* When operating on a set of pages, we need to keep track of them so that
* we can lock them in order and also unlock them later. For this we collect
* pages (i.e. &struct page_entry's) in a binary search @tree. Given that the
* @tree implementation we use does not provide an O(1) operation to obtain the
* highest-ranked element, we use @max to keep track of the inserted page
* with the highest index. This is valuable because if a page is not in
* the tree and its index is higher than @max's, then we can lock it
* without breaking the locking order rule.
*
* Note on naming: 'struct page_set' would be shorter, but we already have a few
* page_set_*() helpers, so page_collection is used instead to avoid confusion.
*
* See also: page_collection_lock().
*/
struct page_collection {
GTree *tree;
struct page_entry *max;
};
/*
* In system mode we want L1_MAP to be based on ram offsets,
* while in user mode we want it to be based on virtual addresses.
*
* TODO: For user mode, see the caveat re host vs guest virtual
* address spaces near GUEST_ADDR_MAX.
*/
#if !defined(CONFIG_USER_ONLY)
#if HOST_LONG_BITS < TARGET_PHYS_ADDR_SPACE_BITS
# define L1_MAP_ADDR_SPACE_BITS HOST_LONG_BITS
#else
# define L1_MAP_ADDR_SPACE_BITS TARGET_PHYS_ADDR_SPACE_BITS
#endif
#else
# define L1_MAP_ADDR_SPACE_BITS MIN(HOST_LONG_BITS, TARGET_ABI_BITS)
#endif
/* Make sure all possible CPU event bits fit in tb->trace_vcpu_dstate */ /* Make sure all possible CPU event bits fit in tb->trace_vcpu_dstate */
QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS > QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS >
sizeof_field(TranslationBlock, trace_vcpu_dstate) sizeof_field(TranslationBlock, trace_vcpu_dstate)
* BITS_PER_BYTE); * BITS_PER_BYTE);
/*
* L1 Mapping properties
*/
int v_l1_size;
int v_l1_shift;
int v_l2_levels;
void *l1_map[V_L1_MAX_SIZE];
TBContext tb_ctx; TBContext tb_ctx;
static void page_table_config_init(void)
{
uint32_t v_l1_bits;
assert(TARGET_PAGE_BITS);
/* The bits remaining after N lower levels of page tables. */
v_l1_bits = (L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % V_L2_BITS;
if (v_l1_bits < V_L1_MIN_BITS) {
v_l1_bits += V_L2_BITS;
}
v_l1_size = 1 << v_l1_bits;
v_l1_shift = L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS - v_l1_bits;
v_l2_levels = v_l1_shift / V_L2_BITS - 1;
assert(v_l1_bits <= V_L1_MAX_BITS);
assert(v_l1_shift % V_L2_BITS == 0);
assert(v_l2_levels >= 0);
}
/* Encode VAL as a signed leb128 sequence at P. /* Encode VAL as a signed leb128 sequence at P.
Return P incremented past the encoded value. */ Return P incremented past the encoded value. */
static uint8_t *encode_sleb128(uint8_t *p, target_long val) static uint8_t *encode_sleb128(uint8_t *p, target_long val)
@ -354,394 +262,8 @@ void page_init(void)
{ {
page_size_init(); page_size_init();
page_table_config_init(); page_table_config_init();
#if defined(CONFIG_BSD) && defined(CONFIG_USER_ONLY)
{
#ifdef HAVE_KINFO_GETVMMAP
struct kinfo_vmentry *freep;
int i, cnt;
freep = kinfo_getvmmap(getpid(), &cnt);
if (freep) {
mmap_lock();
for (i = 0; i < cnt; i++) {
unsigned long startaddr, endaddr;
startaddr = freep[i].kve_start;
endaddr = freep[i].kve_end;
if (h2g_valid(startaddr)) {
startaddr = h2g(startaddr) & TARGET_PAGE_MASK;
if (h2g_valid(endaddr)) {
endaddr = h2g(endaddr);
page_set_flags(startaddr, endaddr, PAGE_RESERVED);
} else {
#if TARGET_ABI_BITS <= L1_MAP_ADDR_SPACE_BITS
endaddr = ~0ul;
page_set_flags(startaddr, endaddr, PAGE_RESERVED);
#endif
}
}
}
free(freep);
mmap_unlock();
}
#else
FILE *f;
last_brk = (unsigned long)sbrk(0);
f = fopen("/compat/linux/proc/self/maps", "r");
if (f) {
mmap_lock();
do {
unsigned long startaddr, endaddr;
int n;
n = fscanf(f, "%lx-%lx %*[^\n]\n", &startaddr, &endaddr);
if (n == 2 && h2g_valid(startaddr)) {
startaddr = h2g(startaddr) & TARGET_PAGE_MASK;
if (h2g_valid(endaddr)) {
endaddr = h2g(endaddr);
} else {
endaddr = ~0ul;
}
page_set_flags(startaddr, endaddr, PAGE_RESERVED);
}
} while (!feof(f));
fclose(f);
mmap_unlock();
}
#endif
}
#endif
} }
PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc)
{
PageDesc *pd;
void **lp;
int i;
/* Level 1. Always allocated. */
lp = l1_map + ((index >> v_l1_shift) & (v_l1_size - 1));
/* Level 2..N-1. */
for (i = v_l2_levels; i > 0; i--) {
void **p = qatomic_rcu_read(lp);
if (p == NULL) {
void *existing;
if (!alloc) {
return NULL;
}
p = g_new0(void *, V_L2_SIZE);
existing = qatomic_cmpxchg(lp, NULL, p);
if (unlikely(existing)) {
g_free(p);
p = existing;
}
}
lp = p + ((index >> (i * V_L2_BITS)) & (V_L2_SIZE - 1));
}
pd = qatomic_rcu_read(lp);
if (pd == NULL) {
void *existing;
if (!alloc) {
return NULL;
}
pd = g_new0(PageDesc, V_L2_SIZE);
#ifndef CONFIG_USER_ONLY
{
int i;
for (i = 0; i < V_L2_SIZE; i++) {
qemu_spin_init(&pd[i].lock);
}
}
#endif
existing = qatomic_cmpxchg(lp, NULL, pd);
if (unlikely(existing)) {
#ifndef CONFIG_USER_ONLY
{
int i;
for (i = 0; i < V_L2_SIZE; i++) {
qemu_spin_destroy(&pd[i].lock);
}
}
#endif
g_free(pd);
pd = existing;
}
}
return pd + (index & (V_L2_SIZE - 1));
}
/* In user-mode page locks aren't used; mmap_lock is enough */
#ifdef CONFIG_USER_ONLY
struct page_collection *
page_collection_lock(tb_page_addr_t start, tb_page_addr_t end)
{
return NULL;
}
void page_collection_unlock(struct page_collection *set)
{ }
#else /* !CONFIG_USER_ONLY */
#ifdef CONFIG_DEBUG_TCG
static __thread GHashTable *ht_pages_locked_debug;
static void ht_pages_locked_debug_init(void)
{
if (ht_pages_locked_debug) {
return;
}
ht_pages_locked_debug = g_hash_table_new(NULL, NULL);
}
static bool page_is_locked(const PageDesc *pd)
{
PageDesc *found;
ht_pages_locked_debug_init();
found = g_hash_table_lookup(ht_pages_locked_debug, pd);
return !!found;
}
static void page_lock__debug(PageDesc *pd)
{
ht_pages_locked_debug_init();
g_assert(!page_is_locked(pd));
g_hash_table_insert(ht_pages_locked_debug, pd, pd);
}
static void page_unlock__debug(const PageDesc *pd)
{
bool removed;
ht_pages_locked_debug_init();
g_assert(page_is_locked(pd));
removed = g_hash_table_remove(ht_pages_locked_debug, pd);
g_assert(removed);
}
void do_assert_page_locked(const PageDesc *pd, const char *file, int line)
{
if (unlikely(!page_is_locked(pd))) {
error_report("assert_page_lock: PageDesc %p not locked @ %s:%d",
pd, file, line);
abort();
}
}
void assert_no_pages_locked(void)
{
ht_pages_locked_debug_init();
g_assert(g_hash_table_size(ht_pages_locked_debug) == 0);
}
#else /* !CONFIG_DEBUG_TCG */
static inline void page_lock__debug(const PageDesc *pd) { }
static inline void page_unlock__debug(const PageDesc *pd) { }
#endif /* CONFIG_DEBUG_TCG */
void page_lock(PageDesc *pd)
{
page_lock__debug(pd);
qemu_spin_lock(&pd->lock);
}
void page_unlock(PageDesc *pd)
{
qemu_spin_unlock(&pd->lock);
page_unlock__debug(pd);
}
static inline struct page_entry *
page_entry_new(PageDesc *pd, tb_page_addr_t index)
{
struct page_entry *pe = g_malloc(sizeof(*pe));
pe->index = index;
pe->pd = pd;
pe->locked = false;
return pe;
}
static void page_entry_destroy(gpointer p)
{
struct page_entry *pe = p;
g_assert(pe->locked);
page_unlock(pe->pd);
g_free(pe);
}
/* returns false on success */
static bool page_entry_trylock(struct page_entry *pe)
{
bool busy;
busy = qemu_spin_trylock(&pe->pd->lock);
if (!busy) {
g_assert(!pe->locked);
pe->locked = true;
page_lock__debug(pe->pd);
}
return busy;
}
static void do_page_entry_lock(struct page_entry *pe)
{
page_lock(pe->pd);
g_assert(!pe->locked);
pe->locked = true;
}
static gboolean page_entry_lock(gpointer key, gpointer value, gpointer data)
{
struct page_entry *pe = value;
do_page_entry_lock(pe);
return FALSE;
}
static gboolean page_entry_unlock(gpointer key, gpointer value, gpointer data)
{
struct page_entry *pe = value;
if (pe->locked) {
pe->locked = false;
page_unlock(pe->pd);
}
return FALSE;
}
/*
* Trylock a page, and if successful, add the page to a collection.
* Returns true ("busy") if the page could not be locked; false otherwise.
*/
static bool page_trylock_add(struct page_collection *set, tb_page_addr_t addr)
{
tb_page_addr_t index = addr >> TARGET_PAGE_BITS;
struct page_entry *pe;
PageDesc *pd;
pe = g_tree_lookup(set->tree, &index);
if (pe) {
return false;
}
pd = page_find(index);
if (pd == NULL) {
return false;
}
pe = page_entry_new(pd, index);
g_tree_insert(set->tree, &pe->index, pe);
/*
* If this is either (1) the first insertion or (2) a page whose index
* is higher than any other so far, just lock the page and move on.
*/
if (set->max == NULL || pe->index > set->max->index) {
set->max = pe;
do_page_entry_lock(pe);
return false;
}
/*
* Try to acquire out-of-order lock; if busy, return busy so that we acquire
* locks in order.
*/
return page_entry_trylock(pe);
}
static gint tb_page_addr_cmp(gconstpointer ap, gconstpointer bp, gpointer udata)
{
tb_page_addr_t a = *(const tb_page_addr_t *)ap;
tb_page_addr_t b = *(const tb_page_addr_t *)bp;
if (a == b) {
return 0;
} else if (a < b) {
return -1;
}
return 1;
}
/*
* Lock a range of pages ([@start,@end[) as well as the pages of all
* intersecting TBs.
* Locking order: acquire locks in ascending order of page index.
*/
struct page_collection *
page_collection_lock(tb_page_addr_t start, tb_page_addr_t end)
{
struct page_collection *set = g_malloc(sizeof(*set));
tb_page_addr_t index;
PageDesc *pd;
start >>= TARGET_PAGE_BITS;
end >>= TARGET_PAGE_BITS;
g_assert(start <= end);
set->tree = g_tree_new_full(tb_page_addr_cmp, NULL, NULL,
page_entry_destroy);
set->max = NULL;
assert_no_pages_locked();
retry:
g_tree_foreach(set->tree, page_entry_lock, NULL);
for (index = start; index <= end; index++) {
TranslationBlock *tb;
int n;
pd = page_find(index);
if (pd == NULL) {
continue;
}
if (page_trylock_add(set, index << TARGET_PAGE_BITS)) {
g_tree_foreach(set->tree, page_entry_unlock, NULL);
goto retry;
}
assert_page_locked(pd);
PAGE_FOR_EACH_TB(pd, tb, n) {
if (page_trylock_add(set, tb_page_addr0(tb)) ||
(tb_page_addr1(tb) != -1 &&
page_trylock_add(set, tb_page_addr1(tb)))) {
/* drop all locks, and reacquire in order */
g_tree_foreach(set->tree, page_entry_unlock, NULL);
goto retry;
}
}
}
return set;
}
void page_collection_unlock(struct page_collection *set)
{
/* entries are unlocked and freed via page_entry_destroy */
g_tree_destroy(set->tree);
g_free(set);
}
#endif /* !CONFIG_USER_ONLY */
/* /*
* Isolate the portion of code gen which can setjmp/longjmp. * Isolate the portion of code gen which can setjmp/longjmp.
* Return the size of the generated code, or negative on error. * Return the size of the generated code, or negative on error.
@ -1235,339 +757,6 @@ void cpu_interrupt(CPUState *cpu, int mask)
qatomic_set(&cpu_neg(cpu)->icount_decr.u16.high, -1); qatomic_set(&cpu_neg(cpu)->icount_decr.u16.high, -1);
} }
/*
* Walks guest process memory "regions" one by one
* and calls callback function 'fn' for each region.
*/
struct walk_memory_regions_data {
walk_memory_regions_fn fn;
void *priv;
target_ulong start;
int prot;
};
static int walk_memory_regions_end(struct walk_memory_regions_data *data,
target_ulong end, int new_prot)
{
if (data->start != -1u) {
int rc = data->fn(data->priv, data->start, end, data->prot);
if (rc != 0) {
return rc;
}
}
data->start = (new_prot ? end : -1u);
data->prot = new_prot;
return 0;
}
static int walk_memory_regions_1(struct walk_memory_regions_data *data,
target_ulong base, int level, void **lp)
{
target_ulong pa;
int i, rc;
if (*lp == NULL) {
return walk_memory_regions_end(data, base, 0);
}
if (level == 0) {
PageDesc *pd = *lp;
for (i = 0; i < V_L2_SIZE; ++i) {
int prot = pd[i].flags;
pa = base | (i << TARGET_PAGE_BITS);
if (prot != data->prot) {
rc = walk_memory_regions_end(data, pa, prot);
if (rc != 0) {
return rc;
}
}
}
} else {
void **pp = *lp;
for (i = 0; i < V_L2_SIZE; ++i) {
pa = base | ((target_ulong)i <<
(TARGET_PAGE_BITS + V_L2_BITS * level));
rc = walk_memory_regions_1(data, pa, level - 1, pp + i);
if (rc != 0) {
return rc;
}
}
}
return 0;
}
int walk_memory_regions(void *priv, walk_memory_regions_fn fn)
{
struct walk_memory_regions_data data;
uintptr_t i, l1_sz = v_l1_size;
data.fn = fn;
data.priv = priv;
data.start = -1u;
data.prot = 0;
for (i = 0; i < l1_sz; i++) {
target_ulong base = i << (v_l1_shift + TARGET_PAGE_BITS);
int rc = walk_memory_regions_1(&data, base, v_l2_levels, l1_map + i);
if (rc != 0) {
return rc;
}
}
return walk_memory_regions_end(&data, 0, 0);
}
static int dump_region(void *priv, target_ulong start,
target_ulong end, unsigned long prot)
{
FILE *f = (FILE *)priv;
(void) fprintf(f, TARGET_FMT_lx"-"TARGET_FMT_lx
" "TARGET_FMT_lx" %c%c%c\n",
start, end, end - start,
((prot & PAGE_READ) ? 'r' : '-'),
((prot & PAGE_WRITE) ? 'w' : '-'),
((prot & PAGE_EXEC) ? 'x' : '-'));
return 0;
}
/* dump memory mappings */
void page_dump(FILE *f)
{
const int length = sizeof(target_ulong) * 2;
(void) fprintf(f, "%-*s %-*s %-*s %s\n",
length, "start", length, "end", length, "size", "prot");
walk_memory_regions(f, dump_region);
}
int page_get_flags(target_ulong address)
{
PageDesc *p;
p = page_find(address >> TARGET_PAGE_BITS);
if (!p) {
return 0;
}
return p->flags;
}
/*
* Allow the target to decide if PAGE_TARGET_[12] may be reset.
* By default, they are not kept.
*/
#ifndef PAGE_TARGET_STICKY
#define PAGE_TARGET_STICKY 0
#endif
#define PAGE_STICKY (PAGE_ANON | PAGE_PASSTHROUGH | PAGE_TARGET_STICKY)
/* Modify the flags of a page and invalidate the code if necessary.
The flag PAGE_WRITE_ORG is positioned automatically depending
on PAGE_WRITE. The mmap_lock should already be held. */
void page_set_flags(target_ulong start, target_ulong end, int flags)
{
target_ulong addr, len;
bool reset, inval_tb = false;
/* This function should never be called with addresses outside the
guest address space. If this assert fires, it probably indicates
a missing call to h2g_valid. */
assert(end - 1 <= GUEST_ADDR_MAX);
assert(start < end);
/* Only set PAGE_ANON with new mappings. */
assert(!(flags & PAGE_ANON) || (flags & PAGE_RESET));
assert_memory_lock();
start = start & TARGET_PAGE_MASK;
end = TARGET_PAGE_ALIGN(end);
if (flags & PAGE_WRITE) {
flags |= PAGE_WRITE_ORG;
}
reset = !(flags & PAGE_VALID) || (flags & PAGE_RESET);
if (reset) {
page_reset_target_data(start, end);
}
flags &= ~PAGE_RESET;
for (addr = start, len = end - start;
len != 0;
len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, true);
/*
* If the page was executable, but is reset, or is no longer
* executable, or has become writable, then invalidate any code.
*/
if ((p->flags & PAGE_EXEC)
&& (reset ||
!(flags & PAGE_EXEC) ||
(flags & ~p->flags & PAGE_WRITE))) {
inval_tb = true;
}
/* Using mprotect on a page does not change sticky bits. */
p->flags = (reset ? 0 : p->flags & PAGE_STICKY) | flags;
}
if (inval_tb) {
tb_invalidate_phys_range(start, end);
}
}
int page_check_range(target_ulong start, target_ulong len, int flags)
{
PageDesc *p;
target_ulong end;
target_ulong addr;
/* This function should never be called with addresses outside the
guest address space. If this assert fires, it probably indicates
a missing call to h2g_valid. */
if (TARGET_ABI_BITS > L1_MAP_ADDR_SPACE_BITS) {
assert(start < ((target_ulong)1 << L1_MAP_ADDR_SPACE_BITS));
}
if (len == 0) {
return 0;
}
if (start + len - 1 < start) {
/* We've wrapped around. */
return -1;
}
/* must do before we loose bits in the next step */
end = TARGET_PAGE_ALIGN(start + len);
start = start & TARGET_PAGE_MASK;
for (addr = start, len = end - start;
len != 0;
len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
p = page_find(addr >> TARGET_PAGE_BITS);
if (!p) {
return -1;
}
if (!(p->flags & PAGE_VALID)) {
return -1;
}
if ((flags & PAGE_READ) && !(p->flags & PAGE_READ)) {
return -1;
}
if (flags & PAGE_WRITE) {
if (!(p->flags & PAGE_WRITE_ORG)) {
return -1;
}
/* unprotect the page if it was put read-only because it
contains translated code */
if (!(p->flags & PAGE_WRITE)) {
if (!page_unprotect(addr, 0)) {
return -1;
}
}
}
}
return 0;
}
void page_protect(tb_page_addr_t page_addr)
{
target_ulong addr;
PageDesc *p;
int prot;
p = page_find(page_addr >> TARGET_PAGE_BITS);
if (p && (p->flags & PAGE_WRITE)) {
/*
* Force the host page as non writable (writes will have a page fault +
* mprotect overhead).
*/
page_addr &= qemu_host_page_mask;
prot = 0;
for (addr = page_addr; addr < page_addr + qemu_host_page_size;
addr += TARGET_PAGE_SIZE) {
p = page_find(addr >> TARGET_PAGE_BITS);
if (!p) {
continue;
}
prot |= p->flags;
p->flags &= ~PAGE_WRITE;
}
mprotect(g2h_untagged(page_addr), qemu_host_page_size,
(prot & PAGE_BITS) & ~PAGE_WRITE);
}
}
/* called from signal handler: invalidate the code and unprotect the
* page. Return 0 if the fault was not handled, 1 if it was handled,
* and 2 if it was handled but the caller must cause the TB to be
* immediately exited. (We can only return 2 if the 'pc' argument is
* non-zero.)
*/
int page_unprotect(target_ulong address, uintptr_t pc)
{
unsigned int prot;
bool current_tb_invalidated;
PageDesc *p;
target_ulong host_start, host_end, addr;
/* Technically this isn't safe inside a signal handler. However we
know this only ever happens in a synchronous SEGV handler, so in
practice it seems to be ok. */
mmap_lock();
p = page_find(address >> TARGET_PAGE_BITS);
if (!p) {
mmap_unlock();
return 0;
}
/* if the page was really writable, then we change its
protection back to writable */
if (p->flags & PAGE_WRITE_ORG) {
current_tb_invalidated = false;
if (p->flags & PAGE_WRITE) {
/* If the page is actually marked WRITE then assume this is because
* this thread raced with another one which got here first and
* set the page to PAGE_WRITE and did the TB invalidate for us.
*/
#ifdef TARGET_HAS_PRECISE_SMC
TranslationBlock *current_tb = tcg_tb_lookup(pc);
if (current_tb) {
current_tb_invalidated = tb_cflags(current_tb) & CF_INVALID;
}
#endif
} else {
host_start = address & qemu_host_page_mask;
host_end = host_start + qemu_host_page_size;
prot = 0;
for (addr = host_start; addr < host_end; addr += TARGET_PAGE_SIZE) {
p = page_find(addr >> TARGET_PAGE_BITS);
p->flags |= PAGE_WRITE;
prot |= p->flags;
/* and since the content will be modified, we must invalidate
the corresponding translated code. */
current_tb_invalidated |=
tb_invalidate_phys_page_unwind(addr, pc);
}
mprotect((void *)g2h_untagged(host_start), qemu_host_page_size,
prot & PAGE_BITS);
}
mmap_unlock();
/* If current TB was invalidated return to main loop */
return current_tb_invalidated ? 2 : 1;
}
mmap_unlock();
return 0;
}
#endif /* CONFIG_USER_ONLY */ #endif /* CONFIG_USER_ONLY */
/* /*

View File

@ -135,6 +135,565 @@ bool handle_sigsegv_accerr_write(CPUState *cpu, sigset_t *old_set,
} }
} }
typedef struct PageFlagsNode {
IntervalTreeNode itree;
int flags;
} PageFlagsNode;
static IntervalTreeRoot pageflags_root;
static PageFlagsNode *pageflags_find(target_ulong start, target_long last)
{
IntervalTreeNode *n;
n = interval_tree_iter_first(&pageflags_root, start, last);
return n ? container_of(n, PageFlagsNode, itree) : NULL;
}
static PageFlagsNode *pageflags_next(PageFlagsNode *p, target_ulong start,
target_long last)
{
IntervalTreeNode *n;
n = interval_tree_iter_next(&p->itree, start, last);
return n ? container_of(n, PageFlagsNode, itree) : NULL;
}
int walk_memory_regions(void *priv, walk_memory_regions_fn fn)
{
IntervalTreeNode *n;
int rc = 0;
mmap_lock();
for (n = interval_tree_iter_first(&pageflags_root, 0, -1);
n != NULL;
n = interval_tree_iter_next(n, 0, -1)) {
PageFlagsNode *p = container_of(n, PageFlagsNode, itree);
rc = fn(priv, n->start, n->last + 1, p->flags);
if (rc != 0) {
break;
}
}
mmap_unlock();
return rc;
}
static int dump_region(void *priv, target_ulong start,
target_ulong end, unsigned long prot)
{
FILE *f = (FILE *)priv;
fprintf(f, TARGET_FMT_lx"-"TARGET_FMT_lx" "TARGET_FMT_lx" %c%c%c\n",
start, end, end - start,
((prot & PAGE_READ) ? 'r' : '-'),
((prot & PAGE_WRITE) ? 'w' : '-'),
((prot & PAGE_EXEC) ? 'x' : '-'));
return 0;
}
/* dump memory mappings */
void page_dump(FILE *f)
{
const int length = sizeof(target_ulong) * 2;
fprintf(f, "%-*s %-*s %-*s %s\n",
length, "start", length, "end", length, "size", "prot");
walk_memory_regions(f, dump_region);
}
int page_get_flags(target_ulong address)
{
PageFlagsNode *p = pageflags_find(address, address);
/*
* See util/interval-tree.c re lockless lookups: no false positives but
* there are false negatives. If we find nothing, retry with the mmap
* lock acquired.
*/
if (p) {
return p->flags;
}
if (have_mmap_lock()) {
return 0;
}
mmap_lock();
p = pageflags_find(address, address);
mmap_unlock();
return p ? p->flags : 0;
}
/* A subroutine of page_set_flags: insert a new node for [start,last]. */
static void pageflags_create(target_ulong start, target_ulong last, int flags)
{
PageFlagsNode *p = g_new(PageFlagsNode, 1);
p->itree.start = start;
p->itree.last = last;
p->flags = flags;
interval_tree_insert(&p->itree, &pageflags_root);
}
/* A subroutine of page_set_flags: remove everything in [start,last]. */
static bool pageflags_unset(target_ulong start, target_ulong last)
{
bool inval_tb = false;
while (true) {
PageFlagsNode *p = pageflags_find(start, last);
target_ulong p_last;
if (!p) {
break;
}
if (p->flags & PAGE_EXEC) {
inval_tb = true;
}
interval_tree_remove(&p->itree, &pageflags_root);
p_last = p->itree.last;
if (p->itree.start < start) {
/* Truncate the node from the end, or split out the middle. */
p->itree.last = start - 1;
interval_tree_insert(&p->itree, &pageflags_root);
if (last < p_last) {
pageflags_create(last + 1, p_last, p->flags);
break;
}
} else if (p_last <= last) {
/* Range completely covers node -- remove it. */
g_free(p);
} else {
/* Truncate the node from the start. */
p->itree.start = last + 1;
interval_tree_insert(&p->itree, &pageflags_root);
break;
}
}
return inval_tb;
}
/*
* A subroutine of page_set_flags: nothing overlaps [start,last],
* but check adjacent mappings and maybe merge into a single range.
*/
static void pageflags_create_merge(target_ulong start, target_ulong last,
int flags)
{
PageFlagsNode *next = NULL, *prev = NULL;
if (start > 0) {
prev = pageflags_find(start - 1, start - 1);
if (prev) {
if (prev->flags == flags) {
interval_tree_remove(&prev->itree, &pageflags_root);
} else {
prev = NULL;
}
}
}
if (last + 1 != 0) {
next = pageflags_find(last + 1, last + 1);
if (next) {
if (next->flags == flags) {
interval_tree_remove(&next->itree, &pageflags_root);
} else {
next = NULL;
}
}
}
if (prev) {
if (next) {
prev->itree.last = next->itree.last;
g_free(next);
} else {
prev->itree.last = last;
}
interval_tree_insert(&prev->itree, &pageflags_root);
} else if (next) {
next->itree.start = start;
interval_tree_insert(&next->itree, &pageflags_root);
} else {
pageflags_create(start, last, flags);
}
}
/*
* Allow the target to decide if PAGE_TARGET_[12] may be reset.
* By default, they are not kept.
*/
#ifndef PAGE_TARGET_STICKY
#define PAGE_TARGET_STICKY 0
#endif
#define PAGE_STICKY (PAGE_ANON | PAGE_PASSTHROUGH | PAGE_TARGET_STICKY)
/* A subroutine of page_set_flags: add flags to [start,last]. */
static bool pageflags_set_clear(target_ulong start, target_ulong last,
int set_flags, int clear_flags)
{
PageFlagsNode *p;
target_ulong p_start, p_last;
int p_flags, merge_flags;
bool inval_tb = false;
restart:
p = pageflags_find(start, last);
if (!p) {
if (set_flags) {
pageflags_create_merge(start, last, set_flags);
}
goto done;
}
p_start = p->itree.start;
p_last = p->itree.last;
p_flags = p->flags;
/* Using mprotect on a page does not change sticky bits. */
merge_flags = (p_flags & ~clear_flags) | set_flags;
/*
* Need to flush if an overlapping executable region
* removes exec, or adds write.
*/
if ((p_flags & PAGE_EXEC)
&& (!(merge_flags & PAGE_EXEC)
|| (merge_flags & ~p_flags & PAGE_WRITE))) {
inval_tb = true;
}
/*
* If there is an exact range match, update and return without
* attempting to merge with adjacent regions.
*/
if (start == p_start && last == p_last) {
if (merge_flags) {
p->flags = merge_flags;
} else {
interval_tree_remove(&p->itree, &pageflags_root);
g_free(p);
}
goto done;
}
/*
* If sticky bits affect the original mapping, then we must be more
* careful about the existing intervals and the separate flags.
*/
if (set_flags != merge_flags) {
if (p_start < start) {
interval_tree_remove(&p->itree, &pageflags_root);
p->itree.last = start - 1;
interval_tree_insert(&p->itree, &pageflags_root);
if (last < p_last) {
if (merge_flags) {
pageflags_create(start, last, merge_flags);
}
pageflags_create(last + 1, p_last, p_flags);
} else {
if (merge_flags) {
pageflags_create(start, p_last, merge_flags);
}
if (p_last < last) {
start = p_last + 1;
goto restart;
}
}
} else {
if (start < p_start && set_flags) {
pageflags_create(start, p_start - 1, set_flags);
}
if (last < p_last) {
interval_tree_remove(&p->itree, &pageflags_root);
p->itree.start = last + 1;
interval_tree_insert(&p->itree, &pageflags_root);
if (merge_flags) {
pageflags_create(start, last, merge_flags);
}
} else {
if (merge_flags) {
p->flags = merge_flags;
} else {
interval_tree_remove(&p->itree, &pageflags_root);
g_free(p);
}
if (p_last < last) {
start = p_last + 1;
goto restart;
}
}
}
goto done;
}
/* If flags are not changing for this range, incorporate it. */
if (set_flags == p_flags) {
if (start < p_start) {
interval_tree_remove(&p->itree, &pageflags_root);
p->itree.start = start;
interval_tree_insert(&p->itree, &pageflags_root);
}
if (p_last < last) {
start = p_last + 1;
goto restart;
}
goto done;
}
/* Maybe split out head and/or tail ranges with the original flags. */
interval_tree_remove(&p->itree, &pageflags_root);
if (p_start < start) {
p->itree.last = start - 1;
interval_tree_insert(&p->itree, &pageflags_root);
if (p_last < last) {
goto restart;
}
if (last < p_last) {
pageflags_create(last + 1, p_last, p_flags);
}
} else if (last < p_last) {
p->itree.start = last + 1;
interval_tree_insert(&p->itree, &pageflags_root);
} else {
g_free(p);
goto restart;
}
if (set_flags) {
pageflags_create(start, last, set_flags);
}
done:
return inval_tb;
}
/*
* Modify the flags of a page and invalidate the code if necessary.
* The flag PAGE_WRITE_ORG is positioned automatically depending
* on PAGE_WRITE. The mmap_lock should already be held.
*/
void page_set_flags(target_ulong start, target_ulong end, int flags)
{
target_ulong last;
bool reset = false;
bool inval_tb = false;
/* This function should never be called with addresses outside the
guest address space. If this assert fires, it probably indicates
a missing call to h2g_valid. */
assert(start < end);
assert(end - 1 <= GUEST_ADDR_MAX);
/* Only set PAGE_ANON with new mappings. */
assert(!(flags & PAGE_ANON) || (flags & PAGE_RESET));
assert_memory_lock();
start = start & TARGET_PAGE_MASK;
end = TARGET_PAGE_ALIGN(end);
last = end - 1;
if (!(flags & PAGE_VALID)) {
flags = 0;
} else {
reset = flags & PAGE_RESET;
flags &= ~PAGE_RESET;
if (flags & PAGE_WRITE) {
flags |= PAGE_WRITE_ORG;
}
}
if (!flags || reset) {
page_reset_target_data(start, end);
inval_tb |= pageflags_unset(start, last);
}
if (flags) {
inval_tb |= pageflags_set_clear(start, last, flags,
~(reset ? 0 : PAGE_STICKY));
}
if (inval_tb) {
tb_invalidate_phys_range(start, end);
}
}
int page_check_range(target_ulong start, target_ulong len, int flags)
{
target_ulong last;
if (len == 0) {
return 0; /* trivial length */
}
last = start + len - 1;
if (last < start) {
return -1; /* wrap around */
}
while (true) {
PageFlagsNode *p = pageflags_find(start, last);
int missing;
if (!p) {
return -1; /* entire region invalid */
}
if (start < p->itree.start) {
return -1; /* initial bytes invalid */
}
missing = flags & ~p->flags;
if (missing & PAGE_READ) {
return -1; /* page not readable */
}
if (missing & PAGE_WRITE) {
if (!(p->flags & PAGE_WRITE_ORG)) {
return -1; /* page not writable */
}
/* Asking about writable, but has been protected: undo. */
if (!page_unprotect(start, 0)) {
return -1;
}
/* TODO: page_unprotect should take a range, not a single page. */
if (last - start < TARGET_PAGE_SIZE) {
return 0; /* ok */
}
start += TARGET_PAGE_SIZE;
continue;
}
if (last <= p->itree.last) {
return 0; /* ok */
}
start = p->itree.last + 1;
}
}
void page_protect(tb_page_addr_t address)
{
PageFlagsNode *p;
target_ulong start, last;
int prot;
assert_memory_lock();
if (qemu_host_page_size <= TARGET_PAGE_SIZE) {
start = address & TARGET_PAGE_MASK;
last = start + TARGET_PAGE_SIZE - 1;
} else {
start = address & qemu_host_page_mask;
last = start + qemu_host_page_size - 1;
}
p = pageflags_find(start, last);
if (!p) {
return;
}
prot = p->flags;
if (unlikely(p->itree.last < last)) {
/* More than one protection region covers the one host page. */
assert(TARGET_PAGE_SIZE < qemu_host_page_size);
while ((p = pageflags_next(p, start, last)) != NULL) {
prot |= p->flags;
}
}
if (prot & PAGE_WRITE) {
pageflags_set_clear(start, last, 0, PAGE_WRITE);
mprotect(g2h_untagged(start), qemu_host_page_size,
prot & (PAGE_READ | PAGE_EXEC) ? PROT_READ : PROT_NONE);
}
}
/*
* Called from signal handler: invalidate the code and unprotect the
* page. Return 0 if the fault was not handled, 1 if it was handled,
* and 2 if it was handled but the caller must cause the TB to be
* immediately exited. (We can only return 2 if the 'pc' argument is
* non-zero.)
*/
int page_unprotect(target_ulong address, uintptr_t pc)
{
PageFlagsNode *p;
bool current_tb_invalidated;
/*
* Technically this isn't safe inside a signal handler. However we
* know this only ever happens in a synchronous SEGV handler, so in
* practice it seems to be ok.
*/
mmap_lock();
p = pageflags_find(address, address);
/* If this address was not really writable, nothing to do. */
if (!p || !(p->flags & PAGE_WRITE_ORG)) {
mmap_unlock();
return 0;
}
current_tb_invalidated = false;
if (p->flags & PAGE_WRITE) {
/*
* If the page is actually marked WRITE then assume this is because
* this thread raced with another one which got here first and
* set the page to PAGE_WRITE and did the TB invalidate for us.
*/
#ifdef TARGET_HAS_PRECISE_SMC
TranslationBlock *current_tb = tcg_tb_lookup(pc);
if (current_tb) {
current_tb_invalidated = tb_cflags(current_tb) & CF_INVALID;
}
#endif
} else {
target_ulong start, len, i;
int prot;
if (qemu_host_page_size <= TARGET_PAGE_SIZE) {
start = address & TARGET_PAGE_MASK;
len = TARGET_PAGE_SIZE;
prot = p->flags | PAGE_WRITE;
pageflags_set_clear(start, start + len - 1, PAGE_WRITE, 0);
current_tb_invalidated = tb_invalidate_phys_page_unwind(start, pc);
} else {
start = address & qemu_host_page_mask;
len = qemu_host_page_size;
prot = 0;
for (i = 0; i < len; i += TARGET_PAGE_SIZE) {
target_ulong addr = start + i;
p = pageflags_find(addr, addr);
if (p) {
prot |= p->flags;
if (p->flags & PAGE_WRITE_ORG) {
prot |= PAGE_WRITE;
pageflags_set_clear(addr, addr + TARGET_PAGE_SIZE - 1,
PAGE_WRITE, 0);
}
}
/*
* Since the content will be modified, we must invalidate
* the corresponding translated code.
*/
current_tb_invalidated |=
tb_invalidate_phys_page_unwind(addr, pc);
}
}
if (prot & PAGE_EXEC) {
prot = (prot & ~PAGE_EXEC) | PAGE_READ;
}
mprotect((void *)g2h_untagged(start), len, prot & PAGE_BITS);
}
mmap_unlock();
/* If current TB was invalidated return to main loop */
return current_tb_invalidated ? 2 : 1;
}
static int probe_access_internal(CPUArchState *env, target_ulong addr, static int probe_access_internal(CPUArchState *env, target_ulong addr,
int fault_size, MMUAccessType access_type, int fault_size, MMUAccessType access_type,
bool nonfault, uintptr_t ra) bool nonfault, uintptr_t ra)
@ -210,47 +769,96 @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
return addr; return addr;
} }
#ifdef TARGET_PAGE_DATA_SIZE
/*
* Allocate chunks of target data together. For the only current user,
* if we allocate one hunk per page, we have overhead of 40/128 or 40%.
* Therefore, allocate memory for 64 pages at a time for overhead < 1%.
*/
#define TPD_PAGES 64
#define TBD_MASK (TARGET_PAGE_MASK * TPD_PAGES)
typedef struct TargetPageDataNode {
IntervalTreeNode itree;
char data[TPD_PAGES][TARGET_PAGE_DATA_SIZE] __attribute__((aligned));
} TargetPageDataNode;
static IntervalTreeRoot targetdata_root;
void page_reset_target_data(target_ulong start, target_ulong end) void page_reset_target_data(target_ulong start, target_ulong end)
{ {
#ifdef TARGET_PAGE_DATA_SIZE IntervalTreeNode *n, *next;
target_ulong addr, len; target_ulong last;
/*
* This function should never be called with addresses outside the
* guest address space. If this assert fires, it probably indicates
* a missing call to h2g_valid.
*/
assert(end - 1 <= GUEST_ADDR_MAX);
assert(start < end);
assert_memory_lock(); assert_memory_lock();
start = start & TARGET_PAGE_MASK; start = start & TARGET_PAGE_MASK;
end = TARGET_PAGE_ALIGN(end); last = TARGET_PAGE_ALIGN(end) - 1;
for (addr = start, len = end - start; for (n = interval_tree_iter_first(&targetdata_root, start, last),
len != 0; next = n ? interval_tree_iter_next(n, start, last) : NULL;
len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) { n != NULL;
PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, 1); n = next,
next = next ? interval_tree_iter_next(n, start, last) : NULL) {
target_ulong n_start, n_last, p_ofs, p_len;
TargetPageDataNode *t;
g_free(p->target_data); if (n->start >= start && n->last <= last) {
p->target_data = NULL; interval_tree_remove(n, &targetdata_root);
g_free(n);
continue;
}
if (n->start < start) {
n_start = start;
p_ofs = (start - n->start) >> TARGET_PAGE_BITS;
} else {
n_start = n->start;
p_ofs = 0;
}
n_last = MIN(last, n->last);
p_len = (n_last + 1 - n_start) >> TARGET_PAGE_BITS;
t = container_of(n, TargetPageDataNode, itree);
memset(t->data[p_ofs], 0, p_len * TARGET_PAGE_DATA_SIZE);
} }
#endif
} }
#ifdef TARGET_PAGE_DATA_SIZE
void *page_get_target_data(target_ulong address) void *page_get_target_data(target_ulong address)
{ {
PageDesc *p = page_find(address >> TARGET_PAGE_BITS); IntervalTreeNode *n;
void *ret = p->target_data; TargetPageDataNode *t;
target_ulong page, region;
if (!ret) { page = address & TARGET_PAGE_MASK;
ret = g_malloc0(TARGET_PAGE_DATA_SIZE); region = address & TBD_MASK;
p->target_data = ret;
n = interval_tree_iter_first(&targetdata_root, page, page);
if (!n) {
/*
* See util/interval-tree.c re lockless lookups: no false positives
* but there are false negatives. If we find nothing, retry with
* the mmap lock acquired. We also need the lock for the
* allocation + insert.
*/
mmap_lock();
n = interval_tree_iter_first(&targetdata_root, page, page);
if (!n) {
t = g_new0(TargetPageDataNode, 1);
n = &t->itree;
n->start = region;
n->last = region | ~TBD_MASK;
interval_tree_insert(n, &targetdata_root);
}
mmap_unlock();
} }
return ret;
t = container_of(n, TargetPageDataNode, itree);
return t->data[(page - region) >> TARGET_PAGE_BITS];
} }
#endif #else
void page_reset_target_data(target_ulong start, target_ulong end) { }
#endif /* TARGET_PAGE_DATA_SIZE */
/* The softmmu versions of these helpers are in cputlb.c. */ /* The softmmu versions of these helpers are in cputlb.c. */

View File

@ -24,6 +24,7 @@
#ifdef CONFIG_TCG #ifdef CONFIG_TCG
#include "exec/cpu_ldst.h" #include "exec/cpu_ldst.h"
#endif #endif
#include "qemu/interval-tree.h"
/* allow to see translation results - the slowdown should be negligible, so we leave it */ /* allow to see translation results - the slowdown should be negligible, so we leave it */
#define DEBUG_DISAS #define DEBUG_DISAS
@ -559,11 +560,20 @@ struct TranslationBlock {
struct tb_tc tc; struct tb_tc tc;
/* first and second physical page containing code. The lower bit /*
of the pointer tells the index in page_next[]. * Track tb_page_addr_t intervals that intersect this TB.
The list is protected by the TB's page('s) lock(s) */ * For user-only, the virtual addresses are always contiguous,
* and we use a unified interval tree. For system, we use a
* linked list headed in each PageDesc. Within the list, the lsb
* of the previous pointer tells the index of page_next[], and the
* list is protected by the PageDesc lock(s).
*/
#ifdef CONFIG_USER_ONLY
IntervalTreeNode itree;
#else
uintptr_t page_next[2]; uintptr_t page_next[2];
tb_page_addr_t page_addr[2]; tb_page_addr_t page_addr[2];
#endif
/* jmp_lock placed here to fill a 4-byte hole. Its documentation is below */ /* jmp_lock placed here to fill a 4-byte hole. Its documentation is below */
QemuSpin jmp_lock; QemuSpin jmp_lock;
@ -619,24 +629,51 @@ static inline uint32_t tb_cflags(const TranslationBlock *tb)
static inline tb_page_addr_t tb_page_addr0(const TranslationBlock *tb) static inline tb_page_addr_t tb_page_addr0(const TranslationBlock *tb)
{ {
#ifdef CONFIG_USER_ONLY
return tb->itree.start;
#else
return tb->page_addr[0]; return tb->page_addr[0];
#endif
} }
static inline tb_page_addr_t tb_page_addr1(const TranslationBlock *tb) static inline tb_page_addr_t tb_page_addr1(const TranslationBlock *tb)
{ {
#ifdef CONFIG_USER_ONLY
tb_page_addr_t next = tb->itree.last & TARGET_PAGE_MASK;
return next == (tb->itree.start & TARGET_PAGE_MASK) ? -1 : next;
#else
return tb->page_addr[1]; return tb->page_addr[1];
#endif
} }
static inline void tb_set_page_addr0(TranslationBlock *tb, static inline void tb_set_page_addr0(TranslationBlock *tb,
tb_page_addr_t addr) tb_page_addr_t addr)
{ {
#ifdef CONFIG_USER_ONLY
tb->itree.start = addr;
/*
* To begin, we record an interval of one byte. When the translation
* loop encounters a second page, the interval will be extended to
* include the first byte of the second page, which is sufficient to
* allow tb_page_addr1() above to work properly. The final corrected
* interval will be set by tb_page_add() from tb->size before the
* node is added to the interval tree.
*/
tb->itree.last = addr;
#else
tb->page_addr[0] = addr; tb->page_addr[0] = addr;
#endif
} }
static inline void tb_set_page_addr1(TranslationBlock *tb, static inline void tb_set_page_addr1(TranslationBlock *tb,
tb_page_addr_t addr) tb_page_addr_t addr)
{ {
#ifdef CONFIG_USER_ONLY
/* Extend the interval to the first byte of the second page. See above. */
tb->itree.last = addr;
#else
tb->page_addr[1] = addr; tb->page_addr[1] = addr;
#endif
} }
/* current cflags for hashing/comparison */ /* current cflags for hashing/comparison */

View File

@ -23,12 +23,6 @@
/* translate-all.c */ /* translate-all.c */
struct page_collection *page_collection_lock(tb_page_addr_t start,
tb_page_addr_t end);
void page_collection_unlock(struct page_collection *set);
void tb_invalidate_phys_page_fast(struct page_collection *pages,
tb_page_addr_t start, int len,
uintptr_t retaddr);
void tb_invalidate_phys_page(tb_page_addr_t addr); void tb_invalidate_phys_page(tb_page_addr_t addr);
void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr); void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr);

View File

@ -0,0 +1,99 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Interval trees.
*
* Derived from include/linux/interval_tree.h and its dependencies.
*/
#ifndef QEMU_INTERVAL_TREE_H
#define QEMU_INTERVAL_TREE_H
/*
* For now, don't expose Linux Red-Black Trees separately, but retain the
* separate type definitions to keep the implementation sane, and allow
* the possibility of disentangling them later.
*/
typedef struct RBNode
{
/* Encodes parent with color in the lsb. */
uintptr_t rb_parent_color;
struct RBNode *rb_right;
struct RBNode *rb_left;
} RBNode;
typedef struct RBRoot
{
RBNode *rb_node;
} RBRoot;
typedef struct RBRootLeftCached {
RBRoot rb_root;
RBNode *rb_leftmost;
} RBRootLeftCached;
typedef struct IntervalTreeNode
{
RBNode rb;
uint64_t start; /* Start of interval */
uint64_t last; /* Last location _in_ interval */
uint64_t subtree_last;
} IntervalTreeNode;
typedef RBRootLeftCached IntervalTreeRoot;
/**
* interval_tree_is_empty
* @root: root of the tree.
*
* Returns true if the tree contains no nodes.
*/
static inline bool interval_tree_is_empty(const IntervalTreeRoot *root)
{
return root->rb_root.rb_node == NULL;
}
/**
* interval_tree_insert
* @node: node to insert,
* @root: root of the tree.
*
* Insert @node into @root, and rebalance.
*/
void interval_tree_insert(IntervalTreeNode *node, IntervalTreeRoot *root);
/**
* interval_tree_remove
* @node: node to remove,
* @root: root of the tree.
*
* Remove @node from @root, and rebalance.
*/
void interval_tree_remove(IntervalTreeNode *node, IntervalTreeRoot *root);
/**
* interval_tree_iter_first:
* @root: root of the tree,
* @start, @last: the inclusive interval [start, last].
*
* Locate the "first" of a set of nodes within the tree at @root
* that overlap the interval, where "first" is sorted by start.
* Returns NULL if no overlap found.
*/
IntervalTreeNode *interval_tree_iter_first(IntervalTreeRoot *root,
uint64_t start, uint64_t last);
/**
* interval_tree_iter_next:
* @node: previous search result
* @start, @last: the inclusive interval [start, last].
*
* Locate the "next" of a set of nodes within the tree that overlap the
* interval; @next is the result of a previous call to
* interval_tree_iter_{first,next}. Returns NULL if @next was the last
* node in the set.
*/
IntervalTreeNode *interval_tree_iter_next(IntervalTreeNode *node,
uint64_t start, uint64_t last);
#endif /* QEMU_INTERVAL_TREE_H */

View File

@ -0,0 +1,22 @@
/*
* Test very large vma allocations.
* The qemu out-of-memory condition was within the mmap syscall itself.
* If the syscall actually returns with MAP_FAILED, the test succeeded.
*/
#include <sys/mman.h>
int main()
{
int n = sizeof(size_t) == 4 ? 32 : 45;
for (int i = 28; i < n; i++) {
size_t l = (size_t)1 << i;
void *p = mmap(0, l, PROT_NONE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0);
if (p == MAP_FAILED) {
break;
}
munmap(p, l);
}
return 0;
}

View File

@ -47,6 +47,7 @@ tests = {
'ptimer-test': ['ptimer-test-stubs.c', meson.project_source_root() / 'hw/core/ptimer.c'], 'ptimer-test': ['ptimer-test-stubs.c', meson.project_source_root() / 'hw/core/ptimer.c'],
'test-qapi-util': [], 'test-qapi-util': [],
'test-smp-parse': [qom, meson.project_source_root() / 'hw/core/machine-smp.c'], 'test-smp-parse': [qom, meson.project_source_root() / 'hw/core/machine-smp.c'],
'test-interval-tree': [],
} }
if have_system or have_tools if have_system or have_tools

View File

@ -0,0 +1,209 @@
/*
* Test interval trees
*
* This work is licensed under the terms of the GNU LGPL, version 2 or later.
* See the COPYING.LIB file in the top-level directory.
*
*/
#include "qemu/osdep.h"
#include "qemu/interval-tree.h"
static IntervalTreeNode nodes[20];
static IntervalTreeRoot root;
static void rand_interval(IntervalTreeNode *n, uint64_t start, uint64_t last)
{
gint32 s_ofs, l_ofs, l_max;
if (last - start > INT32_MAX) {
l_max = INT32_MAX;
} else {
l_max = last - start;
}
s_ofs = g_test_rand_int_range(0, l_max);
l_ofs = g_test_rand_int_range(s_ofs, l_max);
n->start = start + s_ofs;
n->last = start + l_ofs;
}
static void test_empty(void)
{
g_assert(root.rb_root.rb_node == NULL);
g_assert(root.rb_leftmost == NULL);
g_assert(interval_tree_iter_first(&root, 0, UINT64_MAX) == NULL);
}
static void test_find_one_point(void)
{
/* Create a tree of a single node, which is the point [1,1]. */
nodes[0].start = 1;
nodes[0].last = 1;
interval_tree_insert(&nodes[0], &root);
g_assert(interval_tree_iter_first(&root, 0, 9) == &nodes[0]);
g_assert(interval_tree_iter_next(&nodes[0], 0, 9) == NULL);
g_assert(interval_tree_iter_first(&root, 0, 0) == NULL);
g_assert(interval_tree_iter_next(&nodes[0], 0, 0) == NULL);
g_assert(interval_tree_iter_first(&root, 0, 1) == &nodes[0]);
g_assert(interval_tree_iter_first(&root, 1, 1) == &nodes[0]);
g_assert(interval_tree_iter_first(&root, 1, 2) == &nodes[0]);
g_assert(interval_tree_iter_first(&root, 2, 2) == NULL);
interval_tree_remove(&nodes[0], &root);
g_assert(root.rb_root.rb_node == NULL);
g_assert(root.rb_leftmost == NULL);
}
static void test_find_two_point(void)
{
IntervalTreeNode *find0, *find1;
/* Create a tree of a two nodes, which are both the point [1,1]. */
nodes[0].start = 1;
nodes[0].last = 1;
nodes[1] = nodes[0];
interval_tree_insert(&nodes[0], &root);
interval_tree_insert(&nodes[1], &root);
find0 = interval_tree_iter_first(&root, 0, 9);
g_assert(find0 == &nodes[0] || find0 == &nodes[1]);
find1 = interval_tree_iter_next(find0, 0, 9);
g_assert(find1 == &nodes[0] || find1 == &nodes[1]);
g_assert(find0 != find1);
interval_tree_remove(&nodes[1], &root);
g_assert(interval_tree_iter_first(&root, 0, 9) == &nodes[0]);
g_assert(interval_tree_iter_next(&nodes[0], 0, 9) == NULL);
interval_tree_remove(&nodes[0], &root);
}
static void test_find_one_range(void)
{
/* Create a tree of a single node, which is the range [1,8]. */
nodes[0].start = 1;
nodes[0].last = 8;
interval_tree_insert(&nodes[0], &root);
g_assert(interval_tree_iter_first(&root, 0, 9) == &nodes[0]);
g_assert(interval_tree_iter_next(&nodes[0], 0, 9) == NULL);
g_assert(interval_tree_iter_first(&root, 0, 0) == NULL);
g_assert(interval_tree_iter_first(&root, 0, 1) == &nodes[0]);
g_assert(interval_tree_iter_first(&root, 1, 1) == &nodes[0]);
g_assert(interval_tree_iter_first(&root, 4, 6) == &nodes[0]);
g_assert(interval_tree_iter_first(&root, 8, 8) == &nodes[0]);
g_assert(interval_tree_iter_first(&root, 9, 9) == NULL);
interval_tree_remove(&nodes[0], &root);
}
static void test_find_one_range_many(void)
{
int i;
/*
* Create a tree of many nodes in [0,99] and [200,299],
* but only one node with exactly [110,190].
*/
nodes[0].start = 110;
nodes[0].last = 190;
for (i = 1; i < ARRAY_SIZE(nodes) / 2; ++i) {
rand_interval(&nodes[i], 0, 99);
}
for (; i < ARRAY_SIZE(nodes); ++i) {
rand_interval(&nodes[i], 200, 299);
}
for (i = 0; i < ARRAY_SIZE(nodes); ++i) {
interval_tree_insert(&nodes[i], &root);
}
/* Test that we find exactly the one node. */
g_assert(interval_tree_iter_first(&root, 100, 199) == &nodes[0]);
g_assert(interval_tree_iter_next(&nodes[0], 100, 199) == NULL);
g_assert(interval_tree_iter_first(&root, 100, 109) == NULL);
g_assert(interval_tree_iter_first(&root, 100, 110) == &nodes[0]);
g_assert(interval_tree_iter_first(&root, 111, 120) == &nodes[0]);
g_assert(interval_tree_iter_first(&root, 111, 199) == &nodes[0]);
g_assert(interval_tree_iter_first(&root, 190, 199) == &nodes[0]);
g_assert(interval_tree_iter_first(&root, 192, 199) == NULL);
/*
* Test that if there are multiple matches, we return the one
* with the minimal start.
*/
g_assert(interval_tree_iter_first(&root, 100, 300) == &nodes[0]);
/* Test that we don't find it after it is removed. */
interval_tree_remove(&nodes[0], &root);
g_assert(interval_tree_iter_first(&root, 100, 199) == NULL);
for (i = 1; i < ARRAY_SIZE(nodes); ++i) {
interval_tree_remove(&nodes[i], &root);
}
}
static void test_find_many_range(void)
{
IntervalTreeNode *find;
int i, n;
n = g_test_rand_int_range(ARRAY_SIZE(nodes) / 3, ARRAY_SIZE(nodes) / 2);
/*
* Create a fair few nodes in [2000,2999], with the others
* distributed around.
*/
for (i = 0; i < n; ++i) {
rand_interval(&nodes[i], 2000, 2999);
}
for (; i < ARRAY_SIZE(nodes) * 2 / 3; ++i) {
rand_interval(&nodes[i], 1000, 1899);
}
for (; i < ARRAY_SIZE(nodes); ++i) {
rand_interval(&nodes[i], 3100, 3999);
}
for (i = 0; i < ARRAY_SIZE(nodes); ++i) {
interval_tree_insert(&nodes[i], &root);
}
/* Test that we find all of the nodes. */
find = interval_tree_iter_first(&root, 2000, 2999);
for (i = 0; find != NULL; i++) {
find = interval_tree_iter_next(find, 2000, 2999);
}
g_assert_cmpint(i, ==, n);
g_assert(interval_tree_iter_first(&root, 0, 999) == NULL);
g_assert(interval_tree_iter_first(&root, 1900, 1999) == NULL);
g_assert(interval_tree_iter_first(&root, 3000, 3099) == NULL);
g_assert(interval_tree_iter_first(&root, 4000, UINT64_MAX) == NULL);
for (i = 0; i < ARRAY_SIZE(nodes); ++i) {
interval_tree_remove(&nodes[i], &root);
}
}
int main(int argc, char **argv)
{
g_test_init(&argc, &argv, NULL);
g_test_add_func("/interval-tree/empty", test_empty);
g_test_add_func("/interval-tree/find-one-point", test_find_one_point);
g_test_add_func("/interval-tree/find-two-point", test_find_two_point);
g_test_add_func("/interval-tree/find-one-range", test_find_one_range);
g_test_add_func("/interval-tree/find-one-range-many",
test_find_one_range_many);
g_test_add_func("/interval-tree/find-many-range", test_find_many_range);
return g_test_run();
}

View File

@ -42,10 +42,6 @@ find_ram_offset(uint64_t size, uint64_t offset) "size: 0x%" PRIx64 " @ 0x%" PRIx
find_ram_offset_loop(uint64_t size, uint64_t candidate, uint64_t offset, uint64_t next, uint64_t mingap) "trying size: 0x%" PRIx64 " @ 0x%" PRIx64 ", offset: 0x%" PRIx64" next: 0x%" PRIx64 " mingap: 0x%" PRIx64 find_ram_offset_loop(uint64_t size, uint64_t candidate, uint64_t offset, uint64_t next, uint64_t mingap) "trying size: 0x%" PRIx64 " @ 0x%" PRIx64 ", offset: 0x%" PRIx64" next: 0x%" PRIx64 " mingap: 0x%" PRIx64
ram_block_discard_range(const char *rbname, void *hva, size_t length, bool need_madvise, bool need_fallocate, int ret) "%s@%p + 0x%zx: madvise: %d fallocate: %d ret: %d" ram_block_discard_range(const char *rbname, void *hva, size_t length, bool need_madvise, bool need_fallocate, int ret) "%s@%p + 0x%zx: madvise: %d fallocate: %d ret: %d"
# accel/tcg/cputlb.c
memory_notdirty_write_access(uint64_t vaddr, uint64_t ram_addr, unsigned size) "0x%" PRIx64 " ram_addr 0x%" PRIx64 " size %u"
memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64
# job.c # job.c
job_state_transition(void *job, int ret, const char *legal, const char *s0, const char *s1) "job %p (ret: %d) attempting %s transition (%s-->%s)" job_state_transition(void *job, int ret, const char *legal, const char *s0, const char *s1) "job %p (ret: %d) attempting %s transition (%s-->%s)"
job_apply_verb(void *job, const char *state, const char *verb, const char *legal) "job %p in state %s; applying verb %s (%s)" job_apply_verb(void *job, const char *state, const char *verb, const char *legal) "job %p in state %s; applying verb %s (%s)"

882
util/interval-tree.c Normal file
View File

@ -0,0 +1,882 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#include "qemu/osdep.h"
#include "qemu/interval-tree.h"
#include "qemu/atomic.h"
/*
* Red Black Trees.
*
* For now, don't expose Linux Red-Black Trees separately, but retain the
* separate type definitions to keep the implementation sane, and allow
* the possibility of separating them later.
*
* Derived from include/linux/rbtree_augmented.h and its dependencies.
*/
/*
* red-black trees properties: https://en.wikipedia.org/wiki/Rbtree
*
* 1) A node is either red or black
* 2) The root is black
* 3) All leaves (NULL) are black
* 4) Both children of every red node are black
* 5) Every simple path from root to leaves contains the same number
* of black nodes.
*
* 4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
* consecutive red nodes in a path and every red node is therefore followed by
* a black. So if B is the number of black nodes on every simple path (as per
* 5), then the longest possible path due to 4 is 2B.
*
* We shall indicate color with case, where black nodes are uppercase and red
* nodes will be lowercase. Unknown color nodes shall be drawn as red within
* parentheses and have some accompanying text comment.
*
* Notes on lockless lookups:
*
* All stores to the tree structure (rb_left and rb_right) must be done using
* WRITE_ONCE [qatomic_set for QEMU]. And we must not inadvertently cause
* (temporary) loops in the tree structure as seen in program order.
*
* These two requirements will allow lockless iteration of the tree -- not
* correct iteration mind you, tree rotations are not atomic so a lookup might
* miss entire subtrees.
*
* But they do guarantee that any such traversal will only see valid elements
* and that it will indeed complete -- does not get stuck in a loop.
*
* It also guarantees that if the lookup returns an element it is the 'correct'
* one. But not returning an element does _NOT_ mean it's not present.
*
* NOTE:
*
* Stores to __rb_parent_color are not important for simple lookups so those
* are left undone as of now. Nor did I check for loops involving parent
* pointers.
*/
typedef enum RBColor
{
RB_RED,
RB_BLACK,
} RBColor;
typedef struct RBAugmentCallbacks {
void (*propagate)(RBNode *node, RBNode *stop);
void (*copy)(RBNode *old, RBNode *new);
void (*rotate)(RBNode *old, RBNode *new);
} RBAugmentCallbacks;
static inline RBNode *rb_parent(const RBNode *n)
{
return (RBNode *)(n->rb_parent_color & ~1);
}
static inline RBNode *rb_red_parent(const RBNode *n)
{
return (RBNode *)n->rb_parent_color;
}
static inline RBColor pc_color(uintptr_t pc)
{
return (RBColor)(pc & 1);
}
static inline bool pc_is_red(uintptr_t pc)
{
return pc_color(pc) == RB_RED;
}
static inline bool pc_is_black(uintptr_t pc)
{
return !pc_is_red(pc);
}
static inline RBColor rb_color(const RBNode *n)
{
return pc_color(n->rb_parent_color);
}
static inline bool rb_is_red(const RBNode *n)
{
return pc_is_red(n->rb_parent_color);
}
static inline bool rb_is_black(const RBNode *n)
{
return pc_is_black(n->rb_parent_color);
}
static inline void rb_set_black(RBNode *n)
{
n->rb_parent_color |= RB_BLACK;
}
static inline void rb_set_parent_color(RBNode *n, RBNode *p, RBColor color)
{
n->rb_parent_color = (uintptr_t)p | color;
}
static inline void rb_set_parent(RBNode *n, RBNode *p)
{
rb_set_parent_color(n, p, rb_color(n));
}
static inline void rb_link_node(RBNode *node, RBNode *parent, RBNode **rb_link)
{
node->rb_parent_color = (uintptr_t)parent;
node->rb_left = node->rb_right = NULL;
qatomic_set(rb_link, node);
}
static RBNode *rb_next(RBNode *node)
{
RBNode *parent;
/* OMIT: if empty node, return null. */
/*
* If we have a right-hand child, go down and then left as far as we can.
*/
if (node->rb_right) {
node = node->rb_right;
while (node->rb_left) {
node = node->rb_left;
}
return node;
}
/*
* No right-hand children. Everything down and left is smaller than us,
* so any 'next' node must be in the general direction of our parent.
* Go up the tree; any time the ancestor is a right-hand child of its
* parent, keep going up. First time it's a left-hand child of its
* parent, said parent is our 'next' node.
*/
while ((parent = rb_parent(node)) && node == parent->rb_right) {
node = parent;
}
return parent;
}
static inline void rb_change_child(RBNode *old, RBNode *new,
RBNode *parent, RBRoot *root)
{
if (!parent) {
qatomic_set(&root->rb_node, new);
} else if (parent->rb_left == old) {
qatomic_set(&parent->rb_left, new);
} else {
qatomic_set(&parent->rb_right, new);
}
}
static inline void rb_rotate_set_parents(RBNode *old, RBNode *new,
RBRoot *root, RBColor color)
{
RBNode *parent = rb_parent(old);
new->rb_parent_color = old->rb_parent_color;
rb_set_parent_color(old, new, color);
rb_change_child(old, new, parent, root);
}
static void rb_insert_augmented(RBNode *node, RBRoot *root,
const RBAugmentCallbacks *augment)
{
RBNode *parent = rb_red_parent(node), *gparent, *tmp;
while (true) {
/*
* Loop invariant: node is red.
*/
if (unlikely(!parent)) {
/*
* The inserted node is root. Either this is the first node, or
* we recursed at Case 1 below and are no longer violating 4).
*/
rb_set_parent_color(node, NULL, RB_BLACK);
break;
}
/*
* If there is a black parent, we are done. Otherwise, take some
* corrective action as, per 4), we don't want a red root or two
* consecutive red nodes.
*/
if (rb_is_black(parent)) {
break;
}
gparent = rb_red_parent(parent);
tmp = gparent->rb_right;
if (parent != tmp) { /* parent == gparent->rb_left */
if (tmp && rb_is_red(tmp)) {
/*
* Case 1 - node's uncle is red (color flips).
*
* G g
* / \ / \
* p u --> P U
* / /
* n n
*
* However, since g's parent might be red, and 4) does not
* allow this, we need to recurse at g.
*/
rb_set_parent_color(tmp, gparent, RB_BLACK);
rb_set_parent_color(parent, gparent, RB_BLACK);
node = gparent;
parent = rb_parent(node);
rb_set_parent_color(node, parent, RB_RED);
continue;
}
tmp = parent->rb_right;
if (node == tmp) {
/*
* Case 2 - node's uncle is black and node is
* the parent's right child (left rotate at parent).
*
* G G
* / \ / \
* p U --> n U
* \ /
* n p
*
* This still leaves us in violation of 4), the
* continuation into Case 3 will fix that.
*/
tmp = node->rb_left;
qatomic_set(&parent->rb_right, tmp);
qatomic_set(&node->rb_left, parent);
if (tmp) {
rb_set_parent_color(tmp, parent, RB_BLACK);
}
rb_set_parent_color(parent, node, RB_RED);
augment->rotate(parent, node);
parent = node;
tmp = node->rb_right;
}
/*
* Case 3 - node's uncle is black and node is
* the parent's left child (right rotate at gparent).
*
* G P
* / \ / \
* p U --> n g
* / \
* n U
*/
qatomic_set(&gparent->rb_left, tmp); /* == parent->rb_right */
qatomic_set(&parent->rb_right, gparent);
if (tmp) {
rb_set_parent_color(tmp, gparent, RB_BLACK);
}
rb_rotate_set_parents(gparent, parent, root, RB_RED);
augment->rotate(gparent, parent);
break;
} else {
tmp = gparent->rb_left;
if (tmp && rb_is_red(tmp)) {
/* Case 1 - color flips */
rb_set_parent_color(tmp, gparent, RB_BLACK);
rb_set_parent_color(parent, gparent, RB_BLACK);
node = gparent;
parent = rb_parent(node);
rb_set_parent_color(node, parent, RB_RED);
continue;
}
tmp = parent->rb_left;
if (node == tmp) {
/* Case 2 - right rotate at parent */
tmp = node->rb_right;
qatomic_set(&parent->rb_left, tmp);
qatomic_set(&node->rb_right, parent);
if (tmp) {
rb_set_parent_color(tmp, parent, RB_BLACK);
}
rb_set_parent_color(parent, node, RB_RED);
augment->rotate(parent, node);
parent = node;
tmp = node->rb_left;
}
/* Case 3 - left rotate at gparent */
qatomic_set(&gparent->rb_right, tmp); /* == parent->rb_left */
qatomic_set(&parent->rb_left, gparent);
if (tmp) {
rb_set_parent_color(tmp, gparent, RB_BLACK);
}
rb_rotate_set_parents(gparent, parent, root, RB_RED);
augment->rotate(gparent, parent);
break;
}
}
}
static void rb_insert_augmented_cached(RBNode *node,
RBRootLeftCached *root, bool newleft,
const RBAugmentCallbacks *augment)
{
if (newleft) {
root->rb_leftmost = node;
}
rb_insert_augmented(node, &root->rb_root, augment);
}
static void rb_erase_color(RBNode *parent, RBRoot *root,
const RBAugmentCallbacks *augment)
{
RBNode *node = NULL, *sibling, *tmp1, *tmp2;
while (true) {
/*
* Loop invariants:
* - node is black (or NULL on first iteration)
* - node is not the root (parent is not NULL)
* - All leaf paths going through parent and node have a
* black node count that is 1 lower than other leaf paths.
*/
sibling = parent->rb_right;
if (node != sibling) { /* node == parent->rb_left */
if (rb_is_red(sibling)) {
/*
* Case 1 - left rotate at parent
*
* P S
* / \ / \
* N s --> p Sr
* / \ / \
* Sl Sr N Sl
*/
tmp1 = sibling->rb_left;
qatomic_set(&parent->rb_right, tmp1);
qatomic_set(&sibling->rb_left, parent);
rb_set_parent_color(tmp1, parent, RB_BLACK);
rb_rotate_set_parents(parent, sibling, root, RB_RED);
augment->rotate(parent, sibling);
sibling = tmp1;
}
tmp1 = sibling->rb_right;
if (!tmp1 || rb_is_black(tmp1)) {
tmp2 = sibling->rb_left;
if (!tmp2 || rb_is_black(tmp2)) {
/*
* Case 2 - sibling color flip
* (p could be either color here)
*
* (p) (p)
* / \ / \
* N S --> N s
* / \ / \
* Sl Sr Sl Sr
*
* This leaves us violating 5) which
* can be fixed by flipping p to black
* if it was red, or by recursing at p.
* p is red when coming from Case 1.
*/
rb_set_parent_color(sibling, parent, RB_RED);
if (rb_is_red(parent)) {
rb_set_black(parent);
} else {
node = parent;
parent = rb_parent(node);
if (parent) {
continue;
}
}
break;
}
/*
* Case 3 - right rotate at sibling
* (p could be either color here)
*
* (p) (p)
* / \ / \
* N S --> N sl
* / \ \
* sl Sr S
* \
* Sr
*
* Note: p might be red, and then bot
* p and sl are red after rotation (which
* breaks property 4). This is fixed in
* Case 4 (in rb_rotate_set_parents()
* which set sl the color of p
* and set p RB_BLACK)
*
* (p) (sl)
* / \ / \
* N sl --> P S
* \ / \
* S N Sr
* \
* Sr
*/
tmp1 = tmp2->rb_right;
qatomic_set(&sibling->rb_left, tmp1);
qatomic_set(&tmp2->rb_right, sibling);
qatomic_set(&parent->rb_right, tmp2);
if (tmp1) {
rb_set_parent_color(tmp1, sibling, RB_BLACK);
}
augment->rotate(sibling, tmp2);
tmp1 = sibling;
sibling = tmp2;
}
/*
* Case 4 - left rotate at parent + color flips
* (p and sl could be either color here.
* After rotation, p becomes black, s acquires
* p's color, and sl keeps its color)
*
* (p) (s)
* / \ / \
* N S --> P Sr
* / \ / \
* (sl) sr N (sl)
*/
tmp2 = sibling->rb_left;
qatomic_set(&parent->rb_right, tmp2);
qatomic_set(&sibling->rb_left, parent);
rb_set_parent_color(tmp1, sibling, RB_BLACK);
if (tmp2) {
rb_set_parent(tmp2, parent);
}
rb_rotate_set_parents(parent, sibling, root, RB_BLACK);
augment->rotate(parent, sibling);
break;
} else {
sibling = parent->rb_left;
if (rb_is_red(sibling)) {
/* Case 1 - right rotate at parent */
tmp1 = sibling->rb_right;
qatomic_set(&parent->rb_left, tmp1);
qatomic_set(&sibling->rb_right, parent);
rb_set_parent_color(tmp1, parent, RB_BLACK);
rb_rotate_set_parents(parent, sibling, root, RB_RED);
augment->rotate(parent, sibling);
sibling = tmp1;
}
tmp1 = sibling->rb_left;
if (!tmp1 || rb_is_black(tmp1)) {
tmp2 = sibling->rb_right;
if (!tmp2 || rb_is_black(tmp2)) {
/* Case 2 - sibling color flip */
rb_set_parent_color(sibling, parent, RB_RED);
if (rb_is_red(parent)) {
rb_set_black(parent);
} else {
node = parent;
parent = rb_parent(node);
if (parent) {
continue;
}
}
break;
}
/* Case 3 - left rotate at sibling */
tmp1 = tmp2->rb_left;
qatomic_set(&sibling->rb_right, tmp1);
qatomic_set(&tmp2->rb_left, sibling);
qatomic_set(&parent->rb_left, tmp2);
if (tmp1) {
rb_set_parent_color(tmp1, sibling, RB_BLACK);
}
augment->rotate(sibling, tmp2);
tmp1 = sibling;
sibling = tmp2;
}
/* Case 4 - right rotate at parent + color flips */
tmp2 = sibling->rb_right;
qatomic_set(&parent->rb_left, tmp2);
qatomic_set(&sibling->rb_right, parent);
rb_set_parent_color(tmp1, sibling, RB_BLACK);
if (tmp2) {
rb_set_parent(tmp2, parent);
}
rb_rotate_set_parents(parent, sibling, root, RB_BLACK);
augment->rotate(parent, sibling);
break;
}
}
}
static void rb_erase_augmented(RBNode *node, RBRoot *root,
const RBAugmentCallbacks *augment)
{
RBNode *child = node->rb_right;
RBNode *tmp = node->rb_left;
RBNode *parent, *rebalance;
uintptr_t pc;
if (!tmp) {
/*
* Case 1: node to erase has no more than 1 child (easy!)
*
* Note that if there is one child it must be red due to 5)
* and node must be black due to 4). We adjust colors locally
* so as to bypass rb_erase_color() later on.
*/
pc = node->rb_parent_color;
parent = rb_parent(node);
rb_change_child(node, child, parent, root);
if (child) {
child->rb_parent_color = pc;
rebalance = NULL;
} else {
rebalance = pc_is_black(pc) ? parent : NULL;
}
tmp = parent;
} else if (!child) {
/* Still case 1, but this time the child is node->rb_left */
pc = node->rb_parent_color;
parent = rb_parent(node);
tmp->rb_parent_color = pc;
rb_change_child(node, tmp, parent, root);
rebalance = NULL;
tmp = parent;
} else {
RBNode *successor = child, *child2;
tmp = child->rb_left;
if (!tmp) {
/*
* Case 2: node's successor is its right child
*
* (n) (s)
* / \ / \
* (x) (s) -> (x) (c)
* \
* (c)
*/
parent = successor;
child2 = successor->rb_right;
augment->copy(node, successor);
} else {
/*
* Case 3: node's successor is leftmost under
* node's right child subtree
*
* (n) (s)
* / \ / \
* (x) (y) -> (x) (y)
* / /
* (p) (p)
* / /
* (s) (c)
* \
* (c)
*/
do {
parent = successor;
successor = tmp;
tmp = tmp->rb_left;
} while (tmp);
child2 = successor->rb_right;
qatomic_set(&parent->rb_left, child2);
qatomic_set(&successor->rb_right, child);
rb_set_parent(child, successor);
augment->copy(node, successor);
augment->propagate(parent, successor);
}
tmp = node->rb_left;
qatomic_set(&successor->rb_left, tmp);
rb_set_parent(tmp, successor);
pc = node->rb_parent_color;
tmp = rb_parent(node);
rb_change_child(node, successor, tmp, root);
if (child2) {
rb_set_parent_color(child2, parent, RB_BLACK);
rebalance = NULL;
} else {
rebalance = rb_is_black(successor) ? parent : NULL;
}
successor->rb_parent_color = pc;
tmp = successor;
}
augment->propagate(tmp, NULL);
if (rebalance) {
rb_erase_color(rebalance, root, augment);
}
}
static void rb_erase_augmented_cached(RBNode *node, RBRootLeftCached *root,
const RBAugmentCallbacks *augment)
{
if (root->rb_leftmost == node) {
root->rb_leftmost = rb_next(node);
}
rb_erase_augmented(node, &root->rb_root, augment);
}
/*
* Interval trees.
*
* Derived from lib/interval_tree.c and its dependencies,
* especially include/linux/interval_tree_generic.h.
*/
#define rb_to_itree(N) container_of(N, IntervalTreeNode, rb)
static bool interval_tree_compute_max(IntervalTreeNode *node, bool exit)
{
IntervalTreeNode *child;
uint64_t max = node->last;
if (node->rb.rb_left) {
child = rb_to_itree(node->rb.rb_left);
if (child->subtree_last > max) {
max = child->subtree_last;
}
}
if (node->rb.rb_right) {
child = rb_to_itree(node->rb.rb_right);
if (child->subtree_last > max) {
max = child->subtree_last;
}
}
if (exit && node->subtree_last == max) {
return true;
}
node->subtree_last = max;
return false;
}
static void interval_tree_propagate(RBNode *rb, RBNode *stop)
{
while (rb != stop) {
IntervalTreeNode *node = rb_to_itree(rb);
if (interval_tree_compute_max(node, true)) {
break;
}
rb = rb_parent(&node->rb);
}
}
static void interval_tree_copy(RBNode *rb_old, RBNode *rb_new)
{
IntervalTreeNode *old = rb_to_itree(rb_old);
IntervalTreeNode *new = rb_to_itree(rb_new);
new->subtree_last = old->subtree_last;
}
static void interval_tree_rotate(RBNode *rb_old, RBNode *rb_new)
{
IntervalTreeNode *old = rb_to_itree(rb_old);
IntervalTreeNode *new = rb_to_itree(rb_new);
new->subtree_last = old->subtree_last;
interval_tree_compute_max(old, false);
}
static const RBAugmentCallbacks interval_tree_augment = {
.propagate = interval_tree_propagate,
.copy = interval_tree_copy,
.rotate = interval_tree_rotate,
};
/* Insert / remove interval nodes from the tree */
void interval_tree_insert(IntervalTreeNode *node, IntervalTreeRoot *root)
{
RBNode **link = &root->rb_root.rb_node, *rb_parent = NULL;
uint64_t start = node->start, last = node->last;
IntervalTreeNode *parent;
bool leftmost = true;
while (*link) {
rb_parent = *link;
parent = rb_to_itree(rb_parent);
if (parent->subtree_last < last) {
parent->subtree_last = last;
}
if (start < parent->start) {
link = &parent->rb.rb_left;
} else {
link = &parent->rb.rb_right;
leftmost = false;
}
}
node->subtree_last = last;
rb_link_node(&node->rb, rb_parent, link);
rb_insert_augmented_cached(&node->rb, root, leftmost,
&interval_tree_augment);
}
void interval_tree_remove(IntervalTreeNode *node, IntervalTreeRoot *root)
{
rb_erase_augmented_cached(&node->rb, root, &interval_tree_augment);
}
/*
* Iterate over intervals intersecting [start;last]
*
* Note that a node's interval intersects [start;last] iff:
* Cond1: node->start <= last
* and
* Cond2: start <= node->last
*/
static IntervalTreeNode *interval_tree_subtree_search(IntervalTreeNode *node,
uint64_t start,
uint64_t last)
{
while (true) {
/*
* Loop invariant: start <= node->subtree_last
* (Cond2 is satisfied by one of the subtree nodes)
*/
if (node->rb.rb_left) {
IntervalTreeNode *left = rb_to_itree(node->rb.rb_left);
if (start <= left->subtree_last) {
/*
* Some nodes in left subtree satisfy Cond2.
* Iterate to find the leftmost such node N.
* If it also satisfies Cond1, that's the
* match we are looking for. Otherwise, there
* is no matching interval as nodes to the
* right of N can't satisfy Cond1 either.
*/
node = left;
continue;
}
}
if (node->start <= last) { /* Cond1 */
if (start <= node->last) { /* Cond2 */
return node; /* node is leftmost match */
}
if (node->rb.rb_right) {
node = rb_to_itree(node->rb.rb_right);
if (start <= node->subtree_last) {
continue;
}
}
}
return NULL; /* no match */
}
}
IntervalTreeNode *interval_tree_iter_first(IntervalTreeRoot *root,
uint64_t start, uint64_t last)
{
IntervalTreeNode *node, *leftmost;
if (!root->rb_root.rb_node) {
return NULL;
}
/*
* Fastpath range intersection/overlap between A: [a0, a1] and
* B: [b0, b1] is given by:
*
* a0 <= b1 && b0 <= a1
*
* ... where A holds the lock range and B holds the smallest
* 'start' and largest 'last' in the tree. For the later, we
* rely on the root node, which by augmented interval tree
* property, holds the largest value in its last-in-subtree.
* This allows mitigating some of the tree walk overhead for
* for non-intersecting ranges, maintained and consulted in O(1).
*/
node = rb_to_itree(root->rb_root.rb_node);
if (node->subtree_last < start) {
return NULL;
}
leftmost = rb_to_itree(root->rb_leftmost);
if (leftmost->start > last) {
return NULL;
}
return interval_tree_subtree_search(node, start, last);
}
IntervalTreeNode *interval_tree_iter_next(IntervalTreeNode *node,
uint64_t start, uint64_t last)
{
RBNode *rb = node->rb.rb_right, *prev;
while (true) {
/*
* Loop invariants:
* Cond1: node->start <= last
* rb == node->rb.rb_right
*
* First, search right subtree if suitable
*/
if (rb) {
IntervalTreeNode *right = rb_to_itree(rb);
if (start <= right->subtree_last) {
return interval_tree_subtree_search(right, start, last);
}
}
/* Move up the tree until we come from a node's left child */
do {
rb = rb_parent(&node->rb);
if (!rb) {
return NULL;
}
prev = &node->rb;
node = rb_to_itree(rb);
rb = node->rb.rb_right;
} while (prev == rb);
/* Check if the node intersects [start;last] */
if (last < node->start) { /* !Cond1 */
return NULL;
}
if (start <= node->last) { /* Cond2 */
return node;
}
}
}
/* Occasionally useful for calling from within the debugger. */
#if 0
static void debug_interval_tree_int(IntervalTreeNode *node,
const char *dir, int level)
{
printf("%4d %*s %s [%" PRIu64 ",%" PRIu64 "] subtree_last:%" PRIu64 "\n",
level, level + 1, dir, rb_is_red(&node->rb) ? "r" : "b",
node->start, node->last, node->subtree_last);
if (node->rb.rb_left) {
debug_interval_tree_int(rb_to_itree(node->rb.rb_left), "<", level + 1);
}
if (node->rb.rb_right) {
debug_interval_tree_int(rb_to_itree(node->rb.rb_right), ">", level + 1);
}
}
void debug_interval_tree(IntervalTreeNode *node);
void debug_interval_tree(IntervalTreeNode *node)
{
if (node) {
debug_interval_tree_int(node, "*", 0);
} else {
printf("null\n");
}
}
#endif

View File

@ -57,6 +57,7 @@ util_ss.add(files('guest-random.c'))
util_ss.add(files('yank.c')) util_ss.add(files('yank.c'))
util_ss.add(files('int128.c')) util_ss.add(files('int128.c'))
util_ss.add(files('memalign.c')) util_ss.add(files('memalign.c'))
util_ss.add(files('interval-tree.c'))
if have_user if have_user
util_ss.add(files('selfmap.c')) util_ss.add(files('selfmap.c'))