accel/tcg: Use interval tree for user-only page tracking

Finish weaning user-only away from PageDesc.

Using an interval tree to track page permissions means that
we can represent very large regions efficiently.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/290
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/967
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1214
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
Richard Henderson 2022-10-05 07:34:51 -07:00
parent d941c086b8
commit 67ff2186b0
4 changed files with 451 additions and 210 deletions

View File

@ -24,9 +24,7 @@
#endif
typedef struct PageDesc {
#ifdef CONFIG_USER_ONLY
unsigned long flags;
#else
#ifndef CONFIG_USER_ONLY
QemuSpin lock;
/* list of TBs intersecting this ram page */
uintptr_t first_tb;

View File

@ -68,15 +68,23 @@ static void tb_remove_all(void)
/* Call with mmap_lock held. */
static void tb_record(TranslationBlock *tb, PageDesc *p1, PageDesc *p2)
{
/* translator_loop() must have made all TB pages non-writable */
assert(!(p1->flags & PAGE_WRITE));
if (p2) {
assert(!(p2->flags & PAGE_WRITE));
}
target_ulong addr;
int flags;
assert_memory_lock();
tb->itree.last = tb->itree.start + tb->size - 1;
/* translator_loop() must have made all TB pages non-writable */
addr = tb_page_addr0(tb);
flags = page_get_flags(addr);
assert(!(flags & PAGE_WRITE));
addr = tb_page_addr1(tb);
if (addr != -1) {
flags = page_get_flags(addr);
assert(!(flags & PAGE_WRITE));
}
interval_tree_insert(&tb->itree, &tb_root);
}

View File

@ -135,92 +135,49 @@ bool handle_sigsegv_accerr_write(CPUState *cpu, sigset_t *old_set,
}
}
/*
* Walks guest process memory "regions" one by one
* and calls callback function 'fn' for each region.
*/
struct walk_memory_regions_data {
walk_memory_regions_fn fn;
void *priv;
target_ulong start;
int prot;
};
typedef struct PageFlagsNode {
IntervalTreeNode itree;
int flags;
} PageFlagsNode;
static int walk_memory_regions_end(struct walk_memory_regions_data *data,
target_ulong end, int new_prot)
static IntervalTreeRoot pageflags_root;
static PageFlagsNode *pageflags_find(target_ulong start, target_long last)
{
if (data->start != -1u) {
int rc = data->fn(data->priv, data->start, end, data->prot);
if (rc != 0) {
return rc;
}
}
IntervalTreeNode *n;
data->start = (new_prot ? end : -1u);
data->prot = new_prot;
return 0;
n = interval_tree_iter_first(&pageflags_root, start, last);
return n ? container_of(n, PageFlagsNode, itree) : NULL;
}
static int walk_memory_regions_1(struct walk_memory_regions_data *data,
target_ulong base, int level, void **lp)
static PageFlagsNode *pageflags_next(PageFlagsNode *p, target_ulong start,
target_long last)
{
target_ulong pa;
int i, rc;
IntervalTreeNode *n;
if (*lp == NULL) {
return walk_memory_regions_end(data, base, 0);
}
if (level == 0) {
PageDesc *pd = *lp;
for (i = 0; i < V_L2_SIZE; ++i) {
int prot = pd[i].flags;
pa = base | (i << TARGET_PAGE_BITS);
if (prot != data->prot) {
rc = walk_memory_regions_end(data, pa, prot);
if (rc != 0) {
return rc;
}
}
}
} else {
void **pp = *lp;
for (i = 0; i < V_L2_SIZE; ++i) {
pa = base | ((target_ulong)i <<
(TARGET_PAGE_BITS + V_L2_BITS * level));
rc = walk_memory_regions_1(data, pa, level - 1, pp + i);
if (rc != 0) {
return rc;
}
}
}
return 0;
n = interval_tree_iter_next(&p->itree, start, last);
return n ? container_of(n, PageFlagsNode, itree) : NULL;
}
int walk_memory_regions(void *priv, walk_memory_regions_fn fn)
{
struct walk_memory_regions_data data;
uintptr_t i, l1_sz = v_l1_size;
IntervalTreeNode *n;
int rc = 0;
data.fn = fn;
data.priv = priv;
data.start = -1u;
data.prot = 0;
mmap_lock();
for (n = interval_tree_iter_first(&pageflags_root, 0, -1);
n != NULL;
n = interval_tree_iter_next(n, 0, -1)) {
PageFlagsNode *p = container_of(n, PageFlagsNode, itree);
for (i = 0; i < l1_sz; i++) {
target_ulong base = i << (v_l1_shift + TARGET_PAGE_BITS);
int rc = walk_memory_regions_1(&data, base, v_l2_levels, l1_map + i);
rc = fn(priv, n->start, n->last + 1, p->flags);
if (rc != 0) {
return rc;
break;
}
}
mmap_unlock();
return walk_memory_regions_end(&data, 0, 0);
return rc;
}
static int dump_region(void *priv, target_ulong start,
@ -228,13 +185,11 @@ static int dump_region(void *priv, target_ulong start,
{
FILE *f = (FILE *)priv;
(void) fprintf(f, TARGET_FMT_lx"-"TARGET_FMT_lx
" "TARGET_FMT_lx" %c%c%c\n",
fprintf(f, TARGET_FMT_lx"-"TARGET_FMT_lx" "TARGET_FMT_lx" %c%c%c\n",
start, end, end - start,
((prot & PAGE_READ) ? 'r' : '-'),
((prot & PAGE_WRITE) ? 'w' : '-'),
((prot & PAGE_EXEC) ? 'x' : '-'));
return 0;
}
@ -242,20 +197,131 @@ static int dump_region(void *priv, target_ulong start,
void page_dump(FILE *f)
{
const int length = sizeof(target_ulong) * 2;
(void) fprintf(f, "%-*s %-*s %-*s %s\n",
fprintf(f, "%-*s %-*s %-*s %s\n",
length, "start", length, "end", length, "size", "prot");
walk_memory_regions(f, dump_region);
}
int page_get_flags(target_ulong address)
{
PageDesc *p;
PageFlagsNode *p = pageflags_find(address, address);
p = page_find(address >> TARGET_PAGE_BITS);
if (!p) {
/*
* See util/interval-tree.c re lockless lookups: no false positives but
* there are false negatives. If we find nothing, retry with the mmap
* lock acquired.
*/
if (p) {
return p->flags;
}
if (have_mmap_lock()) {
return 0;
}
return p->flags;
mmap_lock();
p = pageflags_find(address, address);
mmap_unlock();
return p ? p->flags : 0;
}
/* A subroutine of page_set_flags: insert a new node for [start,last]. */
static void pageflags_create(target_ulong start, target_ulong last, int flags)
{
PageFlagsNode *p = g_new(PageFlagsNode, 1);
p->itree.start = start;
p->itree.last = last;
p->flags = flags;
interval_tree_insert(&p->itree, &pageflags_root);
}
/* A subroutine of page_set_flags: remove everything in [start,last]. */
static bool pageflags_unset(target_ulong start, target_ulong last)
{
bool inval_tb = false;
while (true) {
PageFlagsNode *p = pageflags_find(start, last);
target_ulong p_last;
if (!p) {
break;
}
if (p->flags & PAGE_EXEC) {
inval_tb = true;
}
interval_tree_remove(&p->itree, &pageflags_root);
p_last = p->itree.last;
if (p->itree.start < start) {
/* Truncate the node from the end, or split out the middle. */
p->itree.last = start - 1;
interval_tree_insert(&p->itree, &pageflags_root);
if (last < p_last) {
pageflags_create(last + 1, p_last, p->flags);
break;
}
} else if (p_last <= last) {
/* Range completely covers node -- remove it. */
g_free(p);
} else {
/* Truncate the node from the start. */
p->itree.start = last + 1;
interval_tree_insert(&p->itree, &pageflags_root);
break;
}
}
return inval_tb;
}
/*
* A subroutine of page_set_flags: nothing overlaps [start,last],
* but check adjacent mappings and maybe merge into a single range.
*/
static void pageflags_create_merge(target_ulong start, target_ulong last,
int flags)
{
PageFlagsNode *next = NULL, *prev = NULL;
if (start > 0) {
prev = pageflags_find(start - 1, start - 1);
if (prev) {
if (prev->flags == flags) {
interval_tree_remove(&prev->itree, &pageflags_root);
} else {
prev = NULL;
}
}
}
if (last + 1 != 0) {
next = pageflags_find(last + 1, last + 1);
if (next) {
if (next->flags == flags) {
interval_tree_remove(&next->itree, &pageflags_root);
} else {
next = NULL;
}
}
}
if (prev) {
if (next) {
prev->itree.last = next->itree.last;
g_free(next);
} else {
prev->itree.last = last;
}
interval_tree_insert(&prev->itree, &pageflags_root);
} else if (next) {
next->itree.start = start;
interval_tree_insert(&next->itree, &pageflags_root);
} else {
pageflags_create(start, last, flags);
}
}
/*
@ -267,6 +333,146 @@ int page_get_flags(target_ulong address)
#endif
#define PAGE_STICKY (PAGE_ANON | PAGE_PASSTHROUGH | PAGE_TARGET_STICKY)
/* A subroutine of page_set_flags: add flags to [start,last]. */
static bool pageflags_set_clear(target_ulong start, target_ulong last,
int set_flags, int clear_flags)
{
PageFlagsNode *p;
target_ulong p_start, p_last;
int p_flags, merge_flags;
bool inval_tb = false;
restart:
p = pageflags_find(start, last);
if (!p) {
if (set_flags) {
pageflags_create_merge(start, last, set_flags);
}
goto done;
}
p_start = p->itree.start;
p_last = p->itree.last;
p_flags = p->flags;
/* Using mprotect on a page does not change sticky bits. */
merge_flags = (p_flags & ~clear_flags) | set_flags;
/*
* Need to flush if an overlapping executable region
* removes exec, or adds write.
*/
if ((p_flags & PAGE_EXEC)
&& (!(merge_flags & PAGE_EXEC)
|| (merge_flags & ~p_flags & PAGE_WRITE))) {
inval_tb = true;
}
/*
* If there is an exact range match, update and return without
* attempting to merge with adjacent regions.
*/
if (start == p_start && last == p_last) {
if (merge_flags) {
p->flags = merge_flags;
} else {
interval_tree_remove(&p->itree, &pageflags_root);
g_free(p);
}
goto done;
}
/*
* If sticky bits affect the original mapping, then we must be more
* careful about the existing intervals and the separate flags.
*/
if (set_flags != merge_flags) {
if (p_start < start) {
interval_tree_remove(&p->itree, &pageflags_root);
p->itree.last = start - 1;
interval_tree_insert(&p->itree, &pageflags_root);
if (last < p_last) {
if (merge_flags) {
pageflags_create(start, last, merge_flags);
}
pageflags_create(last + 1, p_last, p_flags);
} else {
if (merge_flags) {
pageflags_create(start, p_last, merge_flags);
}
if (p_last < last) {
start = p_last + 1;
goto restart;
}
}
} else {
if (start < p_start && set_flags) {
pageflags_create(start, p_start - 1, set_flags);
}
if (last < p_last) {
interval_tree_remove(&p->itree, &pageflags_root);
p->itree.start = last + 1;
interval_tree_insert(&p->itree, &pageflags_root);
if (merge_flags) {
pageflags_create(start, last, merge_flags);
}
} else {
if (merge_flags) {
p->flags = merge_flags;
} else {
interval_tree_remove(&p->itree, &pageflags_root);
g_free(p);
}
if (p_last < last) {
start = p_last + 1;
goto restart;
}
}
}
goto done;
}
/* If flags are not changing for this range, incorporate it. */
if (set_flags == p_flags) {
if (start < p_start) {
interval_tree_remove(&p->itree, &pageflags_root);
p->itree.start = start;
interval_tree_insert(&p->itree, &pageflags_root);
}
if (p_last < last) {
start = p_last + 1;
goto restart;
}
goto done;
}
/* Maybe split out head and/or tail ranges with the original flags. */
interval_tree_remove(&p->itree, &pageflags_root);
if (p_start < start) {
p->itree.last = start - 1;
interval_tree_insert(&p->itree, &pageflags_root);
if (p_last < last) {
goto restart;
}
if (last < p_last) {
pageflags_create(last + 1, p_last, p_flags);
}
} else if (last < p_last) {
p->itree.start = last + 1;
interval_tree_insert(&p->itree, &pageflags_root);
} else {
g_free(p);
goto restart;
}
if (set_flags) {
pageflags_create(start, last, set_flags);
}
done:
return inval_tb;
}
/*
* Modify the flags of a page and invalidate the code if necessary.
* The flag PAGE_WRITE_ORG is positioned automatically depending
@ -274,49 +480,41 @@ int page_get_flags(target_ulong address)
*/
void page_set_flags(target_ulong start, target_ulong end, int flags)
{
target_ulong addr, len;
bool reset, inval_tb = false;
target_ulong last;
bool reset = false;
bool inval_tb = false;
/* This function should never be called with addresses outside the
guest address space. If this assert fires, it probably indicates
a missing call to h2g_valid. */
assert(end - 1 <= GUEST_ADDR_MAX);
assert(start < end);
assert(end - 1 <= GUEST_ADDR_MAX);
/* Only set PAGE_ANON with new mappings. */
assert(!(flags & PAGE_ANON) || (flags & PAGE_RESET));
assert_memory_lock();
start = start & TARGET_PAGE_MASK;
end = TARGET_PAGE_ALIGN(end);
last = end - 1;
if (!(flags & PAGE_VALID)) {
flags = 0;
} else {
reset = flags & PAGE_RESET;
flags &= ~PAGE_RESET;
if (flags & PAGE_WRITE) {
flags |= PAGE_WRITE_ORG;
}
reset = !(flags & PAGE_VALID) || (flags & PAGE_RESET);
if (reset) {
}
if (!flags || reset) {
page_reset_target_data(start, end);
inval_tb |= pageflags_unset(start, last);
}
flags &= ~PAGE_RESET;
for (addr = start, len = end - start;
len != 0;
len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, true);
/*
* If the page was executable, but is reset, or is no longer
* executable, or has become writable, then invalidate any code.
*/
if ((p->flags & PAGE_EXEC)
&& (reset ||
!(flags & PAGE_EXEC) ||
(flags & ~p->flags & PAGE_WRITE))) {
inval_tb = true;
if (flags) {
inval_tb |= pageflags_set_clear(start, last, flags,
~(reset ? 0 : PAGE_STICKY));
}
/* Using mprotect on a page does not change sticky bits. */
p->flags = (reset ? 0 : p->flags & PAGE_STICKY) | flags;
}
if (inval_tb) {
tb_invalidate_phys_range(start, end);
}
@ -324,87 +522,89 @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
int page_check_range(target_ulong start, target_ulong len, int flags)
{
PageDesc *p;
target_ulong end;
target_ulong addr;
/*
* This function should never be called with addresses outside the
* guest address space. If this assert fires, it probably indicates
* a missing call to h2g_valid.
*/
if (TARGET_ABI_BITS > L1_MAP_ADDR_SPACE_BITS) {
assert(start < ((target_ulong)1 << L1_MAP_ADDR_SPACE_BITS));
}
target_ulong last;
if (len == 0) {
return 0;
}
if (start + len - 1 < start) {
/* We've wrapped around. */
return -1;
return 0; /* trivial length */
}
/* must do before we loose bits in the next step */
end = TARGET_PAGE_ALIGN(start + len);
start = start & TARGET_PAGE_MASK;
last = start + len - 1;
if (last < start) {
return -1; /* wrap around */
}
while (true) {
PageFlagsNode *p = pageflags_find(start, last);
int missing;
for (addr = start, len = end - start;
len != 0;
len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
p = page_find(addr >> TARGET_PAGE_BITS);
if (!p) {
return -1;
return -1; /* entire region invalid */
}
if (!(p->flags & PAGE_VALID)) {
return -1;
if (start < p->itree.start) {
return -1; /* initial bytes invalid */
}
if ((flags & PAGE_READ) && !(p->flags & PAGE_READ)) {
return -1;
missing = flags & ~p->flags;
if (missing & PAGE_READ) {
return -1; /* page not readable */
}
if (flags & PAGE_WRITE) {
if (missing & PAGE_WRITE) {
if (!(p->flags & PAGE_WRITE_ORG)) {
return -1; /* page not writable */
}
/* Asking about writable, but has been protected: undo. */
if (!page_unprotect(start, 0)) {
return -1;
}
/* unprotect the page if it was put read-only because it
contains translated code */
if (!(p->flags & PAGE_WRITE)) {
if (!page_unprotect(addr, 0)) {
return -1;
/* TODO: page_unprotect should take a range, not a single page. */
if (last - start < TARGET_PAGE_SIZE) {
return 0; /* ok */
}
}
}
}
return 0;
}
void page_protect(tb_page_addr_t page_addr)
{
target_ulong addr;
PageDesc *p;
int prot;
p = page_find(page_addr >> TARGET_PAGE_BITS);
if (p && (p->flags & PAGE_WRITE)) {
/*
* Force the host page as non writable (writes will have a page fault +
* mprotect overhead).
*/
page_addr &= qemu_host_page_mask;
prot = 0;
for (addr = page_addr; addr < page_addr + qemu_host_page_size;
addr += TARGET_PAGE_SIZE) {
p = page_find(addr >> TARGET_PAGE_BITS);
if (!p) {
start += TARGET_PAGE_SIZE;
continue;
}
prot |= p->flags;
p->flags &= ~PAGE_WRITE;
if (last <= p->itree.last) {
return 0; /* ok */
}
mprotect(g2h_untagged(page_addr), qemu_host_page_size,
(prot & PAGE_BITS) & ~PAGE_WRITE);
start = p->itree.last + 1;
}
}
void page_protect(tb_page_addr_t address)
{
PageFlagsNode *p;
target_ulong start, last;
int prot;
assert_memory_lock();
if (qemu_host_page_size <= TARGET_PAGE_SIZE) {
start = address & TARGET_PAGE_MASK;
last = start + TARGET_PAGE_SIZE - 1;
} else {
start = address & qemu_host_page_mask;
last = start + qemu_host_page_size - 1;
}
p = pageflags_find(start, last);
if (!p) {
return;
}
prot = p->flags;
if (unlikely(p->itree.last < last)) {
/* More than one protection region covers the one host page. */
assert(TARGET_PAGE_SIZE < qemu_host_page_size);
while ((p = pageflags_next(p, start, last)) != NULL) {
prot |= p->flags;
}
}
if (prot & PAGE_WRITE) {
pageflags_set_clear(start, last, 0, PAGE_WRITE);
mprotect(g2h_untagged(start), qemu_host_page_size,
prot & (PAGE_READ | PAGE_EXEC) ? PROT_READ : PROT_NONE);
}
}
@ -417,10 +617,8 @@ void page_protect(tb_page_addr_t page_addr)
*/
int page_unprotect(target_ulong address, uintptr_t pc)
{
unsigned int prot;
PageFlagsNode *p;
bool current_tb_invalidated;
PageDesc *p;
target_ulong host_start, host_end, addr;
/*
* Technically this isn't safe inside a signal handler. However we
@ -429,17 +627,14 @@ int page_unprotect(target_ulong address, uintptr_t pc)
*/
mmap_lock();
p = page_find(address >> TARGET_PAGE_BITS);
if (!p) {
p = pageflags_find(address, address);
/* If this address was not really writable, nothing to do. */
if (!p || !(p->flags & PAGE_WRITE_ORG)) {
mmap_unlock();
return 0;
}
/*
* If the page was really writable, then we change its
* protection back to writable.
*/
if (p->flags & PAGE_WRITE_ORG) {
current_tb_invalidated = false;
if (p->flags & PAGE_WRITE) {
/*
@ -454,15 +649,32 @@ int page_unprotect(target_ulong address, uintptr_t pc)
}
#endif
} else {
host_start = address & qemu_host_page_mask;
host_end = host_start + qemu_host_page_size;
target_ulong start, len, i;
int prot;
if (qemu_host_page_size <= TARGET_PAGE_SIZE) {
start = address & TARGET_PAGE_MASK;
len = TARGET_PAGE_SIZE;
prot = p->flags | PAGE_WRITE;
pageflags_set_clear(start, start + len - 1, PAGE_WRITE, 0);
current_tb_invalidated = tb_invalidate_phys_page_unwind(start, pc);
} else {
start = address & qemu_host_page_mask;
len = qemu_host_page_size;
prot = 0;
for (addr = host_start; addr < host_end; addr += TARGET_PAGE_SIZE) {
p = page_find(addr >> TARGET_PAGE_BITS);
p->flags |= PAGE_WRITE;
prot |= p->flags;
for (i = 0; i < len; i += TARGET_PAGE_SIZE) {
target_ulong addr = start + i;
p = pageflags_find(addr, addr);
if (p) {
prot |= p->flags;
if (p->flags & PAGE_WRITE_ORG) {
prot |= PAGE_WRITE;
pageflags_set_clear(addr, addr + TARGET_PAGE_SIZE - 1,
PAGE_WRITE, 0);
}
}
/*
* Since the content will be modified, we must invalidate
* the corresponding translated code.
@ -470,15 +682,16 @@ int page_unprotect(target_ulong address, uintptr_t pc)
current_tb_invalidated |=
tb_invalidate_phys_page_unwind(addr, pc);
}
mprotect((void *)g2h_untagged(host_start), qemu_host_page_size,
prot & PAGE_BITS);
}
if (prot & PAGE_EXEC) {
prot = (prot & ~PAGE_EXEC) | PAGE_READ;
}
mprotect((void *)g2h_untagged(start), len, prot & PAGE_BITS);
}
mmap_unlock();
/* If current TB was invalidated return to main loop */
return current_tb_invalidated ? 2 : 1;
}
mmap_unlock();
return 0;
}
static int probe_access_internal(CPUArchState *env, target_ulong addr,

View File

@ -0,0 +1,22 @@
/*
* Test very large vma allocations.
* The qemu out-of-memory condition was within the mmap syscall itself.
* If the syscall actually returns with MAP_FAILED, the test succeeded.
*/
#include <sys/mman.h>
int main()
{
int n = sizeof(size_t) == 4 ? 32 : 45;
for (int i = 28; i < n; i++) {
size_t l = (size_t)1 << i;
void *p = mmap(0, l, PROT_NONE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0);
if (p == MAP_FAILED) {
break;
}
munmap(p, l);
}
return 0;
}