Implement multi-level page tables.

Define L1_MAP_ADDR_SPACE_BITS to be either the virtual address size
(in user mode) or physical address size (in system mode), and use
that to size l1_map.  This rewrites page_find_alloc, page_flush_tb,
and walk_memory_regions.

Use TARGET_PHYS_ADDR_SPACE_BITS for the physical memory map based
off of l1_phys_map.  This rewrites page_phys_find_alloc and
phys_page_for_each.

Signed-off-by: Richard Henderson <rth@twiddle.net>
This commit is contained in:
Richard Henderson 2010-03-10 15:53:37 -08:00 committed by Paul Brook
parent 14f24e1465
commit 5cd2c5b6ad
2 changed files with 280 additions and 182 deletions

View File

@ -745,8 +745,11 @@ extern unsigned long qemu_host_page_mask;
#define PAGE_RESERVED 0x0020
void page_dump(FILE *f);
int walk_memory_regions(void *,
int (*fn)(void *, unsigned long, unsigned long, unsigned long));
typedef int (*walk_memory_regions_fn)(void *, unsigned long,
unsigned long, unsigned long);
int walk_memory_regions(void *, walk_memory_regions_fn);
int page_get_flags(target_ulong address);
void page_set_flags(target_ulong start, target_ulong end, int flags);
int page_check_range(target_ulong start, target_ulong len, int flags);

455
exec.c
View File

@ -141,30 +141,56 @@ typedef struct PhysPageDesc {
ram_addr_t region_offset;
} PhysPageDesc;
#define L2_BITS 10
#if defined(CONFIG_USER_ONLY) && defined(TARGET_VIRT_ADDR_SPACE_BITS)
/* XXX: this is a temporary hack for alpha target.
* In the future, this is to be replaced by a multi-level table
* to actually be able to handle the complete 64 bits address space.
*/
#define L1_BITS (TARGET_VIRT_ADDR_SPACE_BITS - L2_BITS - TARGET_PAGE_BITS)
/* In system mode we want L1_MAP to be based on physical addresses,
while in user mode we want it to be based on virtual addresses. */
#if !defined(CONFIG_USER_ONLY)
# define L1_MAP_ADDR_SPACE_BITS TARGET_PHYS_ADDR_SPACE_BITS
#else
#define L1_BITS (32 - L2_BITS - TARGET_PAGE_BITS)
# define L1_MAP_ADDR_SPACE_BITS TARGET_VIRT_ADDR_SPACE_BITS
#endif
#define L1_SIZE (1 << L1_BITS)
/* Size of the L2 (and L3, etc) page tables. */
#define L2_BITS 10
#define L2_SIZE (1 << L2_BITS)
/* The bits remaining after N lower levels of page tables. */
#define P_L1_BITS_REM \
((TARGET_PHYS_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % L2_BITS)
#define V_L1_BITS_REM \
((L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % L2_BITS)
/* Size of the L1 page table. Avoid silly small sizes. */
#if P_L1_BITS_REM < 4
#define P_L1_BITS (P_L1_BITS_REM + L2_BITS)
#else
#define P_L1_BITS P_L1_BITS_REM
#endif
#if V_L1_BITS_REM < 4
#define V_L1_BITS (V_L1_BITS_REM + L2_BITS)
#else
#define V_L1_BITS V_L1_BITS_REM
#endif
#define P_L1_SIZE ((target_phys_addr_t)1 << P_L1_BITS)
#define V_L1_SIZE ((target_ulong)1 << V_L1_BITS)
#define P_L1_SHIFT (TARGET_PHYS_ADDR_SPACE_BITS - TARGET_PAGE_BITS - P_L1_BITS)
#define V_L1_SHIFT (L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS - V_L1_BITS)
unsigned long qemu_real_host_page_size;
unsigned long qemu_host_page_bits;
unsigned long qemu_host_page_size;
unsigned long qemu_host_page_mask;
/* XXX: for system emulation, it could just be an array */
static PageDesc *l1_map[L1_SIZE];
/* This is a multi-level map on the virtual address space.
The bottom level has pointers to PageDesc. */
static void *l1_map[V_L1_SIZE];
#if !defined(CONFIG_USER_ONLY)
static PhysPageDesc **l1_phys_map;
/* This is a multi-level map on the physical address space.
The bottom level has pointers to PhysPageDesc. */
static void *l1_phys_map[P_L1_SIZE];
static void io_mem_init(void);
@ -239,133 +265,159 @@ static void page_init(void)
while ((1 << qemu_host_page_bits) < qemu_host_page_size)
qemu_host_page_bits++;
qemu_host_page_mask = ~(qemu_host_page_size - 1);
#if !defined(CONFIG_USER_ONLY)
l1_phys_map = qemu_vmalloc(L1_SIZE * sizeof(void *));
memset(l1_phys_map, 0, L1_SIZE * sizeof(void *));
#endif
#if !defined(_WIN32) && defined(CONFIG_USER_ONLY)
{
long long startaddr, endaddr;
FILE *f;
int n;
mmap_lock();
last_brk = (unsigned long)sbrk(0);
f = fopen("/proc/self/maps", "r");
if (f) {
mmap_lock();
do {
n = fscanf (f, "%llx-%llx %*[^\n]\n", &startaddr, &endaddr);
if (n == 2) {
startaddr = MIN(startaddr,
(1ULL << TARGET_PHYS_ADDR_SPACE_BITS) - 1);
endaddr = MIN(endaddr,
(1ULL << TARGET_PHYS_ADDR_SPACE_BITS) - 1);
page_set_flags(startaddr & TARGET_PAGE_MASK,
TARGET_PAGE_ALIGN(endaddr),
PAGE_RESERVED);
unsigned long startaddr, endaddr;
int n;
n = fscanf (f, "%lx-%lx %*[^\n]\n", &startaddr, &endaddr);
if (n == 2 && h2g_valid(startaddr)) {
startaddr = h2g(startaddr) & TARGET_PAGE_MASK;
if (h2g_valid(endaddr)) {
endaddr = h2g(endaddr);
} else {
endaddr = ~0ul;
}
page_set_flags(startaddr, endaddr, PAGE_RESERVED);
}
} while (!feof(f));
fclose(f);
mmap_unlock();
}
mmap_unlock();
}
#endif
}
static inline PageDesc **page_l1_map(target_ulong index)
static PageDesc *page_find_alloc(target_ulong index, int alloc)
{
#if TARGET_LONG_BITS > 32
/* Host memory outside guest VM. For 32-bit targets we have already
excluded high addresses. */
if (index > ((target_ulong)L2_SIZE * L1_SIZE))
return NULL;
#endif
return &l1_map[index >> L2_BITS];
}
static inline PageDesc *page_find_alloc(target_ulong index)
{
PageDesc **lp, *p;
lp = page_l1_map(index);
if (!lp)
return NULL;
p = *lp;
if (!p) {
/* allocate if not found */
#if defined(CONFIG_USER_ONLY)
size_t len = sizeof(PageDesc) * L2_SIZE;
/* Don't use qemu_malloc because it may recurse. */
p = mmap(NULL, len, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
*lp = p;
if (h2g_valid(p)) {
unsigned long addr = h2g(p);
page_set_flags(addr & TARGET_PAGE_MASK,
TARGET_PAGE_ALIGN(addr + len),
PAGE_RESERVED);
}
/* We can't use qemu_malloc because it may recurse into a locked mutex.
Neither can we record the new pages we reserve while allocating a
given page because that may recurse into an unallocated page table
entry. Stuff the allocations we do make into a queue and process
them after having completed one entire page table allocation. */
unsigned long reserve[2 * (V_L1_SHIFT / L2_BITS)];
int reserve_idx = 0;
# define ALLOC(P, SIZE) \
do { \
P = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, \
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); \
if (h2g_valid(P)) { \
reserve[reserve_idx] = h2g(P); \
reserve[reserve_idx + 1] = SIZE; \
reserve_idx += 2; \
} \
} while (0)
#else
p = qemu_mallocz(sizeof(PageDesc) * L2_SIZE);
*lp = p;
# define ALLOC(P, SIZE) \
do { P = qemu_mallocz(SIZE); } while (0)
#endif
PageDesc *pd;
void **lp;
int i;
/* Level 1. Always allocated. */
lp = l1_map + ((index >> V_L1_SHIFT) & (V_L1_SIZE - 1));
/* Level 2..N-1. */
for (i = V_L1_SHIFT / L2_BITS - 1; i > 0; i--) {
void **p = *lp;
if (p == NULL) {
if (!alloc) {
return NULL;
}
ALLOC(p, sizeof(void *) * L2_SIZE);
*lp = p;
}
lp = p + ((index >> (i * L2_BITS)) & (L2_SIZE - 1));
}
return p + (index & (L2_SIZE - 1));
pd = *lp;
if (pd == NULL) {
if (!alloc) {
return NULL;
}
ALLOC(pd, sizeof(PageDesc) * L2_SIZE);
*lp = pd;
}
#undef ALLOC
#if defined(CONFIG_USER_ONLY)
for (i = 0; i < reserve_idx; i += 2) {
unsigned long addr = reserve[i];
unsigned long len = reserve[i + 1];
page_set_flags(addr & TARGET_PAGE_MASK,
TARGET_PAGE_ALIGN(addr + len),
PAGE_RESERVED);
}
#endif
return pd + (index & (L2_SIZE - 1));
}
static inline PageDesc *page_find(target_ulong index)
{
PageDesc **lp, *p;
lp = page_l1_map(index);
if (!lp)
return NULL;
p = *lp;
if (!p) {
return NULL;
}
return p + (index & (L2_SIZE - 1));
return page_find_alloc(index, 0);
}
#if !defined(CONFIG_USER_ONLY)
static PhysPageDesc *phys_page_find_alloc(target_phys_addr_t index, int alloc)
{
void **lp, **p;
PhysPageDesc *pd;
void **lp;
int i;
p = (void **)l1_phys_map;
#if TARGET_PHYS_ADDR_SPACE_BITS > 32
/* Level 1. Always allocated. */
lp = l1_phys_map + ((index >> P_L1_SHIFT) & (P_L1_SIZE - 1));
#if TARGET_PHYS_ADDR_SPACE_BITS > (32 + L1_BITS)
#error unsupported TARGET_PHYS_ADDR_SPACE_BITS
#endif
lp = p + ((index >> (L1_BITS + L2_BITS)) & (L1_SIZE - 1));
p = *lp;
if (!p) {
/* allocate if not found */
if (!alloc)
return NULL;
p = qemu_vmalloc(sizeof(void *) * L1_SIZE);
memset(p, 0, sizeof(void *) * L1_SIZE);
*lp = p;
/* Level 2..N-1. */
for (i = P_L1_SHIFT / L2_BITS - 1; i > 0; i--) {
void **p = *lp;
if (p == NULL) {
if (!alloc) {
return NULL;
}
*lp = p = qemu_mallocz(sizeof(void *) * L2_SIZE);
}
lp = p + ((index >> (i * L2_BITS)) & (L2_SIZE - 1));
}
#endif
lp = p + ((index >> L2_BITS) & (L1_SIZE - 1));
pd = *lp;
if (!pd) {
if (pd == NULL) {
int i;
/* allocate if not found */
if (!alloc)
if (!alloc) {
return NULL;
pd = qemu_vmalloc(sizeof(PhysPageDesc) * L2_SIZE);
*lp = pd;
}
*lp = pd = qemu_malloc(sizeof(PhysPageDesc) * L2_SIZE);
for (i = 0; i < L2_SIZE; i++) {
pd[i].phys_offset = IO_MEM_UNASSIGNED;
pd[i].region_offset = (index + i) << TARGET_PAGE_BITS;
pd[i].phys_offset = IO_MEM_UNASSIGNED;
pd[i].region_offset = (index + i) << TARGET_PAGE_BITS;
}
}
return ((PhysPageDesc *)pd) + (index & (L2_SIZE - 1));
return pd + (index & (L2_SIZE - 1));
}
static inline PhysPageDesc *phys_page_find(target_phys_addr_t index)
@ -573,21 +625,34 @@ static inline void invalidate_page_bitmap(PageDesc *p)
p->code_write_count = 0;
}
/* set to NULL all the 'first_tb' fields in all PageDescs */
/* Set to NULL all the 'first_tb' fields in all PageDescs. */
static void page_flush_tb_1 (int level, void **lp)
{
int i;
if (*lp == NULL) {
return;
}
if (level == 0) {
PageDesc *pd = *lp;
for (i = 0; i < L2_BITS; ++i) {
pd[i].first_tb = NULL;
invalidate_page_bitmap(pd + i);
}
} else {
void **pp = *lp;
for (i = 0; i < L2_BITS; ++i) {
page_flush_tb_1 (level - 1, pp + i);
}
}
}
static void page_flush_tb(void)
{
int i, j;
PageDesc *p;
for(i = 0; i < L1_SIZE; i++) {
p = l1_map[i];
if (p) {
for(j = 0; j < L2_SIZE; j++) {
p->first_tb = NULL;
invalidate_page_bitmap(p);
p++;
}
}
int i;
for (i = 0; i < V_L1_SIZE; i++) {
page_flush_tb_1(V_L1_SHIFT / L2_BITS - 1, l1_map + i);
}
}
@ -1081,7 +1146,7 @@ static inline void tb_alloc_page(TranslationBlock *tb,
TranslationBlock *last_first_tb;
tb->page_addr[n] = page_addr;
p = page_find_alloc(page_addr >> TARGET_PAGE_BITS);
p = page_find_alloc(page_addr >> TARGET_PAGE_BITS, 1);
tb->page_next[n] = p->first_tb;
last_first_tb = p->first_tb;
p->first_tb = (TranslationBlock *)((long)tb | n);
@ -1641,50 +1706,37 @@ static int cpu_notify_migration_log(int enable)
return 0;
}
static void phys_page_for_each_in_l1_map(PhysPageDesc **phys_map,
CPUPhysMemoryClient *client)
static void phys_page_for_each_1(CPUPhysMemoryClient *client,
int level, void **lp)
{
PhysPageDesc *pd;
int l1, l2;
int i;
for (l1 = 0; l1 < L1_SIZE; ++l1) {
pd = phys_map[l1];
if (!pd) {
continue;
}
for (l2 = 0; l2 < L2_SIZE; ++l2) {
if (pd[l2].phys_offset == IO_MEM_UNASSIGNED) {
continue;
if (*lp == NULL) {
return;
}
if (level == 0) {
PhysPageDesc *pd = *lp;
for (i = 0; i < L2_BITS; ++i) {
if (pd[i].phys_offset != IO_MEM_UNASSIGNED) {
client->set_memory(client, pd[i].region_offset,
TARGET_PAGE_SIZE, pd[i].phys_offset);
}
client->set_memory(client, pd[l2].region_offset,
TARGET_PAGE_SIZE, pd[l2].phys_offset);
}
} else {
void **pp = *lp;
for (i = 0; i < L2_BITS; ++i) {
phys_page_for_each_1(client, level - 1, pp + i);
}
}
}
static void phys_page_for_each(CPUPhysMemoryClient *client)
{
#if TARGET_PHYS_ADDR_SPACE_BITS > 32
#if TARGET_PHYS_ADDR_SPACE_BITS > (32 + L1_BITS)
#error unsupported TARGET_PHYS_ADDR_SPACE_BITS
#endif
void **phys_map = (void **)l1_phys_map;
int l1;
if (!l1_phys_map) {
return;
int i;
for (i = 0; i < P_L1_SIZE; ++i) {
phys_page_for_each_1(client, P_L1_SHIFT / L2_BITS - 1,
l1_phys_map + 1);
}
for (l1 = 0; l1 < L1_SIZE; ++l1) {
if (phys_map[l1]) {
phys_page_for_each_in_l1_map(phys_map[l1], client);
}
}
#else
if (!l1_phys_map) {
return;
}
phys_page_for_each_in_l1_map(l1_phys_map, client);
#endif
}
void cpu_register_phys_memory_client(CPUPhysMemoryClient *client)
@ -2148,44 +2200,87 @@ void tlb_flush_page(CPUState *env, target_ulong addr)
* Walks guest process memory "regions" one by one
* and calls callback function 'fn' for each region.
*/
int walk_memory_regions(void *priv,
int (*fn)(void *, unsigned long, unsigned long, unsigned long))
struct walk_memory_regions_data
{
unsigned long start, end;
PageDesc *p = NULL;
int i, j, prot, prot1;
int rc = 0;
walk_memory_regions_fn fn;
void *priv;
unsigned long start;
int prot;
};
start = end = -1;
prot = 0;
for (i = 0; i <= L1_SIZE; i++) {
p = (i < L1_SIZE) ? l1_map[i] : NULL;
for (j = 0; j < L2_SIZE; j++) {
prot1 = (p == NULL) ? 0 : p[j].flags;
/*
* "region" is one continuous chunk of memory
* that has same protection flags set.
*/
if (prot1 != prot) {
end = (i << (32 - L1_BITS)) | (j << TARGET_PAGE_BITS);
if (start != -1) {
rc = (*fn)(priv, start, end, prot);
/* callback can stop iteration by returning != 0 */
if (rc != 0)
return (rc);
}
if (prot1 != 0)
start = end;
else
start = -1;
prot = prot1;
}
if (p == NULL)
break;
static int walk_memory_regions_end(struct walk_memory_regions_data *data,
unsigned long end, int new_prot)
{
if (data->start != -1ul) {
int rc = data->fn(data->priv, data->start, end, data->prot);
if (rc != 0) {
return rc;
}
}
return (rc);
data->start = (new_prot ? end : -1ul);
data->prot = new_prot;
return 0;
}
static int walk_memory_regions_1(struct walk_memory_regions_data *data,
unsigned long base, int level, void **lp)
{
unsigned long pa;
int i, rc;
if (*lp == NULL) {
return walk_memory_regions_end(data, base, 0);
}
if (level == 0) {
PageDesc *pd = *lp;
for (i = 0; i < L2_BITS; ++i) {
int prot = pd[i].flags;
pa = base | (i << TARGET_PAGE_BITS);
if (prot != data->prot) {
rc = walk_memory_regions_end(data, pa, prot);
if (rc != 0) {
return rc;
}
}
}
} else {
void **pp = *lp;
for (i = 0; i < L2_BITS; ++i) {
pa = base | (i << (TARGET_PAGE_BITS + L2_BITS * level));
rc = walk_memory_regions_1(data, pa, level - 1, pp + i);
if (rc != 0) {
return rc;
}
}
}
return 0;
}
int walk_memory_regions(void *priv, walk_memory_regions_fn fn)
{
struct walk_memory_regions_data data;
unsigned long i;
data.fn = fn;
data.priv = priv;
data.start = -1ul;
data.prot = 0;
for (i = 0; i < V_L1_SIZE; i++) {
int rc = walk_memory_regions_1(&data, i << V_L1_SHIFT,
V_L1_SHIFT / L2_BITS - 1, l1_map + i);
if (rc != 0) {
return rc;
}
}
return walk_memory_regions_end(&data, 0, 0);
}
static int dump_region(void *priv, unsigned long start,