From 10fe570fc16721d78afdba9689720094527c1ba3 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 9 Aug 2011 13:02:50 -0400 Subject: [PATCH 01/14] Revert "xen/debug: WARN_ON when identity PFN has no _PAGE_IOMAP flag set." We don' use it anymore and there are more false positives. This reverts commit fc25151d9ac7d809239fe68de0a1490b504bb94a. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/Kconfig | 8 -------- arch/x86/xen/mmu.c | 38 -------------------------------------- 2 files changed, 46 deletions(-) diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 5cc821cb2e09..ae559fe91c25 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -49,11 +49,3 @@ config XEN_DEBUG_FS help Enable statistics output and various tuning options in debugfs. Enabling this option may incur a significant performance overhead. - -config XEN_DEBUG - bool "Enable Xen debug checks" - depends on XEN - default n - help - Enable various WARN_ON checks in the Xen MMU code. - Enabling this option WILL incur a significant performance overhead. diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f987bde77c49..3c9aecd09ed1 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -495,41 +495,6 @@ static pte_t xen_make_pte(pteval_t pte) } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); -#ifdef CONFIG_XEN_DEBUG -pte_t xen_make_pte_debug(pteval_t pte) -{ - phys_addr_t addr = (pte & PTE_PFN_MASK); - phys_addr_t other_addr; - bool io_page = false; - pte_t _pte; - - if (pte & _PAGE_IOMAP) - io_page = true; - - _pte = xen_make_pte(pte); - - if (!addr) - return _pte; - - if (io_page && - (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { - other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT; - WARN_ONCE(addr != other_addr, - "0x%lx is using VM_IO, but it is 0x%lx!\n", - (unsigned long)addr, (unsigned long)other_addr); - } else { - pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP; - other_addr = (_pte.pte & PTE_PFN_MASK); - WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set), - "0x%lx is missing VM_IO (and wasn't fixed)!\n", - (unsigned long)addr); - } - - return _pte; -} -PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug); -#endif - static pgd_t xen_make_pgd(pgdval_t pgd) { pgd = pte_pfn_to_mfn(pgd); @@ -1988,9 +1953,6 @@ void __init xen_ident_map_ISA(void) static void __init xen_post_allocator_init(void) { -#ifdef CONFIG_XEN_DEBUG - pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); -#endif pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; From a867db10e89e12a3d97dedafdd411aa1527a6540 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 23 Sep 2011 16:32:47 -0400 Subject: [PATCH 02/14] xen/p2m: Make debug/xen/mmu/p2m visible again. We dropped a lot of the MMU debugfs in favour of using tracing API - but there is one which just provides mostly static information that was made invisible by this change. Bring it back. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 3 --- arch/x86/xen/mmu.c | 14 ------------- arch/x86/xen/p2m.c | 35 ++++++++++++++++++++++++++++++--- 3 files changed, 32 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 64a619d47d34..bc12c12299c3 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -53,9 +53,6 @@ extern int m2p_remove_override(struct page *page, bool clear_pte); extern struct page *m2p_find_override(unsigned long mfn); extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); -#ifdef CONFIG_XEN_DEBUG_FS -extern int p2m_dump_show(struct seq_file *m, void *v); -#endif static inline unsigned long pfn_to_mfn(unsigned long pfn) { unsigned long mfn; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3c9aecd09ed1..4df0444b2cee 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2362,17 +2362,3 @@ out: return err; } EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); - -#ifdef CONFIG_XEN_DEBUG_FS -static int p2m_dump_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, p2m_dump_show, NULL); -} - -static const struct file_operations p2m_dump_fops = { - .open = p2m_dump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 58efeb9d5440..cc2f8dcf8dda 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -782,8 +782,9 @@ unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) EXPORT_SYMBOL_GPL(m2p_find_override_pfn); #ifdef CONFIG_XEN_DEBUG_FS - -int p2m_dump_show(struct seq_file *m, void *v) +#include +#include "debugfs.h" +static int p2m_dump_show(struct seq_file *m, void *v) { static const char * const level_name[] = { "top", "middle", "entry", "abnormal" }; @@ -856,4 +857,32 @@ int p2m_dump_show(struct seq_file *m, void *v) #undef TYPE_PFN #undef TYPE_UNKNOWN } -#endif + +static int p2m_dump_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, p2m_dump_show, NULL); +} + +static const struct file_operations p2m_dump_fops = { + .open = p2m_dump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *d_mmu_debug; + +static int __init xen_p2m_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + d_mmu_debug = debugfs_create_dir("mmu", d_xen); + + debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops); + return 0; +} +fs_initcall(xen_p2m_debugfs); +#endif /* CONFIG_XEN_DEBUG_FS */ From 0f4b49eaf25e661fbe63a5370b7781166b34d616 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 23 Sep 2011 17:36:07 -0400 Subject: [PATCH 03/14] xen/p2m: Use SetPagePrivate and its friends for M2P overrides. We use the page->private field and hence should use the proper macros and set proper bits. Also WARN_ON in case somebody tries to overwrite our data. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index cc2f8dcf8dda..6e56b65edafb 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -692,8 +692,9 @@ int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte) "m2p_add_override: pfn %lx not mapped", pfn)) return -EINVAL; } - - page->private = mfn; + WARN_ON(PagePrivate(page)); + SetPagePrivate(page); + set_page_private(page, mfn); page->index = pfn_to_mfn(pfn); if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) @@ -736,7 +737,8 @@ int m2p_remove_override(struct page *page, bool clear_pte) list_del(&page->lru); spin_unlock_irqrestore(&m2p_override_lock, flags); set_phys_to_machine(pfn, page->index); - + WARN_ON(!PagePrivate(page)); + ClearPagePrivate(page); if (clear_pte && !PageHighMem(page)) set_pte_at(&init_mm, address, ptep, pfn_pte(pfn, PAGE_KERNEL)); @@ -758,7 +760,7 @@ struct page *m2p_find_override(unsigned long mfn) spin_lock_irqsave(&m2p_override_lock, flags); list_for_each_entry(p, bucket, lru) { - if (p->private == mfn) { + if (page_private(p) == mfn) { ret = p; break; } From 693394b8c3dcee1a3baa52e30fdc3323d88cd579 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 29 Sep 2011 11:57:55 +0100 Subject: [PATCH 04/14] xen: add an "highmem" parameter to alloc_xenballooned_pages Add an highmem parameter to alloc_xenballooned_pages, to allow callers to request lowmem or highmem pages. Fix the code style of free_xenballooned_pages' prototype. Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/balloon.c | 12 ++++++++---- drivers/xen/gntdev.c | 2 +- include/xen/balloon.h | 5 +++-- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 5dfd8f8ff07f..cd8b4704ef4e 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -501,20 +501,24 @@ EXPORT_SYMBOL_GPL(balloon_set_new_target); * alloc_xenballooned_pages - get pages that have been ballooned out * @nr_pages: Number of pages to get * @pages: pages returned + * @highmem: highmem or lowmem pages * @return 0 on success, error otherwise */ -int alloc_xenballooned_pages(int nr_pages, struct page** pages) +int alloc_xenballooned_pages(int nr_pages, struct page **pages, bool highmem) { int pgno = 0; struct page* page; mutex_lock(&balloon_mutex); while (pgno < nr_pages) { - page = balloon_retrieve(true); - if (page) { + page = balloon_retrieve(highmem); + if (page && PageHighMem(page) == highmem) { pages[pgno++] = page; } else { enum bp_state st; - st = decrease_reservation(nr_pages - pgno, GFP_HIGHUSER); + if (page) + balloon_append(page); + st = decrease_reservation(nr_pages - pgno, + highmem ? GFP_HIGHUSER : GFP_USER); if (st != BP_DONE) goto out_undo; } diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index f914b26cf0c2..7b9b1d1b75a5 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -123,7 +123,7 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) NULL == add->pages) goto err; - if (alloc_xenballooned_pages(count, add->pages)) + if (alloc_xenballooned_pages(count, add->pages, false /* lowmem */)) goto err; for (i = 0; i < count; i++) { diff --git a/include/xen/balloon.h b/include/xen/balloon.h index 76f7538bb339..d29c153705bc 100644 --- a/include/xen/balloon.h +++ b/include/xen/balloon.h @@ -25,8 +25,9 @@ extern struct balloon_stats balloon_stats; void balloon_set_new_target(unsigned long target); -int alloc_xenballooned_pages(int nr_pages, struct page** pages); -void free_xenballooned_pages(int nr_pages, struct page** pages); +int alloc_xenballooned_pages(int nr_pages, struct page **pages, + bool highmem); +void free_xenballooned_pages(int nr_pages, struct page **pages); struct sys_device; #ifdef CONFIG_XEN_SELFBALLOONING From 0930bba674e248b921ea659b036ff02564e5a5f4 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 29 Sep 2011 11:57:56 +0100 Subject: [PATCH 05/14] xen: modify kernel mappings corresponding to granted pages If we want to use granted pages for AIO, changing the mappings of a user vma and the corresponding p2m is not enough, we also need to update the kernel mappings accordingly. Currently this is only needed for pages that are created for user usages through /dev/xen/gntdev. As in, pages that have been in use by the kernel and use the P2M will not need this special mapping. However there are no guarantees that in the future the kernel won't start accessing pages through the 1:1 even for internal usage. In order to avoid the complexity of dealing with highmem, we allocated the pages lowmem. We issue a HYPERVISOR_grant_table_op right away in m2p_add_override and we remove the mappings using another HYPERVISOR_grant_table_op in m2p_remove_override. Considering that m2p_add_override and m2p_remove_override are called once per page we use multicalls and hypercall batching. Use the kmap_op pointer directly as argument to do the mapping as it is guaranteed to be present up until the unmapping is done. Before issuing any unmapping multicalls, we need to make sure that the mapping has already being done, because we need the kmap->handle to be set correctly. Signed-off-by: Stefano Stabellini [v1: Removed GRANT_FRAME_BIT usage] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 3 +- arch/x86/xen/p2m.c | 76 +++++++++++++++++++++++++---- drivers/block/xen-blkback/blkback.c | 2 +- drivers/xen/gntdev.c | 32 +++++++++++- drivers/xen/grant-table.c | 6 +-- include/xen/grant_table.h | 1 + 6 files changed, 104 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index bc12c12299c3..3138d33b8949 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -12,6 +12,7 @@ #include #include +#include #include /* Xen machine address */ @@ -48,7 +49,7 @@ extern unsigned long set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e); extern int m2p_add_override(unsigned long mfn, struct page *page, - bool clear_pte); + struct gnttab_map_grant_ref *kmap_op); extern int m2p_remove_override(struct page *page, bool clear_pte); extern struct page *m2p_find_override(unsigned long mfn); extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 6e56b65edafb..a8ee9a45c359 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -161,7 +161,9 @@ #include #include #include +#include +#include "multicalls.h" #include "xen-ops.h" static void __init m2p_override_init(void); @@ -676,7 +678,8 @@ static unsigned long mfn_hash(unsigned long mfn) } /* Add an MFN override for a particular page */ -int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte) +int m2p_add_override(unsigned long mfn, struct page *page, + struct gnttab_map_grant_ref *kmap_op) { unsigned long flags; unsigned long pfn; @@ -700,9 +703,20 @@ int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte) if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) return -ENOMEM; - if (clear_pte && !PageHighMem(page)) - /* Just zap old mapping for now */ - pte_clear(&init_mm, address, ptep); + if (kmap_op != NULL) { + if (!PageHighMem(page)) { + struct multicall_space mcs = + xen_mc_entry(sizeof(*kmap_op)); + + MULTI_grant_table_op(mcs.mc, + GNTTABOP_map_grant_ref, kmap_op, 1); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + } + /* let's use dev_bus_addr to record the old mfn instead */ + kmap_op->dev_bus_addr = page->index; + page->index = (unsigned long) kmap_op; + } spin_lock_irqsave(&m2p_override_lock, flags); list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); spin_unlock_irqrestore(&m2p_override_lock, flags); @@ -736,14 +750,56 @@ int m2p_remove_override(struct page *page, bool clear_pte) spin_lock_irqsave(&m2p_override_lock, flags); list_del(&page->lru); spin_unlock_irqrestore(&m2p_override_lock, flags); - set_phys_to_machine(pfn, page->index); WARN_ON(!PagePrivate(page)); ClearPagePrivate(page); - if (clear_pte && !PageHighMem(page)) - set_pte_at(&init_mm, address, ptep, - pfn_pte(pfn, PAGE_KERNEL)); - /* No tlb flush necessary because the caller already - * left the pte unmapped. */ + + if (clear_pte) { + struct gnttab_map_grant_ref *map_op = + (struct gnttab_map_grant_ref *) page->index; + set_phys_to_machine(pfn, map_op->dev_bus_addr); + if (!PageHighMem(page)) { + struct multicall_space mcs; + struct gnttab_unmap_grant_ref *unmap_op; + + /* + * It might be that we queued all the m2p grant table + * hypercalls in a multicall, then m2p_remove_override + * get called before the multicall has actually been + * issued. In this case handle is going to -1 because + * it hasn't been modified yet. + */ + if (map_op->handle == -1) + xen_mc_flush(); + /* + * Now if map_op->handle is negative it means that the + * hypercall actually returned an error. + */ + if (map_op->handle == GNTST_general_error) { + printk(KERN_WARNING "m2p_remove_override: " + "pfn %lx mfn %lx, failed to modify kernel mappings", + pfn, mfn); + return -1; + } + + mcs = xen_mc_entry( + sizeof(struct gnttab_unmap_grant_ref)); + unmap_op = mcs.args; + unmap_op->host_addr = map_op->host_addr; + unmap_op->handle = map_op->handle; + unmap_op->dev_bus_addr = 0; + + MULTI_grant_table_op(mcs.mc, + GNTTABOP_unmap_grant_ref, unmap_op, 1); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + set_pte_at(&init_mm, address, ptep, + pfn_pte(pfn, PAGE_KERNEL)); + __flush_tlb_single(address); + map_op->host_addr = 0; + } + } else + set_phys_to_machine(pfn, page->index); return 0; } diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 2330a9ad5e95..1540792b1e54 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -396,7 +396,7 @@ static int xen_blkbk_map(struct blkif_request *req, continue; ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr), - blkbk->pending_page(pending_req, i), false); + blkbk->pending_page(pending_req, i), NULL); if (ret) { pr_alert(DRV_PFX "Failed to install M2P override for %lx (ret: %d)\n", (unsigned long)map[i].dev_bus_addr, ret); diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 7b9b1d1b75a5..3e3603f35242 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -83,6 +83,7 @@ struct grant_map { struct ioctl_gntdev_grant_ref *grants; struct gnttab_map_grant_ref *map_ops; struct gnttab_unmap_grant_ref *unmap_ops; + struct gnttab_map_grant_ref *kmap_ops; struct page **pages; }; @@ -116,10 +117,12 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL); add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL); add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL); + add->kmap_ops = kzalloc(sizeof(add->kmap_ops[0]) * count, GFP_KERNEL); add->pages = kzalloc(sizeof(add->pages[0]) * count, GFP_KERNEL); if (NULL == add->grants || NULL == add->map_ops || NULL == add->unmap_ops || + NULL == add->kmap_ops || NULL == add->pages) goto err; @@ -129,6 +132,7 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) for (i = 0; i < count; i++) { add->map_ops[i].handle = -1; add->unmap_ops[i].handle = -1; + add->kmap_ops[i].handle = -1; } add->index = 0; @@ -142,6 +146,7 @@ err: kfree(add->grants); kfree(add->map_ops); kfree(add->unmap_ops); + kfree(add->kmap_ops); kfree(add); return NULL; } @@ -243,10 +248,35 @@ static int map_grant_pages(struct grant_map *map) gnttab_set_unmap_op(&map->unmap_ops[i], addr, map->flags, -1 /* handle */); } + } else { + /* + * Setup the map_ops corresponding to the pte entries pointing + * to the kernel linear addresses of the struct pages. + * These ptes are completely different from the user ptes dealt + * with find_grant_ptes. + */ + for (i = 0; i < map->count; i++) { + unsigned level; + unsigned long address = (unsigned long) + pfn_to_kaddr(page_to_pfn(map->pages[i])); + pte_t *ptep; + u64 pte_maddr = 0; + BUG_ON(PageHighMem(map->pages[i])); + + ptep = lookup_address(address, &level); + pte_maddr = arbitrary_virt_to_machine(ptep).maddr; + gnttab_set_map_op(&map->kmap_ops[i], pte_maddr, + map->flags | + GNTMAP_host_map | + GNTMAP_contains_pte, + map->grants[i].ref, + map->grants[i].domid); + } } pr_debug("map %d+%d\n", map->index, map->count); - err = gnttab_map_refs(map->map_ops, map->pages, map->count); + err = gnttab_map_refs(map->map_ops, use_ptemod ? map->kmap_ops : NULL, + map->pages, map->count); if (err) return err; diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 4f44b347b24a..8c71ab801756 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -448,7 +448,8 @@ unsigned int gnttab_max_grant_frames(void) EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, - struct page **pages, unsigned int count) + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count) { int i, ret; pte_t *pte; @@ -488,8 +489,7 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, */ return -EOPNOTSUPP; } - ret = m2p_add_override(mfn, pages[i], - map_ops[i].flags & GNTMAP_contains_pte); + ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]); if (ret) return ret; } diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index b1fab6b5b3ef..6b99bfbd785d 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -156,6 +156,7 @@ unsigned int gnttab_max_grant_frames(void); #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr)) int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, + struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count); int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, struct page **pages, unsigned int count); From 1f1503ba096d3a394d1454dac77467092ca996e6 Mon Sep 17 00:00:00 2001 From: Daniel De Graaf Date: Tue, 11 Oct 2011 15:16:06 -0400 Subject: [PATCH 06/14] xen/gntdev: Fix sleep-inside-spinlock BUG: sleeping function called from invalid context at /local/scratch/dariof/linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3256, name: qemu-dm 1 lock held by qemu-dm/3256: #0: (&(&priv->lock)->rlock){......}, at: [] gntdev_ioctl+0x2bd/0x4d5 Pid: 3256, comm: qemu-dm Tainted: G W 3.1.0-rc8+ #5 Call Trace: [] __might_sleep+0x131/0x135 [] mutex_lock_nested+0x25/0x45 [] free_xenballooned_pages+0x20/0xb1 [] gntdev_put_map+0xa8/0xdb [] ? _raw_spin_lock+0x71/0x7a [] ? gntdev_ioctl+0x2bd/0x4d5 [] gntdev_ioctl+0x31f/0x4d5 [] ? check_events+0x12/0x20 [] do_vfs_ioctl+0x488/0x4d7 [] ? xen_restore_fl_direct_reloc+0x4/0x4 [] ? lock_release+0x21c/0x229 [] ? rcu_read_unlock+0x21/0x32 [] sys_ioctl+0x47/0x6a [] system_call_fastpath+0x16/0x1b gntdev_put_map tries to acquire a mutex when freeing pages back to the xenballoon pool, so it cannot be called with a spinlock held. In gntdev_release, the spinlock is not needed as we are freeing the structure later; in the ioctl, only the list manipulation needs to be under the lock. Reported-and-Tested-By: Dario Faggioli Signed-off-by: Daniel De Graaf Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/gntdev.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 3e3603f35242..880798aae2f2 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -492,13 +492,11 @@ static int gntdev_release(struct inode *inode, struct file *flip) pr_debug("priv %p\n", priv); - spin_lock(&priv->lock); while (!list_empty(&priv->maps)) { map = list_entry(priv->maps.next, struct grant_map, next); list_del(&map->next); gntdev_put_map(map); } - spin_unlock(&priv->lock); if (use_ptemod) mmu_notifier_unregister(&priv->mn, priv->mm); @@ -562,10 +560,11 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); if (map) { list_del(&map->next); - gntdev_put_map(map); err = 0; } spin_unlock(&priv->lock); + if (map) + gntdev_put_map(map); return err; } From 38a1ed4f039db32b418007ac365076cf53647ebd Mon Sep 17 00:00:00 2001 From: Dan Magenheimer Date: Tue, 27 Sep 2011 08:47:58 -0600 Subject: [PATCH 07/14] xen: Fix selfballooning and ensure it doesn't go too far The balloon driver's "current_pages" is very different from totalram_pages. Self-ballooning needs to be driven by the latter. Also, Committed_AS doesn't account for pages used by the kernel so: 1) Add totalreserve_pages to Committed_AS for the normal target. 2) Enforce a floor for when there are little or no user-space threads using memory (e.g. single-user mode) to avoid OOMs. The floor function includes a "min_usable_mb" tuneable in case we discover later that the floor function is still too aggressive in some workloads, though likely it will not be needed. Changes since version 4: - change floor calculation so that it is not as aggressive; this version uses a piecewise linear function similar to minimum_target in the 2.6.18 balloon driver, but modified to add to totalreserve_pages instead of subtract from max_pfn, the 2.6.18 version causes OOMs on recent kernels because the kernel has expanded over time - change safety_margin to min_usable_mb and comment on its use - since committed_as does NOT include kernel space (and other reserved pages), totalreserve_pages is now added to committed_as. The result is less aggressive self-ballooning, but theoretically more appropriate. Changes since version 3: - missing include causes compile problem when CONFIG_FRONTSWAP is disabled - add comments after includes Changes since version 2: - missing include causes compile problem only on 32-bit Changes since version 1: - tuneable safety margin added [v5: avi.miller@oracle.com: still too aggressive, seeing some OOMs] [v4: konrad.wilk@oracle.com: fix compile when CONFIG_FRONTSWAP is disabled] [v3: guru.anbalagane@oracle.com: fix 32-bit compile] [v2: konrad.wilk@oracle.com: make safety margin tuneable] Signed-off-by: Dan Magenheimer [v1: Altered description and added an extra include] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/xen-selfballoon.c | 67 ++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 1b4afd81f872..ff3f2e423af4 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c @@ -68,6 +68,8 @@ */ #include +#include +#include #include #include #include @@ -92,6 +94,15 @@ static unsigned int selfballoon_uphysteresis __read_mostly = 1; /* In HZ, controls frequency of worker invocation. */ static unsigned int selfballoon_interval __read_mostly = 5; +/* + * Minimum usable RAM in MB for selfballooning target for balloon. + * If non-zero, it is added to totalreserve_pages and self-ballooning + * will not balloon below the sum. If zero, a piecewise linear function + * is calculated as a minimum and added to totalreserve_pages. Note that + * setting this value indiscriminately may cause OOMs and crashes. + */ +static unsigned int selfballoon_min_usable_mb; + static void selfballoon_process(struct work_struct *work); static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process); @@ -188,20 +199,23 @@ static int __init xen_selfballooning_setup(char *s) __setup("selfballooning", xen_selfballooning_setup); #endif /* CONFIG_FRONTSWAP */ +#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) + /* * Use current balloon size, the goal (vm_committed_as), and hysteresis * parameters to set a new target balloon size */ static void selfballoon_process(struct work_struct *work) { - unsigned long cur_pages, goal_pages, tgt_pages; + unsigned long cur_pages, goal_pages, tgt_pages, floor_pages; + unsigned long useful_pages; bool reset_timer = false; if (xen_selfballooning_enabled) { - cur_pages = balloon_stats.current_pages; + cur_pages = totalram_pages; tgt_pages = cur_pages; /* default is no change */ goal_pages = percpu_counter_read_positive(&vm_committed_as) + - balloon_stats.current_pages - totalram_pages; + totalreserve_pages; #ifdef CONFIG_FRONTSWAP /* allow space for frontswap pages to be repatriated */ if (frontswap_selfshrinking && frontswap_enabled) @@ -216,7 +230,26 @@ static void selfballoon_process(struct work_struct *work) ((goal_pages - cur_pages) / selfballoon_uphysteresis); /* else if cur_pages == goal_pages, no change */ - balloon_set_new_target(tgt_pages); + useful_pages = max_pfn - totalreserve_pages; + if (selfballoon_min_usable_mb != 0) + floor_pages = totalreserve_pages + + MB2PAGES(selfballoon_min_usable_mb); + /* piecewise linear function ending in ~3% slope */ + else if (useful_pages < MB2PAGES(16)) + floor_pages = max_pfn; /* not worth ballooning */ + else if (useful_pages < MB2PAGES(64)) + floor_pages = totalreserve_pages + MB2PAGES(16) + + ((useful_pages - MB2PAGES(16)) >> 1); + else if (useful_pages < MB2PAGES(512)) + floor_pages = totalreserve_pages + MB2PAGES(40) + + ((useful_pages - MB2PAGES(40)) >> 3); + else /* useful_pages >= MB2PAGES(512) */ + floor_pages = totalreserve_pages + MB2PAGES(99) + + ((useful_pages - MB2PAGES(99)) >> 5); + if (tgt_pages < floor_pages) + tgt_pages = floor_pages; + balloon_set_new_target(tgt_pages + + balloon_stats.current_pages - totalram_pages); reset_timer = true; } #ifdef CONFIG_FRONTSWAP @@ -339,6 +372,31 @@ static ssize_t store_selfballoon_uphys(struct sys_device *dev, static SYSDEV_ATTR(selfballoon_uphysteresis, S_IRUGO | S_IWUSR, show_selfballoon_uphys, store_selfballoon_uphys); +SELFBALLOON_SHOW(selfballoon_min_usable_mb, "%d\n", + selfballoon_min_usable_mb); + +static ssize_t store_selfballoon_min_usable_mb(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = strict_strtoul(buf, 10, &val); + if (err || val == 0) + return -EINVAL; + selfballoon_min_usable_mb = val; + return count; +} + +static SYSDEV_ATTR(selfballoon_min_usable_mb, S_IRUGO | S_IWUSR, + show_selfballoon_min_usable_mb, + store_selfballoon_min_usable_mb); + + #ifdef CONFIG_FRONTSWAP SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking); @@ -420,6 +478,7 @@ static struct attribute *selfballoon_attrs[] = { &attr_selfballoon_interval.attr, &attr_selfballoon_downhysteresis.attr, &attr_selfballoon_uphysteresis.attr, + &attr_selfballoon_min_usable_mb.attr, #ifdef CONFIG_FRONTSWAP &attr_frontswap_selfshrinking.attr, &attr_frontswap_hysteresis.attr, From 9d093e2958baf76154d1008339f594f798a52790 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 29 Sep 2011 13:31:21 -0400 Subject: [PATCH 08/14] xen/events: BUG() when we can't allocate our event->irq array. In case we can't allocate we are doomed. We should BUG_ON instead of trying to dereference it later on. Acked-by: Ian Campbell [v1: Use BUG_ON instead of BUG] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 7523719bf8a4..6b002cca1f5a 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -1670,6 +1670,7 @@ void __init xen_init_IRQ(void) evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), GFP_KERNEL); + BUG_ON(!evtchn_to_irq); for (i = 0; i < NR_EVENT_CHANNELS; i++) evtchn_to_irq[i] = -1; From 9bb9efe4bab8a877cdde5c6bfbfa202645517571 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 29 Sep 2011 13:13:30 -0400 Subject: [PATCH 09/14] xen/events: Don't check the info for NULL as it is already done. The list operation checks whether the 'info' structure that is retrieved from the list is NULL (otherwise it would not been able to retrieve it). This check is not neccessary. Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 6b002cca1f5a..503614f2c122 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -779,7 +779,7 @@ int xen_irq_from_pirq(unsigned pirq) mutex_lock(&irq_mapping_update_lock); list_for_each_entry(info, &xen_irq_list_head, list) { - if (info == NULL || info->type != IRQT_PIRQ) + if (info->type != IRQT_PIRQ) continue; irq = info->irq; if (info->u.pirq.pirq == pirq) From e6599225db36bbdc991d1cc8fbfcacb24f86cdb5 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 29 Sep 2011 13:26:45 -0400 Subject: [PATCH 10/14] xen/irq: If we fail during msi_capability_init return proper error code. There are three different modes: PV, HVM, and initial domain 0. In all the cases we would return -1 for failure instead of a proper error code. Fix this by propagating the error code from the generic IRQ code. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 10 +++++++--- drivers/xen/events.c | 7 ++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 1017c7bee388..11a9301d52d4 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -175,8 +175,10 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) "pcifront-msi-x" : "pcifront-msi", DOMID_SELF); - if (irq < 0) + if (irq < 0) { + ret = irq; goto free; + } i++; } kfree(v); @@ -221,8 +223,10 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (msg.data != XEN_PIRQ_MSI_DATA || xen_irq_from_pirq(pirq) < 0) { pirq = xen_allocate_pirq_msi(dev, msidesc); - if (pirq < 0) + if (pirq < 0) { + irq = -ENODEV; goto error; + } xen_msi_compose_msg(dev, pirq, &msg); __write_msi_msg(msidesc, &msg); dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq); @@ -244,7 +248,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) error: dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n"); - return -ENODEV; + return irq; } #ifdef CONFIG_XEN_DOM0 diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 503614f2c122..212a5c871bf4 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -432,7 +432,8 @@ static int __must_check xen_allocate_irq_dynamic(void) irq = irq_alloc_desc_from(first, -1); - xen_irq_init(irq); + if (irq >= 0) + xen_irq_init(irq); return irq; } @@ -713,7 +714,7 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, mutex_lock(&irq_mapping_update_lock); irq = xen_allocate_irq_dynamic(); - if (irq == -1) + if (irq < 0) goto out; irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_edge_irq, @@ -729,7 +730,7 @@ out: error_irq: mutex_unlock(&irq_mapping_update_lock); xen_free_irq(irq); - return -1; + return ret; } #endif From d98b15db376b9cc35f74fd2bd432b9fc287a5999 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 29 Sep 2011 13:16:17 -0400 Subject: [PATCH 11/14] xen/xenbus: Remove the unnecessary check. .. we check whether 'xdev' is NULL - but there is no need for it as the 'dev' check is done before. The 'dev' is embedded in the 'xdev' so having xdev != NULL with dev being being checked is not going to happen. Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/xenbus/xenbus_probe_backend.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index 60adf919d78d..32417b5064fd 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -104,8 +104,6 @@ static int xenbus_uevent_backend(struct device *dev, xdev = to_xenbus_device(dev); bus = container_of(xdev->dev.bus, struct xen_bus_type, bus); - if (xdev == NULL) - return -ENODEV; if (add_uevent_var(env, "MODALIAS=xen-backend:%s", xdev->devicetype)) return -ENOMEM; From 5e287830136a8edb76e9f9c432b264d99833172f Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 29 Sep 2011 13:06:42 -0400 Subject: [PATCH 12/14] xen/enlighten: Fix compile warnings and set cx to known value. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We get: linux/arch/x86/xen/enlighten.c: In function ‘xen_start_kernel’: linux/arch/x86/xen/enlighten.c:226: warning: ‘cx’ may be used uninitialized in this function linux/arch/x86/xen/enlighten.c:240: note: ‘cx’ was declared here and the cx is really not set but passed in the xen_cpuid instruction which masks the value with returned masked_ecx from cpuid. This can potentially lead to invalid data being stored in cx. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 2d69617950f7..da8afd576a6b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -251,6 +251,7 @@ static void __init xen_init_cpuid_mask(void) ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ (1 << X86_FEATURE_ACPI)); /* disable ACPI */ ax = 1; + cx = 0; xen_cpuid(&ax, &bx, &cx, &dx); xsave_mask = From 8404877ee1cfdbc872e153fd89022f9e47f6f5a3 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 29 Sep 2011 13:09:34 -0400 Subject: [PATCH 13/14] xen/p2m/debugfs: Fix potential pointer exception. We could be referencing the last + 1 element of level_name[] array which would cause a pointer exception, because of the initial setup of lvl=4. [v1: No need to do this for type_name, pointed out by Ian Campbell] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 58efeb9d5440..2e3bf7a0732b 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -786,7 +786,7 @@ EXPORT_SYMBOL_GPL(m2p_find_override_pfn); int p2m_dump_show(struct seq_file *m, void *v) { static const char * const level_name[] = { "top", "middle", - "entry", "abnormal" }; + "entry", "abnormal", "error"}; static const char * const type_name[] = { "identity", "missing", "pfn", "abnormal"}; #define TYPE_IDENTITY 0 From a491dbef56f2aba42fb292067d4652d246627738 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 3 Oct 2011 12:35:26 -0400 Subject: [PATCH 14/14] xen/p2m/debugfs: Make type_name more obvious. Per Ian Campbell suggestion to defend against future breakage in case we expand the P2M values, incorporate the defines in the string array. Suggested-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 2e3bf7a0732b..795e003517e1 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -787,12 +787,15 @@ int p2m_dump_show(struct seq_file *m, void *v) { static const char * const level_name[] = { "top", "middle", "entry", "abnormal", "error"}; - static const char * const type_name[] = { "identity", "missing", - "pfn", "abnormal"}; #define TYPE_IDENTITY 0 #define TYPE_MISSING 1 #define TYPE_PFN 2 #define TYPE_UNKNOWN 3 + static const char * const type_name[] = { + [TYPE_IDENTITY] = "identity", + [TYPE_MISSING] = "missing", + [TYPE_PFN] = "pfn", + [TYPE_UNKNOWN] = "abnormal"}; unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0; unsigned int uninitialized_var(prev_level); unsigned int uninitialized_var(prev_type);