From fd8b79511349efd1f0decea920f61b93acb34a75 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Tue, 7 Oct 2014 17:00:07 -0400 Subject: [PATCH 1/7] xen/balloon: Don't continue ballooning when BP_ECANCELED is encountered Commit 3dcf63677d4e ("xen/balloon: cancel ballooning if adding new memory failed") makes reserve_additional_memory() return BP_ECANCELED when an error is encountered. This error, however, is ignored by the caller (balloon_process()) since it is overwritten by subsequent call to update_schedule(). This results in continuous attempts to add more memory, all of which are likely to fail again. We should stop trying to schedule next iteration of ballooning when the current one has failed. Signed-off-by: Boris Ostrovsky Reviewed-by: Daniel Kiper Signed-off-by: David Vrabel --- drivers/xen/balloon.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 1e0a317d3dcd..3860d02729dc 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -167,6 +167,9 @@ static struct page *balloon_next_page(struct page *page) static enum bp_state update_schedule(enum bp_state state) { + if (state == BP_ECANCELED) + return BP_ECANCELED; + if (state == BP_DONE) { balloon_stats.schedule_delay = 1; balloon_stats.retry_count = 1; From 239af7c7132a617f9dcd05da1dc92b96bc6d0645 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 14 Oct 2014 11:00:18 +0200 Subject: [PATCH 2/7] x86/xen: avoid writing to freed memory after race in p2m handling In case a race was detected during allocation of a new p2m tree element in alloc_p2m() the new allocated mid_mfn page is freed without updating the pointer to the found value in the tree. This will result in overwriting the just freed page with the mfn of the p2m leaf. Signed-off-by: Juergen Gross Signed-off-by: David Vrabel --- arch/x86/xen/p2m.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 9f5983b01ed9..4534320e66e4 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -566,6 +566,7 @@ static bool alloc_p2m(unsigned long pfn) /* Separately check the mid mfn level */ unsigned long missing_mfn; unsigned long mid_mfn_mfn; + unsigned long old_mfn; mid_mfn = alloc_p2m_page(); if (!mid_mfn) @@ -575,10 +576,13 @@ static bool alloc_p2m(unsigned long pfn) missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); mid_mfn_mfn = virt_to_mfn(mid_mfn); - if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) + old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn); + if (old_mfn != missing_mfn) { free_p2m_page(mid_mfn); - else + mid_mfn = mfn_to_virt(old_mfn); + } else { p2m_top_mfn_p[topidx] = mid_mfn; + } } if (p2m_top[topidx][mididx] == p2m_identity || From 2c185687ab016954557aac80074f5d7f7f5d275c Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 14 Oct 2014 13:33:46 +0200 Subject: [PATCH 3/7] x86/xen: delay construction of mfn_list_list The 3 level p2m tree for the Xen tools is constructed very early at boot by calling xen_build_mfn_list_list(). Memory needed for this tree is allocated via extend_brk(). As this tree (other than the kernel internal p2m tree) is only needed for domain save/restore, live migration and crash dump analysis it doesn't matter whether it is constructed very early or just some milliseconds later when memory allocation is possible by other means. This patch moves the call of xen_build_mfn_list_list() just after calling xen_pagetable_p2m_copy() simplifying this function, too, as it doesn't have to bother with two parallel trees now. The same applies for some other internal functions. While simplifying code, make early_can_reuse_p2m_middle() static and drop the unused second parameter. p2m_mid_identity_mfn can be removed as well, it isn't used either. Signed-off-by: Juergen Gross Signed-off-by: David Vrabel --- arch/x86/xen/enlighten.c | 3 -- arch/x86/xen/mmu.c | 5 +++- arch/x86/xen/p2m.c | 65 +++++++++------------------------------- 3 files changed, 18 insertions(+), 55 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1a3f0445432a..fac5e4f9607c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1636,9 +1636,6 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_raw_console_write("mapping kernel into physical memory\n"); xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages); - /* Allocate and initialize top and mid mfn levels for p2m structure */ - xen_build_mfn_list_list(); - /* keep using Xen gdt for now; no urgent need to change it */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f62af7647ec9..a8a1a3d08d4d 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1217,10 +1217,13 @@ static void __init xen_pagetable_p2m_copy(void) static void __init xen_pagetable_init(void) { paging_init(); - xen_setup_shared_info(); #ifdef CONFIG_X86_64 xen_pagetable_p2m_copy(); #endif + /* Allocate and initialize top and mid mfn levels for p2m structure */ + xen_build_mfn_list_list(); + + xen_setup_shared_info(); xen_post_allocator_init(); } static void xen_write_cr2(unsigned long cr2) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 4534320e66e4..d1b3da2960ab 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -163,6 +163,7 @@ #include #include #include +#include #include #include @@ -181,21 +182,20 @@ static void __init m2p_override_init(void); unsigned long xen_max_p2m_pfn __read_mostly; +static unsigned long *p2m_mid_missing_mfn; +static unsigned long *p2m_top_mfn; +static unsigned long **p2m_top_mfn_p; + /* Placeholders for holes in the address space */ static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE); RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); -RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); /* For each I/O range remapped we may lose up to two leaf pages for the boundary * violations and three mid pages to cover up to 3GB. With @@ -272,11 +272,11 @@ static void p2m_init(unsigned long *p2m) * Build the parallel p2m_top_mfn and p2m_mid_mfn structures * * This is called both at boot time, and after resuming from suspend: - * - At boot time we're called very early, and must use extend_brk() + * - At boot time we're called rather early, and must use alloc_bootmem*() * to allocate memory. * * - After resume we're called from within stop_machine, but the mfn - * tree should alreay be completely allocated. + * tree should already be completely allocated. */ void __ref xen_build_mfn_list_list(void) { @@ -287,20 +287,17 @@ void __ref xen_build_mfn_list_list(void) /* Pre-initialize p2m_top_mfn to be completely missing */ if (p2m_top_mfn == NULL) { - p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_missing_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); - p2m_mid_identity_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity); - p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); p2m_top_mfn_p_init(p2m_top_mfn_p); - p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); p2m_top_mfn_init(p2m_top_mfn); } else { /* Reinitialise, mfn's all change after migration */ p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); - p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity); } for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { @@ -328,10 +325,9 @@ void __ref xen_build_mfn_list_list(void) /* * XXX boot-time only! We should never find * missing parts of the mfn tree after - * runtime. extend_brk() will BUG if we call - * it too late. + * runtime. */ - mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + mid_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); p2m_mid_mfn_init(mid_mfn_p, p2m_missing); p2m_top_mfn_p[topidx] = mid_mfn_p; @@ -415,7 +411,6 @@ void __init xen_build_dynamic_phys_to_machine(void) m2p_override_init(); } #ifdef CONFIG_X86_64 -#include unsigned long __init xen_revector_p2m_tree(void) { unsigned long va_start; @@ -477,7 +472,6 @@ unsigned long __init xen_revector_p2m_tree(void) copy_page(new, mid_p); p2m_top[topidx][mididx] = &mfn_list[pfn_free]; - p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn_free]); pfn_free += P2M_PER_PAGE; @@ -610,7 +604,6 @@ static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary) { unsigned topidx, mididx, idx; unsigned long *p2m; - unsigned long *mid_mfn_p; topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); @@ -637,43 +630,21 @@ static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary) p2m_top[topidx][mididx] = p2m; - /* For save/restore we need to MFN of the P2M saved */ - - mid_mfn_p = p2m_top_mfn_p[topidx]; - WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), - "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", - topidx, mididx); - mid_mfn_p[mididx] = virt_to_mfn(p2m); - return true; } static bool __init early_alloc_p2m_middle(unsigned long pfn) { unsigned topidx = p2m_top_index(pfn); - unsigned long *mid_mfn_p; unsigned long **mid; mid = p2m_top[topidx]; - mid_mfn_p = p2m_top_mfn_p[topidx]; if (mid == p2m_mid_missing) { mid = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_mid_init(mid, p2m_missing); p2m_top[topidx] = mid; - - BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); - } - /* And the save/restore P2M tables.. */ - if (mid_mfn_p == p2m_mid_missing_mfn) { - mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(mid_mfn_p, p2m_missing); - - p2m_top_mfn_p[topidx] = mid_mfn_p; - p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); - /* Note: we don't set mid_mfn_p[midix] here, - * look in early_alloc_p2m() */ } return true; } @@ -684,14 +655,13 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn) * replace the P2M leaf with a p2m_missing or p2m_identity. * Stick the old page in the new P2M tree location. */ -bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn) +static bool __init early_can_reuse_p2m_middle(unsigned long set_pfn) { unsigned topidx; unsigned mididx; unsigned ident_pfns; unsigned inv_pfns; unsigned long *p2m; - unsigned long *mid_mfn_p; unsigned idx; unsigned long pfn; @@ -737,11 +707,6 @@ bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_ found: /* Found one, replace old with p2m_identity or p2m_missing */ p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing); - /* And the other for save/restore.. */ - mid_mfn_p = p2m_top_mfn_p[topidx]; - /* NOTE: Even if it is a p2m_identity it should still be point to - * a page filled with INVALID_P2M_ENTRY entries. */ - mid_mfn_p[mididx] = virt_to_mfn(p2m_missing); /* Reset where we want to stick the old page in. */ topidx = p2m_top_index(set_pfn); @@ -756,8 +721,6 @@ found: p2m_init(p2m); p2m_top[topidx][mididx] = p2m; - mid_mfn_p = p2m_top_mfn_p[topidx]; - mid_mfn_p[mididx] = virt_to_mfn(p2m); return true; } @@ -767,7 +730,7 @@ bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) if (!early_alloc_p2m_middle(pfn)) return false; - if (early_can_reuse_p2m_middle(pfn, mfn)) + if (early_can_reuse_p2m_middle(pfn)) return __set_phys_to_machine(pfn, mfn); if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/)) From 3a0e94f8ead4a58b9719db0f78e13d02d059604f Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Oct 2014 13:16:06 +0200 Subject: [PATCH 4/7] x86/xen: avoid race in p2m handling When a new p2m leaf is allocated this leaf is linked into the p2m tree via cmpxchg. Unfortunately the compare value for checking the success of the update is read after checking for the need of a new leaf. It is possible that a new leaf has been linked into the tree concurrently in between. This could lead to a leaked memory page and to the loss of some p2m entries. Avoid the race by using the read compare value for checking the need of a new p2m leaf and use ACCESS_ONCE() to get it. There are other places which seem to need ACCESS_ONCE() to ensure proper operation. Change them accordingly. Signed-off-by: Juergen Gross Signed-off-by: David Vrabel --- arch/x86/xen/p2m.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index d1b3da2960ab..b456b048eca9 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -532,12 +532,13 @@ static bool alloc_p2m(unsigned long pfn) unsigned topidx, mididx; unsigned long ***top_p, **mid; unsigned long *top_mfn_p, *mid_mfn; + unsigned long *p2m_orig; topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); top_p = &p2m_top[topidx]; - mid = *top_p; + mid = ACCESS_ONCE(*top_p); if (mid == p2m_mid_missing) { /* Mid level is missing, allocate a new one */ @@ -552,7 +553,7 @@ static bool alloc_p2m(unsigned long pfn) } top_mfn_p = &p2m_top_mfn[topidx]; - mid_mfn = p2m_top_mfn_p[topidx]; + mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); @@ -579,11 +580,10 @@ static bool alloc_p2m(unsigned long pfn) } } - if (p2m_top[topidx][mididx] == p2m_identity || - p2m_top[topidx][mididx] == p2m_missing) { + p2m_orig = ACCESS_ONCE(p2m_top[topidx][mididx]); + if (p2m_orig == p2m_identity || p2m_orig == p2m_missing) { /* p2m leaf page is missing */ unsigned long *p2m; - unsigned long *p2m_orig = p2m_top[topidx][mididx]; p2m = alloc_p2m_page(); if (!p2m) From 3251f20b897f955ab7f153181b4ebfbb2317cbb9 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 16 Oct 2014 17:02:15 -0400 Subject: [PATCH 5/7] x86/xen: Fix incorrect per_cpu accessor in xen_clocksource_read() Commit 89cbc76768c2 ("x86: Replace __get_cpu_var uses") replaced __get_cpu_var() with this_cpu_ptr() in xen_clocksource_read() in such a way that instead of accessing a structure pointed to by a per-cpu pointer we are trying to get to a per-cpu structure. __this_cpu_read() of the pointer is the more appropriate accessor. Signed-off-by: Boris Ostrovsky Signed-off-by: David Vrabel --- arch/x86/xen/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index a1d430b112b3..f473d268d387 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -158,7 +158,7 @@ cycle_t xen_clocksource_read(void) cycle_t ret; preempt_disable_notrace(); - src = this_cpu_ptr(&xen_vcpu->time); + src = &__this_cpu_read(xen_vcpu)->time; ret = pvclock_clocksource_read(src); preempt_enable_notrace(); return ret; From 1ea644c8f93f3435760d8b88c25d56475d8b7778 Mon Sep 17 00:00:00 2001 From: Martin Kelly Date: Thu, 16 Oct 2014 20:48:11 -0700 Subject: [PATCH 6/7] x86/xen: panic on bad Xen-provided memory map Panic if Xen provides a memory map with 0 entries. Although this is unlikely, it is better to catch the error at the point of seeing the map than later on as a symptom of some other crash. Signed-off-by: Martin Kelly Signed-off-by: David Vrabel --- arch/x86/xen/setup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index af7216128d93..29834b3fd87f 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -595,6 +595,7 @@ char * __init xen_memory_setup(void) rc = 0; } BUG_ON(rc); + BUG_ON(memmap.nr_entries == 0); /* * Xen won't allow a 1:1 mapping to be created to UNUSABLE From 486edb24952c930966dad125f6727017353e9361 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 4 Aug 2014 18:17:23 -0400 Subject: [PATCH 7/7] xen/pci: Allocate memory for physdev_pci_device_add's optarr physdev_pci_device_add's optarr[] is a zero-sized array and therefore reference to add.optarr[0] is accessing memory that does not belong to the 'add' variable. Signed-off-by: Boris Ostrovsky Reviewed-by: Jan Beulich Signed-off-by: David Vrabel --- drivers/xen/pci.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index dd9c249ea311..95ee4302ffb8 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -41,24 +41,29 @@ static int xen_add_device(struct device *dev) #endif if (pci_seg_supported) { - struct physdev_pci_device_add add = { - .seg = pci_domain_nr(pci_dev->bus), - .bus = pci_dev->bus->number, - .devfn = pci_dev->devfn + struct { + struct physdev_pci_device_add add; + uint32_t pxm; + } add_ext = { + .add.seg = pci_domain_nr(pci_dev->bus), + .add.bus = pci_dev->bus->number, + .add.devfn = pci_dev->devfn }; + struct physdev_pci_device_add *add = &add_ext.add; + #ifdef CONFIG_ACPI acpi_handle handle; #endif #ifdef CONFIG_PCI_IOV if (pci_dev->is_virtfn) { - add.flags = XEN_PCI_DEV_VIRTFN; - add.physfn.bus = physfn->bus->number; - add.physfn.devfn = physfn->devfn; + add->flags = XEN_PCI_DEV_VIRTFN; + add->physfn.bus = physfn->bus->number; + add->physfn.devfn = physfn->devfn; } else #endif if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) - add.flags = XEN_PCI_DEV_EXTFN; + add->flags = XEN_PCI_DEV_EXTFN; #ifdef CONFIG_ACPI handle = ACPI_HANDLE(&pci_dev->dev); @@ -77,8 +82,8 @@ static int xen_add_device(struct device *dev) status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm); if (ACPI_SUCCESS(status)) { - add.optarr[0] = pxm; - add.flags |= XEN_PCI_DEV_PXM; + add->optarr[0] = pxm; + add->flags |= XEN_PCI_DEV_PXM; break; } status = acpi_get_parent(handle, &handle); @@ -86,7 +91,7 @@ static int xen_add_device(struct device *dev) } #endif /* CONFIG_ACPI */ - r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, &add); + r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, add); if (r != -ENOSYS) return r; pci_seg_supported = false;