From 5b398e416e880159fe55eefd93c6588fa072cd66 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Wed, 28 Sep 2016 15:22:30 -0700 Subject: [PATCH 1/4] mm,ksm: fix endless looping in allocating memory when ksm enable I hit the following hung task when runing a OOM LTP test case with 4.1 kernel. Call trace: [] __switch_to+0x74/0x8c [] __schedule+0x23c/0x7bc [] schedule+0x3c/0x94 [] rwsem_down_write_failed+0x214/0x350 [] down_write+0x64/0x80 [] __ksm_exit+0x90/0x19c [] mmput+0x118/0x11c [] do_exit+0x2dc/0xa74 [] do_group_exit+0x4c/0xe4 [] get_signal+0x444/0x5e0 [] do_signal+0x1d8/0x450 [] do_notify_resume+0x70/0x78 The oom victim cannot terminate because it needs to take mmap_sem for write while the lock is held by ksmd for read which loops in the page allocator ksm_do_scan scan_get_next_rmap_item down_read get_next_rmap_item alloc_rmap_item #ksmd will loop permanently. There is no way forward because the oom victim cannot release any memory in 4.1 based kernel. Since 4.6 we have the oom reaper which would solve this problem because it would release the memory asynchronously. Nevertheless we can relax alloc_rmap_item requirements and use __GFP_NORETRY because the allocation failure is acceptable as ksm_do_scan would just retry later after the lock got dropped. Such a patch would be also easy to backport to older stable kernels which do not have oom_reaper. While we are at it add GFP_NOWARN so the admin doesn't have to be alarmed by the allocation failure. Link: http://lkml.kernel.org/r/1474165570-44398-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Suggested-by: Hugh Dickins Suggested-by: Michal Hocko Acked-by: Michal Hocko Acked-by: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/ksm.c b/mm/ksm.c index 73d43bafd9fb..5048083b60f2 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -283,7 +283,8 @@ static inline struct rmap_item *alloc_rmap_item(void) { struct rmap_item *rmap_item; - rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); + rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | + __GFP_NORETRY | __GFP_NOWARN); if (rmap_item) ksm_rmap_items++; return rmap_item; From 2481366afd71a0c0b7cd725e6750c04cf589673b Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Wed, 28 Sep 2016 15:22:33 -0700 Subject: [PATCH 2/4] dma-mapping.h: preserve unmap info for CONFIG_DMA_API_DEBUG When CONFIG_DMA_API_DEBUG is enabled we need to preserve unmapping address even if "unmap" is a no-op for our architecutre because we need debug_dma_unmap_page() to correctly cleanup all of the debug bookkeeping. Failing to do so results in a false positive warnings about previously mapped areas never being unmapped. Link: http://lkml.kernel.org/r/1474387125-3713-1-git-send-email-andrew.smirnov@gmail.com Signed-off-by: Andrey Smirnov Reviewed-by: Robin Murphy Cc: Joerg Roedel Cc: Will Deacon Cc: Zhen Lei Cc: "Luis R. Rodriguez" Cc: Christian Borntraeger Cc: Geliang Tang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dma-mapping.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 66533e18276c..dc69df04abc1 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -718,7 +718,7 @@ static inline int dma_mmap_wc(struct device *dev, #define dma_mmap_writecombine dma_mmap_wc #endif -#ifdef CONFIG_NEED_DMA_MAP_STATE +#if defined(CONFIG_NEED_DMA_MAP_STATE) || defined(CONFIG_DMA_API_DEBUG) #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME #define DEFINE_DMA_UNMAP_LEN(LEN_NAME) __u32 LEN_NAME #define dma_unmap_addr(PTR, ADDR_NAME) ((PTR)->ADDR_NAME) From e436fd61a8f62cb7a16310a42b95ab076ff72eff Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Wed, 28 Sep 2016 15:22:36 -0700 Subject: [PATCH 3/4] scripts/recordmcount.c: account for .softirqentry.text be7635e7287e ("arch, ftrace: for KASAN put hard/soft IRQ entries into separate sections") added .softirqentry.text section, but it was not added to recordmcount. So functions in the section are untracable. Add the section to scripts/recordmcount.c and scripts/recordmcount.pl. Fixes: be7635e7287e ("arch, ftrace: for KASAN put hard/soft IRQ entries into separate sections") Link: http://lkml.kernel.org/r/1474902626-73468-1-git-send-email-dvyukov@google.com Signed-off-by: Dmitry Vyukov Acked-by: Steve Rostedt Cc: [4.6+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/recordmcount.c | 1 + scripts/recordmcount.pl | 1 + 2 files changed, 2 insertions(+) diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 42396a74405d..a68f03133df9 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -363,6 +363,7 @@ is_mcounted_section_name(char const *const txtname) strcmp(".sched.text", txtname) == 0 || strcmp(".spinlock.text", txtname) == 0 || strcmp(".irqentry.text", txtname) == 0 || + strcmp(".softirqentry.text", txtname) == 0 || strcmp(".kprobes.text", txtname) == 0 || strcmp(".text.unlikely", txtname) == 0; } diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 96e2486a6fc4..2d48011bc362 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -134,6 +134,7 @@ my %text_sections = ( ".sched.text" => 1, ".spinlock.text" => 1, ".irqentry.text" => 1, + ".softirqentry.text" => 1, ".kprobes.text" => 1, ".text.unlikely" => 1, ); From 231e97e2b8ec9a1556ced5d8a89cda03a480b179 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Wed, 28 Sep 2016 15:22:38 -0700 Subject: [PATCH 4/4] mem-hotplug: use nodes that contain memory as mask in new_node_page() 9bb627be47a5 ("mem-hotplug: don't clear the only node in new_node_page()") prevents allocating from an empty nodemask, but as David points out, it is still wrong. As node_online_map may include memoryless nodes, only allocating from these nodes is meaningless. This patch uses node_states[N_MEMORY] mask to prevent the above case. Fixes: 9bb627be47a5 ("mem-hotplug: don't clear the only node in new_node_page()") Fixes: 394e31d2ceb4 ("mem-hotplug: alloc new page from a nearest neighbor node when mem-offline") Link: http://lkml.kernel.org/r/1474447117.28370.6.camel@TP420 Signed-off-by: Li Zhong Suggested-by: David Rientjes Acked-by: Vlastimil Babka Cc: Michal Hocko Cc: John Allen Cc: Xishi Qiu Cc: Joonsoo Kim Cc: Naoya Horiguchi Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b58906b6215c..9d29ba0f7192 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1555,8 +1555,8 @@ static struct page *new_node_page(struct page *page, unsigned long private, { gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; int nid = page_to_nid(page); - nodemask_t nmask = node_online_map; - struct page *new_page; + nodemask_t nmask = node_states[N_MEMORY]; + struct page *new_page = NULL; /* * TODO: allocate a destination hugepage from a nearest neighbor node, @@ -1567,14 +1567,14 @@ static struct page *new_node_page(struct page *page, unsigned long private, return alloc_huge_page_node(page_hstate(compound_head(page)), next_node_in(nid, nmask)); - if (nid != next_node_in(nid, nmask)) - node_clear(nid, nmask); + node_clear(nid, nmask); if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) gfp_mask |= __GFP_HIGHMEM; - new_page = __alloc_pages_nodemask(gfp_mask, 0, + if (!nodes_empty(nmask)) + new_page = __alloc_pages_nodemask(gfp_mask, 0, node_zonelist(nid, gfp_mask), &nmask); if (!new_page) new_page = __alloc_pages(gfp_mask, 0,