From c47d7f56e914900410f65835933f9fc4374d0a2b Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 14 Dec 2017 15:32:24 -0800 Subject: [PATCH 01/17] include/linux/idr.h: add #include The was removed from radix-tree.h by commit f5bba9d11a25 ("include/linux/radix-tree.h: remove unneeded #include "). Since that commit, tools/testing/radix-tree/ couldn't pass compilation due to tools/testing/radix-tree/idr.c:17: undefined reference to WARN_ON_ONCE. This patch adds the bug.h header to idr.h to solve the issue. Link: http://lkml.kernel.org/r/1511963726-34070-2-git-send-email-wei.w.wang@intel.com Fixes: f5bba9d11a2 ("include/linux/radix-tree.h: remove unneeded #include ") Signed-off-by: Wei Wang Cc: Matthew Wilcox Cc: Jan Kara Cc: Eric Biggers Cc: Tejun Heo Cc: Masahiro Yamada Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/idr.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/idr.h b/include/linux/idr.h index 7c3a365f7e12..fa14f834e4ed 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -15,6 +15,7 @@ #include #include #include +#include struct idr { struct radix_tree_root idr_rt; From 338f1d9d1b829fec494d053f62820a2ee625b1ec Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 14 Dec 2017 15:32:28 -0800 Subject: [PATCH 02/17] lib/rbtree,drm/mm: add rbtree_replace_node_cached() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a variant of rbtree_replace_node() that maintains the leftmost cache of struct rbtree_root_cached when replacing nodes within the rbtree. As drm_mm is the only rb_replace_node() being used on an interval tree, the mistake looks fairly self-contained. Furthermore the only user of drm_mm_replace_node() is its testsuite... Testcase: igt/drm_mm/replace Link: http://lkml.kernel.org/r/20171122100729.3742-1-chris@chris-wilson.co.uk Link: https://patchwork.freedesktop.org/patch/msgid/20171109212435.9265-1-chris@chris-wilson.co.uk Fixes: f808c13fd373 ("lib/interval_tree: fast overlap detection") Signed-off-by: Chris Wilson Reviewed-by: Joonas Lahtinen Acked-by: Davidlohr Bueso Cc: Jérôme Glisse Cc: Joonas Lahtinen Cc: Daniel Vetter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/drm_mm.c | 8 +++++--- include/linux/rbtree.h | 2 ++ lib/rbtree.c | 10 ++++++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c index 61a1c8ea74bc..c3c79ee6119e 100644 --- a/drivers/gpu/drm/drm_mm.c +++ b/drivers/gpu/drm/drm_mm.c @@ -575,21 +575,23 @@ EXPORT_SYMBOL(drm_mm_remove_node); */ void drm_mm_replace_node(struct drm_mm_node *old, struct drm_mm_node *new) { + struct drm_mm *mm = old->mm; + DRM_MM_BUG_ON(!old->allocated); *new = *old; list_replace(&old->node_list, &new->node_list); - rb_replace_node(&old->rb, &new->rb, &old->mm->interval_tree.rb_root); + rb_replace_node_cached(&old->rb, &new->rb, &mm->interval_tree); if (drm_mm_hole_follows(old)) { list_replace(&old->hole_stack, &new->hole_stack); rb_replace_node(&old->rb_hole_size, &new->rb_hole_size, - &old->mm->holes_size); + &mm->holes_size); rb_replace_node(&old->rb_hole_addr, &new->rb_hole_addr, - &old->mm->holes_addr); + &mm->holes_addr); } old->allocated = false; diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index d574361943ea..fcbeed4053ef 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -99,6 +99,8 @@ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, struct rb_root *root); +extern void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, + struct rb_root_cached *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) diff --git a/lib/rbtree.c b/lib/rbtree.c index ba4a9d165f1b..d3ff682fd4b8 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -603,6 +603,16 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, } EXPORT_SYMBOL(rb_replace_node); +void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, + struct rb_root_cached *root) +{ + rb_replace_node(victim, new, &root->rb_root); + + if (root->rb_leftmost == victim) + root->rb_leftmost = new; +} +EXPORT_SYMBOL(rb_replace_node_cached); + void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, struct rb_root *root) { From 13ab183d138f607d885e995d625e58d47678bf97 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 14 Dec 2017 15:32:31 -0800 Subject: [PATCH 03/17] mm/kmemleak.c: make cond_resched() rate-limiting more efficient Commit bde5f6bc68db ("kmemleak: add scheduling point to kmemleak_scan()") tries to rate-limit the frequency of cond_resched() calls, but does it in a way which might incur an expensive division operation in the inner loop. Simplify this. Fixes: bde5f6bc68db5 ("kmemleak: add scheduling point to kmemleak_scan()") Suggested-by: Linus Torvalds Cc: Yisheng Xie Cc: Catalin Marinas Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 3d4781756d50..d73c14294f3a 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1523,7 +1523,7 @@ static void kmemleak_scan(void) if (page_count(page) == 0) continue; scan_block(page, page + 1, NULL); - if (!(pfn % (MAX_SCAN_SIZE / sizeof(*page)))) + if (!(pfn & 63)) cond_resched(); } } From 146734b091430c80d80bb96b1139a96fb4bc830e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 14 Dec 2017 15:32:34 -0800 Subject: [PATCH 04/17] string.h: workaround for increased stack usage The hardened strlen() function causes rather large stack usage in at least one file in the kernel, in particular when CONFIG_KASAN is enabled: drivers/media/usb/em28xx/em28xx-dvb.c: In function 'em28xx_dvb_init': drivers/media/usb/em28xx/em28xx-dvb.c:2062:1: error: the frame size of 3256 bytes is larger than 204 bytes [-Werror=frame-larger-than=] Analyzing this problem led to the discovery that gcc fails to merge the stack slots for the i2c_board_info[] structures after we strlcpy() into them, due to the 'noreturn' attribute on the source string length check. I reported this as a gcc bug, but it is unlikely to get fixed for gcc-8, since it is relatively easy to work around, and it gets triggered rarely. An earlier workaround I did added an empty inline assembly statement before the call to fortify_panic(), which works surprisingly well, but is really ugly and unintuitive. This is a new approach to the same problem, this time addressing it by not calling the 'extern __real_strnlen()' function for string constants where __builtin_strlen() is a compile-time constant and therefore known to be safe. We do this by checking if the last character in the string is a compile-time constant '\0'. If it is, we can assume that strlen() of the string is also constant. As a side-effect, this should also improve the object code output for any other call of strlen() on a string constant. [akpm@linux-foundation.org: add comment] Link: http://lkml.kernel.org/r/20171205215143.3085755-1-arnd@arndb.de Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82365 Link: https://patchwork.kernel.org/patch/9980413/ Link: https://patchwork.kernel.org/patch/9974047/ Fixes: 6974f0c4555 ("include/linux/string.h: add the option of fortified string.h functions") Signed-off-by: Arnd Bergmann Cc: Kees Cook Cc: Mauro Carvalho Chehab Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Daniel Micay Cc: Greg Kroah-Hartman Cc: Martin Wilck Cc: Dan Williams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/string.h b/include/linux/string.h index 410ecf17de3c..cfd83eb2f926 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -259,7 +259,10 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) { __kernel_size_t ret; size_t p_size = __builtin_object_size(p, 0); - if (p_size == (size_t)-1) + + /* Work around gcc excess stack consumption issue */ + if (p_size == (size_t)-1 || + (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0')) return __builtin_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) From 302ec300ef8a545a7fc7f667e5fd743b091c2eeb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 14 Dec 2017 15:32:38 -0800 Subject: [PATCH 05/17] autofs: fix careless error in recent commit Commit ecc0c469f277 ("autofs: don't fail mount for transient error") was meant to replace an 'if' with a 'switch', but instead added the 'switch' leaving the case in place. Link: http://lkml.kernel.org/r/87zi6wstmw.fsf@notabene.neil.brown.name Fixes: ecc0c469f277 ("autofs: don't fail mount for transient error") Reported-by: Ben Hutchings Signed-off-by: NeilBrown Cc: Ian Kent Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/waitq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 8fc41705c7cd..961a12dc6dc8 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -170,7 +170,6 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, mutex_unlock(&sbi->wq_mutex); - if (autofs4_write(sbi, pipe, &pkt, pktsz)) switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) { case 0: break; From 3756f6401c302617c5e091081ca4d26ab604bec5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 14 Dec 2017 15:32:41 -0800 Subject: [PATCH 06/17] exec: avoid gcc-8 warning for get_task_comm gcc-8 warns about using strncpy() with the source size as the limit: fs/exec.c:1223:32: error: argument to 'sizeof' in 'strncpy' call is the same expression as the source; did you mean to use the size of the destination? [-Werror=sizeof-pointer-memaccess] This is indeed slightly suspicious, as it protects us from source arguments without NUL-termination, but does not guarantee that the destination is terminated. This keeps the strncpy() to ensure we have properly padded target buffer, but ensures that we use the correct length, by passing the actual length of the destination buffer as well as adding a build-time check to ensure it is exactly TASK_COMM_LEN. There are only 23 callsites which I all reviewed to ensure this is currently the case. We could get away with doing only the check or passing the right length, but it doesn't hurt to do both. Link: http://lkml.kernel.org/r/20171205151724.1764896-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Suggested-by: Kees Cook Acked-by: Kees Cook Acked-by: Ingo Molnar Cc: Alexander Viro Cc: Peter Zijlstra Cc: Serge Hallyn Cc: James Morris Cc: Aleksa Sarai Cc: "Eric W. Biederman" Cc: Frederic Weisbecker Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 7 +++---- include/linux/sched.h | 6 +++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 6be2aa0ab26f..156f56acfe8e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1216,15 +1216,14 @@ killed: return -EAGAIN; } -char *get_task_comm(char *buf, struct task_struct *tsk) +char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) { - /* buf must be at least sizeof(tsk->comm) in size */ task_lock(tsk); - strncpy(buf, tsk->comm, sizeof(tsk->comm)); + strncpy(buf, tsk->comm, buf_size); task_unlock(tsk); return buf; } -EXPORT_SYMBOL_GPL(get_task_comm); +EXPORT_SYMBOL_GPL(__get_task_comm); /* * These functions flushes out all traces of the currently running executable diff --git a/include/linux/sched.h b/include/linux/sched.h index 21991d668d35..5124ba709830 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1503,7 +1503,11 @@ static inline void set_task_comm(struct task_struct *tsk, const char *from) __set_task_comm(tsk, from, false); } -extern char *get_task_comm(char *to, struct task_struct *tsk); +extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); +#define get_task_comm(buf, tsk) ({ \ + BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \ + __get_task_comm(buf, sizeof(buf), tsk); \ +}) #ifdef CONFIG_SMP void scheduler_ipi(void); From 51f73fffbf30b335d036ab356b67b05e16e26585 Mon Sep 17 00:00:00 2001 From: Srividya Desireddy Date: Thu, 14 Dec 2017 15:32:45 -0800 Subject: [PATCH 07/17] Documentation/vm/zswap.txt: update with same-value filled page feature Update zswap document with details on same-value filled pages identification feature. The usage of zswap.same_filled_pages_enabled module parameter is explained. Link: http://lkml.kernel.org/r/20171206114852epcms5p6973b02a9f455d5d3c765eafda0fe2631@epcms5p6 Signed-off-by: Srividya Desireddy Acked-by: Dan Streetman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/zswap.txt | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/Documentation/vm/zswap.txt b/Documentation/vm/zswap.txt index 89fff7d611cc..0b3a1148f9f0 100644 --- a/Documentation/vm/zswap.txt +++ b/Documentation/vm/zswap.txt @@ -98,5 +98,25 @@ request is made for a page in an old zpool, it is uncompressed using its original compressor. Once all pages are removed from an old zpool, the zpool and its compressor are freed. +Some of the pages in zswap are same-value filled pages (i.e. contents of the +page have same value or repetitive pattern). These pages include zero-filled +pages and they are handled differently. During store operation, a page is +checked if it is a same-value filled page before compressing it. If true, the +compressed length of the page is set to zero and the pattern or same-filled +value is stored. + +Same-value filled pages identification feature is enabled by default and can be +disabled at boot time by setting the "same_filled_pages_enabled" attribute to 0, +e.g. zswap.same_filled_pages_enabled=0. It can also be enabled and disabled at +runtime using the sysfs "same_filled_pages_enabled" attribute, e.g. + +echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled + +When zswap same-filled page identification is disabled at runtime, it will stop +checking for the same-value filled pages during store operation. However, the +existing pages which are marked as same-value filled pages remain stored +unchanged in zswap until they are either loaded or invalidated. + A debugfs interface is provided for various statistic about pool size, number -of pages stored, and various counters for the reasons pages are rejected. +of pages stored, same-value filled pages and various counters for the reasons +pages are rejected. From 4cc90b4cc3d4955f79eae4f7f9d64e67e17b468e Mon Sep 17 00:00:00 2001 From: "Liu, Changcheng" Date: Thu, 14 Dec 2017 15:32:48 -0800 Subject: [PATCH 08/17] scripts/faddr2line: fix CROSS_COMPILE unset error faddr2line hit var unbound error when CROSS_COMPILE isn't set since nounset option is set in bash script. Link: http://lkml.kernel.org/r/20171206013022.GA83929@sofia Fixes: 95a879825419 ("scripts/faddr2line: extend usage on generic arch") Signed-off-by: Liu Changcheng Reported-by: Richard Weinberger Reviewed-by: Richard Weinberger Cc: Thomas Gleixner Cc: Greg Kroah-Hartman Cc: Philippe Ombredanne Cc: NeilBrown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/faddr2line | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/faddr2line b/scripts/faddr2line index 39e07d8574dd..7721d5b2b0c0 100755 --- a/scripts/faddr2line +++ b/scripts/faddr2line @@ -44,10 +44,10 @@ set -o errexit set -o nounset -READELF="${CROSS_COMPILE}readelf" -ADDR2LINE="${CROSS_COMPILE}addr2line" -SIZE="${CROSS_COMPILE}size" -NM="${CROSS_COMPILE}nm" +READELF="${CROSS_COMPILE:-}readelf" +ADDR2LINE="${CROSS_COMPILE:-}addr2line" +SIZE="${CROSS_COMPILE:-}size" +NM="${CROSS_COMPILE:-}nm" command -v awk >/dev/null 2>&1 || die "awk isn't installed" command -v ${READELF} >/dev/null 2>&1 || die "readelf isn't installed" From 183f24aa5b76e37da690b2def41cc70f0792ce09 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 14 Dec 2017 15:32:52 -0800 Subject: [PATCH 09/17] mm/memory.c: mark wp_huge_pmd() inline to prevent build failure With gcc 4.1.2: mm/memory.o: In function `wp_huge_pmd': memory.c:(.text+0x9b4): undefined reference to `do_huge_pmd_wp_page' Interestingly, wp_huge_pmd() is emitted in the assembler output, but never called. Apparently replacing the call to pmd_write() in __handle_mm_fault() by a call to the more complex pmd_access_permitted() reduced the ability of the compiler to remove unused code. Fix this by marking wp_huge_pmd() inline, like was done in commit 91a90140f998 ("mm/memory.c: mark create_huge_pmd() inline to prevent build failure") for a similar problem. [akpm@linux-foundation.org: add comment] Link: http://lkml.kernel.org/r/1512335500-10889-1-git-send-email-geert@linux-m68k.org Fixes: c7da82b894e9eef6 ("mm: replace pmd_write with pmd_access_permitted in fault + gup paths") Signed-off-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 5eb3d2524bdc..cfaba6287702 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3831,7 +3831,8 @@ static inline int create_huge_pmd(struct vm_fault *vmf) return VM_FAULT_FALLBACK; } -static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) +/* `inline' is required to avoid gcc 4.1.2 build error */ +static inline int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_wp_page(vmf, orig_pmd); From c24ad77d962c31af92f2b731dad2104cbf3fbb03 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Thu, 14 Dec 2017 15:32:55 -0800 Subject: [PATCH 10/17] mm/page_alloc.c: avoid excessive IRQ disabled times in free_unref_page_list() Since commit 9cca35d42eb6 ("mm, page_alloc: enable/disable IRQs once when freeing a list of pages") we see excessive IRQ disabled times of up to 25ms on an embedded ARM system (tracing overhead included). This is due to graphics buffers being freed back to the system via release_pages(). Graphics buffers can be huge, so it's not hard to hit cases where the list of pages to free has 2048 entries. Disabling IRQs while freeing all those pages is clearly not a good idea. Introduce a batch limit, which allows IRQ servicing once every few pages. The batch count is the same as used in other parts of the MM subsystem when dealing with IRQ disabled regions. Link: http://lkml.kernel.org/r/20171207170314.4419-1-l.stach@pengutronix.de Fixes: 9cca35d42eb6 ("mm, page_alloc: enable/disable IRQs once when freeing a list of pages") Signed-off-by: Lucas Stach Acked-by: Mel Gorman Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 73f5d4556b3d..7e5e775e97f4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2684,6 +2684,7 @@ void free_unref_page_list(struct list_head *list) { struct page *page, *next; unsigned long flags, pfn; + int batch_count = 0; /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { @@ -2700,6 +2701,16 @@ void free_unref_page_list(struct list_head *list) set_page_private(page, 0); trace_mm_page_free_batched(page); free_unref_page_commit(page, pfn); + + /* + * Guard against excessive IRQ disabled times when we get + * a large list of pages to free. + */ + if (++batch_count == SWAP_CLUSTER_MAX) { + local_irq_restore(flags); + batch_count = 0; + local_irq_save(flags); + } } local_irq_restore(flags); } From 85c3e4a5a185f22649c6bf33bdce7bb1ac890921 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 14 Dec 2017 15:32:58 -0800 Subject: [PATCH 11/17] mm/slab.c: do not hash pointers when debugging slab If CONFIG_DEBUG_SLAB/CONFIG_DEBUG_SLAB_LEAK are enabled, the slab code prints extra debug information when e.g. corruption is detected. This includes pointers, which are not very useful when hashed. Fix this by using %px to print unhashed pointers instead where it makes sense, and by removing the printing of a last user pointer referring to code. [geert+renesas@glider.be: v2] Link: http://lkml.kernel.org/r/1513179267-2509-1-git-send-email-geert+renesas@glider.be Link: http://lkml.kernel.org/r/1512641861-5113-1-git-send-email-geert+renesas@glider.be Fixes: ad67b74d2469d9b8 ("printk: hash addresses printed with %p") Signed-off-by: Geert Uytterhoeven Acked-by: Christoph Lameter Acked-by: Linus Torvalds Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: "Tobin C . Harding" Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 183e996dde5f..4e51ef954026 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1584,11 +1584,8 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) *dbg_redzone2(cachep, objp)); } - if (cachep->flags & SLAB_STORE_USER) { - pr_err("Last user: [<%p>](%pSR)\n", - *dbg_userword(cachep, objp), - *dbg_userword(cachep, objp)); - } + if (cachep->flags & SLAB_STORE_USER) + pr_err("Last user: (%pSR)\n", *dbg_userword(cachep, objp)); realobj = (char *)objp + obj_offset(cachep); size = cachep->object_size; for (i = 0; i < size && lines; i += 16, lines--) { @@ -1621,7 +1618,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Mismatch ! */ /* Print header */ if (lines == 0) { - pr_err("Slab corruption (%s): %s start=%p, len=%d\n", + pr_err("Slab corruption (%s): %s start=%px, len=%d\n", print_tainted(), cachep->name, realobj, size); print_objinfo(cachep, objp, 0); @@ -1650,13 +1647,13 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) if (objnr) { objp = index_to_obj(cachep, page, objnr - 1); realobj = (char *)objp + obj_offset(cachep); - pr_err("Prev obj: start=%p, len=%d\n", realobj, size); + pr_err("Prev obj: start=%px, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { objp = index_to_obj(cachep, page, objnr + 1); realobj = (char *)objp + obj_offset(cachep); - pr_err("Next obj: start=%p, len=%d\n", realobj, size); + pr_err("Next obj: start=%px, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } } @@ -2608,7 +2605,7 @@ static void slab_put_obj(struct kmem_cache *cachep, /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { if (get_free_obj(page, i) == objnr) { - pr_err("slab: double free detected in cache '%s', objp %p\n", + pr_err("slab: double free detected in cache '%s', objp %px\n", cachep->name, objp); BUG(); } @@ -2772,7 +2769,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) else slab_error(cache, "memory outside object was overwritten"); - pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n", obj, redzone1, redzone2); } @@ -3078,7 +3075,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { slab_error(cachep, "double free, or memory outside object was overwritten"); - pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n", objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); } @@ -3091,7 +3088,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, cachep->ctor(objp); if (ARCH_SLAB_MINALIGN && ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { - pr_err("0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", + pr_err("0x%px: not aligned to ARCH_SLAB_MINALIGN=%d\n", objp, (int)ARCH_SLAB_MINALIGN); } return objp; @@ -4283,7 +4280,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) return; } #endif - seq_printf(m, "%p", (void *)address); + seq_printf(m, "%px", (void *)address); } static int leaks_show(struct seq_file *m, void *p) From 689d77f001cd22da31cc943170e1f6f2e8197035 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 14 Dec 2017 15:33:02 -0800 Subject: [PATCH 12/17] kcov: fix comparison callback signature Fix a silly copy-paste bug. We truncated u32 args to u16. Link: http://lkml.kernel.org/r/20171207101134.107168-1-dvyukov@google.com Fixes: ded97d2c2b2c ("kcov: support comparison operands collection") Signed-off-by: Dmitry Vyukov Cc: syzkaller@googlegroups.com Cc: Alexander Potapenko Cc: Vegard Nossum Cc: Quentin Casasnovas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index 15f33faf4013..7594c033d98a 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -157,7 +157,7 @@ void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2) } EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2); -void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2) +void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2) { write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_); } @@ -183,7 +183,7 @@ void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2) } EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2); -void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2) +void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2) { write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2, _RET_IP_); From 0b265c3b3b721dca03e82719ac0e15bc2c89aa3a Mon Sep 17 00:00:00 2001 From: "Liu, Changcheng" Date: Thu, 14 Dec 2017 15:33:05 -0800 Subject: [PATCH 13/17] tools/slabinfo-gnuplot: force to use bash shell On some linux distributions, the default link of sh is dash which deoesn't support split array like "${var//,/ }" It's better to force to use bash shell directly. Link: http://lkml.kernel.org/r/20171208093751.GA175471@sofia Signed-off-by: Liu Changcheng Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/slabinfo-gnuplot.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/vm/slabinfo-gnuplot.sh b/tools/vm/slabinfo-gnuplot.sh index 35b039864b77..0cf28aa6f21c 100644 --- a/tools/vm/slabinfo-gnuplot.sh +++ b/tools/vm/slabinfo-gnuplot.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # Sergey Senozhatsky, 2015 # sergey.senozhatsky.work@gmail.com From 1f704fd0d14043e76e80f6b8b2251b9b2cedcca6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 14 Dec 2017 15:33:08 -0800 Subject: [PATCH 14/17] mm/frame_vector.c: release a semaphore in 'get_vaddr_frames()' A semaphore is acquired before this check, so we must release it before leaving. Link: http://lkml.kernel.org/r/20171211211009.4971-1-christophe.jaillet@wanadoo.fr Fixes: b7f0554a56f2 ("mm: fail get_vaddr_frames() for filesystem-dax mappings") Signed-off-by: Christophe JAILLET Acked-by: Michal Hocko Cc: Dan Williams Cc: Christian Borntraeger Cc: David Sterba Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/frame_vector.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/frame_vector.c b/mm/frame_vector.c index 297c7238f7d4..c64dca6e27c2 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -62,8 +62,10 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, * get_user_pages_longterm() and disallow it for filesystem-dax * mappings. */ - if (vma_is_fsdax(vma)) - return -EOPNOTSUPP; + if (vma_is_fsdax(vma)) { + ret = -EOPNOTSUPP; + goto out; + } if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { vec->got_ref = true; From bdcf0a423ea1c40bbb40e7ee483b50fc8aa3d758 Mon Sep 17 00:00:00 2001 From: Thiago Rafael Becker Date: Thu, 14 Dec 2017 15:33:12 -0800 Subject: [PATCH 15/17] kernel: make groups_sort calling a responsibility group_info allocators In testing, we found that nfsd threads may call set_groups in parallel for the same entry cached in auth.unix.gid, racing in the call of groups_sort, corrupting the groups for that entry and leading to permission denials for the client. This patch: - Make groups_sort globally visible. - Move the call to groups_sort to the modifiers of group_info - Remove the call to groups_sort from set_groups Link: http://lkml.kernel.org/r/20171211151420.18655-1-thiago.becker@gmail.com Signed-off-by: Thiago Rafael Becker Reviewed-by: Matthew Wilcox Reviewed-by: NeilBrown Acked-by: "J. Bruce Fields" Cc: Al Viro Cc: Martin Schwidefsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/kernel/compat_linux.c | 1 + fs/nfsd/auth.c | 3 +++ include/linux/cred.h | 1 + kernel/groups.c | 5 +++-- kernel/uid16.c | 1 + net/sunrpc/auth_gss/gss_rpc_xdr.c | 1 + net/sunrpc/auth_gss/svcauth_gss.c | 1 + net/sunrpc/svcauth_unix.c | 2 ++ 8 files changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index f04db3779b34..59eea9c65d3e 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -263,6 +263,7 @@ COMPAT_SYSCALL_DEFINE2(s390_setgroups16, int, gidsetsize, u16 __user *, grouplis return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 697f8ae7792d..f650e475d8f0 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -60,6 +60,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) gi->gid[i] = exp->ex_anon_gid; else gi->gid[i] = rqgi->gid[i]; + + /* Each thread allocates its own gi, no race */ + groups_sort(gi); } } else { gi = get_group_info(rqgi); diff --git a/include/linux/cred.h b/include/linux/cred.h index 099058e1178b..631286535d0f 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -83,6 +83,7 @@ extern int set_current_groups(struct group_info *); extern void set_groups(struct cred *, struct group_info *); extern int groups_search(const struct group_info *, kgid_t); extern bool may_setgroups(void); +extern void groups_sort(struct group_info *); /* * The security context of a task diff --git a/kernel/groups.c b/kernel/groups.c index e357bc800111..daae2f2dc6d4 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b) return gid_gt(a, b) - gid_lt(a, b); } -static void groups_sort(struct group_info *group_info) +void groups_sort(struct group_info *group_info) { sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), gid_cmp, NULL); } +EXPORT_SYMBOL(groups_sort); /* a simple bsearch */ int groups_search(const struct group_info *group_info, kgid_t grp) @@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp) void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); - groups_sort(group_info); get_group_info(group_info); new->group_info = group_info; } @@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/kernel/uid16.c b/kernel/uid16.c index ce74a4901d2b..ef1da2a5f9bd 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index c4778cae58ef..444380f968f1 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -231,6 +231,7 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr, goto out_free_groups; creds->cr_group_info->gid[i] = kgid; } + groups_sort(creds->cr_group_info); return 0; out_free_groups: diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 5dd4e6c9fef2..26531193fce4 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -481,6 +481,7 @@ static int rsc_parse(struct cache_detail *cd, goto out; rsci.cred.cr_group_info->gid[i] = kgid; } + groups_sort(rsci.cred.cr_group_info); /* mech name */ len = qword_get(&mesg, buf, mlen); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 740b67d5a733..af7f28fb8102 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -520,6 +520,7 @@ static int unix_gid_parse(struct cache_detail *cd, ug.gi->gid[i] = kgid; } + groups_sort(ug.gi); ugp = unix_gid_lookup(cd, uid); if (ugp) { struct cache_head *ch; @@ -819,6 +820,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv)); cred->cr_group_info->gid[i] = kgid; } + groups_sort(cred->cr_group_info); if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { *authp = rpc_autherr_badverf; return SVC_DENIED; From 4837fe37adff1d159904f0c013471b1ecbcb455e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 14 Dec 2017 15:33:15 -0800 Subject: [PATCH 16/17] mm, oom_reaper: fix memory corruption David Rientjes has reported the following memory corruption while the oom reaper tries to unmap the victims address space BUG: Bad page map in process oom_reaper pte:6353826300000000 pmd:00000000 addr:00007f50cab1d000 vm_flags:08100073 anon_vma:ffff9eea335603f0 mapping: (null) index:7f50cab1d file: (null) fault: (null) mmap: (null) readpage: (null) CPU: 2 PID: 1001 Comm: oom_reaper Call Trace: unmap_page_range+0x1068/0x1130 __oom_reap_task_mm+0xd5/0x16b oom_reaper+0xff/0x14c kthread+0xc1/0xe0 Tetsuo Handa has noticed that the synchronization inside exit_mmap is insufficient. We only synchronize with the oom reaper if tsk_is_oom_victim which is not true if the final __mmput is called from a different context than the oom victim exit path. This can trivially happen from context of any task which has grabbed mm reference (e.g. to read /proc// file which requires mm etc.). The race would look like this oom_reaper oom_victim task mmget_not_zero do_exit mmput __oom_reap_task_mm mmput __mmput exit_mmap remove_vma unmap_page_range Fix this issue by providing a new mm_is_oom_victim() helper which operates on the mm struct rather than a task. Any context which operates on a remote mm struct should use this helper in place of tsk_is_oom_victim. The flag is set in mark_oom_victim and never cleared so it is stable in the exit_mmap path. Debugged by Tetsuo Handa. Link: http://lkml.kernel.org/r/20171210095130.17110-1-mhocko@kernel.org Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") Signed-off-by: Michal Hocko Reported-by: David Rientjes Acked-by: David Rientjes Cc: Tetsuo Handa Cc: Andrea Argangeli Cc: [4.14] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 9 +++++++++ include/linux/sched/coredump.h | 1 + mm/mmap.c | 10 +++++----- mm/oom_kill.c | 4 +++- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 01c91d874a57..5bad038ac012 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -66,6 +66,15 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) return tsk->signal->oom_mm; } +/* + * Use this helper if tsk->mm != mm and the victim mm needs a special + * handling. This is guaranteed to stay true after once set. + */ +static inline bool mm_is_oom_victim(struct mm_struct *mm) +{ + return test_bit(MMF_OOM_VICTIM, &mm->flags); +} + /* * Checks whether a page fault on the given mm is still reliable. * This is no longer true if the oom reaper started to reap the diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 9c8847395b5e..ec912d01126f 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -70,6 +70,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ +#define MMF_OOM_VICTIM 25 /* mm is the oom victim */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ diff --git a/mm/mmap.c b/mm/mmap.c index a4d546821214..9efdc021ad22 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3019,20 +3019,20 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); - set_bit(MMF_OOM_SKIP, &mm->flags); - if (unlikely(tsk_is_oom_victim(current))) { + if (unlikely(mm_is_oom_victim(mm))) { /* * Wait for oom_reap_task() to stop working on this * mm. Because MMF_OOM_SKIP is already set before * calling down_read(), oom_reap_task() will not run * on this "mm" post up_write(). * - * tsk_is_oom_victim() cannot be set from under us - * either because current->mm is already set to NULL + * mm_is_oom_victim() cannot be set from under us + * either because victim->mm is already set to NULL * under task_lock before calling mmput and oom_mm is - * set not NULL by the OOM killer only if current->mm + * set not NULL by the OOM killer only if victim->mm * is found not NULL while holding the task_lock. */ + set_bit(MMF_OOM_SKIP, &mm->flags); down_write(&mm->mmap_sem); up_write(&mm->mmap_sem); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c957be32b27a..29f855551efe 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -683,8 +683,10 @@ static void mark_oom_victim(struct task_struct *tsk) return; /* oom_mm is bound to the signal struct life time. */ - if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) + if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) { mmgrab(tsk->signal->oom_mm); + set_bit(MMF_OOM_VICTIM, &mm->flags); + } /* * Make sure that the task is woken up from uninterruptible sleep From 7c2c11b208be09c156573fc0076b7b3646e05219 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Thu, 14 Dec 2017 15:33:19 -0800 Subject: [PATCH 17/17] arch: define weak abort() gcc toggle -fisolate-erroneous-paths-dereference (default at -O2 onwards) isolates faulty code paths such as null pointer access, divide by zero etc. If gcc port doesnt implement __builtin_trap, an abort() is generated which causes kernel link error. In this case, gcc is generating abort due to 'divide by zero' in lib/mpi/mpih-div.c. Currently 'frv' and 'arc' are failing. Previously other arch was also broken like m32r was fixed by commit d22e3d69ee1a ("m32r: fix build failure"). Let's define this weak function which is common for all arch and fix the problem permanently. We can even remove the arch specific 'abort' after this is done. Link: http://lkml.kernel.org/r/1513118956-8718-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Cc: Alexey Brodkin Cc: Vineet Gupta Cc: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/exit.c b/kernel/exit.c index 6b4298a41167..df0c91d5606c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1755,3 +1755,11 @@ Efault: return -EFAULT; } #endif + +__weak void abort(void) +{ + BUG(); + + /* if that doesn't kill us, halt */ + panic("Oops failed to kill thread"); +}