From 6b69cbd8b1fba2641bab1a68bb964fbab4277736 Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Fri, 17 Aug 2018 15:43:29 -0700 Subject: [PATCH 001/111] 9p: remove Ron Minnich from MAINTAINERS Ron Minnich has left Sandia in 2011, and has not been involved in any 9p commit in recent years. Also add a CREDITS entry to record his contributions. Link: http://lkml.kernel.org/r/1534486244-1055-1-git-send-email-asmadeus@codewreck.org Signed-off-by: Dominique Martinet Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Ronald G. Minnich Cc: Latchesar Ionkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- CREDITS | 5 +++++ MAINTAINERS | 1 - 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index 989cda91c427..5befd2d714d0 100644 --- a/CREDITS +++ b/CREDITS @@ -2571,6 +2571,11 @@ S: Helstorfer Str. 7 S: D-30625 Hannover S: Germany +N: Ron Minnich +E: rminnich@sandia.gov +E: rminnich@gmail.com +D: 9p filesystem development + N: Corey Minyard E: minyard@wf-rch.cirr.com E: minyard@mvista.com diff --git a/MAINTAINERS b/MAINTAINERS index 3d08725527aa..6f6b1943061e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -199,7 +199,6 @@ F: drivers/net/ethernet/8390/ 9P FILE SYSTEM M: Eric Van Hensbergen -M: Ron Minnich M: Latchesar Ionkov L: v9fs-developer@lists.sourceforge.net W: http://swik.net/v9fs From 6ed191ca9045e595d02cdf206fd7d494a1573bdd Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Fri, 17 Aug 2018 15:43:33 -0700 Subject: [PATCH 002/111] 9p: add Dominique Martinet to MAINTAINERS Link: http://lkml.kernel.org/r/1533869305-29325-1-git-send-email-asmadeus@codewreck.org Signed-off-by: Dominique Martinet Acked-by: Andrew Morton Cc: Eric Van Hensbergen Cc: Latchesar Ionkov Cc: Ron Minnich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6f6b1943061e..6faf6d3d499e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -200,10 +200,12 @@ F: drivers/net/ethernet/8390/ 9P FILE SYSTEM M: Eric Van Hensbergen M: Latchesar Ionkov +M: Dominique Martinet L: v9fs-developer@lists.sourceforge.net W: http://swik.net/v9fs Q: http://patchwork.kernel.org/project/v9fs-devel/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs.git +T: git git://github.com/martinetd/linux.git S: Maintained F: Documentation/filesystems/9p.txt F: fs/9p/ From e36488c83b6d871b35dd555afb2d434bd61687cf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Aug 2018 15:43:36 -0700 Subject: [PATCH 003/111] bitfield: avoid gcc-8 -Wint-in-bool-context warning Passing an enum into FIELD_GET() produces a long but harmless warning on newer compilers: from include/linux/linkage.h:7, from include/linux/kernel.h:7, from include/linux/skbuff.h:17, from include/linux/if_ether.h:23, from include/linux/etherdevice.h:25, from drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c:63: drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c: In function 'iwl_mvm_rx_mpdu_mq': include/linux/bitfield.h:56:20: error: enum constant in boolean context [-Werror=int-in-bool-context] BUILD_BUG_ON_MSG(!(_mask), _pfx "mask is zero"); \ ^ ... include/linux/bitfield.h:103:3: note: in expansion of macro '__BF_FIELD_CHECK' __BF_FIELD_CHECK(_mask, _reg, 0U, "FIELD_GET: "); \ ^~~~~~~~~~~~~~~~ drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c:1025:21: note: in expansion of macro 'FIELD_GET' le16_encode_bits(FIELD_GET(IWL_RX_HE_PHY_SIBG_SYM_OR_USER_NUM_MASK, The problem here is that the caller has no idea how the macro gets expanding, leading to a false-positive. It can be trivially avoided by doing a comparison against zero. This only recently started appearing as the iwlwifi driver was patched to use FIELD_GET. Link: http://lkml.kernel.org/r/20180813220950.194841-1-arnd@arndb.de Fixes: 514c30696fbc ("iwlwifi: add support for IEEE802.11ax") Signed-off-by: Arnd Bergmann Cc: Masahiro Yamada Cc: Johannes Berg Cc: Jakub Kicinski Cc: Andy Shevchenko Cc: Al Viro Cc: David Laight Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitfield.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h index 65a6981eef7b..3f1ef4450a7c 100644 --- a/include/linux/bitfield.h +++ b/include/linux/bitfield.h @@ -53,7 +53,7 @@ ({ \ BUILD_BUG_ON_MSG(!__builtin_constant_p(_mask), \ _pfx "mask is not constant"); \ - BUILD_BUG_ON_MSG(!(_mask), _pfx "mask is zero"); \ + BUILD_BUG_ON_MSG((_mask) == 0, _pfx "mask is zero"); \ BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ? \ ~((_mask) >> __bf_shf(_mask)) & (_val) : 0, \ _pfx "value too large for the field"); \ From e1fb4a0864958fac2fb1b23f9f4562a9f90e3e8f Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 17 Aug 2018 15:43:40 -0700 Subject: [PATCH 004/111] dax: remove VM_MIXEDMAP for fsdax and device dax This patch is reworked from an earlier patch that Dan has posted: https://patchwork.kernel.org/patch/10131727/ VM_MIXEDMAP is used by dax to direct mm paths like vm_normal_page() that the memory page it is dealing with is not typical memory from the linear map. The get_user_pages_fast() path, since it does not resolve the vma, is already using {pte,pmd}_devmap() as a stand-in for VM_MIXEDMAP, so we use that as a VM_MIXEDMAP replacement in some locations. In the cases where there is no pte to consult we fallback to using vma_is_dax() to detect the VM_MIXEDMAP special case. Now that we have explicit driver pfn_t-flag opt-in/opt-out for get_user_pages() support for DAX we can stop setting VM_MIXEDMAP. This also means we no longer need to worry about safely manipulating vm_flags in a future where we support dynamically changing the dax mode of a file. DAX should also now be supported with madvise_behavior(), vma_merge(), and copy_page_range(). This patch has been tested against ndctl unit test. It has also been tested against xfstests commit: 625515d using fake pmem created by memmap and no additional issues have been observed. Link: http://lkml.kernel.org/r/152847720311.55924.16999195879201817653.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Dave Jiang Acked-by: Dan Williams Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 2 +- fs/ext2/file.c | 1 - fs/ext4/file.c | 2 +- fs/xfs/xfs_file.c | 2 +- mm/hmm.c | 6 ++++-- mm/huge_memory.c | 4 ++-- mm/ksm.c | 3 +++ mm/memory.c | 6 ++++++ mm/migrate.c | 3 ++- mm/mlock.c | 3 ++- mm/mmap.c | 9 +++++---- 11 files changed, 27 insertions(+), 14 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 108c37fca782..0a2acd7993f0 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -474,7 +474,7 @@ static int dax_mmap(struct file *filp, struct vm_area_struct *vma) return rc; vma->vm_ops = &dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + vma->vm_flags |= VM_HUGEPAGE; return 0; } diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 047c327a6b23..28b2609f25c1 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -126,7 +126,6 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &ext2_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP; return 0; } #else diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 7f8023340eb8..69d65d49837b 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -374,7 +374,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + vma->vm_flags |= VM_HUGEPAGE; } else { vma->vm_ops = &ext4_file_vm_ops; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 181e9084519b..5eaef2c17293 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1169,7 +1169,7 @@ xfs_file_mmap( file_accessed(filp); vma->vm_ops = &xfs_file_vm_ops; if (IS_DAX(file_inode(filp))) - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + vma->vm_flags |= VM_HUGEPAGE; return 0; } diff --git a/mm/hmm.c b/mm/hmm.c index de7b6bf77201..f40e8add84b5 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -676,7 +676,8 @@ int hmm_vma_get_pfns(struct hmm_range *range) return -EINVAL; /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || + vma_is_dax(vma)) { hmm_pfns_special(range); return -EINVAL; } @@ -849,7 +850,8 @@ int hmm_vma_fault(struct hmm_range *range, bool block) return -EINVAL; /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || + vma_is_dax(vma)) { hmm_pfns_special(range); return -EINVAL; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a9e1e093df51..1ce44e87f494 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -762,11 +762,11 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, * but we need to be consistent with PTEs and architectures that * can't support a 'special' bit. */ - BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && + !pfn_t_devmap(pfn)); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - BUG_ON(!pfn_t_devmap(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; diff --git a/mm/ksm.c b/mm/ksm.c index a6d43cf9a982..9b855a8b0f2d 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2430,6 +2430,9 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, VM_HUGETLB | VM_MIXEDMAP)) return 0; /* just ignore the advice */ + if (vma_is_dax(vma)) + return 0; + #ifdef VM_SAO if (*vm_flags & VM_SAO) return 0; diff --git a/mm/memory.c b/mm/memory.c index 348279ff6e51..7c3bd119fcca 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -859,6 +859,10 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return NULL; } } + + if (pte_devmap(pte)) + return NULL; + print_bad_pte(vma, addr, pte, NULL); return NULL; } @@ -923,6 +927,8 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, } } + if (pmd_devmap(pmd)) + return NULL; if (is_zero_pfn(pfn)) return NULL; if (unlikely(pfn > highest_memmap_pfn)) diff --git a/mm/migrate.c b/mm/migrate.c index 8c0af0f7cab1..4a83268e23c2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2951,7 +2951,8 @@ int migrate_vma(const struct migrate_vma_ops *ops, /* Sanity check the arguments */ start &= PAGE_MASK; end &= PAGE_MASK; - if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) + if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || + vma_is_dax(vma)) return -EINVAL; if (start < vma->vm_start || start >= vma->vm_end) return -EINVAL; diff --git a/mm/mlock.c b/mm/mlock.c index 74e5a6547c3d..41cc47e28ad6 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -527,7 +527,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, vm_flags_t old_flags = vma->vm_flags; if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || - is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) + is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || + vma_is_dax(vma)) /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; diff --git a/mm/mmap.c b/mm/mmap.c index 17bbf4d3e24f..8d6449e74431 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1796,11 +1796,12 @@ out: vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { - if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current->mm))) - mm->locked_vm += (len >> PAGE_SHIFT); - else + if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current->mm)) vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + else + mm->locked_vm += (len >> PAGE_SHIFT); } if (file) From 2c1bb29aa6e7b0e52d84bd06bc199b0a5076a781 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Aug 2018 15:43:44 -0700 Subject: [PATCH 005/111] firewire: use 64-bit time_t based interfaces 32-bit CLOCK_REALTIME timestamps overflow in year 2038, so all such interfaces are deprecated now. For the FW_CDEV_IOC_GET_CYCLE_TIMER2 ioctl, we already support 64-bit timestamps, but the implementation still uses timespec. This changes the code to use timespec64 instead with the appropriate accessor functions. Link: http://lkml.kernel.org/r/20180711124456.1023039-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Cc: Stefan Richter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/firewire/core-cdev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index f0587273940e..d8e185582642 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -1205,7 +1205,7 @@ static int ioctl_get_cycle_timer2(struct client *client, union ioctl_arg *arg) { struct fw_cdev_get_cycle_timer2 *a = &arg->get_cycle_timer2; struct fw_card *card = client->device->card; - struct timespec ts = {0, 0}; + struct timespec64 ts = {0, 0}; u32 cycle_time; int ret = 0; @@ -1214,9 +1214,9 @@ static int ioctl_get_cycle_timer2(struct client *client, union ioctl_arg *arg) cycle_time = card->driver->read_csr(card, CSR_CYCLE_TIME); switch (a->clk_id) { - case CLOCK_REALTIME: getnstimeofday(&ts); break; - case CLOCK_MONOTONIC: ktime_get_ts(&ts); break; - case CLOCK_MONOTONIC_RAW: getrawmonotonic(&ts); break; + case CLOCK_REALTIME: ktime_get_real_ts64(&ts); break; + case CLOCK_MONOTONIC: ktime_get_ts64(&ts); break; + case CLOCK_MONOTONIC_RAW: ktime_get_raw_ts64(&ts); break; default: ret = -EINVAL; } From a3fda0ffeaf0114328024aee4a9ec3b08af4b077 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Aug 2018 15:43:47 -0700 Subject: [PATCH 006/111] fs/ufs: use ktime_get_real_seconds for sb and cg timestamps get_seconds() is deprecated because of the 32-bit overflow and will be removed. All callers in ufs also truncate to a 32-bit number, so nothing changes during the conversion, but this should be harmless as the superblock and cylinder group timestamps are not visible to user space, except for checking the fs-dirty state, wich works fine across the overflow. This moves the call to get_seconds() into a new inline function, with a comment explaining the constraints, while converting it to ktime_get_real_seconds(). Link: http://lkml.kernel.org/r/20180718115017.742609-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Thomas Gleixner Cc: Al Viro Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/balloc.c | 4 ++-- fs/ufs/ialloc.c | 2 +- fs/ufs/super.c | 4 ++-- fs/ufs/util.h | 14 ++++++++++++++ 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index e727ee07dbe4..075d3d9114c8 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -547,7 +547,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, /* * Block can be extended */ - ucg->cg_time = cpu_to_fs32(sb, get_seconds()); + ucg->cg_time = ufs_get_seconds(sb); for (i = newcount; i < (uspi->s_fpb - fragoff); i++) if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i)) break; @@ -639,7 +639,7 @@ cg_found: if (!ufs_cg_chkmagic(sb, ucg)) ufs_panic (sb, "ufs_alloc_fragments", "internal error, bad magic number on cg %u", cgno); - ucg->cg_time = cpu_to_fs32(sb, get_seconds()); + ucg->cg_time = ufs_get_seconds(sb); if (count == uspi->s_fpb) { result = ufs_alloccg_block (inode, ucpi, goal, err); diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 02c0a4be4212..969fd60436d3 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -89,7 +89,7 @@ void ufs_free_inode (struct inode * inode) if (!ufs_cg_chkmagic(sb, ucg)) ufs_panic (sb, "ufs_free_fragments", "internal error, bad cg magic number"); - ucg->cg_time = cpu_to_fs32(sb, get_seconds()); + ucg->cg_time = ufs_get_seconds(sb); is_directory = S_ISDIR(inode->i_mode); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 488088141451..a4e07e910f1b 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -698,7 +698,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait) usb1 = ubh_get_usb_first(uspi); usb3 = ubh_get_usb_third(uspi); - usb1->fs_time = cpu_to_fs32(sb, get_seconds()); + usb1->fs_time = ufs_get_seconds(sb); if ((flags & UFS_ST_MASK) == UFS_ST_SUN || (flags & UFS_ST_MASK) == UFS_ST_SUNOS || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) @@ -1342,7 +1342,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) */ if (*mount_flags & SB_RDONLY) { ufs_put_super_internal(sb); - usb1->fs_time = cpu_to_fs32(sb, get_seconds()); + usb1->fs_time = ufs_get_seconds(sb); if ((flags & UFS_ST_MASK) == UFS_ST_SUN || (flags & UFS_ST_MASK) == UFS_ST_SUNOS || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 1907be6d5808..1fd3011ea623 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -590,3 +590,17 @@ static inline int ufs_is_data_ptr_zero(struct ufs_sb_private_info *uspi, else return *(__fs32 *)p == 0; } + +static inline __fs32 ufs_get_seconds(struct super_block *sbp) +{ + time64_t now = ktime_get_real_seconds(); + + /* Signed 32-bit interpretation wraps around in 2038, which + * happens in ufs1 inode stamps but not ufs2 using 64-bits + * stamps. For superblock and blockgroup, let's assume + * unsigned 32-bit stamps, which are good until y2106. + * Wrap around rather than clamp here to make the dirty + * file system detection work in the superblock stamp. + */ + return cpu_to_fs32(sbp, lower_32_bits(now)); +} From bcf451ecfc8d45618d13c9e4abcbbd770af20cc9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Aug 2018 15:43:50 -0700 Subject: [PATCH 007/111] fs/ntfs: use timespec64 directly for timestamp conversion Now that the VFS has been converted from timespec to timespec64 timestamps, only the conversion to/from ntfs timestamps uses 32-bit seconds. This changes that last missing piece to get the ntfs implementation y2038 safe on 32-bit architectures. Link: http://lkml.kernel.org/r/20180718115017.742609-2-arnd@arndb.de Signed-off-by: Arnd Bergmann Cc: Anton Altaparmakov Cc: Al Viro Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/inode.c | 12 ++++++------ fs/ntfs/time.h | 27 +++++++++++++++------------ 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index decaf75d1cd5..bd3221cbdd95 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -667,18 +667,18 @@ static int ntfs_read_locked_inode(struct inode *vi) * mtime is the last change of the data within the file. Not changed * when only metadata is changed, e.g. a rename doesn't affect mtime. */ - vi->i_mtime = timespec_to_timespec64(ntfs2utc(si->last_data_change_time)); + vi->i_mtime = ntfs2utc(si->last_data_change_time); /* * ctime is the last change of the metadata of the file. This obviously * always changes, when mtime is changed. ctime can be changed on its * own, mtime is then not changed, e.g. when a file is renamed. */ - vi->i_ctime = timespec_to_timespec64(ntfs2utc(si->last_mft_change_time)); + vi->i_ctime = ntfs2utc(si->last_mft_change_time); /* * Last access to the data within the file. Not changed during a rename * for example but changed whenever the file is written to. */ - vi->i_atime = timespec_to_timespec64(ntfs2utc(si->last_access_time)); + vi->i_atime = ntfs2utc(si->last_access_time); /* Find the attribute list attribute if present. */ ntfs_attr_reinit_search_ctx(ctx); @@ -2997,7 +2997,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si = (STANDARD_INFORMATION*)((u8*)ctx->attr + le16_to_cpu(ctx->attr->data.resident.value_offset)); /* Update the access times if they have changed. */ - nt = utc2ntfs(timespec64_to_timespec(vi->i_mtime)); + nt = utc2ntfs(vi->i_mtime); if (si->last_data_change_time != nt) { ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, (long long) @@ -3006,7 +3006,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si->last_data_change_time = nt; modified = true; } - nt = utc2ntfs(timespec64_to_timespec(vi->i_ctime)); + nt = utc2ntfs(vi->i_ctime); if (si->last_mft_change_time != nt) { ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, (long long) @@ -3015,7 +3015,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si->last_mft_change_time = nt; modified = true; } - nt = utc2ntfs(timespec64_to_timespec(vi->i_atime)); + nt = utc2ntfs(vi->i_atime); if (si->last_access_time != nt) { ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h index 01233989d5d1..24cd719f1fd2 100644 --- a/fs/ntfs/time.h +++ b/fs/ntfs/time.h @@ -36,16 +36,16 @@ * Convert the Linux UTC time @ts to its corresponding NTFS time and return * that in little endian format. * - * Linux stores time in a struct timespec consisting of a time_t (long at - * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second - * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of - * 1-nano-second intervals since the value of tv_sec. + * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec + * and a long tv_nsec where tv_sec is the number of 1-second intervals since + * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second + * intervals since the value of tv_sec. * * NTFS uses Microsoft's standard time format which is stored in a s64 and is * measured as the number of 100-nano-second intervals since 1st January 1601, * 00:00:00 UTC. */ -static inline sle64 utc2ntfs(const struct timespec ts) +static inline sle64 utc2ntfs(const struct timespec64 ts) { /* * Convert the seconds to 100ns intervals, add the nano-seconds @@ -63,7 +63,10 @@ static inline sle64 utc2ntfs(const struct timespec ts) */ static inline sle64 get_current_ntfs_time(void) { - return utc2ntfs(current_kernel_time()); + struct timespec64 ts; + + ktime_get_coarse_real_ts64(&ts); + return utc2ntfs(ts); } /** @@ -73,18 +76,18 @@ static inline sle64 get_current_ntfs_time(void) * Convert the little endian NTFS time @time to its corresponding Linux UTC * time and return that in cpu format. * - * Linux stores time in a struct timespec consisting of a time_t (long at - * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second - * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of - * 1-nano-second intervals since the value of tv_sec. + * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec + * and a long tv_nsec where tv_sec is the number of 1-second intervals since + * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second + * intervals since the value of tv_sec. * * NTFS uses Microsoft's standard time format which is stored in a s64 and is * measured as the number of 100 nano-second intervals since 1st January 1601, * 00:00:00 UTC. */ -static inline struct timespec ntfs2utc(const sle64 time) +static inline struct timespec64 ntfs2utc(const sle64 time) { - struct timespec ts; + struct timespec64 ts; /* Subtract the NTFS time offset. */ u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET); From f08957d0ffe91f346c47cef95139c54aa7275cfe Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Aug 2018 15:43:54 -0700 Subject: [PATCH 008/111] fs/hpfs: extend gmt_to_local() conversion to 64-bit times The VFS timestamps are all 64-bit now, the only missing piece for hpfs is the internal conversion function. One interesting bit about hpfs is that it can already deal with moving the 136 year window of its timestamps to support a much wider range than other file systems with 32-bit timestamps. It also treats the timestamps as 'unsigned' on 64-bit architectures (but signed on 32-bit, because time_t always around to negative numbers in 2038). Changing the conversion to use time64_t makes 32-bit architectures behave the same way as 64-bit. For completeness, this also adds a clamp_t call for each conversion, so we don't wrap the timestamps but instead stay within the [0..U32_MAX] range of the on-disk timestamps. Link: http://lkml.kernel.org/r/20180718115017.742609-3-arnd@arndb.de Signed-off-by: Arnd Bergmann Cc: Mikulas Patocka Cc: Al Viro Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hpfs/hpfs_fn.h | 13 ++++++++++--- fs/hpfs/namei.c | 12 ++++++------ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 2a153aed4c19..ab2e7cc2ff33 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -334,16 +334,23 @@ long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg); * local time (HPFS) to GMT (Unix) */ -static inline time_t local_to_gmt(struct super_block *s, time32_t t) +static inline time64_t local_to_gmt(struct super_block *s, time32_t t) { extern struct timezone sys_tz; return t + sys_tz.tz_minuteswest * 60 + hpfs_sb(s)->sb_timeshift; } -static inline time32_t gmt_to_local(struct super_block *s, time_t t) +static inline time32_t gmt_to_local(struct super_block *s, time64_t t) { extern struct timezone sys_tz; - return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift; + t = t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift; + + return clamp_t(time64_t, t, 0, U32_MAX); +} + +static inline time32_t local_get_seconds(struct super_block *s) +{ + return gmt_to_local(s, ktime_get_real_seconds()); } /* diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index a3615e4c730d..082b7c76dd0c 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -11,7 +11,7 @@ static void hpfs_update_directory_times(struct inode *dir) { - time_t t = get_seconds(); + time64_t t = local_to_gmt(dir->i_sb, local_get_seconds(dir->i_sb)); if (t == dir->i_mtime.tv_sec && t == dir->i_ctime.tv_sec) return; @@ -50,7 +50,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) /*dee.archive = 0;*/ dee.hidden = name[0] == '.'; dee.fnode = cpu_to_le32(fno); - dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb)); result = new_inode(dir->i_sb); if (!result) goto bail2; @@ -91,7 +91,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) dnode->root_dnode = 1; dnode->up = cpu_to_le32(fno); de = hpfs_add_de(dir->i_sb, dnode, "\001\001", 2, 0); - de->creation_date = de->write_date = de->read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + de->creation_date = de->write_date = de->read_date = cpu_to_le32(local_get_seconds(dir->i_sb)); if (!(mode & 0222)) de->read_only = 1; de->first = de->directory = 1; /*de->hidden = de->system = 0;*/ @@ -151,7 +151,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, b dee.archive = 1; dee.hidden = name[0] == '.'; dee.fnode = cpu_to_le32(fno); - dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb)); result = new_inode(dir->i_sb); if (!result) @@ -238,7 +238,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, de dee.archive = 1; dee.hidden = name[0] == '.'; dee.fnode = cpu_to_le32(fno); - dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb)); result = new_inode(dir->i_sb); if (!result) @@ -314,7 +314,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy dee.archive = 1; dee.hidden = name[0] == '.'; dee.fnode = cpu_to_le32(fno); - dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb)); result = new_inode(dir->i_sb); if (!result) From fde5e903fb5893415fa49d280e998be226d0898f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 17 Aug 2018 15:43:57 -0700 Subject: [PATCH 009/111] scripts/spdxcheck.py: work with current HEAD LICENSES/ directory Depending on how old your -next tree is, it may not have a master that has the LICENSES directory. Change the lookup to HEAD and find whatever LICENSE directory files are used in that branch. Miscellanea: - Remove the checkpatch test as it will have its own SPDX license identifier. Link: http://lkml.kernel.org/r/7eeefc862194930c773e662cb2152e178441d3b8.camel@perches.com Signed-off-by: Joe Perches Reviewed-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spdxcheck.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/spdxcheck.py b/scripts/spdxcheck.py index 7deaef297f52..a6041f29b18e 100755 --- a/scripts/spdxcheck.py +++ b/scripts/spdxcheck.py @@ -32,7 +32,7 @@ def read_spdxdata(repo): # The subdirectories of LICENSES in the kernel source license_dirs = [ "preferred", "other", "exceptions" ] - lictree = repo.heads.master.commit.tree['LICENSES'] + lictree = repo.head.commit.tree['LICENSES'] spdx = SPDXdata() @@ -199,8 +199,6 @@ def scan_git_tree(tree): continue if el.path.find("license-rules.rst") >= 0: continue - if el.path == 'scripts/checkpatch.pl': - continue if not os.path.isfile(el.path): continue parser.parse_lines(open(el.path), args.maxlines, el.path) From bed95c43c15eb6b1ccc5b09e5ae08cac726c456d Mon Sep 17 00:00:00 2001 From: Jeremy Cline Date: Fri, 17 Aug 2018 15:44:01 -0700 Subject: [PATCH 010/111] scripts: add Python 3 compatibility to spdxcheck.py "dict.has_key(key)" on dictionaries has been replaced with "key in dict". Additionally, when run under Python 3 some files don't decode with the default encoding (tested with UTF-8). To handle that, don't open the file in text mode and decode text line-by-line, ignoring encoding errors. This remains compatible with Python 2 and should have no functional change. Link: http://lkml.kernel.org/r/20180717190635.29467-1-jcline@redhat.com Signed-off-by: Jeremy Cline Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spdxcheck.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/spdxcheck.py b/scripts/spdxcheck.py index a6041f29b18e..839e190bbd7a 100755 --- a/scripts/spdxcheck.py +++ b/scripts/spdxcheck.py @@ -4,6 +4,7 @@ from argparse import ArgumentParser from ply import lex, yacc +import locale import traceback import sys import git @@ -102,7 +103,7 @@ class id_parser(object): raise ParserException(tok, 'Invalid License ID') self.lastid = id elif tok.type == 'EXC': - if not self.spdx.exceptions.has_key(id): + if id not in self.spdx.exceptions: raise ParserException(tok, 'Invalid Exception ID') if self.lastid not in self.spdx.exceptions[id]: raise ParserException(tok, 'Exception not valid for license %s' %self.lastid) @@ -167,6 +168,7 @@ class id_parser(object): self.curline = 0 try: for line in fd: + line = line.decode(locale.getpreferredencoding(False), errors='ignore') self.curline += 1 if self.curline > maxlines: break @@ -201,7 +203,8 @@ def scan_git_tree(tree): continue if not os.path.isfile(el.path): continue - parser.parse_lines(open(el.path), args.maxlines, el.path) + with open(el.path, 'rb') as fd: + parser.parse_lines(fd, args.maxlines, el.path) def scan_git_subtree(tree, path): for p in path.strip('/').split('/'): From a10dcebacdb0cf6eb29c211e99cf190cd131a16a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 17 Aug 2018 15:44:04 -0700 Subject: [PATCH 011/111] fs/ntfs/aops.c: don't disable interrupts during kmap_atomic() ntfs_end_buffer_async_read() disables interrupts around kmap_atomic(). This is a leftover from the old kmap_atomic() implementation which relied on fixed mapping slots, so the caller had to make sure that the same slot could not be reused from an interrupting context. kmap_atomic() was changed to dynamic slots long ago and commit 1ec9c5ddc17a ("include/linux/highmem.h: remove the second argument of k[un]map_atomic()") removed the slot assignements, but the callers were not checked for now redundant interrupt disabling. Remove the conditional interrupt disable. Link: http://lkml.kernel.org/r/20180611144913.gln5mklhqcrfsoom@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Cc: Anton Altaparmakov Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/aops.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 3a2e509c77c5..01c770979921 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -93,13 +93,11 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) ofs = 0; if (file_ofs < init_size) ofs = init_size - file_ofs; - local_irq_save(flags); kaddr = kmap_atomic(page); memset(kaddr + bh_offset(bh) + ofs, 0, bh->b_size - ofs); flush_dcache_page(page); kunmap_atomic(kaddr); - local_irq_restore(flags); } } else { clear_buffer_uptodate(bh); @@ -146,13 +144,11 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) recs = PAGE_SIZE / rec_size; /* Should have been verified before we got here... */ BUG_ON(!recs); - local_irq_save(flags); kaddr = kmap_atomic(page); for (i = 0; i < recs; i++) post_read_mst_fixup((NTFS_RECORD*)(kaddr + i * rec_size), rec_size); kunmap_atomic(kaddr); - local_irq_restore(flags); flush_dcache_page(page); if (likely(page_uptodate && !PageError(page))) SetPageUptodate(page); From ac4ecf968acb9e54c335f99d842d56d6b90e28fb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 17 Aug 2018 15:44:07 -0700 Subject: [PATCH 012/111] ntfs: aops: remove VLA usage In the quest to remove all stack VLA usage from the kernel[1], this uses the maximum size needed on the stack and adds a sanity check for robustness: index.block_size cannot be larger than PAGE_SIZE nor less than NTFS_BLOCK_SIZE. [1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com Link: http://lkml.kernel.org/r/20180626172909.41453-2-keescook@chromium.org Signed-off-by: Kees Cook Cc: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/aops.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 01c770979921..8946130c87ad 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -922,7 +922,7 @@ static int ntfs_write_mst_block(struct page *page, ntfs_volume *vol = ni->vol; u8 *kaddr; unsigned int rec_size = ni->itype.index.block_size; - ntfs_inode *locked_nis[PAGE_SIZE / rec_size]; + ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE]; struct buffer_head *bh, *head, *tbh, *rec_start_bh; struct buffer_head *bhs[MAX_BUF_PER_PAGE]; runlist_element *rl; @@ -931,6 +931,9 @@ static int ntfs_write_mst_block(struct page *page, bool sync, is_mft, page_is_dirty, rec_is_dirty; unsigned char bh_size_bits; + if (WARN_ON(rec_size < NTFS_BLOCK_SIZE)) + return -EINVAL; + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " "0x%lx.", vi->i_ino, ni->type, page->index); BUG_ON(!NInoNonResident(ni)); From 2c27ce915078a5822aefb5db7bc2481664b26044 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 17 Aug 2018 15:44:11 -0700 Subject: [PATCH 013/111] ntfs: decompress: remove VLA usage In the quest to remove all stack VLA usage from the kernel[1], this moves the stack buffer used during decompression to be allocated externally. The existing "dest_max_index" used in the VLA is bounded by cb_max_page. cb_max_page is bounded by max_page, and max_page is bounded by nr_pages. Since nr_pages is used for the "pages" allocation, it can similarly be used for the "completed_pages" allocation and passed into the decompression function. The error paths are updated to free the new allocation. [1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com Link: http://lkml.kernel.org/r/20180626172909.41453-3-keescook@chromium.org Signed-off-by: Kees Cook Cc: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/compress.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index fbd0090d7d0c..df7c32b5fac7 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -128,6 +128,7 @@ static inline void handle_bounds_compressed_page(struct page *page, /** * ntfs_decompress - decompress a compression block into an array of pages * @dest_pages: destination array of pages + * @completed_pages: scratch space to track completed pages * @dest_index: current index into @dest_pages (IN/OUT) * @dest_ofs: current offset within @dest_pages[@dest_index] (IN/OUT) * @dest_max_index: maximum index into @dest_pages (IN) @@ -162,10 +163,10 @@ static inline void handle_bounds_compressed_page(struct page *page, * Note to hackers: This function may not sleep until it has finished accessing * the compression block @cb_start as it is a per-CPU buffer. */ -static int ntfs_decompress(struct page *dest_pages[], int *dest_index, - int *dest_ofs, const int dest_max_index, const int dest_max_ofs, - const int xpage, char *xpage_done, u8 *const cb_start, - const u32 cb_size, const loff_t i_size, +static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], + int *dest_index, int *dest_ofs, const int dest_max_index, + const int dest_max_ofs, const int xpage, char *xpage_done, + u8 *const cb_start, const u32 cb_size, const loff_t i_size, const s64 initialized_size) { /* @@ -190,9 +191,6 @@ static int ntfs_decompress(struct page *dest_pages[], int *dest_index, /* Variables for tag and token parsing. */ u8 tag; /* Current tag. */ int token; /* Loop counter for the eight tokens in tag. */ - - /* Need this because we can't sleep, so need two stages. */ - int completed_pages[dest_max_index - *dest_index + 1]; int nr_completed_pages = 0; /* Default error code. */ @@ -516,6 +514,7 @@ int ntfs_read_compressed_block(struct page *page) unsigned int cb_clusters, cb_max_ofs; int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0; struct page **pages; + int *completed_pages; unsigned char xpage_done = 0; ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = " @@ -528,14 +527,16 @@ int ntfs_read_compressed_block(struct page *page) BUG_ON(ni->name_len); pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); + completed_pages = kmalloc_array(nr_pages + 1, sizeof(int), GFP_NOFS); /* Allocate memory to store the buffer heads we need. */ bhs_size = cb_size / block_size * sizeof(struct buffer_head *); bhs = kmalloc(bhs_size, GFP_NOFS); - if (unlikely(!pages || !bhs)) { + if (unlikely(!pages || !bhs || !completed_pages)) { kfree(bhs); kfree(pages); + kfree(completed_pages); unlock_page(page); ntfs_error(vol->sb, "Failed to allocate internal buffers."); return -ENOMEM; @@ -562,6 +563,7 @@ int ntfs_read_compressed_block(struct page *page) if (xpage >= max_page) { kfree(bhs); kfree(pages); + kfree(completed_pages); zero_user(page, 0, PAGE_SIZE); ntfs_debug("Compressed read outside i_size - truncated?"); SetPageUptodate(page); @@ -854,10 +856,10 @@ lock_retry_remap: unsigned int prev_cur_page = cur_page; ntfs_debug("Found compressed compression block."); - err = ntfs_decompress(pages, &cur_page, &cur_ofs, - cb_max_page, cb_max_ofs, xpage, &xpage_done, - cb_pos, cb_size - (cb_pos - cb), i_size, - initialized_size); + err = ntfs_decompress(pages, completed_pages, &cur_page, + &cur_ofs, cb_max_page, cb_max_ofs, xpage, + &xpage_done, cb_pos, cb_size - (cb_pos - cb), + i_size, initialized_size); /* * We can sleep from now on, lock already dropped by * ntfs_decompress(). @@ -912,6 +914,7 @@ lock_retry_remap: /* We no longer need the list of pages. */ kfree(pages); + kfree(completed_pages); /* If we have completed the requested page, we return success. */ if (likely(xpage_done)) @@ -956,5 +959,6 @@ err_out: } } kfree(pages); + kfree(completed_pages); return -EIO; } From ab62ef82ea49b8814f4b0e2fe61426acda793fb9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 17 Aug 2018 15:44:14 -0700 Subject: [PATCH 014/111] ntfs: mft: remove VLA usage In the quest to remove all stack VLA usage from the kernel[1], this allocates the maximum size stack buffer. Existing checks already require that blocksize >= NTFS_BLOCK_SIZE and mft_record_size <= PAGE_SIZE, so max_bhs can be at most PAGE_SIZE / NTFS_BLOCK_SIZE. Sanity checks are added for robustness. [1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com Link: http://lkml.kernel.org/r/20180626172909.41453-4-keescook@chromium.org Signed-off-by: Kees Cook Cc: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/mft.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 32c523cf5a2d..fb14d17666c8 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -35,6 +35,8 @@ #include "mft.h" #include "ntfs.h" +#define MAX_BHS (PAGE_SIZE / NTFS_BLOCK_SIZE) + /** * map_mft_record_page - map the page in which a specific mft record resides * @ni: ntfs inode whose mft record page to map @@ -469,7 +471,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, struct page *page; unsigned int blocksize = vol->sb->s_blocksize; int max_bhs = vol->mft_record_size / blocksize; - struct buffer_head *bhs[max_bhs]; + struct buffer_head *bhs[MAX_BHS]; struct buffer_head *bh, *head; u8 *kmirr; runlist_element *rl; @@ -479,6 +481,8 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, ntfs_debug("Entering for inode 0x%lx.", mft_no); BUG_ON(!max_bhs); + if (WARN_ON(max_bhs > MAX_BHS)) + return -EINVAL; if (unlikely(!vol->mftmirr_ino)) { /* This could happen during umount... */ err = ntfs_sync_mft_mirror_umount(vol, mft_no, m); @@ -674,7 +678,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) unsigned int blocksize = vol->sb->s_blocksize; unsigned char blocksize_bits = vol->sb->s_blocksize_bits; int max_bhs = vol->mft_record_size / blocksize; - struct buffer_head *bhs[max_bhs]; + struct buffer_head *bhs[MAX_BHS]; struct buffer_head *bh, *head; runlist_element *rl; unsigned int block_start, block_end, m_start, m_end; @@ -684,6 +688,10 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) BUG_ON(NInoAttr(ni)); BUG_ON(!max_bhs); BUG_ON(!PageLocked(page)); + if (WARN_ON(max_bhs > MAX_BHS)) { + err = -EINVAL; + goto err_out; + } /* * If the ntfs_inode is clean no need to do anything. If it is dirty, * mark it as clean now so that it can be redirtied later on if needed. From 82f7c5103dcdd714486a64cfc5783cc4608d124c Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 17 Aug 2018 15:44:17 -0700 Subject: [PATCH 015/111] sh: make use of for_each_node_by_type() Instead of open-coding the loop, let's use canned macro. Also make sure we are not leaking "cpus" node reference. Link: http://lkml.kernel.org/r/20180624224252.GA220395@dtor-ws Signed-off-by: Dmitry Torokhov Reviewed-by: Andrew Morton Cc: Yoshinori Sato Cc: Rich Felker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/boards/of-generic.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/sh/boards/of-generic.c b/arch/sh/boards/of-generic.c index 46b2481eec90..26789ad28193 100644 --- a/arch/sh/boards/of-generic.c +++ b/arch/sh/boards/of-generic.c @@ -56,15 +56,15 @@ const struct of_cpu_method __cpu_method_of_table_sentinel static void sh_of_smp_probe(void) { - struct device_node *np = 0; - const char *method = 0; + struct device_node *np; + const char *method = NULL; const struct of_cpu_method *m = __cpu_method_of_table; pr_info("SH generic board support: scanning for cpus\n"); init_cpu_possible(cpumask_of(0)); - while ((np = of_find_node_by_type(np, "cpu"))) { + for_each_node_by_type(np, "cpu") { const __be32 *cell = of_get_property(np, "reg", NULL); u64 id = -1; if (cell) id = of_read_number(cell, of_n_addr_cells(np)); @@ -80,6 +80,7 @@ static void sh_of_smp_probe(void) if (!method) { np = of_find_node_by_name(NULL, "cpus"); of_property_read_string(np, "enable-method", &method); + of_node_put(np); } pr_info("CPU enable method: %s\n", method); From 8d00d0c00c0720c43b0eec0e86a6e916192f35d0 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 17 Aug 2018 15:44:21 -0700 Subject: [PATCH 016/111] sh: prefer _THIS_IP_ to current_text_addr As part of the effort to reduce the code duplication between _THIS_IP_ and current_text_addr(), let's consolidate callers of current_text_addr() to use _THIS_IP_. Link: http://lkml.kernel.org/r/20180801185331.39535-1-ndesaulniers@google.com Signed-off-by: Nick Desaulniers Cc: Yoshinori Sato Cc: Rich Felker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/asm/kexec.h | 3 ++- arch/sh/kernel/dwarf.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/sh/include/asm/kexec.h b/arch/sh/include/asm/kexec.h index fd5f331a3912..927d80ba2332 100644 --- a/arch/sh/include/asm/kexec.h +++ b/arch/sh/include/asm/kexec.h @@ -4,6 +4,7 @@ #include #include +#include /* * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. @@ -61,7 +62,7 @@ static inline void crash_setup_regs(struct pt_regs *newregs, __asm__ __volatile__ ("stc gbr, %0" : "=r" (newregs->gbr)); __asm__ __volatile__ ("stc sr, %0" : "=r" (newregs->sr)); - newregs->pc = (unsigned long)current_text_addr(); + newregs->pc = _THIS_IP_; } } #else diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c index 1a2526676a87..bb511e2d9d68 100644 --- a/arch/sh/kernel/dwarf.c +++ b/arch/sh/kernel/dwarf.c @@ -599,7 +599,7 @@ struct dwarf_frame *dwarf_unwind_stack(unsigned long pc, * time this function makes its first function call. */ if (!pc || !prev) - pc = (unsigned long)current_text_addr(); + pc = _THIS_IP_; #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* From 93f5920d8607c5e3f2d3b159377a7e7d7875ffdd Mon Sep 17 00:00:00 2001 From: Jun Piao Date: Fri, 17 Aug 2018 15:44:24 -0700 Subject: [PATCH 017/111] ocfs2: return -EROFS when filesystem becomes read-only We should return -EROFS rather than other errno if filesystem becomes read-only. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/5B191B26.9010501@huawei.com Signed-off-by: Jun Piao Reviewed-by: Yiwen Jiang Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 43 +++++++++++++++++++----------------------- fs/ocfs2/localalloc.c | 9 ++++----- fs/ocfs2/quota_local.c | 15 +++++++-------- 3 files changed, 30 insertions(+), 37 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 0f157bbd3e0f..676714fef869 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1481,19 +1481,17 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, while(le16_to_cpu(el->l_tree_depth) > 1) { if (le16_to_cpu(el->l_next_free_rec) == 0) { - ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has empty extent list (next_free_rec == 0)\n", - (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); - status = -EIO; + status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has empty extent list (next_free_rec == 0)\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); goto bail; } i = le16_to_cpu(el->l_next_free_rec) - 1; blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (!blkno) { - ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has extent list where extent # %d has no physical block start\n", - (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); - status = -EIO; + status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has extent list where extent # %d has no physical block start\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); goto bail; } @@ -3214,11 +3212,10 @@ rightmost_no_delete: goto rightmost_no_delete; if (le16_to_cpu(el->l_next_free_rec) == 0) { - ret = -EIO; - ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has empty extent block at %llu\n", - (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), - (unsigned long long)le64_to_cpu(eb->h_blkno)); + ret = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has empty extent block at %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + (unsigned long long)le64_to_cpu(eb->h_blkno)); goto out; } @@ -4411,12 +4408,11 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, le16_to_cpu(new_el->l_count)) { bh = path_leaf_bh(left_path); eb = (struct ocfs2_extent_block *)bh->b_data; - ocfs2_error(sb, - "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n", - (unsigned long long)le64_to_cpu(eb->h_blkno), - le16_to_cpu(new_el->l_next_free_rec), - le16_to_cpu(new_el->l_count)); - status = -EINVAL; + status = ocfs2_error(sb, + "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n", + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(new_el->l_next_free_rec), + le16_to_cpu(new_el->l_count)); goto free_left_path; } rec = &new_el->l_recs[ @@ -4466,11 +4462,10 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { bh = path_leaf_bh(right_path); eb = (struct ocfs2_extent_block *)bh->b_data; - ocfs2_error(sb, - "Extent block #%llu has an invalid l_next_free_rec of %d\n", - (unsigned long long)le64_to_cpu(eb->h_blkno), - le16_to_cpu(new_el->l_next_free_rec)); - status = -EINVAL; + status = ocfs2_error(sb, + "Extent block #%llu has an invalid l_next_free_rec of %d\n", + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(new_el->l_next_free_rec)); goto free_right_path; } rec = &new_el->l_recs[1]; diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index fe0d1f9571bb..7642b6712c39 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -663,11 +663,10 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, #ifdef CONFIG_OCFS2_DEBUG_FS if (le32_to_cpu(alloc->id1.bitmap1.i_used) != ocfs2_local_alloc_count_bits(alloc)) { - ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n", - (unsigned long long)le64_to_cpu(alloc->i_blkno), - le32_to_cpu(alloc->id1.bitmap1.i_used), - ocfs2_local_alloc_count_bits(alloc)); - status = -EIO; + status = ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n", + (unsigned long long)le64_to_cpu(alloc->i_blkno), + le32_to_cpu(alloc->id1.bitmap1.i_used), + ocfs2_local_alloc_count_bits(alloc)); goto bail; } #endif diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 16c42ed0dca8..b1a8b046f4c2 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -137,14 +137,13 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block, int rc = 0; struct buffer_head *tmp = *bh; - if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { - ocfs2_error(inode->i_sb, - "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)v_block, - (unsigned long long)i_size_read(inode)); - return -EIO; - } + if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) + return ocfs2_error(inode->i_sb, + "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)v_block, + (unsigned long long)i_size_read(inode)); + rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, ocfs2_validate_quota_block); if (rc) From 229ba1f82abe4c942e2ab5862daafdfe471fedd8 Mon Sep 17 00:00:00 2001 From: wangyan Date: Fri, 17 Aug 2018 15:44:27 -0700 Subject: [PATCH 018/111] ocfs2: clean up some unnecessary code Several functions have some unnecessary code, clean up these code. Link: http://lkml.kernel.org/r/5B14DF72.5020800@huawei.com Signed-off-by: Yan Wang Reviewed-by: Jun Piao Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 17 ++++------------- fs/ocfs2/cluster/tcp.c | 2 -- fs/ocfs2/inode.c | 5 +---- 3 files changed, 5 insertions(+), 19 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 676714fef869..a342f008e42f 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -932,13 +932,11 @@ static int ocfs2_validate_extent_block(struct super_block *sb, goto bail; } - if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) { + if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) rc = ocfs2_error(sb, "Extent block #%llu has an invalid h_fs_generation of #%u\n", (unsigned long long)bh->b_blocknr, le32_to_cpu(eb->h_fs_generation)); - goto bail; - } bail: return rc; } @@ -1596,10 +1594,8 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, * the new data. */ ret = ocfs2_add_branch(handle, et, bh, last_eb_bh, meta_ac); - if (ret < 0) { + if (ret < 0) mlog_errno(ret); - goto out; - } out: if (final_depth) @@ -5518,10 +5514,8 @@ static int ocfs2_truncate_rec(handle_t *handle, ocfs2_journal_dirty(handle, path_leaf_bh(path)); ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); - if (ret) { + if (ret) mlog_errno(ret); - goto out; - } out: ocfs2_free_path(left_path); @@ -5654,10 +5648,8 @@ int ocfs2_remove_extent(handle_t *handle, ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, cpos, len); - if (ret) { + if (ret) mlog_errno(ret); - goto out; - } } out: @@ -5702,7 +5694,6 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode, if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); - goto out; } } diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 1296f78ae966..7d9eea7d4a87 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -872,8 +872,6 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, "for type %u key %08x\n", msg_type, key); } write_unlock(&o2net_handler_lock); - if (ret) - goto out; out: if (ret) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index ddc3e9470c87..79279240fb6e 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -637,10 +637,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, handle = NULL; status = ocfs2_commit_truncate(osb, inode, fe_bh); - if (status < 0) { + if (status < 0) mlog_errno(status); - goto out; - } } out: @@ -1499,7 +1497,6 @@ static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, (unsigned long long)bh->b_blocknr, le32_to_cpu(di->i_fs_generation)); rc = -OCFS2_FILECHECK_ERR_GENERATION; - goto bail; } bail: From 480bd56485b77c36e17a411921266c6f06623d98 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 17 Aug 2018 15:44:31 -0700 Subject: [PATCH 019/111] ocfs2: make several functions and variables static (and some const) There are a variety of functions and variables that are local to the source and do not need to be in global scope, so make them static. Also make a couple of char arrays static const. Cleans up sparse warnings: symbol 'o2hb_heartbeat_mode_desc' was not declared. Should it be static? symbol 'o2hb_heartbeat_mode' was not declared. Should it be static? symbol 'o2hb_dependent_users' was not declared. Should it be static? symbol 'o2hb_region_dec_user' was not declared. Should it be static? symbol 'o2nm_fence_method_desc' was not declared. Should it be static? symbol 'lockdep_keys' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20180628131659.12133-1-colin.king@canonical.com Signed-off-by: Colin Ian King Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 12 ++++++------ fs/ocfs2/cluster/nodemanager.c | 6 +++--- fs/ocfs2/dlmglue.c | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index ea8c551bcd7e..9b2ed62dd638 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -127,13 +127,13 @@ enum o2hb_heartbeat_modes { O2HB_HEARTBEAT_NUM_MODES, }; -char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = { - "local", /* O2HB_HEARTBEAT_LOCAL */ - "global", /* O2HB_HEARTBEAT_GLOBAL */ +static const char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = { + "local", /* O2HB_HEARTBEAT_LOCAL */ + "global", /* O2HB_HEARTBEAT_GLOBAL */ }; unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; -unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; +static unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; /* * o2hb_dependent_users tracks the number of registered callbacks that depend @@ -141,7 +141,7 @@ unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; * However only o2dlm depends on the heartbeat. It does not want the heartbeat * to stop while a dlm domain is still active. */ -unsigned int o2hb_dependent_users; +static unsigned int o2hb_dependent_users; /* * In global heartbeat mode, all regions are pinned if there are one or more @@ -2486,7 +2486,7 @@ unlock: return ret; } -void o2hb_region_dec_user(const char *region_uuid) +static void o2hb_region_dec_user(const char *region_uuid) { spin_lock(&o2hb_live_lock); diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index da64c3a20eeb..0e4166cc23a0 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -35,9 +35,9 @@ * cluster references throughout where nodes are looked up */ struct o2nm_cluster *o2nm_single_cluster = NULL; -char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = { - "reset", /* O2NM_FENCE_RESET */ - "panic", /* O2NM_FENCE_PANIC */ +static const char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = { + "reset", /* O2NM_FENCE_RESET */ + "panic", /* O2NM_FENCE_PANIC */ }; static inline void o2nm_lock_subsystem(void); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 0ff424c6d17c..8e712b614e6e 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -96,7 +96,7 @@ struct ocfs2_unblock_ctl { }; /* Lockdep class keys */ -struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES]; +static struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES]; static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, int new_level); From 6cd00a01f0c1ae6a852b09c59b8dd55cc6c35d1d Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 17 Aug 2018 15:44:34 -0700 Subject: [PATCH 020/111] fs/dcache.c: fix kmemcheck splat at take_dentry_name_snapshot() Since only dentry->d_name.len + 1 bytes out of DNAME_INLINE_LEN bytes are initialized at __d_alloc(), we can't copy the whole size unconditionally. WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff8fa27465ac50) 636f6e66696766732e746d70000000000010000000000000020000000188ffff i i i i i i i i i i i i i u u u u u u u u u u i i i i i u u u u ^ RIP: 0010:take_dentry_name_snapshot+0x28/0x50 RSP: 0018:ffffa83000f5bdf8 EFLAGS: 00010246 RAX: 0000000000000020 RBX: ffff8fa274b20550 RCX: 0000000000000002 RDX: ffffa83000f5be40 RSI: ffff8fa27465ac50 RDI: ffffa83000f5be60 RBP: ffffa83000f5bdf8 R08: ffffa83000f5be48 R09: 0000000000000001 R10: ffff8fa27465ac00 R11: ffff8fa27465acc0 R12: ffff8fa27465ac00 R13: ffff8fa27465acc0 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f79737ac8c0(0000) GS:ffffffff8fc30000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff8fa274c0b000 CR3: 0000000134aa7002 CR4: 00000000000606f0 take_dentry_name_snapshot+0x28/0x50 vfs_rename+0x128/0x870 SyS_rename+0x3b2/0x3d0 entry_SYSCALL_64_fastpath+0x1a/0xa4 0xffffffffffffffff Link: http://lkml.kernel.org/r/201709131912.GBG39012.QMJLOVFSFFOOtH@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Cc: Vegard Nossum Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/dcache.c b/fs/dcache.c index 8d2ec4898c2b..2e7e8d85e9b4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -292,7 +292,8 @@ void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry spin_unlock(&dentry->d_lock); name->name = p->name; } else { - memcpy(name->inline_name, dentry->d_iname, DNAME_INLINE_LEN); + memcpy(name->inline_name, dentry->d_iname, + dentry->d_name.len + 1); spin_unlock(&dentry->d_lock); name->name = name->inline_name; } From 4cdfffc8722e99be8d400d8fa1fcd615d078ad43 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 17 Aug 2018 15:44:37 -0700 Subject: [PATCH 021/111] vfs: discard ATTR_ATTR_FLAG This flag was introduce in 2.1.37pre1 and the only place it was tested was removed in 2.1.43pre1. The flag was never set. Let's discard it properly. Link: http://lkml.kernel.org/r/877en0hewz.fsf@notabene.neil.brown.name Signed-off-by: NeilBrown Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hostfs/hostfs.h | 2 +- include/linux/fs.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index cb8374af08a6..33b8423ef0c9 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -19,7 +19,7 @@ #define HOSTFS_ATTR_ATIME_SET 128 #define HOSTFS_ATTR_MTIME_SET 256 -/* These two are unused by hostfs. */ +/* This one is unused by hostfs. */ #define HOSTFS_ATTR_FORCE 512 /* Not a change, but a change it */ #define HOSTFS_ATTR_ATTR_FLAG 1024 diff --git a/include/linux/fs.h b/include/linux/fs.h index 1ec33fd0423f..9d319f1f66f6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -179,7 +179,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define ATTR_ATIME_SET (1 << 7) #define ATTR_MTIME_SET (1 << 8) #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ -#define ATTR_ATTR_FLAG (1 << 10) #define ATTR_KILL_SUID (1 << 11) #define ATTR_KILL_SGID (1 << 12) #define ATTR_FILE (1 << 13) From 1f4aace60b0edc2d885aaa263abf4df42c8c65a8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 17 Aug 2018 15:44:41 -0700 Subject: [PATCH 022/111] fs/seq_file.c: simplify seq_file iteration code and interface The documentation for seq_file suggests that it is necessary to be able to move the iterator to a given offset, however that is not the case. If the iterator is stored in the private data and is stable from one read() syscall to the next, it is only necessary to support first/next interactions. Implementing this in a client is a little clumsy. - if ->start() is given a pos of zero, it should go to start of sequence. - if ->start() is given the name pos that was given to the most recent next() or start(), it should restore the iterator to state just before that last call - if ->start is given another number, it should set the iterator one beyond the start just before the last ->start or ->next call. Also, the documentation says that the implementation can interpret the pos however it likes (other than zero meaning start), but seq_file increments the pos sometimes which does impose on the implementation. This patch simplifies the interface for first/next iteration and simplifies the code, while maintaining complete backward compatability. Now: - if ->start() is given a pos of zero, it should return an iterator placed at the start of the sequence - if ->start() is given a non-zero pos, it should return the iterator in the same state it was after the last ->start or ->next. This is particularly useful for interators which walk the multiple chains in a hash table, e.g. using rhashtable_walk*. See fs/gfs2/glock.c and drivers/staging/lustre/lustre/llite/vvp_dev.c A large part of achieving this is to *always* call ->next after ->show has successfully stored all of an entry in the buffer. Never just increment the index instead. Also: - always pass &m->index to ->start() and ->next(), never a temp variable - don't clear ->from when ->count is zero, as ->from is dead when ->count is zero. Some ->next functions do not increment *pos when they return NULL. To maintain compatability with this, we still need to increment m->index in one place, if ->next didn't increment it. Note that such ->next functions are buggy and should be fixed. A simple demonstration is dd if=/proc/swaps bs=1000 skip=1 Choose any block size larger than the size of /proc/swaps. This will always show the whole last line of /proc/swaps. This patch doesn't work around buggy next() functions for this case. [neilb@suse.com: ensure ->from is valid] Link: http://lkml.kernel.org/r/87601ryb8a.fsf@notabene.neil.brown.name Signed-off-by: NeilBrown Acked-by: Jonathan Corbet [docs] Tested-by: Jann Horn Cc: Alexander Viro Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/seq_file.txt | 63 +++++++++++++++++--------- fs/seq_file.c | 54 +++++++++------------- 2 files changed, 63 insertions(+), 54 deletions(-) diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt index 9de4303201e1..d412b236a9d6 100644 --- a/Documentation/filesystems/seq_file.txt +++ b/Documentation/filesystems/seq_file.txt @@ -66,23 +66,39 @@ kernel 3.10. Current versions require the following update The iterator interface -Modules implementing a virtual file with seq_file must implement a simple -iterator object that allows stepping through the data of interest. -Iterators must be able to move to a specific position - like the file they -implement - but the interpretation of that position is up to the iterator -itself. A seq_file implementation that is formatting firewall rules, for -example, could interpret position N as the Nth rule in the chain. -Positioning can thus be done in whatever way makes the most sense for the -generator of the data, which need not be aware of how a position translates -to an offset in the virtual file. The one obvious exception is that a -position of zero should indicate the beginning of the file. +Modules implementing a virtual file with seq_file must implement an +iterator object that allows stepping through the data of interest +during a "session" (roughly one read() system call). If the iterator +is able to move to a specific position - like the file they implement, +though with freedom to map the position number to a sequence location +in whatever way is convenient - the iterator need only exist +transiently during a session. If the iterator cannot easily find a +numerical position but works well with a first/next interface, the +iterator can be stored in the private data area and continue from one +session to the next. + +A seq_file implementation that is formatting firewall rules from a +table, for example, could provide a simple iterator that interprets +position N as the Nth rule in the chain. A seq_file implementation +that presents the content of a, potentially volatile, linked list +might record a pointer into that list, providing that can be done +without risk of the current location being removed. + +Positioning can thus be done in whatever way makes the most sense for +the generator of the data, which need not be aware of how a position +translates to an offset in the virtual file. The one obvious exception +is that a position of zero should indicate the beginning of the file. The /proc/sequence iterator just uses the count of the next number it will output as its position. -Four functions must be implemented to make the iterator work. The first, -called start() takes a position as an argument and returns an iterator -which will start reading at that position. For our simple sequence example, +Four functions must be implemented to make the iterator work. The +first, called start(), starts a session and takes a position as an +argument, returning an iterator which will start reading at that +position. The pos passed to start() will always be either zero, or +the most recent pos used in the previous session. + +For our simple sequence example, the start() function looks like: static void *ct_seq_start(struct seq_file *s, loff_t *pos) @@ -101,11 +117,12 @@ implementations; in most cases the start() function should check for a "past end of file" condition and return NULL if need be. For more complicated applications, the private field of the seq_file -structure can be used. There is also a special value which can be returned -by the start() function called SEQ_START_TOKEN; it can be used if you wish -to instruct your show() function (described below) to print a header at the -top of the output. SEQ_START_TOKEN should only be used if the offset is -zero, however. +structure can be used to hold state from session to session. There is +also a special value which can be returned by the start() function +called SEQ_START_TOKEN; it can be used if you wish to instruct your +show() function (described below) to print a header at the top of the +output. SEQ_START_TOKEN should only be used if the offset is zero, +however. The next function to implement is called, amazingly, next(); its job is to move the iterator forward to the next position in the sequence. The @@ -121,9 +138,13 @@ complete. Here's the example version: return spos; } -The stop() function is called when iteration is complete; its job, of -course, is to clean up. If dynamic memory is allocated for the iterator, -stop() is the place to free it. +The stop() function closes a session; its job, of course, is to clean +up. If dynamic memory is allocated for the iterator, stop() is the +place to free it; if a lock was taken by start(), stop() must release +that lock. The value that *pos was set to by the last next() call +before stop() is remembered, and used for the first start() call of +the next session unless lseek() has been called on the file; in that +case next start() will be asked to start at position zero. static void ct_seq_stop(struct seq_file *s, void *v) { diff --git a/fs/seq_file.c b/fs/seq_file.c index 4cc090b50cc5..1dea7a8a5255 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -90,23 +90,22 @@ EXPORT_SYMBOL(seq_open); static int traverse(struct seq_file *m, loff_t offset) { - loff_t pos = 0, index; + loff_t pos = 0; int error = 0; void *p; m->version = 0; - index = 0; + m->index = 0; m->count = m->from = 0; - if (!offset) { - m->index = index; + if (!offset) return 0; - } + if (!m->buf) { m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) return -ENOMEM; } - p = m->op->start(m, &index); + p = m->op->start(m, &m->index); while (p) { error = PTR_ERR(p); if (IS_ERR(p)) @@ -123,20 +122,15 @@ static int traverse(struct seq_file *m, loff_t offset) if (pos + m->count > offset) { m->from = offset - pos; m->count -= m->from; - m->index = index; break; } pos += m->count; m->count = 0; - if (pos == offset) { - index++; - m->index = index; + p = m->op->next(m, p, &m->index); + if (pos == offset) break; - } - p = m->op->next(m, p, &index); } m->op->stop(m, p); - m->index = index; return error; Eoverflow: @@ -160,7 +154,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct seq_file *m = file->private_data; size_t copied = 0; - loff_t pos; size_t n; void *p; int err = 0; @@ -223,16 +216,12 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) size -= n; buf += n; copied += n; - if (!m->count) { - m->from = 0; - m->index++; - } if (!size) goto Done; } /* we need at least one record in buffer */ - pos = m->index; - p = m->op->start(m, &pos); + m->from = 0; + p = m->op->start(m, &m->index); while (1) { err = PTR_ERR(p); if (!p || IS_ERR(p)) @@ -243,8 +232,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (unlikely(err)) m->count = 0; if (unlikely(!m->count)) { - p = m->op->next(m, p, &pos); - m->index = pos; + p = m->op->next(m, p, &m->index); continue; } if (m->count < m->size) @@ -256,29 +244,33 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (!m->buf) goto Enomem; m->version = 0; - pos = m->index; - p = m->op->start(m, &pos); + p = m->op->start(m, &m->index); } m->op->stop(m, p); m->count = 0; goto Done; Fill: /* they want more? let's try to get some more */ - while (m->count < size) { + while (1) { size_t offs = m->count; - loff_t next = pos; - p = m->op->next(m, p, &next); + loff_t pos = m->index; + + p = m->op->next(m, p, &m->index); + if (pos == m->index) + /* Buggy ->next function */ + m->index++; if (!p || IS_ERR(p)) { err = PTR_ERR(p); break; } + if (m->count >= size) + break; err = m->op->show(m, p); if (seq_has_overflowed(m) || err) { m->count = offs; if (likely(err <= 0)) break; } - pos = next; } m->op->stop(m, p); n = min(m->count, size); @@ -287,11 +279,7 @@ Fill: goto Efault; copied += n; m->count -= n; - if (m->count) - m->from = n; - else - pos++; - m->index = pos; + m->from = n; Done: if (!copied) copied = err; From 0882ff9190e3bc51e2d78c3aadd7c690eeaa91d5 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 17 Aug 2018 15:44:44 -0700 Subject: [PATCH 023/111] mm, slub: restore the original intention of prefetch_freepointer() In SLUB, prefetch_freepointer() is used when allocating an object from cache's freelist, to make sure the next object in the list is cache-hot, since it's probable it will be allocated soon. Commit 2482ddec670f ("mm: add SLUB free list pointer obfuscation") has unintentionally changed the prefetch in a way where the prefetch is turned to a real fetch, and only the next->next pointer is prefetched. In case there is not a stream of allocations that would benefit from prefetching, the extra real fetch might add a useless cache miss to the allocation. Restore the previous behavior. Link: http://lkml.kernel.org/r/20180809085245.22448-1-vbabka@suse.cz Fixes: 2482ddec670f ("mm: add SLUB free list pointer obfuscation") Signed-off-by: Vlastimil Babka Acked-by: Kees Cook Cc: Daniel Micay Cc: Eric Dumazet Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 51258eff4178..ce2b9e5cea77 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -271,8 +271,7 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) static void prefetch_freepointer(const struct kmem_cache *s, void *object) { - if (object) - prefetch(freelist_dereference(s, object + s->offset)); + prefetch(object + s->offset); } static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) From 50a7ca3c6fc86955f99fc432fc8a186b968b365b Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Fri, 17 Aug 2018 15:44:47 -0700 Subject: [PATCH 024/111] mm: convert return type of handle_mm_fault() caller to vm_fault_t Use new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. Ref-> commit 1c8f422059ae ("mm: change return type to vm_fault_t") In this patch all the caller of handle_mm_fault() are changed to return vm_fault_t type. Link: http://lkml.kernel.org/r/20180617084810.GA6730@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Cc: Matthew Wilcox Cc: Richard Henderson Cc: Tony Luck Cc: Matt Turner Cc: Vineet Gupta Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Richard Kuo Cc: Geert Uytterhoeven Cc: Michal Simek Cc: James Hogan Cc: Ley Foon Tan Cc: Jonas Bonn Cc: James E.J. Bottomley Cc: Benjamin Herrenschmidt Cc: Palmer Dabbelt Cc: Yoshinori Sato Cc: David S. Miller Cc: Richard Weinberger Cc: Guan Xuetao Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: "Levin, Alexander (Sasha Levin)" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/mm/fault.c | 3 ++- arch/arc/mm/fault.c | 4 +++- arch/arm/mm/fault.c | 7 ++++--- arch/arm64/mm/fault.c | 6 +++--- arch/hexagon/mm/vm_fault.c | 2 +- arch/ia64/mm/fault.c | 2 +- arch/m68k/mm/fault.c | 4 ++-- arch/microblaze/mm/fault.c | 2 +- arch/mips/mm/fault.c | 2 +- arch/nds32/mm/fault.c | 2 +- arch/nios2/mm/fault.c | 2 +- arch/openrisc/mm/fault.c | 2 +- arch/parisc/mm/fault.c | 2 +- arch/powerpc/include/asm/copro.h | 4 +++- arch/powerpc/mm/copro_fault.c | 2 +- arch/powerpc/mm/fault.c | 7 ++++--- arch/powerpc/platforms/cell/spufs/fault.c | 2 +- arch/riscv/mm/fault.c | 3 ++- arch/s390/mm/fault.c | 13 ++++++++----- arch/sh/mm/fault.c | 4 ++-- arch/sparc/mm/fault_32.c | 3 ++- arch/sparc/mm/fault_64.c | 3 ++- arch/um/kernel/trap.c | 2 +- arch/unicore32/mm/fault.c | 9 +++++---- arch/x86/mm/fault.c | 5 +++-- arch/xtensa/mm/fault.c | 2 +- drivers/iommu/amd_iommu_v2.c | 2 +- drivers/iommu/intel-svm.c | 4 +++- drivers/misc/cxl/fault.c | 2 +- drivers/misc/ocxl/link.c | 3 ++- mm/hmm.c | 8 ++++---- mm/ksm.c | 2 +- 32 files changed, 69 insertions(+), 51 deletions(-) diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index de2bd217adad..d73dc473fbb9 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -87,7 +87,8 @@ do_page_fault(unsigned long address, unsigned long mmcsr, struct vm_area_struct * vma; struct mm_struct *mm = current->mm; const struct exception_table_entry *fixup; - int fault, si_code = SEGV_MAPERR; + int si_code = SEGV_MAPERR; + vm_fault_t fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* As of EV6, a load into $31/$f31 is a prefetch, and never faults diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index b884bbd6f354..db6913094be3 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -66,7 +67,8 @@ void do_page_fault(unsigned long address, struct pt_regs *regs) struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; siginfo_t info; - int fault, ret; + int ret; + vm_fault_t fault; int write = regs->ecr_cause & ECR_C_PROTV_STORE; /* ST/EX */ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 84becc911ee3..3232afb6fdc0 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -224,12 +224,12 @@ static inline bool access_error(unsigned int fsr, struct vm_area_struct *vma) return vma->vm_flags & mask ? false : true; } -static int __kprobes +static vm_fault_t __kprobes __do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr, unsigned int flags, struct task_struct *tsk) { struct vm_area_struct *vma; - int fault; + vm_fault_t fault; vma = find_vma(mm, addr); fault = VM_FAULT_BADMAP; @@ -264,7 +264,8 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct task_struct *tsk; struct mm_struct *mm; - int fault, sig, code; + int sig, code; + vm_fault_t fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; if (notify_page_fault(regs, fsr)) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 9943690a3924..50b30ff30de4 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -379,12 +379,12 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re #define VM_FAULT_BADMAP 0x010000 #define VM_FAULT_BADACCESS 0x020000 -static int __do_page_fault(struct mm_struct *mm, unsigned long addr, +static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int mm_flags, unsigned long vm_flags, struct task_struct *tsk) { struct vm_area_struct *vma; - int fault; + vm_fault_t fault; vma = find_vma(mm, addr); fault = VM_FAULT_BADMAP; @@ -427,7 +427,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, struct task_struct *tsk; struct mm_struct *mm; struct siginfo si; - int fault, major = 0; + vm_fault_t fault, major = 0; unsigned long vm_flags = VM_READ | VM_WRITE; unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index 933bbcef5363..eb263e61daf4 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -52,7 +52,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) struct mm_struct *mm = current->mm; int si_signo; int si_code = SEGV_MAPERR; - int fault; + vm_fault_t fault; const struct exception_table_entry *fixup; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 817fa120645f..a9d55ad8d67b 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -86,7 +86,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re struct vm_area_struct *vma, *prev_vma; struct mm_struct *mm = current->mm; unsigned long mask; - int fault; + vm_fault_t fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT) diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index f2ff3779875a..9b6163c05a75 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -70,7 +70,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, { struct mm_struct *mm = current->mm; struct vm_area_struct * vma; - int fault; + vm_fault_t fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; pr_debug("do page fault:\nregs->sr=%#x, regs->pc=%#lx, address=%#lx, %ld, %p\n", @@ -136,7 +136,7 @@ good_area: */ fault = handle_mm_fault(vma, address, flags); - pr_debug("handle_mm_fault returns %d\n", fault); + pr_debug("handle_mm_fault returns %x\n", fault); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return 0; diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index af607447c683..202ad6a494f5 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -90,7 +90,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address, struct mm_struct *mm = current->mm; int code = SEGV_MAPERR; int is_write = error_code & ESR_S; - int fault; + vm_fault_t fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; regs->ear = address; diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 5f71f2b903b7..73d8a0f0b810 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -43,7 +43,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, struct mm_struct *mm = tsk->mm; const int field = sizeof(unsigned long) * 2; int si_code; - int fault; + vm_fault_t fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); diff --git a/arch/nds32/mm/fault.c b/arch/nds32/mm/fault.c index 9bdb7c3ecbb6..b740534b152c 100644 --- a/arch/nds32/mm/fault.c +++ b/arch/nds32/mm/fault.c @@ -73,7 +73,7 @@ void do_page_fault(unsigned long entry, unsigned long addr, struct mm_struct *mm; struct vm_area_struct *vma; int si_code; - int fault; + vm_fault_t fault; unsigned int mask = VM_READ | VM_WRITE | VM_EXEC; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c index b804dd06ea1c..24fd84cf6006 100644 --- a/arch/nios2/mm/fault.c +++ b/arch/nios2/mm/fault.c @@ -47,7 +47,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause, struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; int code = SEGV_MAPERR; - int fault; + vm_fault_t fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; cause >>= 2; diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 9f011d16cc46..dc4dbafc1d83 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -53,7 +53,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address, struct mm_struct *mm; struct vm_area_struct *vma; int si_code; - int fault; + vm_fault_t fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index a80117980fc2..c8e8b7c05558 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -262,7 +262,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, struct task_struct *tsk; struct mm_struct *mm; unsigned long acc_type; - int fault = 0; + vm_fault_t fault = 0; unsigned int flags; if (faulthandler_disabled()) diff --git a/arch/powerpc/include/asm/copro.h b/arch/powerpc/include/asm/copro.h index ce216df31381..48616fe7ea75 100644 --- a/arch/powerpc/include/asm/copro.h +++ b/arch/powerpc/include/asm/copro.h @@ -10,13 +10,15 @@ #ifndef _ASM_POWERPC_COPRO_H #define _ASM_POWERPC_COPRO_H +#include + struct copro_slb { u64 esid, vsid; }; int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, - unsigned long dsisr, unsigned *flt); + unsigned long dsisr, vm_fault_t *flt); int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb); diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index 7d0945bd3a61..c8da352e8686 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -34,7 +34,7 @@ * to handle fortunately. */ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, - unsigned long dsisr, unsigned *flt) + unsigned long dsisr, vm_fault_t *flt) { struct vm_area_struct *vma; unsigned long is_write; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index b1ca7a0974e3..7c061313cc6f 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -156,7 +156,7 @@ static noinline int bad_access(struct pt_regs *regs, unsigned long address) } static int do_sigbus(struct pt_regs *regs, unsigned long address, - unsigned int fault) + vm_fault_t fault) { siginfo_t info; unsigned int lsb = 0; @@ -187,7 +187,8 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address, return 0; } -static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) +static int mm_fault_error(struct pt_regs *regs, unsigned long addr, + vm_fault_t fault) { /* * Kernel page fault interrupted by SIGKILL. We have no reason to @@ -415,7 +416,7 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address, int is_exec = TRAP(regs) == 0x400; int is_user = user_mode(regs); int is_write = page_fault_is_write(error_code); - int fault, major = 0; + vm_fault_t fault, major = 0; bool must_retry = false; if (notify_page_fault(regs)) diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c index 1e002e94d0f6..83cf58daaa79 100644 --- a/arch/powerpc/platforms/cell/spufs/fault.c +++ b/arch/powerpc/platforms/cell/spufs/fault.c @@ -111,7 +111,7 @@ int spufs_handle_class1(struct spu_context *ctx) { u64 ea, dsisr, access; unsigned long flags; - unsigned flt = 0; + vm_fault_t flt = 0; int ret; /* diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 148c98ca9b45..88401d5125bc 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -41,7 +41,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs) struct mm_struct *mm; unsigned long addr, cause; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; - int fault, code = SEGV_MAPERR; + int code = SEGV_MAPERR; + vm_fault_t fault; cause = regs->scause; addr = regs->sbadaddr; diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 4cc3f06b0ab3..72af23bacbb5 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -341,7 +341,8 @@ static noinline int signal_return(struct pt_regs *regs) return -EACCES; } -static noinline void do_fault_error(struct pt_regs *regs, int access, int fault) +static noinline void do_fault_error(struct pt_regs *regs, int access, + vm_fault_t fault) { int si_code; @@ -401,7 +402,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, int fault) * 11 Page translation -> Not present (nullification) * 3b Region third trans. -> Not present (nullification) */ -static inline int do_exception(struct pt_regs *regs, int access) +static inline vm_fault_t do_exception(struct pt_regs *regs, int access) { struct gmap *gmap; struct task_struct *tsk; @@ -411,7 +412,7 @@ static inline int do_exception(struct pt_regs *regs, int access) unsigned long trans_exc_code; unsigned long address; unsigned int flags; - int fault; + vm_fault_t fault; tsk = current; /* @@ -564,7 +565,8 @@ out: void do_protection_exception(struct pt_regs *regs) { unsigned long trans_exc_code; - int access, fault; + int access; + vm_fault_t fault; trans_exc_code = regs->int_parm_long; /* @@ -599,7 +601,8 @@ NOKPROBE_SYMBOL(do_protection_exception); void do_dat_exception(struct pt_regs *regs) { - int access, fault; + int access; + vm_fault_t fault; access = VM_READ | VM_EXEC | VM_WRITE; fault = do_exception(regs, access); diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index b8e7bb84b6b1..6defd2c6d9b1 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -313,7 +313,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) static noinline int mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, unsigned int fault) + unsigned long address, vm_fault_t fault) { /* * Pagefault was interrupted by SIGKILL. We have no reason to @@ -396,7 +396,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct * vma; - int fault; + vm_fault_t fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index 9f75b6444bf1..b0440b0edd97 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -166,7 +166,8 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, unsigned int fixup; unsigned long g2; int from_user = !(regs->psr & PSR_PS); - int fault, code; + int code; + vm_fault_t fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; if (text_fault) diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 63166fcf9e25..8f8a604c1300 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -278,7 +278,8 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned int insn = 0; - int si_code, fault_code, fault; + int si_code, fault_code; + vm_fault_t fault; unsigned long address, mm_rss; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index ec9a42c14c56..cced82946042 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -72,7 +72,7 @@ good_area: } do { - int fault; + vm_fault_t fault; fault = handle_mm_fault(vma, address, flags); diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c index 381473412937..8f12a5b50a42 100644 --- a/arch/unicore32/mm/fault.c +++ b/arch/unicore32/mm/fault.c @@ -168,11 +168,11 @@ static inline bool access_error(unsigned int fsr, struct vm_area_struct *vma) return vma->vm_flags & mask ? false : true; } -static int __do_pf(struct mm_struct *mm, unsigned long addr, unsigned int fsr, - unsigned int flags, struct task_struct *tsk) +static vm_fault_t __do_pf(struct mm_struct *mm, unsigned long addr, + unsigned int fsr, unsigned int flags, struct task_struct *tsk) { struct vm_area_struct *vma; - int fault; + vm_fault_t fault; vma = find_vma(mm, addr); fault = VM_FAULT_BADMAP; @@ -209,7 +209,8 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct task_struct *tsk; struct mm_struct *mm; - int fault, sig, code; + int sig, code; + vm_fault_t fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index db1c042e9853..b9123c497e0a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -16,6 +16,7 @@ #include /* prefetchw */ #include /* exception_enter(), ... */ #include /* faulthandler_disabled() */ +#include #include /* boot_cpu_has, ... */ #include /* dotraplinkage, ... */ @@ -999,7 +1000,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, u32 *pkey, unsigned int fault) + unsigned long address, u32 *pkey, vm_fault_t fault) { if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { no_context(regs, error_code, address, 0, 0); @@ -1213,7 +1214,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; - int fault, major = 0; + vm_fault_t fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; u32 pkey; diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index c111a833205a..2ab0e0dcd166 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -42,7 +42,7 @@ void do_page_fault(struct pt_regs *regs) int code; int is_write, is_exec; - int fault; + vm_fault_t fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; code = SEGV_MAPERR; diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index 1d0b53a04a08..58da65df03f5 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -508,7 +508,7 @@ static void do_fault(struct work_struct *work) { struct fault *fault = container_of(work, struct fault, work); struct vm_area_struct *vma; - int ret = VM_FAULT_ERROR; + vm_fault_t ret = VM_FAULT_ERROR; unsigned int flags = 0; struct mm_struct *mm; u64 address; diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 45f6e581cd56..7d65aab36a96 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #define PASID_ENTRY_P BIT_ULL(0) @@ -594,7 +595,8 @@ static irqreturn_t prq_event_thread(int irq, void *d) struct vm_area_struct *vma; struct page_req_dsc *req; struct qi_desc resp; - int ret, result; + int result; + vm_fault_t ret; u64 address; handled = 1; diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c index 70dbb6de102c..93ecc67a0f3b 100644 --- a/drivers/misc/cxl/fault.c +++ b/drivers/misc/cxl/fault.c @@ -134,7 +134,7 @@ static int cxl_handle_segment_miss(struct cxl_context *ctx, int cxl_handle_mm_fault(struct mm_struct *mm, u64 dsisr, u64 dar) { - unsigned flt = 0; + vm_fault_t flt = 0; int result; unsigned long access, flags, inv_flags = 0; diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c index 88876ae8f330..ffc731b0731a 100644 --- a/drivers/misc/ocxl/link.c +++ b/drivers/misc/ocxl/link.c @@ -2,6 +2,7 @@ // Copyright 2017 IBM Corp. #include #include +#include #include #include #include @@ -126,7 +127,7 @@ static void ack_irq(struct spa *spa, enum xsl_response r) static void xsl_fault_handler_bh(struct work_struct *fault_work) { - unsigned int flt = 0; + vm_fault_t flt = 0; unsigned long access, flags, inv_flags = 0; enum xsl_response r; struct xsl_fault *fault = container_of(fault_work, struct xsl_fault, diff --git a/mm/hmm.c b/mm/hmm.c index f40e8add84b5..caf9df27599e 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -299,14 +299,14 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; - int r; + vm_fault_t ret; flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; flags |= write_fault ? FAULT_FLAG_WRITE : 0; - r = handle_mm_fault(vma, addr, flags); - if (r & VM_FAULT_RETRY) + ret = handle_mm_fault(vma, addr, flags); + if (ret & VM_FAULT_RETRY) return -EBUSY; - if (r & VM_FAULT_ERROR) { + if (ret & VM_FAULT_ERROR) { *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; } diff --git a/mm/ksm.c b/mm/ksm.c index 9b855a8b0f2d..2621be57bd95 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -470,7 +470,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm) static int break_ksm(struct vm_area_struct *vma, unsigned long addr) { struct page *page; - int ret = 0; + vm_fault_t ret = 0; do { cond_resched(); From 720e14ebec642bc56c44e5e60a2d595900e5bbf0 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 17 Aug 2018 15:44:52 -0700 Subject: [PATCH 025/111] mm: skip invalid pages block at a time in zero_resv_unresv() The role of zero_resv_unavail() is to make sure that every struct page that is allocated but is not backed by memory that is accessible by kernel is zeroed and not in some uninitialized state. Since struct pages are allocated in blocks (2M pages in x86 case), we can skip pageblock_nr_pages at a time, when the first one is found to be invalid. This optimization may help since now on x86 every hole in e820 maps is marked as reserved in memblock, and thus will go through this function. This function is called before sched_clock() is initialized, so I used my x86 early boot clock patches to measure the performance improvement. With 1T hole on i7-8700 currently we would take 0.606918s of boot time, but with this optimization 0.001103s. Link: http://lkml.kernel.org/r/20180615155733.1175-1-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Oscar Salvador Reviewed-by: Naoya Horiguchi Cc: Pasha Tatashin Cc: Steven Sistare Cc: Daniel Jordan Cc: Michal Hocko Cc: Matthew Wilcox Cc: Ingo Molnar Cc: Dan Williams Cc: "Huang, Ying" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0922ef5d2e46..33f6745bb649 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6405,8 +6405,11 @@ void __paginginit zero_resv_unavail(void) pgcnt = 0; for_each_resv_unavail_range(i, &start, &end) { for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { + pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) + + pageblock_nr_pages - 1; continue; + } mm_zero_struct_page(pfn_to_page(pfn)); pgcnt++; } From fadae2953072e9005c5f1d64e1049edb043494dc Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 17 Aug 2018 15:44:55 -0700 Subject: [PATCH 026/111] thp: use mm_file_counter to determine update which rss counter Since commit eca56ff906bd ("mm, shmem: add internal shmem resident memory accounting"), MM_SHMEMPAGES is added to separate the shmem accounting from regular files. So, all shmem pages should be accounted to MM_SHMEMPAGES instead of MM_FILEPAGES. And, normal 4K shmem pages have been accounted to MM_SHMEMPAGES, so shmem thp pages should be not treated differently. Account them to MM_SHMEMPAGES via mm_counter_file() since shmem pages are swap backed to keep consistent with normal 4K shmem pages. This will not change the rss counter of processes since shmem pages are still a part of it. The /proc/pid/status and /proc/pid/statm counters will however be more accurate wrt shmem usage, as originally intended. And as eca56ff906bd ("mm, shmem: add internal shmem resident memory accounting") mentioned, oom also could report more accurate "shmem-rss". Link: http://lkml.kernel.org/r/1529442518-17398-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- mm/memory.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1ce44e87f494..064a9d78879d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1740,7 +1740,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, } else { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); - add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); + add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR); } spin_unlock(ptl); @@ -2090,7 +2090,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, SetPageReferenced(page); page_remove_rmap(page, true); put_page(page); - add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR); + add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); return; } else if (is_huge_zero_pmd(*pmd)) { /* diff --git a/mm/memory.c b/mm/memory.c index 7c3bd119fcca..d7b5b22a1a0a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3400,7 +3400,7 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page) if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR); + add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); page_add_file_rmap(page, true); /* * deposit and withdraw with pmd lock held From 7f1d23e607185e07e5df2a806a989c24627aef41 Mon Sep 17 00:00:00 2001 From: Christian Hansen Date: Fri, 17 Aug 2018 15:44:59 -0700 Subject: [PATCH 027/111] tools/vm/page-types.c: include shared map counts Add a new flag that will read kpagecount for each PFN and print out the number of times the page is mapped along with the flags in the listing view. This information is useful in understanding and optimizing memory usage. Identifying pages which are not shared allows us to focus on adjusting the memory layout or access patterns for the sole owning process. Knowing the number of processes that share a page tells us how many other times we must make the same adjustments or how many processes to potentially disable. Truncated sample output: voffset map-cnt offset len flags 561a3591e 1 15fe8 1 ___U_lA____Ma_b___________________________ 561a3591f 1 2b103 1 ___U_lA____Ma_b___________________________ 561a36ca4 1 2cc78 1 ___U_lA____Ma_b___________________________ 7f588bb4e 14 2273c 1 __RU_lA____M______________________________ [akpm@linux-foundation.org: coding-style fixes] [chansen3@cisco.com: add documentation, tweak whitespace] Link: http://lkml.kernel.org/r/20180705181204.5529-1-chansen3@cisco.com Link: http://lkml.kernel.org/r/20180612153205.12879-1-chansen3@cisco.com Signed-off-by: Christian Hansen Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/pagemap.rst | 3 + tools/vm/page-types.c | 73 +++++++++++++++++++----- 2 files changed, 62 insertions(+), 14 deletions(-) diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index 577af85beb41..3f7bade2c231 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -44,6 +44,9 @@ There are four components to pagemap: * ``/proc/kpagecount``. This file contains a 64-bit count of the number of times each page is mapped, indexed by PFN. +The page-types tool in the tools/vm directory can be used to query the +number of times a page is mapped. + * ``/proc/kpageflags``. This file contains a 64-bit set of flags for each page, indexed by PFN. diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index cce853dca691..14ebd1695223 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -75,6 +75,7 @@ #define KPF_BYTES 8 #define PROC_KPAGEFLAGS "/proc/kpageflags" +#define PROC_KPAGECOUNT "/proc/kpagecount" #define PROC_KPAGECGROUP "/proc/kpagecgroup" /* [32-] kernel hacking assistances */ @@ -173,6 +174,7 @@ static pid_t opt_pid; /* process to walk */ const char *opt_file; /* file or directory path */ static uint64_t opt_cgroup; /* cgroup inode */ static int opt_list_cgroup;/* list page cgroup */ +static int opt_list_mapcnt;/* list page map count */ static const char *opt_kpageflags;/* kpageflags file to parse */ #define MAX_ADDR_RANGES 1024 @@ -194,6 +196,7 @@ static int page_size; static int pagemap_fd; static int kpageflags_fd; +static int kpagecount_fd = -1; static int kpagecgroup_fd = -1; static int opt_hwpoison; @@ -298,6 +301,15 @@ static unsigned long kpagecgroup_read(uint64_t *buf, return do_u64_read(kpagecgroup_fd, opt_kpageflags, buf, index, pages); } +static unsigned long kpagecount_read(uint64_t *buf, + unsigned long index, + unsigned long pages) +{ + return kpagecount_fd < 0 ? pages : + do_u64_read(kpagecount_fd, PROC_KPAGECOUNT, + buf, index, pages); +} + static unsigned long pagemap_read(uint64_t *buf, unsigned long index, unsigned long pages) @@ -370,16 +382,18 @@ static char *page_flag_longname(uint64_t flags) */ static void show_page_range(unsigned long voffset, unsigned long offset, - unsigned long size, uint64_t flags, uint64_t cgroup) + unsigned long size, uint64_t flags, + uint64_t cgroup, uint64_t mapcnt) { static uint64_t flags0; static uint64_t cgroup0; + static uint64_t mapcnt0; static unsigned long voff; static unsigned long index; static unsigned long count; - if (flags == flags0 && cgroup == cgroup0 && offset == index + count && - size && voffset == voff + count) { + if (flags == flags0 && cgroup == cgroup0 && mapcnt == mapcnt0 && + offset == index + count && size && voffset == voff + count) { count += size; return; } @@ -391,12 +405,15 @@ static void show_page_range(unsigned long voffset, unsigned long offset, printf("%lu\t", voff); if (opt_list_cgroup) printf("@%llu\t", (unsigned long long)cgroup0); + if (opt_list_mapcnt) + printf("%lu\t", mapcnt0); printf("%lx\t%lx\t%s\n", index, count, page_flag_name(flags0)); } flags0 = flags; - cgroup0= cgroup; + cgroup0 = cgroup; + mapcnt0 = mapcnt; index = offset; voff = voffset; count = size; @@ -404,11 +421,11 @@ static void show_page_range(unsigned long voffset, unsigned long offset, static void flush_page_range(void) { - show_page_range(0, 0, 0, 0, 0); + show_page_range(0, 0, 0, 0, 0, 0); } static void show_page(unsigned long voffset, unsigned long offset, - uint64_t flags, uint64_t cgroup) + uint64_t flags, uint64_t cgroup, uint64_t mapcnt) { if (opt_pid) printf("%lx\t", voffset); @@ -416,6 +433,9 @@ static void show_page(unsigned long voffset, unsigned long offset, printf("%lu\t", voffset); if (opt_list_cgroup) printf("@%llu\t", (unsigned long long)cgroup); + if (opt_list_mapcnt) + printf("%lu\t", mapcnt); + printf("%lx\t%s\n", offset, page_flag_name(flags)); } @@ -599,7 +619,8 @@ static size_t hash_slot(uint64_t flags) } static void add_page(unsigned long voffset, unsigned long offset, - uint64_t flags, uint64_t cgroup, uint64_t pme) + uint64_t flags, uint64_t cgroup, uint64_t mapcnt, + uint64_t pme) { flags = kpageflags_flags(flags, pme); @@ -615,9 +636,9 @@ static void add_page(unsigned long voffset, unsigned long offset, unpoison_page(offset); if (opt_list == 1) - show_page_range(voffset, offset, 1, flags, cgroup); + show_page_range(voffset, offset, 1, flags, cgroup, mapcnt); else if (opt_list == 2) - show_page(voffset, offset, flags, cgroup); + show_page(voffset, offset, flags, cgroup, mapcnt); nr_pages[hash_slot(flags)]++; total_pages++; @@ -631,6 +652,7 @@ static void walk_pfn(unsigned long voffset, { uint64_t buf[KPAGEFLAGS_BATCH]; uint64_t cgi[KPAGEFLAGS_BATCH]; + uint64_t cnt[KPAGEFLAGS_BATCH]; unsigned long batch; unsigned long pages; unsigned long i; @@ -654,8 +676,12 @@ static void walk_pfn(unsigned long voffset, if (kpagecgroup_read(cgi, index, pages) != pages) fatal("kpagecgroup returned fewer pages than expected"); + if (kpagecount_read(cnt, index, batch) != pages) + fatal("kpagecount returned fewer pages than expected"); + for (i = 0; i < pages; i++) - add_page(voffset + i, index + i, buf[i], cgi[i], pme); + add_page(voffset + i, index + i, + buf[i], cgi[i], cnt[i], pme); index += pages; count -= pages; @@ -673,9 +699,10 @@ static void walk_swap(unsigned long voffset, uint64_t pme) return; if (opt_list == 1) - show_page_range(voffset, pagemap_swap_offset(pme), 1, flags, 0); + show_page_range(voffset, pagemap_swap_offset(pme), + 1, flags, 0, 0); else if (opt_list == 2) - show_page(voffset, pagemap_swap_offset(pme), flags, 0); + show_page(voffset, pagemap_swap_offset(pme), flags, 0, 0); nr_pages[hash_slot(flags)]++; total_pages++; @@ -789,6 +816,7 @@ static void usage(void) " -l|--list Show page details in ranges\n" " -L|--list-each Show page details one by one\n" " -C|--list-cgroup Show cgroup inode for pages\n" +" -M|--list-mapcnt Show page map count\n" " -N|--no-summary Don't show summary info\n" " -X|--hwpoison hwpoison pages\n" " -x|--unpoison unpoison pages\n" @@ -925,6 +953,7 @@ static void walk_file(const char *name, const struct stat *st) uint8_t vec[PAGEMAP_BATCH]; uint64_t buf[PAGEMAP_BATCH], flags; uint64_t cgroup = 0; + uint64_t mapcnt = 0; unsigned long nr_pages, pfn, i; off_t off, end = st->st_size; int fd; @@ -984,13 +1013,15 @@ got_sigbus: continue; if (!kpagecgroup_read(&cgroup, pfn, 1)) fatal("kpagecgroup_read failed"); + if (!kpagecount_read(&mapcnt, pfn, 1)) + fatal("kpagecount_read failed"); if (first && opt_list) { first = 0; flush_page_range(); show_file(name, st); } add_page(off / page_size + i, pfn, - flags, cgroup, buf[i]); + flags, cgroup, mapcnt, buf[i]); } } @@ -1193,6 +1224,7 @@ static const struct option opts[] = { { "list" , 0, NULL, 'l' }, { "list-each" , 0, NULL, 'L' }, { "list-cgroup", 0, NULL, 'C' }, + { "list-mapcnt", 0, NULL, 'M' }, { "no-summary", 0, NULL, 'N' }, { "hwpoison" , 0, NULL, 'X' }, { "unpoison" , 0, NULL, 'x' }, @@ -1208,7 +1240,8 @@ int main(int argc, char *argv[]) page_size = getpagesize(); while ((c = getopt_long(argc, argv, - "rp:f:a:b:d:c:ClLNXxF:h", opts, NULL)) != -1) { + "rp:f:a:b:d:c:ClLMNXxF:h", + opts, NULL)) != -1) { switch (c) { case 'r': opt_raw = 1; @@ -1240,6 +1273,9 @@ int main(int argc, char *argv[]) case 'L': opt_list = 2; break; + case 'M': + opt_list_mapcnt = 1; + break; case 'N': opt_no_summary = 1; break; @@ -1269,12 +1305,18 @@ int main(int argc, char *argv[]) if (opt_cgroup || opt_list_cgroup) kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY); + if (opt_list && opt_list_mapcnt) + kpagecount_fd = checked_open(PROC_KPAGECOUNT, O_RDONLY); + if (opt_list && opt_pid) printf("voffset\t"); if (opt_list && opt_file) printf("foffset\t"); if (opt_list && opt_list_cgroup) printf("cgroup\t"); + if (opt_list && opt_list_mapcnt) + printf("map-cnt\t"); + if (opt_list == 1) printf("offset\tlen\tflags\n"); if (opt_list == 2) @@ -1296,5 +1338,8 @@ int main(int argc, char *argv[]) show_summary(); + if (opt_list_mapcnt) + close(kpagecount_fd); + return 0; } From 59ae96ffc3a6731c3b85f9925e07e893d392e814 Mon Sep 17 00:00:00 2001 From: Christian Hansen Date: Fri, 17 Aug 2018 15:45:02 -0700 Subject: [PATCH 028/111] tools/vm/page-types.c: add support for idle page tracking Add a flag which causes page-types to use the kernels's idle page tracking to mark pages idle. As the tool already prints the idle flag if set, subsequent runs will show which pages have been accessed since last run. [akpm@linux-foundation.org: simplify mark_page_idle()] [chansen3@cisco.com: reorganize mark_page_idle() logic, add docs] Link: http://lkml.kernel.org/r/20180706172237.21691-1-chansen3@cisco.com Link: http://lkml.kernel.org/r/20180612153223.13174-1-chansen3@cisco.com Signed-off-by: Christian Hansen Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/mm/idle_page_tracking.rst | 5 ++ tools/vm/page-types.c | 47 ++++++++++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/idle_page_tracking.rst b/Documentation/admin-guide/mm/idle_page_tracking.rst index 6f7b7ca1add3..df9394fb39c2 100644 --- a/Documentation/admin-guide/mm/idle_page_tracking.rst +++ b/Documentation/admin-guide/mm/idle_page_tracking.rst @@ -65,6 +65,11 @@ workload one should: are not reclaimable, he or she can filter them out using ``/proc/kpageflags``. +The page-types tool in the tools/vm directory can be used to assist in this. +If the tool is run initially with the appropriate option, it will mark all the +queried pages as idle. Subsequent runs of the tool can then show which pages have +their idle flag cleared in the interim. + See :ref:`Documentation/admin-guide/mm/pagemap.rst ` for more information about ``/proc/pid/pagemap``, ``/proc/kpageflags``, and ``/proc/kpagecgroup``. diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index 14ebd1695223..30cb0a0713ff 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -78,6 +78,8 @@ #define PROC_KPAGECOUNT "/proc/kpagecount" #define PROC_KPAGECGROUP "/proc/kpagecgroup" +#define SYS_KERNEL_MM_PAGE_IDLE "/sys/kernel/mm/page_idle/bitmap" + /* [32-] kernel hacking assistances */ #define KPF_RESERVED 32 #define KPF_MLOCKED 33 @@ -169,6 +171,7 @@ static const char * const debugfs_known_mountpoints[] = { static int opt_raw; /* for kernel developers */ static int opt_list; /* list pages (in ranges) */ +static int opt_mark_idle; /* set accessed bit */ static int opt_no_summary; /* don't show summary */ static pid_t opt_pid; /* process to walk */ const char *opt_file; /* file or directory path */ @@ -198,6 +201,7 @@ static int pagemap_fd; static int kpageflags_fd; static int kpagecount_fd = -1; static int kpagecgroup_fd = -1; +static int page_idle_fd = -1; static int opt_hwpoison; static int opt_unpoison; @@ -587,6 +591,30 @@ static int unpoison_page(unsigned long offset) return 0; } +static int mark_page_idle(unsigned long offset) +{ + static unsigned long off; + static uint64_t buf; + int len; + + if ((offset / 64 == off / 64) || buf == 0) { + buf |= 1UL << (offset % 64); + off = offset; + return 0; + } + + len = pwrite(page_idle_fd, &buf, 8, 8 * (off / 64)); + if (len < 0) { + perror("mark page idle"); + return len; + } + + buf = 1UL << (offset % 64); + off = offset; + + return 0; +} + /* * page frame walker */ @@ -635,6 +663,9 @@ static void add_page(unsigned long voffset, unsigned long offset, if (opt_unpoison) unpoison_page(offset); + if (opt_mark_idle) + mark_page_idle(offset); + if (opt_list == 1) show_page_range(voffset, offset, 1, flags, cgroup, mapcnt); else if (opt_list == 2) @@ -783,6 +814,9 @@ static void walk_addr_ranges(void) else walk_task(opt_offset[i], opt_size[i]); + if (opt_mark_idle) + mark_page_idle(0); + close(kpageflags_fd); } @@ -813,6 +847,7 @@ static void usage(void) " -c|--cgroup path|@inode Walk pages within memory cgroup\n" " -p|--pid pid Walk process address space\n" " -f|--file filename Walk file address space\n" +" -i|--mark-idle Mark pages idle\n" " -l|--list Show page details in ranges\n" " -L|--list-each Show page details one by one\n" " -C|--list-cgroup Show cgroup inode for pages\n" @@ -1221,6 +1256,7 @@ static const struct option opts[] = { { "bits" , 1, NULL, 'b' }, { "cgroup" , 1, NULL, 'c' }, { "describe" , 1, NULL, 'd' }, + { "mark-idle" , 0, NULL, 'i' }, { "list" , 0, NULL, 'l' }, { "list-each" , 0, NULL, 'L' }, { "list-cgroup", 0, NULL, 'C' }, @@ -1240,7 +1276,7 @@ int main(int argc, char *argv[]) page_size = getpagesize(); while ((c = getopt_long(argc, argv, - "rp:f:a:b:d:c:ClLMNXxF:h", + "rp:f:a:b:d:c:CilLMNXxF:h", opts, NULL)) != -1) { switch (c) { case 'r': @@ -1267,6 +1303,9 @@ int main(int argc, char *argv[]) case 'd': describe_flags(optarg); exit(0); + case 'i': + opt_mark_idle = 1; + break; case 'l': opt_list = 1; break; @@ -1308,6 +1347,9 @@ int main(int argc, char *argv[]) if (opt_list && opt_list_mapcnt) kpagecount_fd = checked_open(PROC_KPAGECOUNT, O_RDONLY); + if (opt_mark_idle && opt_file) + page_idle_fd = checked_open(SYS_KERNEL_MM_PAGE_IDLE, O_RDWR); + if (opt_list && opt_pid) printf("voffset\t"); if (opt_list && opt_file) @@ -1341,5 +1383,8 @@ int main(int argc, char *argv[]) if (opt_list_mapcnt) close(kpagecount_fd); + if (page_idle_fd >= 0) + close(page_idle_fd); + return 0; } From d6a24df00638a09abd8fb99bdb5a1c7fdd536e59 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 17 Aug 2018 15:45:05 -0700 Subject: [PATCH 029/111] mm, page_alloc: actually ignore mempolicies for high priority allocations __alloc_pages_slowpath() has for a long time contained code to ignore node restrictions from memory policies for high priority allocations. The current code that resets the zonelist iterator however does effectively nothing after commit 7810e6781e0f ("mm, page_alloc: do not break __GFP_THISNODE by zonelist reset") removed a buggy zonelist reset. Even before that commit, mempolicy restrictions were still not ignored, as they are passed in ac->nodemask which is untouched by the code. We can either remove the code, or make it work as intended. Since ac->nodemask can be set from task's mempolicy via alloc_pages_current() and thus also alloc_pages(), it may indeed affect kernel allocations, and it makes sense to ignore it to allow progress for high priority allocations. Thus, this patch resets ac->nodemask to NULL in such cases. This assumes all callers can handle it (i.e. there are no guarantees as in the case of __GFP_THISNODE) which seems to be the case. The same assumption is already present in check_retry_cpuset() for some time. The expected effect is that high priority kernel allocations in the context of userspace tasks (e.g. OOM victims) restricted by mempolicies will have higher chance to succeed if they are restricted to nodes with depleted memory, while there are other nodes with free memory left. It's not a new intention, but for the first time the code will match the intention, AFAICS. It was intended by commit 183f6371aac2 ("mm: ignore mempolicies when using ALLOC_NO_WATERMARK") in v3.6 but I think it never really worked, as mempolicy restriction was already encoded in nodemask, not zonelist, at that time. So originally that was for ALLOC_NO_WATERMARK only. Then it was adjusted by e46e7b77c909 ("mm, page_alloc: recalculate the preferred zoneref if the context can ignore memory policies") and cd04ae1e2dc8 ("mm, oom: do not rely on TIF_MEMDIE for memory reserves access") to the current state. So even GFP_ATOMIC would now ignore mempolicies after the initial attempts fail - if the code worked as people thought it does. Link: http://lkml.kernel.org/r/20180612122624.8045-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: Mel Gorman Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 33f6745bb649..0303a3b24610 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4165,11 +4165,12 @@ retry: alloc_flags = reserve_flags; /* - * Reset the zonelist iterators if memory policies can be ignored. - * These allocations are high priority and system rather than user - * orientated. + * Reset the nodemask and zonelist iterators if memory policies can be + * ignored. These allocations are high priority and system rather than + * user oriented. */ if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { + ac->nodemask = NULL; ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); } From 46c9a946d766fa830a85d6599de5891fe9e717d2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Aug 2018 15:45:09 -0700 Subject: [PATCH 030/111] shmem: use monotonic time for i_generation get_seconds() is deprecated because it will lead to a 32-bit overflow in 2038 or 2106. We don't need the i_generation to be strictly monotonic anyway, and other file systems like ext4 and xfs just use prandom_u32(), so let's use the same one here. If this is considered too slow, we could also use ktime_get_seconds() or ktime_get_real_seconds() to keep the previous behavior. Both of these return a time64_t and are not deprecated, but only return a unique value once per second, and are predictable. Link: http://lkml.kernel.org/r/20180620082556.581543-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Reviewed-by: Andrew Morton Cc: Hugh Dickins Cc: Mike Kravetz Cc: "Kirill A. Shutemov" Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 06ebe17bb924..c48c79018a7c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -2188,7 +2189,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode inode_init_owner(inode, dir, mode); inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); - inode->i_generation = get_seconds(); + inode->i_generation = prandom_u32(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); From b3a2369692fedcc439dfaa4f215cf5323a886c88 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 17 Aug 2018 15:45:12 -0700 Subject: [PATCH 031/111] include/linux/page_ext.h: drop definition of unused PAGE_EXT_DEBUG_POISON After commit bd33ef368135 ("mm: enable page poisoning early at boot") PAGE_EXT_DEBUG_POISON is not longer used. Remove it. Link: http://lkml.kernel.org/r/20180531135457.20167-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reviewed-by: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_ext.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index ca5461efae2f..bbec618a614b 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -16,18 +16,7 @@ struct page_ext_operations { #ifdef CONFIG_PAGE_EXTENSION -/* - * page_ext->flags bits: - * - * PAGE_EXT_DEBUG_POISON is set for poisoned pages. This is used to - * implement generic debug pagealloc feature. The pages are filled with - * poison patterns and set this flag after free_pages(). The poisoned - * pages are verified whether the patterns are not corrupted and clear - * the flag before alloc_pages(). - */ - enum page_ext_flags { - PAGE_EXT_DEBUG_POISON, /* Page is poisoned */ PAGE_EXT_DEBUG_GUARD, PAGE_EXT_OWNER, #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) From 10ed63415223b16d7e80ba528556500231814232 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 17 Aug 2018 15:45:15 -0700 Subject: [PATCH 032/111] mm/page_ext.c: constify lookup_page_ext() argument lookup_page_ext() finds 'struct page_ext' for a given page. It requires only read access to the given struct page. Current implemnentation takes 'struct page *' as an argument. It makes compiler complain when 'const struct page *' passed. Change the argument to 'const struct page *'. Link: http://lkml.kernel.org/r/20180531135457.20167-3-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reviewed-by: Andrew Morton Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_ext.h | 4 ++-- mm/page_ext.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index bbec618a614b..f84f167ec04c 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -50,7 +50,7 @@ static inline void page_ext_init(void) } #endif -struct page_ext *lookup_page_ext(struct page *page); +struct page_ext *lookup_page_ext(const struct page *page); #else /* !CONFIG_PAGE_EXTENSION */ struct page_ext; @@ -59,7 +59,7 @@ static inline void pgdat_page_ext_init(struct pglist_data *pgdat) { } -static inline struct page_ext *lookup_page_ext(struct page *page) +static inline struct page_ext *lookup_page_ext(const struct page *page) { return NULL; } diff --git a/mm/page_ext.c b/mm/page_ext.c index 5295ef331165..a9826da84ccb 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -120,7 +120,7 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) pgdat->node_page_ext = NULL; } -struct page_ext *lookup_page_ext(struct page *page) +struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); unsigned long index; @@ -195,7 +195,7 @@ fail: #else /* CONFIG_FLAT_NODE_MEM_MAP */ -struct page_ext *lookup_page_ext(struct page *page) +struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); From bb451fdf3d058dfecee1c0b965342d344e7d8317 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 17 Aug 2018 15:45:19 -0700 Subject: [PATCH 033/111] mm/vmscan.c: condense scan_control Use smaller scan_control fields for order, priority, and reclaim_idx. Convert fields from int => s8. All easily fit within a byte: - allocation order range: 0..MAX_ORDER(64?) - priority range: 0..12(DEF_PRIORITY) - reclaim_idx range: 0..6(__MAX_NR_ZONES) Since 6538b8ea886e ("x86_64: expand kernel stack to 16K") x86_64 stack overflows are not an issue. But it's inefficient to use ints. Use s8 (signed byte) rather than u8 to allow for loops like: do { ... } while (--sc.priority >= 0); Add BUILD_BUG_ON to verify that s8 is capable of storing max values. This reduces sizeof(struct scan_control): - 96 => 80 bytes (x86_64) - 68 => 56 bytes (i386) scan_control structure field order is changed to utilize padding. After this patch there is 1 bit of scan_control padding. akpm: makes my vmscan.o's .text 572 bytes smaller as well. Link: http://lkml.kernel.org/r/20180530061212.84915-1-gthelen@google.com Signed-off-by: Greg Thelen Suggested-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 03822f86f288..a00d94530e57 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -65,12 +65,6 @@ struct scan_control { /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; - /* This context's GFP mask */ - gfp_t gfp_mask; - - /* Allocation order */ - int order; - /* * Nodemask of nodes allowed by the caller. If NULL, all nodes * are scanned. @@ -83,12 +77,6 @@ struct scan_control { */ struct mem_cgroup *target_mem_cgroup; - /* Scan (total_size >> priority) pages at once */ - int priority; - - /* The highest zone to isolate pages for reclaim from */ - enum zone_type reclaim_idx; - /* Writepage batching in laptop mode; RECLAIM_WRITE */ unsigned int may_writepage:1; @@ -111,6 +99,18 @@ struct scan_control { /* One of the zones is ready for compaction */ unsigned int compaction_ready:1; + /* Allocation order */ + s8 order; + + /* Scan (total_size >> priority) pages at once */ + s8 priority; + + /* The highest zone to isolate pages for reclaim from */ + s8 reclaim_idx; + + /* This context's GFP mask */ + gfp_t gfp_mask; + /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -3063,6 +3063,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .may_swap = 1, }; + /* + * scan_control uses s8 fields for order, priority, and reclaim_idx. + * Confirm they are large enough for max values. + */ + BUILD_BUG_ON(MAX_ORDER > S8_MAX); + BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); + BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX); + /* * Do not enter reclaim if fatal signal was delivered while throttled. * 1 is returned so that the page allocator does not OOM kill at this From 8cded8668e1f49ab9b90682bca76e861782416e9 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Fri, 17 Aug 2018 15:45:22 -0700 Subject: [PATCH 034/111] mm/mempool.c: remove unused argument in kasan_unpoison_element() and remove_element() The argument "gfp_t flags" is not used in kasan_unpoison_element() and remove_element(), so remove it. Link: http://lkml.kernel.org/r/20180621070332.16633-1-baijiaju1990@gmail.com Signed-off-by: Jia-Ju Bai Reviewed-by: Matthew Wilcox Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index b54f2c20e5e0..44f5fa98c1e7 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -111,7 +111,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element) kasan_free_pages(element, (unsigned long)pool->pool_data); } -static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags) +static void kasan_unpoison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) kasan_unpoison_slab(element); @@ -127,12 +127,12 @@ static __always_inline void add_element(mempool_t *pool, void *element) pool->elements[pool->curr_nr++] = element; } -static void *remove_element(mempool_t *pool, gfp_t flags) +static void *remove_element(mempool_t *pool) { void *element = pool->elements[--pool->curr_nr]; BUG_ON(pool->curr_nr < 0); - kasan_unpoison_element(pool, element, flags); + kasan_unpoison_element(pool, element); check_element(pool, element); return element; } @@ -151,7 +151,7 @@ static void *remove_element(mempool_t *pool, gfp_t flags) void mempool_exit(mempool_t *pool) { while (pool->curr_nr) { - void *element = remove_element(pool, GFP_KERNEL); + void *element = remove_element(pool); pool->free(element, pool->pool_data); } kfree(pool->elements); @@ -301,7 +301,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr) spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { while (new_min_nr < pool->curr_nr) { - element = remove_element(pool, GFP_KERNEL); + element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); pool->free(element, pool->pool_data); spin_lock_irqsave(&pool->lock, flags); @@ -387,7 +387,7 @@ repeat_alloc: spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr)) { - element = remove_element(pool, gfp_temp); + element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); From c2231020ea7b53d486395dbd8d3216e0dd1fc7ee Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 17 Aug 2018 15:45:26 -0700 Subject: [PATCH 035/111] mm: thp: register mm for khugepaged when merging vma for shmem When merging anonymous page vma, if the size of the vma can fit in at least one hugepage, the mm will be registered for khugepaged for collapsing THP in the future. But it skips shmem vmas. Do so for shmem also, but not for file-private mappings when merging a vma in order to increase the odds of collapsing a hugepage via khugepaged. hugepage_vma_check() sounds like a good fit to do the check. And move the definition of it before khugepaged_enter_vma_merge() to avoid a build error. Link: http://lkml.kernel.org/r/1529697791-6950-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Kirill A. Shutemov Cc: Hugh Dickins Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 53 ++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d7b2a4bf8671..22da712022de 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -397,6 +397,25 @@ static inline int khugepaged_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +static bool hugepage_vma_check(struct vm_area_struct *vma) +{ + if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + return false; + if (shmem_file(vma->vm_file)) { + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + return false; + return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, + HPAGE_PMD_NR); + } + if (!vma->anon_vma || vma->vm_ops) + return false; + if (is_vma_temporary_stack(vma)) + return false; + return !(vma->vm_flags & VM_NO_KHUGEPAGED); +} + int __khugepaged_enter(struct mm_struct *mm) { struct mm_slot *mm_slot; @@ -434,15 +453,14 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags) { unsigned long hstart, hend; - if (!vma->anon_vma) - /* - * Not yet faulted in so we will register later in the - * page fault if needed. - */ - return 0; - if (vma->vm_ops || (vm_flags & VM_NO_KHUGEPAGED)) - /* khugepaged not yet working on file or special mappings */ + + /* + * khugepaged does not yet work on non-shmem files or special + * mappings. And file-private shmem THP is not supported. + */ + if (!hugepage_vma_check(vma)) return 0; + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) @@ -819,25 +837,6 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) } #endif -static bool hugepage_vma_check(struct vm_area_struct *vma) -{ - if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return false; - if (shmem_file(vma->vm_file)) { - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) - return false; - return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, - HPAGE_PMD_NR); - } - if (!vma->anon_vma || vma->vm_ops) - return false; - if (is_vma_temporary_stack(vma)) - return false; - return !(vma->vm_flags & VM_NO_KHUGEPAGED); -} - /* * If mmap_sem temporarily dropped, revalidate vma * before taking mmap_sem. From 87aa752906ecf69842ef5ac1d0677aa75a4e0aac Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 17 Aug 2018 15:45:29 -0700 Subject: [PATCH 036/111] mm: thp: inc counter for collapsed shmem THP /sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed is used to record the counter of collapsed THP, but it just gets inc'ed in anonymous THP collapse path, do this for shmem THP collapse too. Link: http://lkml.kernel.org/r/1529622949-75504-2-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Kirill A. Shutemov Cc: Hugh Dickins Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 22da712022de..79d55e10bca9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1516,6 +1516,8 @@ tree_unlocked: unlock_page(new_page); *hpage = NULL; + + khugepaged_pages_collapsed++; } else { /* Something went wrong: rollback changes to the radix-tree */ shmem_uncharge(mapping->host, nr_none); From 357c1206520da7a40e383fe329ce379bda722cd9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Aug 2018 15:45:32 -0700 Subject: [PATCH 037/111] mpage: add argument structure for do_mpage_readpage() Patch series "Submit ->readpages() IO as read-ahead", v4. The only caller of ->readpages() is from read-ahead, yet we don't submit IO flagged with REQ_RAHEAD. This means we don't see it in blktrace, for instance, which is a shame. Additionally, it's preventing further functional changes in the block layer for deadling with read-ahead more intelligently. We already make assumptions about ->readpages() just being for read-ahead in the mpage implementation, using readahead_gfp_mask(mapping) as out GFP mask of choice. This small series fixes up mpage_readpages() to submit with REQ_RAHEAD, which takes care of file systems using mpage_readpages(). The first patch is a prep patch, that makes do_mpage_readpage() take an argument structure. This patch (of 4): We're currently passing 8 arguments to this function, clean it up a bit by packing the arguments in an args structure we pass to it. No intentional functional changes in this patch. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20180621010725.17813-2-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Andrew Morton Cc: Al Viro Cc: Christoph Hellwig Cc: Theodore Ts'o Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/mpage.c | 109 ++++++++++++++++++++++++++++------------------------- 1 file changed, 57 insertions(+), 52 deletions(-) diff --git a/fs/mpage.c b/fs/mpage.c index b73638db9866..6dc90e456abf 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -133,6 +133,17 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) } while (page_bh != head); } +struct mpage_readpage_args { + struct bio *bio; + struct page *page; + unsigned int nr_pages; + sector_t last_block_in_bio; + struct buffer_head map_bh; + unsigned long first_logical_block; + get_block_t *get_block; + gfp_t gfp; +}; + /* * This is the worker routine which does all the work of mapping the disk * blocks and constructs largest possible bios, submits them for IO if the @@ -142,16 +153,14 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) * represent the validity of its disk mapping and to decide when to do the next * get_block() call. */ -static struct bio * -do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, - sector_t *last_block_in_bio, struct buffer_head *map_bh, - unsigned long *first_logical_block, get_block_t get_block, - gfp_t gfp) +static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) { + struct page *page = args->page; struct inode *inode = page->mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; + struct buffer_head *map_bh = &args->map_bh; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; @@ -168,7 +177,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, goto confused; block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); - last_block = block_in_file + nr_pages * blocks_per_page; + last_block = block_in_file + args->nr_pages * blocks_per_page; last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; @@ -178,9 +187,10 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, * Map blocks using the result from the previous get_blocks call first. */ nblocks = map_bh->b_size >> blkbits; - if (buffer_mapped(map_bh) && block_in_file > *first_logical_block && - block_in_file < (*first_logical_block + nblocks)) { - unsigned map_offset = block_in_file - *first_logical_block; + if (buffer_mapped(map_bh) && + block_in_file > args->first_logical_block && + block_in_file < (args->first_logical_block + nblocks)) { + unsigned map_offset = block_in_file - args->first_logical_block; unsigned last = nblocks - map_offset; for (relative_block = 0; ; relative_block++) { @@ -208,9 +218,9 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, if (block_in_file < last_block) { map_bh->b_size = (last_block-block_in_file) << blkbits; - if (get_block(inode, block_in_file, map_bh, 0)) + if (args->get_block(inode, block_in_file, map_bh, 0)) goto confused; - *first_logical_block = block_in_file; + args->first_logical_block = block_in_file; } if (!buffer_mapped(map_bh)) { @@ -273,43 +283,45 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, /* * This page will go to BIO. Do we need to send this BIO off first? */ - if (bio && (*last_block_in_bio != blocks[0] - 1)) - bio = mpage_bio_submit(REQ_OP_READ, 0, bio); + if (args->bio && (args->last_block_in_bio != blocks[0] - 1)) + args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); alloc_new: - if (bio == NULL) { + if (args->bio == NULL) { if (first_hole == blocks_per_page) { if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9), page)) goto out; } - bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), - min_t(int, nr_pages, BIO_MAX_PAGES), gfp); - if (bio == NULL) + args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), + min_t(int, args->nr_pages, + BIO_MAX_PAGES), + args->gfp); + if (args->bio == NULL) goto confused; } length = first_hole << blkbits; - if (bio_add_page(bio, page, length, 0) < length) { - bio = mpage_bio_submit(REQ_OP_READ, 0, bio); + if (bio_add_page(args->bio, page, length, 0) < length) { + args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); goto alloc_new; } - relative_block = block_in_file - *first_logical_block; + relative_block = block_in_file - args->first_logical_block; nblocks = map_bh->b_size >> blkbits; if ((buffer_boundary(map_bh) && relative_block == nblocks) || (first_hole != blocks_per_page)) - bio = mpage_bio_submit(REQ_OP_READ, 0, bio); + args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); else - *last_block_in_bio = blocks[blocks_per_page - 1]; + args->last_block_in_bio = blocks[blocks_per_page - 1]; out: - return bio; + return args->bio; confused: - if (bio) - bio = mpage_bio_submit(REQ_OP_READ, 0, bio); + if (args->bio) + args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); if (!PageUptodate(page)) - block_read_full_page(page, get_block); + block_read_full_page(page, args->get_block); else unlock_page(page); goto out; @@ -363,15 +375,12 @@ int mpage_readpages(struct address_space *mapping, struct list_head *pages, unsigned nr_pages, get_block_t get_block) { - struct bio *bio = NULL; + struct mpage_readpage_args args = { + .get_block = get_block, + .gfp = readahead_gfp_mask(mapping), + }; unsigned page_idx; - sector_t last_block_in_bio = 0; - struct buffer_head map_bh; - unsigned long first_logical_block = 0; - gfp_t gfp = readahead_gfp_mask(mapping); - map_bh.b_state = 0; - map_bh.b_size = 0; for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = lru_to_page(pages); @@ -379,18 +388,16 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, list_del(&page->lru); if (!add_to_page_cache_lru(page, mapping, page->index, - gfp)) { - bio = do_mpage_readpage(bio, page, - nr_pages - page_idx, - &last_block_in_bio, &map_bh, - &first_logical_block, - get_block, gfp); + args.gfp)) { + args.page = page; + args.nr_pages = nr_pages - page_idx; + args.bio = do_mpage_readpage(&args); } put_page(page); } BUG_ON(!list_empty(pages)); - if (bio) - mpage_bio_submit(REQ_OP_READ, 0, bio); + if (args.bio) + mpage_bio_submit(REQ_OP_READ, 0, args.bio); return 0; } EXPORT_SYMBOL(mpage_readpages); @@ -400,18 +407,16 @@ EXPORT_SYMBOL(mpage_readpages); */ int mpage_readpage(struct page *page, get_block_t get_block) { - struct bio *bio = NULL; - sector_t last_block_in_bio = 0; - struct buffer_head map_bh; - unsigned long first_logical_block = 0; - gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); + struct mpage_readpage_args args = { + .page = page, + .nr_pages = 1, + .get_block = get_block, + .gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL), + }; - map_bh.b_state = 0; - map_bh.b_size = 0; - bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, - &map_bh, &first_logical_block, get_block, gfp); - if (bio) - mpage_bio_submit(REQ_OP_READ, 0, bio); + args.bio = do_mpage_readpage(&args); + if (args.bio) + mpage_bio_submit(REQ_OP_READ, 0, args.bio); return 0; } EXPORT_SYMBOL(mpage_readpage); From 74c8164e1cdb1eb22f1d49d54e515e81821a8ad0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Aug 2018 15:45:36 -0700 Subject: [PATCH 038/111] mpage: mpage_readpages() should submit IO as read-ahead a_ops->readpages() is only ever used for read-ahead, yet we don't flag the IO being submitted as such. Fix that up. Any file system that uses mpage_readpages() as its ->readpages() implementation will now get this right. Since we're passing in whether the IO is read-ahead or not, we don't need to pass in the 'gfp' separately, as it is dependent on the IO being read-ahead. Kill off that member. Add some documentation notes on ->readpages() being purely for read-ahead. Link: http://lkml.kernel.org/r/20180621010725.17813-3-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Andrew Morton Cc: Al Viro Cc: Chris Mason Cc: Christoph Hellwig Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/data.c | 5 +++++ fs/mpage.c | 29 +++++++++++++++++++---------- include/linux/fs.h | 4 ++++ 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8f931d699287..b7c9b58acf3e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1421,6 +1421,11 @@ out: /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. + * + * Note that the aops->readpages() function is ONLY used for read-ahead. If + * this function ever deviates from doing just read-ahead, it should either + * use ->readpage() or do the necessary surgery to decouple ->readpages() + * readom read-ahead. */ static int f2fs_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, diff --git a/fs/mpage.c b/fs/mpage.c index 6dc90e456abf..c820dc9bebab 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -137,11 +137,11 @@ struct mpage_readpage_args { struct bio *bio; struct page *page; unsigned int nr_pages; + bool is_readahead; sector_t last_block_in_bio; struct buffer_head map_bh; unsigned long first_logical_block; get_block_t *get_block; - gfp_t gfp; }; /* @@ -170,8 +170,18 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) struct block_device *bdev = NULL; int length; int fully_mapped = 1; + int op_flags; unsigned nblocks; unsigned relative_block; + gfp_t gfp; + + if (args->is_readahead) { + op_flags = REQ_RAHEAD; + gfp = readahead_gfp_mask(page->mapping); + } else { + op_flags = 0; + gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); + } if (page_has_buffers(page)) goto confused; @@ -284,7 +294,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) * This page will go to BIO. Do we need to send this BIO off first? */ if (args->bio && (args->last_block_in_bio != blocks[0] - 1)) - args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); + args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); alloc_new: if (args->bio == NULL) { @@ -296,14 +306,14 @@ alloc_new: args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), min_t(int, args->nr_pages, BIO_MAX_PAGES), - args->gfp); + gfp); if (args->bio == NULL) goto confused; } length = first_hole << blkbits; if (bio_add_page(args->bio, page, length, 0) < length) { - args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); + args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); goto alloc_new; } @@ -311,7 +321,7 @@ alloc_new: nblocks = map_bh->b_size >> blkbits; if ((buffer_boundary(map_bh) && relative_block == nblocks) || (first_hole != blocks_per_page)) - args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); + args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); else args->last_block_in_bio = blocks[blocks_per_page - 1]; out: @@ -319,7 +329,7 @@ out: confused: if (args->bio) - args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); + args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); if (!PageUptodate(page)) block_read_full_page(page, args->get_block); else @@ -377,7 +387,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, { struct mpage_readpage_args args = { .get_block = get_block, - .gfp = readahead_gfp_mask(mapping), + .is_readahead = true, }; unsigned page_idx; @@ -388,7 +398,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, list_del(&page->lru); if (!add_to_page_cache_lru(page, mapping, page->index, - args.gfp)) { + readahead_gfp_mask(mapping))) { args.page = page; args.nr_pages = nr_pages - page_idx; args.bio = do_mpage_readpage(&args); @@ -397,7 +407,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, } BUG_ON(!list_empty(pages)); if (args.bio) - mpage_bio_submit(REQ_OP_READ, 0, args.bio); + mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio); return 0; } EXPORT_SYMBOL(mpage_readpages); @@ -411,7 +421,6 @@ int mpage_readpage(struct page *page, get_block_t get_block) .page = page, .nr_pages = 1, .get_block = get_block, - .gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL), }; args.bio = do_mpage_readpage(&args); diff --git a/include/linux/fs.h b/include/linux/fs.h index 9d319f1f66f6..a9242f336f02 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -344,6 +344,10 @@ struct address_space_operations { /* Set a page dirty. Return true if this dirtied it */ int (*set_page_dirty)(struct page *page); + /* + * Reads in the requested pages. Unlike ->readpage(), this is + * PURELY used for read-ahead!. + */ int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); From 5e9d398240b2292b1091f921d29bbab374b755fd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Aug 2018 15:45:39 -0700 Subject: [PATCH 039/111] btrfs: readpages() should submit IO as read-ahead a_ops->readpages() is only ever used for read-ahead. Ensure that we pass this information down to the block layer. Link: http://lkml.kernel.org/r/20180621010725.17813-4-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Andrew Morton Cc: Al Viro Cc: Chris Mason Cc: Christoph Hellwig Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 628f1aef34b0..4dd6faab02bb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3102,7 +3102,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, for (index = 0; index < nr_pages; index++) { __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, - bio, 0, bio_flags, 0, prev_em_start); + bio, 0, bio_flags, REQ_RAHEAD, prev_em_start); put_page(pages[index]); } } From ac22b46a0b65dbeccbf4d458db95687e825bde90 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Aug 2018 15:45:42 -0700 Subject: [PATCH 040/111] ext4: readpages() should submit IO as read-ahead a_ops->readpages() is only ever used for read-ahead. Ensure that we pass this information down to the block layer. Link: http://lkml.kernel.org/r/20180621010725.17813-5-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Andrew Morton Cc: Al Viro Cc: Chris Mason Cc: Christoph Hellwig Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 5 +++-- fs/ext4/readpage.c | 5 +++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1fc013f3d944..0f0edd1cd0cd 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3062,7 +3062,7 @@ static inline void ext4_set_de_type(struct super_block *sb, /* readpages.c */ extern int ext4_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, - unsigned nr_pages); + unsigned nr_pages, bool is_readahead); /* symlink.c */ extern const struct inode_operations ext4_encrypted_symlink_inode_operations; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8f6ad7667974..d0dd585add6a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3325,7 +3325,8 @@ static int ext4_readpage(struct file *file, struct page *page) ret = ext4_readpage_inline(inode, page); if (ret == -EAGAIN) - return ext4_mpage_readpages(page->mapping, NULL, page, 1); + return ext4_mpage_readpages(page->mapping, NULL, page, 1, + false); return ret; } @@ -3340,7 +3341,7 @@ ext4_readpages(struct file *file, struct address_space *mapping, if (ext4_has_inline_data(inode)) return 0; - return ext4_mpage_readpages(mapping, pages, NULL, nr_pages); + return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true); } static void ext4_invalidatepage(struct page *page, unsigned int offset, diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 19b87a8de6ff..f461d75ac049 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -98,7 +98,7 @@ static void mpage_end_io(struct bio *bio) int ext4_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, - unsigned nr_pages) + unsigned nr_pages, bool is_readahead) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; @@ -259,7 +259,8 @@ int ext4_mpage_readpages(struct address_space *mapping, bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; bio->bi_private = ctx; - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio_set_op_attrs(bio, REQ_OP_READ, + is_readahead ? REQ_RAHEAD : 0); } length = first_hole << blkbits; From c6ddfb6c58903262d2d77042c41dba58cf775d88 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 17 Aug 2018 15:45:46 -0700 Subject: [PATCH 041/111] mm, clear_huge_page: move order algorithm into a separate function Patch series "mm, huge page: Copy target sub-page last when copy huge page", v2. Huge page helps to reduce TLB miss rate, but it has higher cache footprint, sometimes this may cause some issue. For example, when copying huge page on x86_64 platform, the cache footprint is 4M. But on a Xeon E5 v3 2699 CPU, there are 18 cores, 36 threads, and only 45M LLC (last level cache). That is, in average, there are 2.5M LLC for each core and 1.25M LLC for each thread. If the cache contention is heavy when copying the huge page, and we copy the huge page from the begin to the end, it is possible that the begin of huge page is evicted from the cache after we finishing copying the end of the huge page. And it is possible for the application to access the begin of the huge page after copying the huge page. In c79b57e462b5d ("mm: hugetlb: clear target sub-page last when clearing huge page"), to keep the cache lines of the target subpage hot, the order to clear the subpages in the huge page in clear_huge_page() is changed to clearing the subpage which is furthest from the target subpage firstly, and the target subpage last. The similar order changing helps huge page copying too. That is implemented in this patchset. The patchset is a generic optimization which should benefit quite some workloads, not for a specific use case. To demonstrate the performance benefit of the patchset, we have tested it with vm-scalability run on transparent huge page. With this patchset, the throughput increases ~16.6% in vm-scalability anon-cow-seq test case with 36 processes on a 2 socket Xeon E5 v3 2699 system (36 cores, 72 threads). The test case set /sys/kernel/mm/transparent_hugepage/enabled to be always, mmap() a big anonymous memory area and populate it, then forked 36 child processes, each writes to the anonymous memory area from the begin to the end, so cause copy on write. For each child process, other child processes could be seen as other workloads which generate heavy cache pressure. At the same time, the IPC (instruction per cycle) increased from 0.63 to 0.78, and the time spent in user space is reduced ~7.2%. This patch (of 4): In c79b57e462b5d ("mm: hugetlb: clear target sub-page last when clearing huge page"), to keep the cache lines of the target subpage hot, the order to clear the subpages in the huge page in clear_huge_page() is changed to clearing the subpage which is furthest from the target subpage firstly, and the target subpage last. This optimization could be applied to copying huge page too with the same order algorithm. To avoid code duplication and reduce maintenance overhead, in this patch, the order algorithm is moved out of clear_huge_page() into a separate function: process_huge_page(). So that we can use it for copying huge page too. This will change the direct calls to clear_user_highpage() into the indirect calls. But with the proper inline support of the compilers, the indirect call will be optimized to be the direct call. Our tests show no performance change with the patch. This patch is a code cleanup without functionality change. Link: http://lkml.kernel.org/r/20180524005851.4079-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-by: Mike Kravetz Reviewed-by: Mike Kravetz Cc: Andi Kleen Cc: Jan Kara Cc: Michal Hocko Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Minchan Kim Cc: Shaohua Li Cc: Christopher Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 98 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 60 insertions(+), 38 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index d7b5b22a1a0a..65bb59e031c9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4599,6 +4599,57 @@ EXPORT_SYMBOL(__might_fault); #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) +/* + * Process all subpages of the specified huge page with the specified + * operation. The target subpage will be processed last to keep its + * cache lines hot. + */ +static inline void process_huge_page( + unsigned long addr_hint, unsigned int pages_per_huge_page, + void (*process_subpage)(unsigned long addr, int idx, void *arg), + void *arg) +{ + int i, n, base, l; + unsigned long addr = addr_hint & + ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); + + /* Process target subpage last to keep its cache lines hot */ + might_sleep(); + n = (addr_hint - addr) / PAGE_SIZE; + if (2 * n <= pages_per_huge_page) { + /* If target subpage in first half of huge page */ + base = 0; + l = n; + /* Process subpages at the end of huge page */ + for (i = pages_per_huge_page - 1; i >= 2 * n; i--) { + cond_resched(); + process_subpage(addr + i * PAGE_SIZE, i, arg); + } + } else { + /* If target subpage in second half of huge page */ + base = pages_per_huge_page - 2 * (pages_per_huge_page - n); + l = pages_per_huge_page - n; + /* Process subpages at the begin of huge page */ + for (i = 0; i < base; i++) { + cond_resched(); + process_subpage(addr + i * PAGE_SIZE, i, arg); + } + } + /* + * Process remaining subpages in left-right-left-right pattern + * towards the target subpage + */ + for (i = 0; i < l; i++) { + int left_idx = base + i; + int right_idx = base + 2 * l - 1 - i; + + cond_resched(); + process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg); + cond_resched(); + process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg); + } +} + static void clear_gigantic_page(struct page *page, unsigned long addr, unsigned int pages_per_huge_page) @@ -4613,10 +4664,17 @@ static void clear_gigantic_page(struct page *page, clear_user_highpage(p, addr + i * PAGE_SIZE); } } + +static void clear_subpage(unsigned long addr, int idx, void *arg) +{ + struct page *page = arg; + + clear_user_highpage(page + idx, addr); +} + void clear_huge_page(struct page *page, unsigned long addr_hint, unsigned int pages_per_huge_page) { - int i, n, base, l; unsigned long addr = addr_hint & ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); @@ -4625,43 +4683,7 @@ void clear_huge_page(struct page *page, return; } - /* Clear sub-page to access last to keep its cache lines hot */ - might_sleep(); - n = (addr_hint - addr) / PAGE_SIZE; - if (2 * n <= pages_per_huge_page) { - /* If sub-page to access in first half of huge page */ - base = 0; - l = n; - /* Clear sub-pages at the end of huge page */ - for (i = pages_per_huge_page - 1; i >= 2 * n; i--) { - cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); - } - } else { - /* If sub-page to access in second half of huge page */ - base = pages_per_huge_page - 2 * (pages_per_huge_page - n); - l = pages_per_huge_page - n; - /* Clear sub-pages at the begin of huge page */ - for (i = 0; i < base; i++) { - cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); - } - } - /* - * Clear remaining sub-pages in left-right-left-right pattern - * towards the sub-page to access - */ - for (i = 0; i < l; i++) { - int left_idx = base + i; - int right_idx = base + 2 * l - 1 - i; - - cond_resched(); - clear_user_highpage(page + left_idx, - addr + left_idx * PAGE_SIZE); - cond_resched(); - clear_user_highpage(page + right_idx, - addr + right_idx * PAGE_SIZE); - } + process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page); } static void copy_user_gigantic_page(struct page *dst, struct page *src, From c9f4cd71383576a916e7fca99c490fc92a289f5a Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 17 Aug 2018 15:45:49 -0700 Subject: [PATCH 042/111] mm, huge page: copy target sub-page last when copy huge page Huge page helps to reduce TLB miss rate, but it has higher cache footprint, sometimes this may cause some issue. For example, when copying huge page on x86_64 platform, the cache footprint is 4M. But on a Xeon E5 v3 2699 CPU, there are 18 cores, 36 threads, and only 45M LLC (last level cache). That is, in average, there are 2.5M LLC for each core and 1.25M LLC for each thread. If the cache contention is heavy when copying the huge page, and we copy the huge page from the begin to the end, it is possible that the begin of huge page is evicted from the cache after we finishing copying the end of the huge page. And it is possible for the application to access the begin of the huge page after copying the huge page. In c79b57e462b5d ("mm: hugetlb: clear target sub-page last when clearing huge page"), to keep the cache lines of the target subpage hot, the order to clear the subpages in the huge page in clear_huge_page() is changed to clearing the subpage which is furthest from the target subpage firstly, and the target subpage last. The similar order changing helps huge page copying too. That is implemented in this patch. Because we have put the order algorithm into a separate function, the implementation is quite simple. The patch is a generic optimization which should benefit quite some workloads, not for a specific use case. To demonstrate the performance benefit of the patch, we tested it with vm-scalability run on transparent huge page. With this patch, the throughput increases ~16.6% in vm-scalability anon-cow-seq test case with 36 processes on a 2 socket Xeon E5 v3 2699 system (36 cores, 72 threads). The test case set /sys/kernel/mm/transparent_hugepage/enabled to be always, mmap() a big anonymous memory area and populate it, then forked 36 child processes, each writes to the anonymous memory area from the begin to the end, so cause copy on write. For each child process, other child processes could be seen as other workloads which generate heavy cache pressure. At the same time, the IPC (instruction per cycle) increased from 0.63 to 0.78, and the time spent in user space is reduced ~7.2%. Link: http://lkml.kernel.org/r/20180524005851.4079-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Mike Kravetz Cc: Andi Kleen Cc: Jan Kara Cc: Michal Hocko Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Minchan Kim Cc: Shaohua Li Cc: Christopher Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 ++- mm/huge_memory.c | 3 ++- mm/memory.c | 30 +++++++++++++++++++++++------- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 68a5121694ef..2fb32d1561eb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2752,7 +2752,8 @@ extern void clear_huge_page(struct page *page, unsigned long addr_hint, unsigned int pages_per_huge_page); extern void copy_user_huge_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma, + unsigned long addr_hint, + struct vm_area_struct *vma, unsigned int pages_per_huge_page); extern long copy_huge_page_from_user(struct page *dst_page, const void __user *usr_src, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 064a9d78879d..78427af91de9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1328,7 +1328,8 @@ alloc: if (!page) clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); else - copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); + copy_user_huge_page(new_page, page, vmf->address, + vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); mmun_start = haddr; diff --git a/mm/memory.c b/mm/memory.c index 65bb59e031c9..175f344e1523 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4705,11 +4705,31 @@ static void copy_user_gigantic_page(struct page *dst, struct page *src, } } +struct copy_subpage_arg { + struct page *dst; + struct page *src; + struct vm_area_struct *vma; +}; + +static void copy_subpage(unsigned long addr, int idx, void *arg) +{ + struct copy_subpage_arg *copy_arg = arg; + + copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx, + addr, copy_arg->vma); +} + void copy_user_huge_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma, + unsigned long addr_hint, struct vm_area_struct *vma, unsigned int pages_per_huge_page) { - int i; + unsigned long addr = addr_hint & + ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); + struct copy_subpage_arg arg = { + .dst = dst, + .src = src, + .vma = vma, + }; if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { copy_user_gigantic_page(dst, src, addr, vma, @@ -4717,11 +4737,7 @@ void copy_user_huge_page(struct page *dst, struct page *src, return; } - might_sleep(); - for (i = 0; i < pages_per_huge_page; i++) { - cond_resched(); - copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); - } + process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg); } long copy_huge_page_from_user(struct page *dst_page, From 5b7a1d406062449a4d51aea1df37a73285ced1dc Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 17 Aug 2018 15:45:53 -0700 Subject: [PATCH 043/111] mm, hugetlbfs: rename address to haddr in hugetlb_cow() To take better advantage of general huge page copying optimization, the target subpage address will be passed to hugetlb_cow(), then copy_user_huge_page(). So we will use both target subpage address and huge page size aligned address in hugetlb_cow(). To distinguish between them, "haddr" is used for huge page size aligned address to be consistent with Transparent Huge Page naming convention. Now, only huge page size aligned address is used in hugetlb_cow(), so the "address" is renamed to "haddr" in hugetlb_cow() in this patch. Next patch will use target subpage address in hugetlb_cow() too. The patch is just code cleanup without any functionality changes. Link: http://lkml.kernel.org/r/20180524005851.4079-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-by: Mike Kravetz Suggested-by: Michal Hocko Reviewed-by: Mike Kravetz Cc: David Rientjes Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Andi Kleen Cc: Jan Kara Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Minchan Kim Cc: Shaohua Li Cc: Christopher Lameter Cc: "Aneesh Kumar K.V" Cc: Punit Agrawal Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3103099f64fd..e6767a35f7de 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3509,7 +3509,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * Keep the pte_same checks anyway to make transition from the mutex easier. */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, + unsigned long haddr, pte_t *ptep, struct page *pagecache_page, spinlock_t *ptl) { pte_t pte; @@ -3527,7 +3527,7 @@ retry_avoidcopy: * and just make the page writable */ if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { page_move_anon_rmap(old_page, vma); - set_huge_ptep_writable(vma, address, ptep); + set_huge_ptep_writable(vma, haddr, ptep); return 0; } @@ -3551,7 +3551,7 @@ retry_avoidcopy: * be acquired again before returning to the caller, as expected. */ spin_unlock(ptl); - new_page = alloc_huge_page(vma, address, outside_reserve); + new_page = alloc_huge_page(vma, haddr, outside_reserve); if (IS_ERR(new_page)) { /* @@ -3564,11 +3564,10 @@ retry_avoidcopy: if (outside_reserve) { put_page(old_page); BUG_ON(huge_pte_none(pte)); - unmap_ref_private(mm, vma, old_page, address); + unmap_ref_private(mm, vma, old_page, haddr); BUG_ON(huge_pte_none(pte)); spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h), - huge_page_size(h)); + ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; @@ -3593,12 +3592,12 @@ retry_avoidcopy: goto out_release_all; } - copy_user_huge_page(new_page, old_page, address, vma, + copy_user_huge_page(new_page, old_page, haddr, vma, pages_per_huge_page(h)); __SetPageUptodate(new_page); set_page_huge_active(new_page); - mmun_start = address & huge_page_mask(h); + mmun_start = haddr; mmun_end = mmun_start + huge_page_size(h); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); @@ -3607,25 +3606,24 @@ retry_avoidcopy: * before the page tables are altered */ spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h), - huge_page_size(h)); + ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { ClearPagePrivate(new_page); /* Break COW */ - huge_ptep_clear_flush(vma, address, ptep); + huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); - set_huge_pte_at(mm, address, ptep, + set_huge_pte_at(mm, haddr, ptep, make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page, true); - hugepage_add_new_anon_rmap(new_page, vma, address); + hugepage_add_new_anon_rmap(new_page, vma, haddr); /* Make the old page be freed below */ new_page = old_page; } spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out_release_all: - restore_reserve_on_error(h, vma, address, new_page); + restore_reserve_on_error(h, vma, haddr, new_page); put_page(new_page); out_release_old: put_page(old_page); From 974e6d66b6b5c6e2d6a3ccc18b2f9a0b472be5b4 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 17 Aug 2018 15:45:57 -0700 Subject: [PATCH 044/111] mm, hugetlbfs: pass fault address to cow handler This is to take better advantage of the general huge page copying optimization. Where, the target subpage will be copied last to avoid the cache lines of target subpage to be evicted when copying other subpages. This works better if the address of the target subpage is available when copying huge page. So hugetlbfs page fault handlers are changed to pass that information to hugetlb_cow(). This will benefit workloads which don't access the begin of the hugetlbfs huge page after the page fault under heavy cache contention. Link: http://lkml.kernel.org/r/20180524005851.4079-5-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Mike Kravetz Cc: Michal Hocko Cc: David Rientjes Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Andi Kleen Cc: Jan Kara Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Minchan Kim Cc: Shaohua Li Cc: Christopher Lameter Cc: "Aneesh Kumar K.V" Cc: Punit Agrawal Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e6767a35f7de..f1bcaae0d73a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3509,7 +3509,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * Keep the pte_same checks anyway to make transition from the mutex easier. */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long haddr, pte_t *ptep, + unsigned long address, pte_t *ptep, struct page *pagecache_page, spinlock_t *ptl) { pte_t pte; @@ -3518,6 +3518,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, int ret = 0, outside_reserve = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + unsigned long haddr = address & huge_page_mask(h); pte = huge_ptep_get(ptep); old_page = pte_page(pte); @@ -3592,7 +3593,7 @@ retry_avoidcopy: goto out_release_all; } - copy_user_huge_page(new_page, old_page, haddr, vma, + copy_user_huge_page(new_page, old_page, address, vma, pages_per_huge_page(h)); __SetPageUptodate(new_page); set_page_huge_active(new_page); @@ -3826,7 +3827,7 @@ retry: hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, haddr, ptep, page, ptl); + ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); } spin_unlock(ptl); @@ -3980,7 +3981,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { - ret = hugetlb_cow(mm, vma, haddr, ptep, + ret = hugetlb_cow(mm, vma, address, ptep, pagecache_page, ptl); goto out_put_page; } From 9ea9a68064035dee70f465f605a3e63990e33bd9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 17 Aug 2018 15:46:01 -0700 Subject: [PATCH 045/111] mm: drop VM_BUG_ON from __get_free_pages There is no real reason to blow up just because the caller doesn't know that __get_free_pages cannot return highmem pages. Simply fix that up silently. Even if we have some confused users such a fixup will not be harmful. [akpm@linux-foundation.org: mask off __GFP_HIGHMEM] Link: http://lkml.kernel.org/r/20180622162841.25114-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Jiankang Chen Cc: Mel Gorman Cc: Johannes Weiner Cc: Yisheng Xie Cc: Hanjun Guo Cc: Kefeng Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0303a3b24610..e1517bb143dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4404,19 +4404,15 @@ out: EXPORT_SYMBOL(__alloc_pages_nodemask); /* - * Common helper functions. + * Common helper functions. Never use with __GFP_HIGHMEM because the returned + * address cannot represent highmem pages. Use alloc_pages and then kmap if + * you need to access high mem. */ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) { struct page *page; - /* - * __get_free_pages() returns a virtual address, which cannot represent - * a highmem page - */ - VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); - - page = alloc_pages(gfp_mask, order); + page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order); if (!page) return 0; return (unsigned long) page_address(page); From ae1e16da14b2bca94272c9f23c930be48994b2bb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 17 Aug 2018 15:46:05 -0700 Subject: [PATCH 046/111] mm: workingset: remove local_irq_disable() from count_shadow_nodes() Patch series "mm: use irq locking suffix instead local_irq_disable()". A small series which avoids using local_irq_disable()/local_irq_enable() but instead does spin_lock_irq()/spin_unlock_irq() so it is within the context of the lock which it belongs to. Patch #1 is a cleanup where local_irq_.*() remained after the lock was removed. This patch (of 2): In 0c7c1bed7e13 ("mm: make counting of list_lru_one::nr_items lockless") the spin_lock(&nlru->lock); statement was replaced with rcu_read_lock(); in __list_lru_count_one(). The comment in count_shadow_nodes() says that the local_irq_disable() is required because the lock must be acquired with disabled interrupts and (spin_lock()) does not do so. Since the lock is replaced with rcu_read_lock() the local_irq_disable() is no longer needed. The code path is list_lru_shrink_count() -> list_lru_count_one() -> __list_lru_count_one() -> rcu_read_lock() -> list_lru_from_memcg_idx() -> rcu_read_unlock() Remove the local_irq_disable() statement. Link: http://lkml.kernel.org/r/20180622151221.28167-2-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Andrew Morton Reviewed-by: Kirill Tkhai Acked-by: Vladimir Davydov Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/workingset.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/workingset.c b/mm/workingset.c index 40ee02c83978..ed8151180899 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -366,10 +366,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, unsigned long nodes; unsigned long cache; - /* list_lru lock nests inside the IRQ-safe i_pages lock */ - local_irq_disable(); nodes = list_lru_shrink_count(&shadow_nodes, sc); - local_irq_enable(); /* * Approximate a reasonable limit for the radix tree nodes From 6ca342d020e8b8315d430d3a7d7c65daadfb4d8b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 17 Aug 2018 15:46:08 -0700 Subject: [PATCH 047/111] mm: workingset: make shadow_lru_isolate() use locking suffix shadow_lru_isolate() disables interrupts and acquires a lock. It could use spin_lock_irq() instead. It also uses local_irq_enable() while it could use spin_unlock_irq()/xa_unlock_irq(). Use proper suffix for lock/unlock in order to enable/disable interrupts during release/acquire of a lock. Link: http://lkml.kernel.org/r/20180622151221.28167-3-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Andrew Morton Cc: Vladimir Davydov Cc: Kirill Tkhai Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/workingset.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/workingset.c b/mm/workingset.c index ed8151180899..529480c21f93 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -431,7 +431,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { - spin_unlock(lru_lock); + spin_unlock_irq(lru_lock); ret = LRU_RETRY; goto out; } @@ -469,13 +469,11 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, workingset_lookup_update(mapping)); out_invalid: - xa_unlock(&mapping->i_pages); + xa_unlock_irq(&mapping->i_pages); ret = LRU_REMOVED_RETRY; out: - local_irq_enable(); cond_resched(); - local_irq_disable(); - spin_lock(lru_lock); + spin_lock_irq(lru_lock); return ret; } From 930eaac5eed23bb52061a50f674753a319216041 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 17 Aug 2018 15:46:11 -0700 Subject: [PATCH 048/111] mm/list_lru.c: fold __list_lru_count_one() into its caller __list_lru_count_one() has a single callsite. Acked-by: Vladimir Davydov Cc: Sebastian Andrzej Siewior Cc: Kirill Tkhai Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index fcfb6c89ed47..db679a057f46 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -162,26 +162,20 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, } EXPORT_SYMBOL_GPL(list_lru_isolate_move); -static unsigned long __list_lru_count_one(struct list_lru *lru, - int nid, int memcg_idx) +unsigned long list_lru_count_one(struct list_lru *lru, + int nid, struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; unsigned long count; rcu_read_lock(); - l = list_lru_from_memcg_idx(nlru, memcg_idx); + l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); count = l->nr_items; rcu_read_unlock(); return count; } - -unsigned long list_lru_count_one(struct list_lru *lru, - int nid, struct mem_cgroup *memcg) -{ - return __list_lru_count_one(lru, nid, memcg_cache_id(memcg)); -} EXPORT_SYMBOL_GPL(list_lru_count_one); unsigned long list_lru_count_node(struct list_lru *lru, int nid) From b9ff036082cd1793a59b35c4432644fe44620664 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Fri, 17 Aug 2018 15:46:15 -0700 Subject: [PATCH 049/111] mm/memory_hotplug.c: make add_memory_resource use __try_online_node This is a small cleanup for the memhotplug code. A lot more could be done, but it is better to start somewhere. I tried to unify/remove duplicated code. The following is what this patchset does: 1) add_memory_resource() has code to allocate a node in case it was offline. Since try_online_node has some code for that as well, I just made add_memory_resource() to use that so we can remove duplicated code.. This is better explained in patch 1/4. 2) register_mem_sect_under_node() will be called only from link_mem_sections() 3) Make register_mem_sect_under_node() a callback of walk_memory_range() 4) Drop unnecessary checks from register_mem_sect_under_node() I have done some tests and I could not see anything broken because of this patchset. add_memory_resource() contains code to allocate a new node in case it is necessary. Since try_online_node() also has some code for this purpose, let us make use of that and remove duplicate code. This introduces __try_online_node(), which is called by add_memory_resource() and try_online_node(). __try_online_node() has two new parameters, start_addr of the node, and if the node should be onlined and registered right away. This is always wanted if we are calling from do_cpu_up(), but not when we are calling from memhotplug code. Nothing changes from the point of view of the users of try_online_node(), since try_online_node passes start_addr=0 and online_node=true to __try_online_node(). Link: http://lkml.kernel.org/r/20180622111839.10071-2-osalvador@techadventures.net Signed-off-by: Oscar Salvador Reviewed-by: Pavel Tatashin Tested-by: Reza Arbab Tested-by: Jonathan Cameron Cc: Pasha Tatashin Cc: Michal Hocko Cc: Vlastimil Babka Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 67 ++++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7deb49f69e27..504ba120bdfc 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1034,8 +1034,10 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) return pgdat; } -static void rollback_node_hotadd(int nid, pg_data_t *pgdat) +static void rollback_node_hotadd(int nid) { + pg_data_t *pgdat = NODE_DATA(nid); + arch_refresh_nodedata(nid, NULL); free_percpu(pgdat->per_cpu_nodestats); arch_free_nodedata(pgdat); @@ -1046,28 +1048,48 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) /** * try_online_node - online a node if offlined * @nid: the node ID - * + * @start: start addr of the node + * @set_node_online: Whether we want to online the node * called by cpu_up() to online a node without onlined memory. + * + * Returns: + * 1 -> a new node has been allocated + * 0 -> the node is already online + * -ENOMEM -> the node could not be allocated */ -int try_online_node(int nid) +static int __try_online_node(int nid, u64 start, bool set_node_online) { - pg_data_t *pgdat; - int ret; + pg_data_t *pgdat; + int ret = 1; if (node_online(nid)) return 0; - mem_hotplug_begin(); - pgdat = hotadd_new_pgdat(nid, 0); + pgdat = hotadd_new_pgdat(nid, start); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; goto out; } - node_set_online(nid); - ret = register_one_node(nid); - BUG_ON(ret); + + if (set_node_online) { + node_set_online(nid); + ret = register_one_node(nid); + BUG_ON(ret); + } out: + return ret; +} + +/* + * Users of this function always want to online/register the node + */ +int try_online_node(int nid) +{ + int ret; + + mem_hotplug_begin(); + ret = __try_online_node(nid, 0, true); mem_hotplug_done(); return ret; } @@ -1099,9 +1121,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) int __ref add_memory_resource(int nid, struct resource *res, bool online) { u64 start, size; - pg_data_t *pgdat = NULL; - bool new_pgdat; - bool new_node; + bool new_node = false; int ret; start = res->start; @@ -1111,11 +1131,6 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) if (ret) return ret; - { /* Stupid hack to suppress address-never-null warning */ - void *p = NODE_DATA(nid); - new_pgdat = !p; - } - mem_hotplug_begin(); /* @@ -1126,17 +1141,13 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) */ memblock_add_node(start, size, nid); - new_node = !node_online(nid); - if (new_node) { - pgdat = hotadd_new_pgdat(nid, start); - ret = -ENOMEM; - if (!pgdat) - goto error; - } + ret = __try_online_node(nid, start, false); + if (ret < 0) + goto error; + new_node = ret; /* call arch's memory hotadd */ ret = arch_add_memory(nid, start, size, NULL, true); - if (ret < 0) goto error; @@ -1180,8 +1191,8 @@ register_fail: error: /* rollback pgdat allocation and others */ - if (new_pgdat && pgdat) - rollback_node_hotadd(nid, pgdat); + if (new_node) + rollback_node_hotadd(nid); memblock_remove(start, size); out: From d5b6f6a3610b05e6712cb9c61a85a6dff16e91cf Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Fri, 17 Aug 2018 15:46:18 -0700 Subject: [PATCH 050/111] mm/memory_hotplug.c: call register_mem_sect_under_node() When hotplugging memory, it is possible that two calls are being made to register_mem_sect_under_node(). One comes from __add_section()->hotplug_memory_register() and the other from add_memory_resource()->link_mem_sections() if we had to register a new node. In case we had to register a new node, hotplug_memory_register() will only handle/allocate the memory_block's since register_mem_sect_under_node() will return right away because the node it is not online yet. I think it is better if we leave hotplug_memory_register() to handle/allocate only memory_block's and make link_mem_sections() to call register_mem_sect_under_node(). So this patch removes the call to register_mem_sect_under_node() from hotplug_memory_register(), and moves the call to link_mem_sections() out of the condition, so it will always be called. In this way we only have one place where the memory sections are registered. Link: http://lkml.kernel.org/r/20180622111839.10071-3-osalvador@techadventures.net Signed-off-by: Oscar Salvador Reviewed-by: Pavel Tatashin Tested-by: Reza Arbab Tested-by: Jonathan Cameron Cc: Pasha Tatashin Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 2 -- mm/memory_hotplug.c | 32 +++++++++++--------------------- 2 files changed, 11 insertions(+), 23 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index f5e560188a18..c8a1cb0b6136 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -736,8 +736,6 @@ int hotplug_memory_register(int nid, struct mem_section *section) mem->section_count++; } - if (mem->section_count == sections_per_block) - ret = register_mem_sect_under_node(mem, nid, false); out: mutex_unlock(&mem_sysfs_mutex); return ret; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 504ba120bdfc..e2ed64b994e5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1123,6 +1123,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) u64 start, size; bool new_node = false; int ret; + unsigned long start_pfn, nr_pages; start = res->start; size = resource_size(res); @@ -1151,34 +1152,23 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) if (ret < 0) goto error; - /* we online node here. we can't roll back from here. */ - node_set_online(nid); - if (new_node) { - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; - - ret = __register_one_node(nid); - if (ret) - goto register_fail; - - /* - * link memory sections under this node. This is already - * done when creatig memory section in register_new_memory - * but that depends to have the node registered so offline - * nodes have to go through register_node. - * TODO clean up this mess. - */ - ret = link_mem_sections(nid, start_pfn, nr_pages, false); -register_fail: - /* - * If sysfs file of new node can't create, cpu on the node + /* If sysfs file of new node can't be created, cpu on the node * can't be hot-added. There is no rollback way now. * So, check by BUG_ON() to catch it reluctantly.. + * We online node here. We can't roll back from here. */ + node_set_online(nid); + ret = __register_one_node(nid); BUG_ON(ret); } + /* link memory sections under this node.*/ + start_pfn = start >> PAGE_SHIFT; + nr_pages = size >> PAGE_SHIFT; + ret = link_mem_sections(nid, start_pfn, nr_pages, false); + BUG_ON(ret); + /* create new memmap entry */ firmware_map_add_hotplug(start, start + size, "System RAM"); From 4fbce633910ed80b135b84160a22b219080c8082 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Fri, 17 Aug 2018 15:46:22 -0700 Subject: [PATCH 051/111] mm/memory_hotplug.c: make register_mem_sect_under_node() a callback of walk_memory_range() link_mem_sections() and walk_memory_range() share most of the code, so we can use convert link_mem_sections() into a dummy function that calls walk_memory_range() with a callback to register_mem_sect_under_node(). This patch converts register_mem_sect_under_node() in order to match a walk_memory_range's callback, getting rid of the check_nid argument and checking instead if the system is still boothing, since we only have to check for the nid if the system is in such state. Link: http://lkml.kernel.org/r/20180622111839.10071-4-osalvador@techadventures.net Signed-off-by: Oscar Salvador Suggested-by: Pavel Tatashin Tested-by: Reza Arbab Tested-by: Jonathan Cameron Reviewed-by: Pavel Tatashin Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 44 ++++++-------------------------------------- include/linux/node.h | 12 +++++++----- mm/memory_hotplug.c | 5 +---- 3 files changed, 14 insertions(+), 47 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index a5e821d09656..845d5523812b 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -399,10 +399,9 @@ static int __ref get_nid_for_pfn(unsigned long pfn) } /* register memory section under specified node if it spans that node */ -int register_mem_sect_under_node(struct memory_block *mem_blk, int nid, - bool check_nid) +int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg) { - int ret; + int ret, nid = *(int *)arg; unsigned long pfn, sect_start_pfn, sect_end_pfn; if (!mem_blk) @@ -433,7 +432,7 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, int nid, * case, during hotplug we know that all pages in the memory * block belong to the same node. */ - if (check_nid) { + if (system_state == SYSTEM_BOOTING) { page_nid = get_nid_for_pfn(pfn); if (page_nid < 0) continue; @@ -490,41 +489,10 @@ int unregister_mem_sect_under_nodes(struct memory_block *mem_blk, return 0; } -int link_mem_sections(int nid, unsigned long start_pfn, unsigned long nr_pages, - bool check_nid) +int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long end_pfn = start_pfn + nr_pages; - unsigned long pfn; - struct memory_block *mem_blk = NULL; - int err = 0; - - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - unsigned long section_nr = pfn_to_section_nr(pfn); - struct mem_section *mem_sect; - int ret; - - if (!present_section_nr(section_nr)) - continue; - mem_sect = __nr_to_section(section_nr); - - /* same memblock ? */ - if (mem_blk) - if ((section_nr >= mem_blk->start_section_nr) && - (section_nr <= mem_blk->end_section_nr)) - continue; - - mem_blk = find_memory_block_hinted(mem_sect, mem_blk); - - ret = register_mem_sect_under_node(mem_blk, nid, check_nid); - if (!err) - err = ret; - - /* discard ref obtained in find_memory_block() */ - } - - if (mem_blk) - kobject_put(&mem_blk->dev.kobj); - return err; + return walk_memory_range(start_pfn, end_pfn, (void *)&nid, + register_mem_sect_under_node); } #ifdef CONFIG_HUGETLBFS diff --git a/include/linux/node.h b/include/linux/node.h index 6d336e38d155..257bb3d6d014 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -33,10 +33,10 @@ typedef void (*node_registration_func_t)(struct node *); #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA) extern int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long nr_pages, bool check_nid); + unsigned long end_pfn); #else static inline int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long nr_pages, bool check_nid) + unsigned long end_pfn) { return 0; } @@ -54,12 +54,14 @@ static inline int register_one_node(int nid) if (node_online(nid)) { struct pglist_data *pgdat = NODE_DATA(nid); + unsigned long start_pfn = pgdat->node_start_pfn; + unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; error = __register_one_node(nid); if (error) return error; /* link memory sections under this node */ - error = link_mem_sections(nid, pgdat->node_start_pfn, pgdat->node_spanned_pages, true); + error = link_mem_sections(nid, start_pfn, end_pfn); } return error; @@ -69,7 +71,7 @@ extern void unregister_one_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); extern int register_mem_sect_under_node(struct memory_block *mem_blk, - int nid, bool check_nid); + void *arg); extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk, unsigned long phys_index); @@ -99,7 +101,7 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) return 0; } static inline int register_mem_sect_under_node(struct memory_block *mem_blk, - int nid, bool check_nid) + void *arg) { return 0; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e2ed64b994e5..4eb6e824a80c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1123,7 +1123,6 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) u64 start, size; bool new_node = false; int ret; - unsigned long start_pfn, nr_pages; start = res->start; size = resource_size(res); @@ -1164,9 +1163,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) } /* link memory sections under this node.*/ - start_pfn = start >> PAGE_SHIFT; - nr_pages = size >> PAGE_SHIFT; - ret = link_mem_sections(nid, start_pfn, nr_pages, false); + ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1)); BUG_ON(ret); /* create new memmap entry */ From 3172e5e61c8a78f690c50f221fdeedce35d0b1e4 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Fri, 17 Aug 2018 15:46:25 -0700 Subject: [PATCH 052/111] mm/memory_hotplug.c: drop unnecessary checks from register_mem_sect_under_node() Callers of register_mem_sect_under_node() are always passing a valid memory_block (not NULL), so we can safely drop the check for NULL. In the same way, register_mem_sect_under_node() is only called in case the node is online, so we can safely remove that check as well. Link: http://lkml.kernel.org/r/20180622111839.10071-5-osalvador@techadventures.net Signed-off-by: Oscar Salvador Reviewed-by: Pavel Tatashin Tested-by: Reza Arbab Tested-by: Jonathan Cameron Cc: Pasha Tatashin Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 845d5523812b..1ac4c36e13bb 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -404,12 +404,7 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg) int ret, nid = *(int *)arg; unsigned long pfn, sect_start_pfn, sect_end_pfn; - if (!mem_blk) - return -EFAULT; - mem_blk->nid = nid; - if (!node_online(nid)) - return 0; sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); From a3266bd49c721e2e0a71f352d83713fbd60caadb Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 17 Aug 2018 15:46:29 -0700 Subject: [PATCH 053/111] mm: provide a fallback for PAGE_KERNEL_RO for architectures Some architectures do not define certain PAGE_KERNEL_* flags, this is either because: a) The way to implement some of these flags is *not yet ported*, or b) The architecture *has no way* to describe them Over time we have accumulated a few PAGE_KERNEL_* fallback workarounds for architectures in the kernel which do not define them using *relatively safe* equivalents. Move these scattered fallback hacks into asm-generic. We start off with PAGE_KERNEL_RO using PAGE_KERNEL as a fallback. This has been in place on the firmware loader for years. Move the fallback into the respective asm-generic header. Link: http://lkml.kernel.org/r/20180510185507.2439-2-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Reviewed-by: Andrew Morton Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Matthew Wilcox Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/firmware_loader/fallback.c | 5 ----- include/asm-generic/pgtable.h | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/base/firmware_loader/fallback.c b/drivers/base/firmware_loader/fallback.c index 202324291542..b5c865fe263b 100644 --- a/drivers/base/firmware_loader/fallback.c +++ b/drivers/base/firmware_loader/fallback.c @@ -219,11 +219,6 @@ static ssize_t firmware_loading_show(struct device *dev, return sprintf(buf, "%d\n", loading); } -/* Some architectures don't have PAGE_KERNEL_RO */ -#ifndef PAGE_KERNEL_RO -#define PAGE_KERNEL_RO PAGE_KERNEL -#endif - /* one pages buffer should be mapped/unmapped only once */ static int map_fw_priv_pages(struct fw_priv *fw_priv) { diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index a75cb371cd19..62bbd6f23c35 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1095,6 +1095,20 @@ static inline bool arch_has_pfn_modify_check(void) } #endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */ +/* + * Architecture PAGE_KERNEL_* fallbacks + * + * Some architectures don't define certain PAGE_KERNEL_* flags. This is either + * because they really don't support them, or the port needs to be updated to + * reflect the required functionality. Below are a set of relatively safe + * fallbacks, as best effort, which we can count on in lieu of the architectures + * not defining them on their own yet. + */ + +#ifndef PAGE_KERNEL_RO +# define PAGE_KERNEL_RO PAGE_KERNEL +#endif + #endif /* !__ASSEMBLY__ */ #ifndef io_remap_pfn_range From 1a9b4b3d75679fbe8c3bb8fb7e957ea693b6a89c Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 17 Aug 2018 15:46:32 -0700 Subject: [PATCH 054/111] mm: provide a fallback for PAGE_KERNEL_EXEC for architectures Some architectures just don't have PAGE_KERNEL_EXEC. The mm/nommu.c and mm/vmalloc.c code have been using PAGE_KERNEL as a fallback for years. Move this fallback to asm-generic. Link: http://lkml.kernel.org/r/20180510185507.2439-3-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Suggested-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 4 ++++ mm/nommu.c | 4 ---- mm/vmalloc.c | 4 ---- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 62bbd6f23c35..88ebc6102c7c 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1109,6 +1109,10 @@ static inline bool arch_has_pfn_modify_check(void) # define PAGE_KERNEL_RO PAGE_KERNEL #endif +#ifndef PAGE_KERNEL_EXEC +# define PAGE_KERNEL_EXEC PAGE_KERNEL +#endif + #endif /* !__ASSEMBLY__ */ #ifndef io_remap_pfn_range diff --git a/mm/nommu.c b/mm/nommu.c index 9fc9e43335b6..e4aac33216ae 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -364,10 +364,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -#ifndef PAGE_KERNEL_EXEC -# define PAGE_KERNEL_EXEC PAGE_KERNEL -#endif - /** * vmalloc_exec - allocate virtually contiguous, executable memory * @size: allocation size diff --git a/mm/vmalloc.c b/mm/vmalloc.c index cfea25be7754..a728fc492557 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1907,10 +1907,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -#ifndef PAGE_KERNEL_EXEC -# define PAGE_KERNEL_EXEC PAGE_KERNEL -#endif - /** * vmalloc_exec - allocate virtually contiguous, executable memory * @size: allocation size From dc0b58643aff8b378086f25cce6789ccba68cbcb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 17 Aug 2018 15:46:36 -0700 Subject: [PATCH 055/111] mm: introduce mem_cgroup_put() helper Introduce the mem_cgroup_put() helper, which helps to eliminate guarding memcg css release with "#ifdef CONFIG_MEMCG" in multiple places. Link: http://lkml.kernel.org/r/20180623000600.5818-2-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Reviewed-by: Andrew Morton Acked-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 680d3395fc83..42f4719def32 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -378,6 +378,11 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; } +static inline void mem_cgroup_put(struct mem_cgroup *memcg) +{ + css_put(&memcg->css); +} + #define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) @@ -850,6 +855,10 @@ static inline bool task_in_mem_cgroup(struct task_struct *task, return true; } +static inline void mem_cgroup_put(struct mem_cgroup *memcg) +{ +} + static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, From d46eb14b735b11927d4bdc2d1854c311af19de6d Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 17 Aug 2018 15:46:39 -0700 Subject: [PATCH 056/111] fs: fsnotify: account fsnotify metadata to kmemcg Patch series "Directed kmem charging", v8. The Linux kernel's memory cgroup allows limiting the memory usage of the jobs running on the system to provide isolation between the jobs. All the kernel memory allocated in the context of the job and marked with __GFP_ACCOUNT will also be included in the memory usage and be limited by the job's limit. The kernel memory can only be charged to the memcg of the process in whose context kernel memory was allocated. However there are cases where the allocated kernel memory should be charged to the memcg different from the current processes's memcg. This patch series contains two such concrete use-cases i.e. fsnotify and buffer_head. The fsnotify event objects can consume a lot of system memory for large or unlimited queues if there is either no or slow listener. The events are allocated in the context of the event producer. However they should be charged to the event consumer. Similarly the buffer_head objects can be allocated in a memcg different from the memcg of the page for which buffer_head objects are being allocated. To solve this issue, this patch series introduces mechanism to charge kernel memory to a given memcg. In case of fsnotify events, the memcg of the consumer can be used for charging and for buffer_head, the memcg of the page can be charged. For directed charging, the caller can use the scope API memalloc_[un]use_memcg() to specify the memcg to charge for all the __GFP_ACCOUNT allocations within the scope. This patch (of 2): A lot of memory can be consumed by the events generated for the huge or unlimited queues if there is either no or slow listener. This can cause system level memory pressure or OOMs. So, it's better to account the fsnotify kmem caches to the memcg of the listener. However the listener can be in a different memcg than the memcg of the producer and these allocations happen in the context of the event producer. This patch introduces remote memcg charging API which the producer can use to charge the allocations to the memcg of the listener. There are seven fsnotify kmem caches and among them allocations from dnotify_struct_cache, dnotify_mark_cache, fanotify_mark_cache and inotify_inode_mark_cachep happens in the context of syscall from the listener. So, SLAB_ACCOUNT is enough for these caches. The objects from fsnotify_mark_connector_cachep are not accounted as they are small compared to the notification mark or events and it is unclear whom to account connector to since it is shared by all events attached to the inode. The allocations from the event caches happen in the context of the event producer. For such caches we will need to remote charge the allocations to the listener's memcg. Thus we save the memcg reference in the fsnotify_group structure of the listener. This patch has also moved the members of fsnotify_group to keep the size same, at least for 64 bit build, even with additional member by filling the holes. [shakeelb@google.com: use GFP_KERNEL_ACCOUNT rather than open-coding it] Link: http://lkml.kernel.org/r/20180702215439.211597-1-shakeelb@google.com Link: http://lkml.kernel.org/r/20180627191250.209150-2-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Jan Kara Cc: Amir Goldstein Cc: Greg Thelen Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/dnotify/dnotify.c | 5 ++-- fs/notify/fanotify/fanotify.c | 14 ++++++++--- fs/notify/fanotify/fanotify_user.c | 5 +++- fs/notify/group.c | 3 +++ fs/notify/inotify/inotify_fsnotify.c | 7 +++++- fs/notify/inotify/inotify_user.c | 5 +++- include/linux/fsnotify_backend.h | 12 ++++++--- include/linux/memcontrol.h | 10 +++++++- include/linux/sched.h | 3 +++ include/linux/sched/mm.h | 37 ++++++++++++++++++++++++++++ kernel/fork.c | 3 +++ mm/memcontrol.c | 37 +++++++++++++++++++++++++--- 12 files changed, 123 insertions(+), 18 deletions(-) diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index e2bea2ac5dfb..a6365e6bc047 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -384,8 +384,9 @@ out_err: static int __init dnotify_init(void) { - dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC); - dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC); + dnotify_struct_cache = KMEM_CACHE(dnotify_struct, + SLAB_PANIC|SLAB_ACCOUNT); + dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops); if (IS_ERR(dnotify_group)) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index f90842efea13..eb4e75175cfb 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "fanotify.h" @@ -140,8 +141,8 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, struct inode *inode, u32 mask, const struct path *path) { - struct fanotify_event_info *event; - gfp_t gfp = GFP_KERNEL; + struct fanotify_event_info *event = NULL; + gfp_t gfp = GFP_KERNEL_ACCOUNT; /* * For queues with unlimited length lost events are not expected and @@ -151,19 +152,22 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, if (group->max_events == UINT_MAX) gfp |= __GFP_NOFAIL; + /* Whoever is interested in the event, pays for the allocation. */ + memalloc_use_memcg(group->memcg); + if (fanotify_is_perm_event(mask)) { struct fanotify_perm_event_info *pevent; pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp); if (!pevent) - return NULL; + goto out; event = &pevent->fae; pevent->response = 0; goto init; } event = kmem_cache_alloc(fanotify_event_cachep, gfp); if (!event) - return NULL; + goto out; init: __maybe_unused fsnotify_init_event(&event->fse, inode, mask); event->tgid = get_pid(task_tgid(current)); @@ -174,6 +178,8 @@ init: __maybe_unused event->path.mnt = NULL; event->path.dentry = NULL; } +out: + memalloc_unuse_memcg(); return event; } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index ec4d8c59d0e3..0cf45041dc32 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -756,6 +757,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.user = user; atomic_inc(&user->fanotify_listeners); + group->memcg = get_mem_cgroup_from_mm(current->mm); oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL); if (unlikely(!oevent)) { @@ -957,7 +959,8 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, */ static int __init fanotify_user_setup(void) { - fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); + fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, + SLAB_PANIC|SLAB_ACCOUNT); fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) { fanotify_perm_event_cachep = diff --git a/fs/notify/group.c b/fs/notify/group.c index aa5468f23e45..c03b83662876 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "fsnotify.h" @@ -36,6 +37,8 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) if (group->ops->free_group_priv) group->ops->free_group_priv(group); + mem_cgroup_put(group->memcg); + kfree(group); } diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 9ab6dde38a14..f4184b4f3815 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "inotify.h" @@ -98,7 +99,11 @@ int inotify_handle_event(struct fsnotify_group *group, i_mark = container_of(inode_mark, struct inotify_inode_mark, fsn_mark); - event = kmalloc(alloc_len, GFP_KERNEL); + /* Whoever is interested in the event, pays for the allocation. */ + memalloc_use_memcg(group->memcg); + event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT); + memalloc_unuse_memcg(); + if (unlikely(!event)) { /* * Treat lost event due to ENOMEM the same way as queue diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 1cf5b779d862..749c46ababa0 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "inotify.h" #include "../fdinfo.h" @@ -636,6 +637,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) oevent->name_len = 0; group->max_events = max_events; + group->memcg = get_mem_cgroup_from_mm(current->mm); spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); @@ -808,7 +810,8 @@ static int __init inotify_user_setup(void) BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); - inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); + inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, + SLAB_PANIC|SLAB_ACCOUNT); inotify_max_queued_events = 16384; init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index b38964a7a521..a0c4790c5302 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -84,6 +84,8 @@ struct fsnotify_event_private_data; struct fsnotify_fname; struct fsnotify_iter_info; +struct mem_cgroup; + /* * Each group much define these ops. The fsnotify infrastructure will call * these operations for each relevant group. @@ -127,6 +129,8 @@ struct fsnotify_event { * everything will be cleaned up. */ struct fsnotify_group { + const struct fsnotify_ops *ops; /* how this group handles things */ + /* * How the refcnt is used is up to each group. When the refcnt hits 0 * fsnotify will clean up all of the resources associated with this group. @@ -137,8 +141,6 @@ struct fsnotify_group { */ refcount_t refcnt; /* things with interest in this group */ - const struct fsnotify_ops *ops; /* how this group handles things */ - /* needed to send notification to userspace */ spinlock_t notification_lock; /* protect the notification_list */ struct list_head notification_list; /* list of event_holder this group needs to send to userspace */ @@ -160,6 +162,8 @@ struct fsnotify_group { atomic_t num_marks; /* 1 for each mark and 1 for not being * past the point of no return when freeing * a group */ + atomic_t user_waits; /* Number of tasks waiting for user + * response */ struct list_head marks_list; /* all inode marks for this group */ struct fasync_struct *fsn_fa; /* async notification */ @@ -167,8 +171,8 @@ struct fsnotify_group { struct fsnotify_event *overflow_event; /* Event we queue when the * notification list is too * full */ - atomic_t user_waits; /* Number of tasks waiting for user - * response */ + + struct mem_cgroup *memcg; /* memcg to charge allocations */ /* groups can define private fields here or use the void *private */ union { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 42f4719def32..121e218d2a21 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -373,6 +373,8 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *); bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); +struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); + static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; @@ -380,7 +382,8 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ static inline void mem_cgroup_put(struct mem_cgroup *memcg) { - css_put(&memcg->css); + if (memcg) + css_put(&memcg->css); } #define mem_cgroup_from_counter(counter, member) \ @@ -855,6 +858,11 @@ static inline bool task_in_mem_cgroup(struct task_struct *task, return true; } +static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) +{ + return NULL; +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index 95a5018c338e..1827f4a7a6de 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1152,6 +1152,9 @@ struct task_struct { /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; + + /* Used by memcontrol for targeted memcg charge: */ + struct mem_cgroup *active_memcg; #endif #ifdef CONFIG_BLK_CGROUP diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 44d356f5e47c..aebb370a0006 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -248,6 +248,43 @@ static inline void memalloc_noreclaim_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC) | flags; } +#ifdef CONFIG_MEMCG +/** + * memalloc_use_memcg - Starts the remote memcg charging scope. + * @memcg: memcg to charge. + * + * This function marks the beginning of the remote memcg charging scope. All the + * __GFP_ACCOUNT allocations till the end of the scope will be charged to the + * given memcg. + * + * NOTE: This function is not nesting safe. + */ +static inline void memalloc_use_memcg(struct mem_cgroup *memcg) +{ + WARN_ON_ONCE(current->active_memcg); + current->active_memcg = memcg; +} + +/** + * memalloc_unuse_memcg - Ends the remote memcg charging scope. + * + * This function marks the end of the remote memcg charging scope started by + * memalloc_use_memcg(). + */ +static inline void memalloc_unuse_memcg(void) +{ + current->active_memcg = NULL; +} +#else +static inline void memalloc_use_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void memalloc_unuse_memcg(void) +{ +} +#endif + #ifdef CONFIG_MEMBARRIER enum { MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0), diff --git a/kernel/fork.c b/kernel/fork.c index 33112315b5c0..5ee74c113381 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -871,6 +871,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->use_memdelay = 0; #endif +#ifdef CONFIG_MEMCG + tsk->active_memcg = NULL; +#endif return tsk; free_stack: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b836e7f00309..bf9cf738c836 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -678,9 +678,20 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) } EXPORT_SYMBOL(mem_cgroup_from_task); -static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) +/** + * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. + * @mm: mm from which memcg should be extracted. It can be NULL. + * + * Obtain a reference on mm->memcg and returns it if successful. Otherwise + * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is + * returned. + */ +struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { - struct mem_cgroup *memcg = NULL; + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return NULL; rcu_read_lock(); do { @@ -700,6 +711,24 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) rcu_read_unlock(); return memcg; } +EXPORT_SYMBOL(get_mem_cgroup_from_mm); + +/** + * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg. + */ +static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) +{ + if (unlikely(current->active_memcg)) { + struct mem_cgroup *memcg = root_mem_cgroup; + + rcu_read_lock(); + if (css_tryget_online(¤t->active_memcg->css)) + memcg = current->active_memcg; + rcu_read_unlock(); + return memcg; + } + return get_mem_cgroup_from_mm(current->mm); +} /** * mem_cgroup_iter - iterate over memory cgroup hierarchy @@ -2261,7 +2290,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) if (current->memcg_kmem_skip_account) return cachep; - memcg = get_mem_cgroup_from_mm(current->mm); + memcg = get_mem_cgroup_from_current(); kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) goto out; @@ -2345,7 +2374,7 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) if (memcg_kmem_bypass()) return 0; - memcg = get_mem_cgroup_from_mm(current->mm); + memcg = get_mem_cgroup_from_current(); if (!mem_cgroup_is_root(memcg)) { ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); if (!ret) From f745c6f5fe75734f3b35d9d4e6ebe2a7d010ddda Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 17 Aug 2018 15:46:44 -0700 Subject: [PATCH 057/111] fs, mm: account buffer_head to kmemcg The buffer_head can consume a significant amount of system memory and is directly related to the amount of page cache. In our production environment we have observed that a lot of machines are spending a significant amount of memory as buffer_head and can not be left as system memory overhead. Charging buffer_head is not as simple as adding __GFP_ACCOUNT to the allocation. The buffer_heads can be allocated in a memcg different from the memcg of the page for which buffer_heads are being allocated. One concrete example is memory reclaim. The reclaim can trigger I/O of pages of any memcg on the system. So, the right way to charge buffer_head is to extract the memcg from the page for which buffer_heads are being allocated and then use targeted memcg charging API. [shakeelb@google.com: use __GFP_ACCOUNT for directed memcg charging] Link: http://lkml.kernel.org/r/20180702220208.213380-1-shakeelb@google.com Link: http://lkml.kernel.org/r/20180627191250.209150-3-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Jan Kara Cc: Amir Goldstein Cc: Greg Thelen Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 12 ++++++++++-- include/linux/memcontrol.h | 7 +++++++ mm/memcontrol.c | 22 ++++++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index c8c2b7d8b8d6..4cc679d5bf58 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -45,6 +45,7 @@ #include #include #include +#include #include static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -813,12 +814,16 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry) { struct buffer_head *bh, *head; - gfp_t gfp = GFP_NOFS; + gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; long offset; + struct mem_cgroup *memcg; if (retry) gfp |= __GFP_NOFAIL; + memcg = get_mem_cgroup_from_page(page); + memalloc_use_memcg(memcg); + head = NULL; offset = PAGE_SIZE; while ((offset -= size) >= 0) { @@ -835,6 +840,9 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, /* Link the buffer to its page */ set_bh_page(bh, page, offset); } +out: + memalloc_unuse_memcg(); + mem_cgroup_put(memcg); return head; /* * In case anything failed, we just free everything we got. @@ -848,7 +856,7 @@ no_grow: } while (head); } - return NULL; + goto out; } EXPORT_SYMBOL_GPL(alloc_page_buffers); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 121e218d2a21..50e3e807b427 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -375,6 +375,8 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); +struct mem_cgroup *get_mem_cgroup_from_page(struct page *page); + static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; @@ -863,6 +865,11 @@ static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) return NULL; } +static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) +{ + return NULL; +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bf9cf738c836..c071af193986 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -713,6 +713,28 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) } EXPORT_SYMBOL(get_mem_cgroup_from_mm); +/** + * get_mem_cgroup_from_page: Obtain a reference on given page's memcg. + * @page: page from which memcg should be extracted. + * + * Obtain a reference on page->memcg and returns it if successful. Otherwise + * root_mem_cgroup is returned. + */ +struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) +{ + struct mem_cgroup *memcg = page->mem_cgroup; + + if (mem_cgroup_disabled()) + return NULL; + + rcu_read_lock(); + if (!memcg || !css_tryget_online(&memcg->css)) + memcg = root_mem_cgroup; + rcu_read_unlock(); + return memcg; +} +EXPORT_SYMBOL(get_mem_cgroup_from_page); + /** * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg. */ From dcfe4df3d57f08f7bf4acdd36c89763fe188cf3c Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 17 Aug 2018 15:46:47 -0700 Subject: [PATCH 058/111] mm/page-writeback.c: update stale account_page_redirty() comment Commit 93f78d882865 ("writeback: move backing_dev_info->bdi_stat[] into bdi_writeback") replaced BDI_DIRTIED with WB_DIRTIED in account_page_redirty(). Update comment to track that change. BDI_DIRTIED => WB_DIRTIED BDI_WRITTEN => WB_WRITTEN Link: http://lkml.kernel.org/r/20180625171526.173483-1-gthelen@google.com Signed-off-by: Greg Thelen Reviewed-by: Jan Kara Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 337c6afb3345..6551d3b0dc30 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2490,8 +2490,8 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers); /* * Call this whenever redirtying a page, to de-account the dirty counters - * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written - * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to + * (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written + * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to * systematic errors in balanced_dirty_ratelimit and the dirty pages position * control. */ From 4d0a5402f505eafe5b0a77f2dc77bb6c2e25a714 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 17 Aug 2018 15:46:50 -0700 Subject: [PATCH 059/111] mm/zsmalloc.c: make several functions and a struct static The functions zs_page_isolate, zs_page_migrate, zs_page_putback, lock_zspage, trylock_zspage and structure zsmalloc_aops are local to source and do not need to be in global scope, so make them static. Cleans up sparse warnings: symbol 'zs_page_isolate' was not declared. Should it be static? symbol 'zs_page_migrate' was not declared. Should it be static? symbol 'zs_page_putback' was not declared. Should it be static? symbol 'zsmalloc_aops' was not declared. Should it be static? symbol 'lock_zspage' was not declared. Should it be static? symbol 'trylock_zspage' was not declared. Should it be static? [arnd@arndb.de: hide unused lock_zspage] Link: http://lkml.kernel.org/r/20180706130924.3891230-1-arnd@arndb.de Link: http://lkml.kernel.org/r/20180624213322.13776-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: Sergey Senozhatsky Reviewed-by: Andrew Morton Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 8d87e973a4f5..9da65552e7ca 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -924,20 +924,7 @@ static void reset_page(struct page *page) page->freelist = NULL; } -/* - * To prevent zspage destroy during migration, zspage freeing should - * hold locks of all pages in the zspage. - */ -void lock_zspage(struct zspage *zspage) -{ - struct page *page = get_first_page(zspage); - - do { - lock_page(page); - } while ((page = get_next_page(page)) != NULL); -} - -int trylock_zspage(struct zspage *zspage) +static int trylock_zspage(struct zspage *zspage) { struct page *cursor, *fail; @@ -1814,6 +1801,19 @@ static enum fullness_group putback_zspage(struct size_class *class, } #ifdef CONFIG_COMPACTION +/* + * To prevent zspage destroy during migration, zspage freeing should + * hold locks of all pages in the zspage. + */ +static void lock_zspage(struct zspage *zspage) +{ + struct page *page = get_first_page(zspage); + + do { + lock_page(page); + } while ((page = get_next_page(page)) != NULL); +} + static struct dentry *zs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { @@ -1905,7 +1905,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, __SetPageMovable(newpage, page_mapping(oldpage)); } -bool zs_page_isolate(struct page *page, isolate_mode_t mode) +static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { struct zs_pool *pool; struct size_class *class; @@ -1960,7 +1960,7 @@ bool zs_page_isolate(struct page *page, isolate_mode_t mode) return true; } -int zs_page_migrate(struct address_space *mapping, struct page *newpage, +static int zs_page_migrate(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { struct zs_pool *pool; @@ -2076,7 +2076,7 @@ unpin_objects: return ret; } -void zs_page_putback(struct page *page) +static void zs_page_putback(struct page *page) { struct zs_pool *pool; struct size_class *class; @@ -2108,7 +2108,7 @@ void zs_page_putback(struct page *page) spin_unlock(&class->lock); } -const struct address_space_operations zsmalloc_aops = { +static const struct address_space_operations zsmalloc_aops = { .isolate_page = zs_page_isolate, .migratepage = zs_page_migrate, .putback_page = zs_page_putback, From 31f21da18132fc971297175077fafa3bd4184dc1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 17 Aug 2018 15:46:54 -0700 Subject: [PATCH 060/111] mm/swap_slots.c: make swap_slots_cache_mutex and swap_slots_cache_enable_mutex static The mutexes swap_slots_cache_mutex and swap_slots_cache_enable_mutex are local to the source and do not need to be in global scope, so make them static. Cleans up sparse warnings: symbol 'swap_slots_cache_mutex' was not declared. Should it be static? symbol 'swap_slots_cache_enable_mutex' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20180624182536.4937-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_slots.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/swap_slots.c b/mm/swap_slots.c index a791411fed71..008ccb22fee6 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -38,9 +38,9 @@ static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); static bool swap_slot_cache_active; bool swap_slot_cache_enabled; static bool swap_slot_cache_initialized; -DEFINE_MUTEX(swap_slots_cache_mutex); +static DEFINE_MUTEX(swap_slots_cache_mutex); /* Serialize swap slots cache enable/disable operations */ -DEFINE_MUTEX(swap_slots_cache_enable_mutex); +static DEFINE_MUTEX(swap_slots_cache_enable_mutex); static void __drain_swap_slots_cache(unsigned int type); static void deactivate_swap_slots_cache(void); From a718e28f538441a3b6612da9ff226973376cdf0f Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 17 Aug 2018 15:46:57 -0700 Subject: [PATCH 061/111] mm/fadvise.c: fix signed overflow UBSAN complaint Signed integer overflow is undefined according to the C standard. The overflow in ksys_fadvise64_64() is deliberate, but since it is signed overflow, UBSAN complains: UBSAN: Undefined behaviour in mm/fadvise.c:76:10 signed integer overflow: 4 + 9223372036854775805 cannot be represented in type 'long long int' Use unsigned types to do math. Unsigned overflow is defined so UBSAN will not complain about it. This patch doesn't change generated code. [akpm@linux-foundation.org: add comment explaining the casts] Link: http://lkml.kernel.org/r/20180629184453.7614-1-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Reported-by: Reviewed-by: Andrew Morton Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fadvise.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/fadvise.c b/mm/fadvise.c index afa41491d324..2d8376e3c640 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -72,8 +72,12 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) goto out; } - /* Careful about overflows. Len == 0 means "as much as possible" */ - endbyte = offset + len; + /* + * Careful about overflows. Len == 0 means "as much as possible". Use + * unsigned math because signed overflows are undefined and UBSan + * complains. + */ + endbyte = (u64)offset + (u64)len; if (!len || endbyte < len) endbyte = -1; else From 50f8b92f21d23789bd4ada593e8ddc56cc4f79fe Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 17 Aug 2018 15:47:00 -0700 Subject: [PATCH 062/111] mm: thp: pass correct vm_flags to hugepage_vma_check() khugepaged_enter_vma_merge() passes a stale vma->vm_flags to hugepage_vma_check(). The argument vm_flags contains the latest value. Therefore, it is necessary to pass this vm_flags into hugepage_vma_check(). With this bug, madvise(MADV_HUGEPAGE) for mmap files in shmem fails to put memory in huge pages. Here is an example of failed madvise(): /* mount /dev/shm with huge=advise: * mount -o remount,huge=advise /dev/shm */ /* create file /dev/shm/huge */ #define HUGE_FILE "/dev/shm/huge" fd = open(HUGE_FILE, O_RDONLY); ptr = mmap(NULL, FILE_SIZE, PROT_READ, MAP_PRIVATE, fd, 0); ret = madvise(ptr, FILE_SIZE, MADV_HUGEPAGE); madvise() will return 0, but this memory region is never put in huge page (check from /proc/meminfo: ShmemHugePages). Link: http://lkml.kernel.org/r/20180629181752.792831-1-songliubraving@fb.com Fixes: 02b75dc8160d ("mm: thp: register mm for khugepaged when merging vma for shmem") Signed-off-by: Song Liu Reviewed-by: Rik van Riel Reviewed-by: Yang Shi Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 79d55e10bca9..961cbe9062a5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -397,10 +397,11 @@ static inline int khugepaged_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } -static bool hugepage_vma_check(struct vm_area_struct *vma) +static bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags) { - if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE) || + if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; if (shmem_file(vma->vm_file)) { @@ -413,7 +414,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) return false; if (is_vma_temporary_stack(vma)) return false; - return !(vma->vm_flags & VM_NO_KHUGEPAGED); + return !(vm_flags & VM_NO_KHUGEPAGED); } int __khugepaged_enter(struct mm_struct *mm) @@ -458,7 +459,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma, * khugepaged does not yet work on non-shmem files or special * mappings. And file-private shmem THP is not supported. */ - if (!hugepage_vma_check(vma)) + if (!hugepage_vma_check(vma, vm_flags)) return 0; hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; @@ -861,7 +862,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, hend = vma->vm_end & HPAGE_PMD_MASK; if (address < hstart || address + HPAGE_PMD_SIZE > hend) return SCAN_ADDRESS_RANGE; - if (!hugepage_vma_check(vma)) + if (!hugepage_vma_check(vma, vma->vm_flags)) return SCAN_VMA_CHECK; return 0; } @@ -1695,7 +1696,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, progress++; break; } - if (!hugepage_vma_check(vma)) { + if (!hugepage_vma_check(vma, vma->vm_flags)) { skip: progress++; continue; From 0207df4fa1a869281ddbf72db6203dbf036b3e1a Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 17 Aug 2018 15:47:04 -0700 Subject: [PATCH 063/111] kernel/memremap, kasan: make ZONE_DEVICE with work with KASAN KASAN learns about hotadded memory via the memory hotplug notifier. devm_memremap_pages() intentionally skips calling memory hotplug notifiers. So KASAN doesn't know anything about new memory added by devm_memremap_pages(). This causes a crash when KASAN tries to access non-existent shadow memory: BUG: unable to handle kernel paging request at ffffed0078000000 RIP: 0010:check_memory_region+0x82/0x1e0 Call Trace: memcpy+0x1f/0x50 pmem_do_bvec+0x163/0x720 pmem_make_request+0x305/0xac0 generic_make_request+0x54f/0xcf0 submit_bio+0x9c/0x370 submit_bh_wbc+0x4c7/0x700 block_read_full_page+0x5ef/0x870 do_read_cache_page+0x2b8/0xb30 read_dev_sector+0xbd/0x3f0 read_lba.isra.0+0x277/0x670 efi_partition+0x41a/0x18f0 check_partition+0x30d/0x5e9 rescan_partitions+0x18c/0x840 __blkdev_get+0x859/0x1060 blkdev_get+0x23f/0x810 __device_add_disk+0x9c8/0xde0 pmem_attach_disk+0x9a8/0xf50 nvdimm_bus_probe+0xf3/0x3c0 driver_probe_device+0x493/0xbd0 bus_for_each_drv+0x118/0x1b0 __device_attach+0x1cd/0x2b0 bus_probe_device+0x1ac/0x260 device_add+0x90d/0x1380 nd_async_device_register+0xe/0x50 async_run_entry_fn+0xc3/0x5d0 process_one_work+0xa0a/0x1810 worker_thread+0x87/0xe80 kthread+0x2d7/0x390 ret_from_fork+0x3a/0x50 Add kasan_add_zero_shadow()/kasan_remove_zero_shadow() - post mm_init() interface to map/unmap kasan_zero_page at requested virtual addresses. And use it to add/remove the shadow memory for hotplugged/unplugged device memory. Link: http://lkml.kernel.org/r/20180629164932.740-1-aryabinin@virtuozzo.com Fixes: 41e94a851304 ("add devm_memremap_pages") Signed-off-by: Andrey Ryabinin Reported-by: Dave Chinner Reviewed-by: Dan Williams Tested-by: Dan Williams Cc: Dmitry Vyukov Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 13 +- kernel/memremap.c | 10 ++ mm/kasan/kasan_init.c | 316 ++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 325 insertions(+), 14 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index de784fd11d12..46aae129917c 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -20,7 +20,7 @@ extern pmd_t kasan_zero_pmd[PTRS_PER_PMD]; extern pud_t kasan_zero_pud[PTRS_PER_PUD]; extern p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D]; -void kasan_populate_zero_shadow(const void *shadow_start, +int kasan_populate_zero_shadow(const void *shadow_start, const void *shadow_end); static inline void *kasan_mem_to_shadow(const void *addr) @@ -71,6 +71,9 @@ struct kasan_cache { int kasan_module_alloc(void *addr, size_t size); void kasan_free_shadow(const struct vm_struct *vm); +int kasan_add_zero_shadow(void *start, unsigned long size); +void kasan_remove_zero_shadow(void *start, unsigned long size); + size_t ksize(const void *); static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); } size_t kasan_metadata_size(struct kmem_cache *cache); @@ -124,6 +127,14 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object, static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } static inline void kasan_free_shadow(const struct vm_struct *vm) {} +static inline int kasan_add_zero_shadow(void *start, unsigned long size) +{ + return 0; +} +static inline void kasan_remove_zero_shadow(void *start, + unsigned long size) +{} + static inline void kasan_unpoison_slab(const void *ptr) { } static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } diff --git a/kernel/memremap.c b/kernel/memremap.c index 38283363da06..1f87ea6b6545 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -137,6 +138,7 @@ static void devm_memremap_pages_release(void *data) mem_hotplug_begin(); arch_remove_memory(align_start, align_size, pgmap->altmap_valid ? &pgmap->altmap : NULL); + kasan_remove_zero_shadow(__va(align_start), align_size); mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); @@ -239,6 +241,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_pfn_remap; mem_hotplug_begin(); + error = kasan_add_zero_shadow(__va(align_start), align_size); + if (error) { + mem_hotplug_done(); + goto err_kasan; + } + error = arch_add_memory(nid, align_start, align_size, altmap, false); if (!error) move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], @@ -267,6 +275,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) return __va(res->start); err_add_memory: + kasan_remove_zero_shadow(__va(align_start), align_size); + err_kasan: untrack_pfn(NULL, PHYS_PFN(align_start), align_size); err_pfn_remap: err_radix: diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c index f436246ccc79..7a2a2f13f86f 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/kasan_init.c @@ -17,10 +17,13 @@ #include #include #include +#include #include #include +#include "kasan.h" + /* * This page serves two purposes: * - It used as early shadow memory. The entire shadow region populated @@ -32,22 +35,59 @@ unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; #if CONFIG_PGTABLE_LEVELS > 4 p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss; +static inline bool kasan_p4d_table(pgd_t pgd) +{ + return pgd_page(pgd) == virt_to_page(lm_alias(kasan_zero_p4d)); +} +#else +static inline bool kasan_p4d_table(pgd_t pgd) +{ + return 0; +} #endif #if CONFIG_PGTABLE_LEVELS > 3 pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; +static inline bool kasan_pud_table(p4d_t p4d) +{ + return p4d_page(p4d) == virt_to_page(lm_alias(kasan_zero_pud)); +} +#else +static inline bool kasan_pud_table(p4d_t p4d) +{ + return 0; +} #endif #if CONFIG_PGTABLE_LEVELS > 2 pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss; +static inline bool kasan_pmd_table(pud_t pud) +{ + return pud_page(pud) == virt_to_page(lm_alias(kasan_zero_pmd)); +} +#else +static inline bool kasan_pmd_table(pud_t pud) +{ + return 0; +} #endif pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss; +static inline bool kasan_pte_table(pmd_t pmd) +{ + return pmd_page(pmd) == virt_to_page(lm_alias(kasan_zero_pte)); +} + +static inline bool kasan_zero_page_entry(pte_t pte) +{ + return pte_page(pte) == virt_to_page(lm_alias(kasan_zero_page)); +} + static __init void *early_alloc(size_t size, int node) { return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, node); } -static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr, +static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end) { pte_t *pte = pte_offset_kernel(pmd, addr); @@ -63,7 +103,7 @@ static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr, } } -static void __init zero_pmd_populate(pud_t *pud, unsigned long addr, +static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end) { pmd_t *pmd = pmd_offset(pud, addr); @@ -78,14 +118,24 @@ static void __init zero_pmd_populate(pud_t *pud, unsigned long addr, } if (pmd_none(*pmd)) { - pmd_populate_kernel(&init_mm, pmd, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + pte_t *p; + + if (slab_is_available()) + p = pte_alloc_one_kernel(&init_mm, addr); + else + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); + if (!p) + return -ENOMEM; + + pmd_populate_kernel(&init_mm, pmd, p); } zero_pte_populate(pmd, addr, next); } while (pmd++, addr = next, addr != end); + + return 0; } -static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr, +static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end) { pud_t *pud = pud_offset(p4d, addr); @@ -103,14 +153,24 @@ static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr, } if (pud_none(*pud)) { - pud_populate(&init_mm, pud, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + pmd_t *p; + + if (slab_is_available()) { + p = pmd_alloc(&init_mm, pud, addr); + if (!p) + return -ENOMEM; + } else { + pud_populate(&init_mm, pud, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } } zero_pmd_populate(pud, addr, next); } while (pud++, addr = next, addr != end); + + return 0; } -static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, +static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, unsigned long end) { p4d_t *p4d = p4d_offset(pgd, addr); @@ -132,11 +192,21 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, } if (p4d_none(*p4d)) { - p4d_populate(&init_mm, p4d, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + pud_t *p; + + if (slab_is_available()) { + p = pud_alloc(&init_mm, p4d, addr); + if (!p) + return -ENOMEM; + } else { + p4d_populate(&init_mm, p4d, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } } zero_pud_populate(p4d, addr, next); } while (p4d++, addr = next, addr != end); + + return 0; } /** @@ -145,7 +215,7 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, * @shadow_start - start of the memory range to populate * @shadow_end - end of the memory range to populate */ -void __init kasan_populate_zero_shadow(const void *shadow_start, +int __ref kasan_populate_zero_shadow(const void *shadow_start, const void *shadow_end) { unsigned long addr = (unsigned long)shadow_start; @@ -191,9 +261,229 @@ void __init kasan_populate_zero_shadow(const void *shadow_start, } if (pgd_none(*pgd)) { - pgd_populate(&init_mm, pgd, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + p4d_t *p; + + if (slab_is_available()) { + p = p4d_alloc(&init_mm, pgd, addr); + if (!p) + return -ENOMEM; + } else { + pgd_populate(&init_mm, pgd, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } } zero_p4d_populate(pgd, addr, next); } while (pgd++, addr = next, addr != end); + + return 0; +} + +static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + pte_free_kernel(&init_mm, (pte_t *)page_to_virt(pmd_page(*pmd))); + pmd_clear(pmd); +} + +static void kasan_free_pmd(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + pmd_free(&init_mm, (pmd_t *)page_to_virt(pud_page(*pud))); + pud_clear(pud); +} + +static void kasan_free_pud(pud_t *pud_start, p4d_t *p4d) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + pud_free(&init_mm, (pud_t *)page_to_virt(p4d_page(*p4d))); + p4d_clear(p4d); +} + +static void kasan_free_p4d(p4d_t *p4d_start, pgd_t *pgd) +{ + p4d_t *p4d; + int i; + + for (i = 0; i < PTRS_PER_P4D; i++) { + p4d = p4d_start + i; + if (!p4d_none(*p4d)) + return; + } + + p4d_free(&init_mm, (p4d_t *)page_to_virt(pgd_page(*pgd))); + pgd_clear(pgd); +} + +static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + if (WARN_ON(!kasan_zero_page_entry(*pte))) + continue; + pte_clear(&init_mm, addr, pte); + } +} + +static void kasan_remove_pmd_table(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pmd++) { + pte_t *pte; + + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + if (kasan_pte_table(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) + pmd_clear(pmd); + continue; + } + pte = pte_offset_kernel(pmd, addr); + kasan_remove_pte_table(pte, addr, next); + kasan_free_pte(pte_offset_kernel(pmd, 0), pmd); + } +} + +static void kasan_remove_pud_table(pud_t *pud, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pud++) { + pmd_t *pmd, *pmd_base; + + next = pud_addr_end(addr, end); + + if (!pud_present(*pud)) + continue; + + if (kasan_pmd_table(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) + pud_clear(pud); + continue; + } + pmd = pmd_offset(pud, addr); + pmd_base = pmd_offset(pud, 0); + kasan_remove_pmd_table(pmd, addr, next); + kasan_free_pmd(pmd_base, pud); + } +} + +static void kasan_remove_p4d_table(p4d_t *p4d, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, p4d++) { + pud_t *pud; + + next = p4d_addr_end(addr, end); + + if (!p4d_present(*p4d)) + continue; + + if (kasan_pud_table(*p4d)) { + if (IS_ALIGNED(addr, P4D_SIZE) && + IS_ALIGNED(next, P4D_SIZE)) + p4d_clear(p4d); + continue; + } + pud = pud_offset(p4d, addr); + kasan_remove_pud_table(pud, addr, next); + kasan_free_pud(pud_offset(p4d, 0), p4d); + } +} + +void kasan_remove_zero_shadow(void *start, unsigned long size) +{ + unsigned long addr, end, next; + pgd_t *pgd; + + addr = (unsigned long)kasan_mem_to_shadow(start); + end = addr + (size >> KASAN_SHADOW_SCALE_SHIFT); + + if (WARN_ON((unsigned long)start % + (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) || + WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE))) + return; + + for (; addr < end; addr = next) { + p4d_t *p4d; + + next = pgd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + if (!pgd_present(*pgd)) + continue; + + if (kasan_p4d_table(*pgd)) { + if (IS_ALIGNED(addr, PGDIR_SIZE) && + IS_ALIGNED(next, PGDIR_SIZE)) + pgd_clear(pgd); + continue; + } + + p4d = p4d_offset(pgd, addr); + kasan_remove_p4d_table(p4d, addr, next); + kasan_free_p4d(p4d_offset(pgd, 0), pgd); + } +} + +int kasan_add_zero_shadow(void *start, unsigned long size) +{ + int ret; + void *shadow_start, *shadow_end; + + shadow_start = kasan_mem_to_shadow(start); + shadow_end = shadow_start + (size >> KASAN_SHADOW_SCALE_SHIFT); + + if (WARN_ON((unsigned long)start % + (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) || + WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE))) + return -EINVAL; + + ret = kasan_populate_zero_shadow(shadow_start, shadow_end); + if (ret) + kasan_remove_zero_shadow(shadow_start, + size >> KASAN_SHADOW_SCALE_SHIFT); + return ret; } From d39f8fb4b7776dcb09ec3bf7a321547083078ee3 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 17 Aug 2018 15:47:07 -0700 Subject: [PATCH 064/111] mm: make DEFERRED_STRUCT_PAGE_INIT explicitly depend on SPARSEMEM The deferred memory initialization relies on section definitions, e.g PAGES_PER_SECTION, that are only available when CONFIG_SPARSEMEM=y on most architectures. Initially DEFERRED_STRUCT_PAGE_INIT depended on explicit ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT configuration option, but since the commit 2e3ca40f03bb13709df4 ("mm: relax deferred struct page requirements") this requirement was relaxed and now it is possible to enable DEFERRED_STRUCT_PAGE_INIT on architectures that support DISCONTINGMEM and NO_BOOTMEM which causes build failures. For instance, setting SMP=y and DEFERRED_STRUCT_PAGE_INIT=y on arc causes the following build failure: CC mm/page_alloc.o mm/page_alloc.c: In function 'update_defer_init': mm/page_alloc.c:321:14: error: 'PAGES_PER_SECTION' undeclared (first use in this function); did you mean 'USEC_PER_SEC'? (pfn & (PAGES_PER_SECTION - 1)) == 0) { ^~~~~~~~~~~~~~~~~ USEC_PER_SEC mm/page_alloc.c:321:14: note: each undeclared identifier is reported only once for each function it appears in In file included from include/linux/cache.h:5:0, from include/linux/printk.h:9, from include/linux/kernel.h:14, from include/asm-generic/bug.h:18, from arch/arc/include/asm/bug.h:32, from include/linux/bug.h:5, from include/linux/mmdebug.h:5, from include/linux/mm.h:9, from mm/page_alloc.c:18: mm/page_alloc.c: In function 'deferred_grow_zone': mm/page_alloc.c:1624:52: error: 'PAGES_PER_SECTION' undeclared (first use in this function); did you mean 'USEC_PER_SEC'? unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); ^ include/uapi/linux/kernel.h:11:47: note: in definition of macro '__ALIGN_KERNEL_MASK' #define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) ^~~~ include/linux/kernel.h:58:22: note: in expansion of macro '__ALIGN_KERNEL' #define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) ^~~~~~~~~~~~~~ mm/page_alloc.c:1624:34: note: in expansion of macro 'ALIGN' unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); ^~~~~ In file included from include/asm-generic/bug.h:18:0, from arch/arc/include/asm/bug.h:32, from include/linux/bug.h:5, from include/linux/mmdebug.h:5, from include/linux/mm.h:9, from mm/page_alloc.c:18: mm/page_alloc.c: In function 'free_area_init_node': mm/page_alloc.c:6379:50: error: 'PAGES_PER_SECTION' undeclared (first use in this function); did you mean 'USEC_PER_SEC'? pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, ^ include/linux/kernel.h:812:22: note: in definition of macro '__typecheck' (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) ^ include/linux/kernel.h:836:24: note: in expansion of macro '__safe_cmp' __builtin_choose_expr(__safe_cmp(x, y), \ ^~~~~~~~~~ include/linux/kernel.h:904:27: note: in expansion of macro '__careful_cmp' #define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <) ^~~~~~~~~~~~~ mm/page_alloc.c:6379:29: note: in expansion of macro 'min_t' pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, ^~~~~ include/linux/kernel.h:836:2: error: first argument to '__builtin_choose_expr' not a constant __builtin_choose_expr(__safe_cmp(x, y), \ ^ include/linux/kernel.h:904:27: note: in expansion of macro '__careful_cmp' #define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <) ^~~~~~~~~~~~~ mm/page_alloc.c:6379:29: note: in expansion of macro 'min_t' pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, ^~~~~ scripts/Makefile.build:317: recipe for target 'mm/page_alloc.o' failed Let's make the DEFERRED_STRUCT_PAGE_INIT explicitly depend on SPARSEMEM as the systems that support DISCONTIGMEM do not seem to have that huge amounts of memory that would make DEFERRED_STRUCT_PAGE_INIT relevant. Link: http://lkml.kernel.org/r/1530279308-24988-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Acked-by: Michal Hocko Reviewed-by: Pavel Tatashin Tested-by: Randy Dunlap Cc: Pasha Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 9ae1b6a8e30f..08d8399bb93b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -638,7 +638,7 @@ config DEFERRED_STRUCT_PAGE_INIT bool "Defer initialisation of struct pages to kthreads" default n depends on NO_BOOTMEM - depends on !FLATMEM + depends on SPARSEMEM depends on !NEED_PER_CPU_KM help Ordinarily all struct pages are initialised during early boot in a From 29ef680ae7c21110af8e6416d84d8a72fc147b14 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 17 Aug 2018 15:47:11 -0700 Subject: [PATCH 065/111] memcg, oom: move out_of_memory back to the charge path Commit 3812c8c8f395 ("mm: memcg: do not trap chargers with full callstack on OOM") has changed the ENOMEM semantic of memcg charges. Rather than invoking the oom killer from the charging context it delays the oom killer to the page fault path (pagefault_out_of_memory). This in turn means that many users (e.g. slab or g-u-p) will get ENOMEM when the corresponding memcg hits the hard limit and the memcg is is OOM. This is behavior is inconsistent with !memcg case where the oom killer is invoked from the allocation context and the allocator keeps retrying until it succeeds. The difference in the behavior is user visible. mmap(MAP_POPULATE) might result in not fully populated ranges while the mmap return code doesn't tell that to the userspace. Random syscalls might fail with ENOMEM etc. The primary motivation of the different memcg oom semantic was the deadlock avoidance. Things have changed since then, though. We have an async oom teardown by the oom reaper now and so we do not have to rely on the victim to tear down its memory anymore. Therefore we can return to the original semantic as long as the memcg oom killer is not handed over to the users space. There is still one thing to be careful about here though. If the oom killer is not able to make any forward progress - e.g. because there is no eligible task to kill - then we have to bail out of the charge path to prevent from same class of deadlocks. We have basically two options here. Either we fail the charge with ENOMEM or force the charge and allow overcharge. The first option has been considered more harmful than useful because rare inconsistencies in the ENOMEM behavior is hard to test for and error prone. Basically the same reason why the page allocator doesn't fail allocations under such conditions. The later might allow runaways but those should be really unlikely unless somebody misconfigures the system. E.g. allowing to migrate tasks away from the memcg to a different unlimited memcg with move_charge_at_immigrate disabled. Link: http://lkml.kernel.org/r/20180628151101.25307-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Greg Thelen Cc: Johannes Weiner Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 16 ++++---- include/linux/sched.h | 2 +- mm/memcontrol.c | 75 ++++++++++++++++++++++++++++++-------- mm/memory.c | 4 +- 4 files changed, 71 insertions(+), 26 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 50e3e807b427..57a202f31683 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -507,16 +507,16 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); -static inline void mem_cgroup_oom_enable(void) +static inline void mem_cgroup_enter_user_fault(void) { - WARN_ON(current->memcg_may_oom); - current->memcg_may_oom = 1; + WARN_ON(current->in_user_fault); + current->in_user_fault = 1; } -static inline void mem_cgroup_oom_disable(void) +static inline void mem_cgroup_exit_user_fault(void) { - WARN_ON(!current->memcg_may_oom); - current->memcg_may_oom = 0; + WARN_ON(!current->in_user_fault); + current->in_user_fault = 0; } static inline bool task_in_memcg_oom(struct task_struct *p) @@ -961,11 +961,11 @@ static inline void mem_cgroup_handle_over_high(void) { } -static inline void mem_cgroup_oom_enable(void) +static inline void mem_cgroup_enter_user_fault(void) { } -static inline void mem_cgroup_oom_disable(void) +static inline void mem_cgroup_exit_user_fault(void) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index 1827f4a7a6de..066a2c328653 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -722,7 +722,7 @@ struct task_struct { unsigned restore_sigmask:1; #endif #ifdef CONFIG_MEMCG - unsigned memcg_may_oom:1; + unsigned in_user_fault:1; #ifndef CONFIG_SLOB unsigned memcg_kmem_skip_account:1; #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c071af193986..d6724bed57d8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1534,28 +1534,53 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } -static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) +enum oom_status { + OOM_SUCCESS, + OOM_FAILED, + OOM_ASYNC, + OOM_SKIPPED +}; + +static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_may_oom || order > PAGE_ALLOC_COSTLY_ORDER) - return; + if (order > PAGE_ALLOC_COSTLY_ORDER) + return OOM_SKIPPED; + /* * We are in the middle of the charge context here, so we * don't want to block when potentially sitting on a callstack * that holds all kinds of filesystem and mm locks. * - * Also, the caller may handle a failed allocation gracefully - * (like optional page cache readahead) and so an OOM killer - * invocation might not even be necessary. + * cgroup1 allows disabling the OOM killer and waiting for outside + * handling until the charge can succeed; remember the context and put + * the task to sleep at the end of the page fault when all locks are + * released. * - * That's why we don't do anything here except remember the - * OOM context and then deal with it at the end of the page - * fault when the stack is unwound, the locks are released, - * and when we know whether the fault was overall successful. + * On the other hand, in-kernel OOM killer allows for an async victim + * memory reclaim (oom_reaper) and that means that we are not solely + * relying on the oom victim to make a forward progress and we can + * invoke the oom killer here. + * + * Please note that mem_cgroup_out_of_memory might fail to find a + * victim and then we have to bail out from the charge path. */ - css_get(&memcg->css); - current->memcg_in_oom = memcg; - current->memcg_oom_gfp_mask = mask; - current->memcg_oom_order = order; + if (memcg->oom_kill_disable) { + if (!current->in_user_fault) + return OOM_SKIPPED; + css_get(&memcg->css); + current->memcg_in_oom = memcg; + current->memcg_oom_gfp_mask = mask; + current->memcg_oom_order = order; + + return OOM_ASYNC; + } + + if (mem_cgroup_out_of_memory(memcg, mask, order)) + return OOM_SUCCESS; + + WARN(1,"Memory cgroup charge failed because of no reclaimable memory! " + "This looks like a misconfiguration or a kernel bug."); + return OOM_FAILED; } /** @@ -1950,6 +1975,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long nr_reclaimed; bool may_swap = true; bool drained = false; + bool oomed = false; + enum oom_status oom_status; if (mem_cgroup_is_root(memcg)) return 0; @@ -2037,6 +2064,9 @@ retry: if (nr_retries--) goto retry; + if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed) + goto nomem; + if (gfp_mask & __GFP_NOFAIL) goto force; @@ -2045,8 +2075,23 @@ retry: memcg_memory_event(mem_over_limit, MEMCG_OOM); - mem_cgroup_oom(mem_over_limit, gfp_mask, + /* + * keep retrying as long as the memcg oom killer is able to make + * a forward progress or bypass the charge if the oom killer + * couldn't make any progress. + */ + oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); + switch (oom_status) { + case OOM_SUCCESS: + nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + oomed = true; + goto retry; + case OOM_FAILED: + goto force; + default: + goto nomem; + } nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; diff --git a/mm/memory.c b/mm/memory.c index 175f344e1523..ae2ec887508b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4153,7 +4153,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, * space. Kernel faults are handled more gracefully. */ if (flags & FAULT_FLAG_USER) - mem_cgroup_oom_enable(); + mem_cgroup_enter_user_fault(); if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); @@ -4161,7 +4161,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, ret = __handle_mm_fault(vma, address, flags); if (flags & FAULT_FLAG_USER) { - mem_cgroup_oom_disable(); + mem_cgroup_exit_user_fault(); /* * The task may have entered a memcg OOM situation but * if the allocation error was handled gracefully (no From 4e40987f12de2f244d0d2ef64730aca92922c95a Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Fri, 17 Aug 2018 15:47:14 -0700 Subject: [PATCH 066/111] mm/sparse.c: make sparse_init_one_section void and remove check sparse_init_one_section() is being called from two sites: sparse_init() and sparse_add_one_section(). The former calls it from a for_each_present_section_nr() loop, and the latter marks the section as present before calling it. This means that when sparse_init_one_section() gets called, we already know that the section is present. So there is no point to double check that in the function. This removes the check and makes the function void. [ross.zwisler@linux.intel.com: fix error path in sparse_add_one_section] Link: http://lkml.kernel.org/r/20180706190658.6873-1-ross.zwisler@linux.intel.com [ross.zwisler@linux.intel.com: simplification suggested by Oscar] Link: http://lkml.kernel.org/r/20180706223358.742-1-ross.zwisler@linux.intel.com Link: http://lkml.kernel.org/r/20180702154325.12196-1-osalvador@techadventures.net Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Reviewed-by: Pavel Tatashin Reviewed-by: Andrew Morton Cc: Pasha Tatashin Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index f13f2723950a..b1b14a9c4041 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -257,19 +257,14 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); } -static int __meminit sparse_init_one_section(struct mem_section *ms, +static void __meminit sparse_init_one_section(struct mem_section *ms, unsigned long pnum, struct page *mem_map, unsigned long *pageblock_bitmap) { - if (!present_section(ms)) - return -EINVAL; - ms->section_mem_map &= ~SECTION_MAP_MASK; ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) | SECTION_HAS_MEM_MAP; ms->pageblock_flags = pageblock_bitmap; - - return 1; } unsigned long usemap_size(void) @@ -760,6 +755,7 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat, ret = sparse_index_init(section_nr, pgdat->node_id); if (ret < 0 && ret != -EEXIST) return ret; + ret = 0; memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, altmap); if (!memmap) return -ENOMEM; @@ -786,12 +782,11 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat, #endif section_mark_present(ms); - - ret = sparse_init_one_section(ms, section_nr, memmap, usemap); + sparse_init_one_section(ms, section_nr, memmap, usemap); out: pgdat_resize_unlock(pgdat, &flags); - if (ret <= 0) { + if (ret < 0) { kfree(usemap); __kfree_section_memmap(memmap, altmap); } From a36aab890c2166744e6299dc55ef2c38cb6616c0 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 17 Aug 2018 15:47:17 -0700 Subject: [PATCH 067/111] mm/memblock.c: replace u64 with phys_addr_t where appropriate Most functions in memblock already use phys_addr_t to represent a physical address with __memblock_free_late() being an exception. This patch replaces u64 with phys_addr_t in __memblock_free_late() and switches several format strings from %llx to %pa to avoid casting from phys_addr_t to u64. Link: http://lkml.kernel.org/r/1530637506-1256-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Pasha Tatashin Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index b4ad05764745..237944479d25 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -392,7 +392,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, { struct memblock_region *new_array, *old_array; phys_addr_t old_alloc_size, new_alloc_size; - phys_addr_t old_size, new_size, addr; + phys_addr_t old_size, new_size, addr, new_end; int use_slab = slab_is_available(); int *in_slab; @@ -453,9 +453,9 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, return -1; } - memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]", - type->name, type->max * 2, (u64)addr, - (u64)addr + new_size - 1); + new_end = addr + new_size - 1; + memblock_dbg("memblock: %s is doubled to %ld at [%pa-%pa]", + type->name, type->max * 2, &addr, &new_end); /* * Found space, we now need to move the array over before we add the @@ -1438,9 +1438,9 @@ void * __init memblock_virt_alloc_try_nid_raw( { void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", - __func__, (u64)size, (u64)align, nid, (u64)min_addr, - (u64)max_addr, (void *)_RET_IP_); + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", + __func__, (u64)size, (u64)align, nid, &min_addr, + &max_addr, (void *)_RET_IP_); ptr = memblock_virt_alloc_internal(size, align, min_addr, max_addr, nid); @@ -1475,9 +1475,9 @@ void * __init memblock_virt_alloc_try_nid_nopanic( { void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", - __func__, (u64)size, (u64)align, nid, (u64)min_addr, - (u64)max_addr, (void *)_RET_IP_); + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", + __func__, (u64)size, (u64)align, nid, &min_addr, + &max_addr, (void *)_RET_IP_); ptr = memblock_virt_alloc_internal(size, align, min_addr, max_addr, nid); @@ -1511,9 +1511,9 @@ void * __init memblock_virt_alloc_try_nid( { void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", - __func__, (u64)size, (u64)align, nid, (u64)min_addr, - (u64)max_addr, (void *)_RET_IP_); + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", + __func__, (u64)size, (u64)align, nid, &min_addr, + &max_addr, (void *)_RET_IP_); ptr = memblock_virt_alloc_internal(size, align, min_addr, max_addr, nid); if (ptr) { @@ -1521,9 +1521,8 @@ void * __init memblock_virt_alloc_try_nid( return ptr; } - panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", - __func__, (u64)size, (u64)align, nid, (u64)min_addr, - (u64)max_addr); + panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa\n", + __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr); return NULL; } #endif @@ -1538,9 +1537,10 @@ void * __init memblock_virt_alloc_try_nid( */ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) { - memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", - __func__, (u64)base, (u64)base + size - 1, - (void *)_RET_IP_); + phys_addr_t end = base + size - 1; + + memblock_dbg("%s: [%pa-%pa] %pF\n", + __func__, &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); memblock_remove_range(&memblock.reserved, base, size); } @@ -1556,11 +1556,11 @@ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) */ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) { - u64 cursor, end; + phys_addr_t cursor, end; - memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", - __func__, (u64)base, (u64)base + size - 1, - (void *)_RET_IP_); + end = base + size - 1; + memblock_dbg("%s: [%pa-%pa] %pF\n", + __func__, &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); cursor = PFN_UP(base); end = PFN_DOWN(base + size); From e0295238e50f1aa16d4c902c837fd8d17861b698 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:47:21 -0700 Subject: [PATCH 068/111] mm/list_lru.c: combine code under the same define Patch series "Improve shrink_slab() scalability (old complexity was O(n^2), new is O(n))", v8. This patcheset solves the problem with slow shrink_slab() occuring on the machines having many shrinkers and memory cgroups (i.e., with many containers). The problem is complexity of shrink_slab() is O(n^2) and it grows too fast with the growth of containers numbers. Let us have 200 containers, and every container has 10 mounts and 10 cgroups. All container tasks are isolated, and they don't touch foreign containers mounts. In case of global reclaim, a task has to iterate all over the memcgs and to call all the memcg-aware shrinkers for all of them. This means, the task has to visit 200 * 10 = 2000 shrinkers for every memcg, and since there are 2000 memcgs, the total calls of do_shrink_slab() are 2000 * 2000 = 4000000. 4 million calls are not a number operations, which can takes 1 cpu cycle. E.g., super_cache_count() accesses at least two lists, and makes arifmetical calculations. Even, if there are no charged objects, we do these calculations, and replaces cpu caches by read memory. I observed nodes spending almost 100% time in kernel, in case of intensive writing and global reclaim. The writer consumes pages fast, but it's need to shrink_slab() before the reclaimer reached shrink pages function (and frees SWAP_CLUSTER_MAX pages). Even if there is no writing, the iterations just waste the time, and slows reclaim down. Let's see the small test below: $echo 1 > /sys/fs/cgroup/memory/memory.use_hierarchy $mkdir /sys/fs/cgroup/memory/ct $echo 4000M > /sys/fs/cgroup/memory/ct/memory.kmem.limit_in_bytes $for i in `seq 0 4000`; do mkdir /sys/fs/cgroup/memory/ct/$i; echo $$ > /sys/fs/cgroup/memory/ct/$i/cgroup.procs; mkdir -p s/$i; mount -t tmpfs $i s/$i; touch s/$i/file; done Then, let's see drop caches time (5 sequential calls): $time echo 3 > /proc/sys/vm/drop_caches 0.00user 13.78system 0:13.78elapsed 99%CPU 0.00user 5.59system 0:05.60elapsed 99%CPU 0.00user 5.48system 0:05.48elapsed 99%CPU 0.00user 8.35system 0:08.35elapsed 99%CPU 0.00user 8.34system 0:08.35elapsed 99%CPU The last four calls don't actually shrink anything. So, the iterations over slab shrinkers take 5.48 seconds. Not so good for scalability. The patchset solves the problem by making shrink_slab() of O(n) complexity. There are following functional actions: 1) Assign id to every registered memcg-aware shrinker. 2) Maintain per-memcgroup bitmap of memcg-aware shrinkers, and set a shrinker-related bit after the first element is added to lru list (also, when removed child memcg elements are reparanted). 3) Split memcg-aware shrinkers and !memcg-aware shrinkers, and call a shrinker if its bit is set in memcg's shrinker bitmap. (Also, there is a functionality to clear the bit, after last element is shrinked). This gives significant performance increase. The result after patchset is applied: $time echo 3 > /proc/sys/vm/drop_caches 0.00user 1.10system 0:01.10elapsed 99%CPU 0.00user 0.00system 0:00.01elapsed 64%CPU 0.00user 0.01system 0:00.01elapsed 82%CPU 0.00user 0.00system 0:00.01elapsed 64%CPU 0.00user 0.01system 0:00.01elapsed 82%CPU The results show the performance increases at least in 548 times. So, the patchset makes shrink_slab() of less complexity and improves the performance in such types of load I pointed. This will give a profit in case of !global reclaim case, since there also will be less do_shrink_slab() calls. This patch (of 17): These two pairs of blocks of code are under the same #ifdef #else #endif. Link: http://lkml.kernel.org/r/153063052519.1818.9393587113056959488.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Johannes Weiner Cc: Michal Hocko Cc: Thomas Gleixner Cc: Philippe Ombredanne Cc: Sahitya Tummala Cc: Greg Kroah-Hartman Cc: Stephen Rothwell Cc: Roman Gushchin Cc: Matthias Kaehlcke Cc: Tetsuo Handa Cc: Chris Wilson Cc: Waiman Long Cc: Minchan Kim Cc: "Huang, Ying" Cc: Mel Gorman Cc: Josef Bacik Cc: Guenter Roeck Cc: Matthew Wilcox Cc: Li RongQing Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index db679a057f46..b65e0b9b0646 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -29,17 +29,7 @@ static void list_lru_unregister(struct list_lru *lru) list_del(&lru->list); mutex_unlock(&list_lrus_mutex); } -#else -static void list_lru_register(struct list_lru *lru) -{ -} -static void list_lru_unregister(struct list_lru *lru) -{ -} -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ - -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) static inline bool list_lru_memcg_aware(struct list_lru *lru) { /* @@ -89,6 +79,14 @@ list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); } #else +static void list_lru_register(struct list_lru *lru) +{ +} + +static void list_lru_unregister(struct list_lru *lru) +{ +} + static inline bool list_lru_memcg_aware(struct list_lru *lru) { return false; From 84c07d11aa619c6d24c682f469b10f344f0c02aa Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:47:25 -0700 Subject: [PATCH 069/111] mm: introduce CONFIG_MEMCG_KMEM as combination of CONFIG_MEMCG && !CONFIG_SLOB Introduce new config option, which is used to replace repeating CONFIG_MEMCG && !CONFIG_SLOB pattern. Next patches add a little more memcg+kmem related code, so let's keep the defines more clearly. Link: http://lkml.kernel.org/r/153063053670.1818.15013136946600481138.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list_lru.h | 4 ++-- include/linux/memcontrol.h | 6 +++--- include/linux/sched.h | 2 +- include/linux/slab.h | 2 +- init/Kconfig | 5 +++++ mm/list_lru.c | 8 ++++---- mm/memcontrol.c | 16 ++++++++-------- mm/slab.h | 6 +++--- mm/slab_common.c | 8 ++++---- 9 files changed, 31 insertions(+), 26 deletions(-) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 96def9d15b1b..2d23b5b745be 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -42,7 +42,7 @@ struct list_lru_node { spinlock_t lock; /* global list, used for the root cgroup in cgroup aware lrus */ struct list_lru_one lru; -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */ struct list_lru_memcg __rcu *memcg_lrus; #endif @@ -51,7 +51,7 @@ struct list_lru_node { struct list_lru { struct list_lru_node *node; -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM struct list_head list; #endif }; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 57a202f31683..f3c026df7443 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -271,7 +271,7 @@ struct mem_cgroup { bool tcpmem_active; int tcpmem_pressure; -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; enum memcg_kmem_state kmem_state; @@ -1231,7 +1231,7 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, int memcg_kmem_charge(struct page *page, gfp_t gfp, int order); void memcg_kmem_uncharge(struct page *page, int order); -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM extern struct static_key_false memcg_kmem_enabled_key; extern struct workqueue_struct *memcg_kmem_cache_wq; @@ -1284,6 +1284,6 @@ static inline void memcg_put_cache_ids(void) { } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 066a2c328653..789923fbee3a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -723,7 +723,7 @@ struct task_struct { #endif #ifdef CONFIG_MEMCG unsigned in_user_fault:1; -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM unsigned memcg_kmem_skip_account:1; #endif #endif diff --git a/include/linux/slab.h b/include/linux/slab.h index 14e3fe4bd6a1..ed9cbddeb4a6 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -97,7 +97,7 @@ # define SLAB_FAILSLAB 0 #endif /* Account to memcg */ -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM # define SLAB_ACCOUNT ((slab_flags_t __force)0x04000000U) #else # define SLAB_ACCOUNT 0 diff --git a/init/Kconfig b/init/Kconfig index 4dc783023e43..9bd50ba8253f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -708,6 +708,11 @@ config MEMCG_SWAP_ENABLED select this option (if, for some reason, they need to disable it then swapaccount=0 does the trick). +config MEMCG_KMEM + bool + depends on MEMCG && !SLOB + default y + config BLK_CGROUP bool "IO controller" depends on BLOCK diff --git a/mm/list_lru.c b/mm/list_lru.c index b65e0b9b0646..c5217d84c6e1 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -12,7 +12,7 @@ #include #include -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM static LIST_HEAD(list_lrus); static DEFINE_MUTEX(list_lrus_mutex); @@ -103,7 +103,7 @@ list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) { return &nlru->lru; } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ bool list_lru_add(struct list_lru *lru, struct list_head *item) { @@ -284,7 +284,7 @@ static void init_one_lru(struct list_lru_one *l) l->nr_items = 0; } -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus, int begin, int end) { @@ -543,7 +543,7 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) static void memcg_destroy_list_lru(struct list_lru *lru) { } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct lock_class_key *key) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d6724bed57d8..2f00b455080f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -251,7 +251,7 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return (memcg == root_mem_cgroup); } -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM /* * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. * The main reason for not using cgroup id for this: @@ -305,7 +305,7 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); struct workqueue_struct *memcg_kmem_cache_wq; -#endif /* !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ /** * mem_cgroup_css_from_page - css of the memcg associated with a page @@ -2215,7 +2215,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, unlock_page_lru(page, isolated); } -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM static int memcg_alloc_cache_id(void) { int id, size; @@ -2480,7 +2480,7 @@ void memcg_kmem_uncharge(struct page *page, int order) css_put_many(&memcg->css, nr_pages); } -#endif /* !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -2875,7 +2875,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { int memcg_id; @@ -2975,7 +2975,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) static void memcg_free_kmem(struct mem_cgroup *memcg) { } -#endif /* !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ static int memcg_update_kmem_max(struct mem_cgroup *memcg, unsigned long max) @@ -4279,7 +4279,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) INIT_LIST_HEAD(&memcg->event_list); spin_lock_init(&memcg->event_list_lock); memcg->socket_pressure = jiffies; -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; #endif #ifdef CONFIG_CGROUP_WRITEBACK @@ -6119,7 +6119,7 @@ static int __init mem_cgroup_init(void) { int cpu, node; -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM /* * Kmem cache creation is mostly done with the slab_mutex held, * so use a workqueue with limited concurrency to avoid stalling diff --git a/mm/slab.h b/mm/slab.h index 68bdf498da3b..58c6c1c2a78e 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -203,7 +203,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM /* List of all root caches. */ extern struct list_head slab_root_caches; @@ -296,7 +296,7 @@ extern void memcg_link_cache(struct kmem_cache *s); extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, void (*deact_fn)(struct kmem_cache *)); -#else /* CONFIG_MEMCG && !CONFIG_SLOB */ +#else /* CONFIG_MEMCG_KMEM */ /* If !memcg, all caches are root. */ #define slab_root_caches slab_caches @@ -351,7 +351,7 @@ static inline void memcg_link_cache(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) { diff --git a/mm/slab_common.c b/mm/slab_common.c index 2296caf87bfb..fea3376f9816 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -127,7 +127,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, return i; } -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM LIST_HEAD(slab_root_caches); @@ -256,7 +256,7 @@ static inline void destroy_memcg_params(struct kmem_cache *s) static inline void memcg_unlink_cache(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ /* * Figure out what the alignment of the objects will be given a set of @@ -584,7 +584,7 @@ static int shutdown_cache(struct kmem_cache *s) return 0; } -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM /* * memcg_create_kmem_cache - Create a cache for a memory cgroup. * @memcg: The memory cgroup the new cache is for. @@ -861,7 +861,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s) static inline void flush_memcg_workqueue(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ void slab_kmem_cache_release(struct kmem_cache *s) { From b4c2b231c3ba155623591fb6301ed97b95e1c039 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:47:29 -0700 Subject: [PATCH 070/111] mm: assign id to every memcg-aware shrinker Introduce shrinker::id number, which is used to enumerate memcg-aware shrinkers. The number start from 0, and the code tries to maintain it as small as possible. This will be used to represent a memcg-aware shrinkers in memcg shrinkers map. Since all memcg-aware shrinkers are based on list_lru, which is per-memcg in case of !CONFIG_MEMCG_KMEM only, the new functionality will be under this config option. [ktkhai@virtuozzo.com: v9] Link: http://lkml.kernel.org/r/153112546435.4097.10607140323811756557.stgit@localhost.localdomain Link: http://lkml.kernel.org/r/153063054586.1818.6041047871606697364.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shrinker.h | 4 +++ mm/vmscan.c | 63 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 6794490f25b2..7ca9c18cf130 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -66,6 +66,10 @@ struct shrinker { /* These are for internal use */ struct list_head list; +#ifdef CONFIG_MEMCG_KMEM + /* ID in shrinker_idr */ + int id; +#endif /* objs pending delete, per node */ atomic_long_t *nr_deferred; }; diff --git a/mm/vmscan.c b/mm/vmscan.c index a00d94530e57..5cb4f779ea4a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -169,6 +169,50 @@ unsigned long vm_total_pages; static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); +#ifdef CONFIG_MEMCG_KMEM +static DEFINE_IDR(shrinker_idr); +static int shrinker_nr_max; + +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + int id, ret = -ENOMEM; + + down_write(&shrinker_rwsem); + /* This may call shrinker, so it must use down_read_trylock() */ + id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); + if (id < 0) + goto unlock; + + if (id >= shrinker_nr_max) + shrinker_nr_max = id + 1; + shrinker->id = id; + ret = 0; +unlock: + up_write(&shrinker_rwsem); + return ret; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ + int id = shrinker->id; + + BUG_ON(id < 0); + + down_write(&shrinker_rwsem); + idr_remove(&shrinker_idr, id); + up_write(&shrinker_rwsem); +} +#else /* CONFIG_MEMCG_KMEM */ +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + return 0; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + #ifdef CONFIG_MEMCG static bool global_reclaim(struct scan_control *sc) { @@ -313,11 +357,28 @@ int prealloc_shrinker(struct shrinker *shrinker) shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); if (!shrinker->nr_deferred) return -ENOMEM; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + if (prealloc_memcg_shrinker(shrinker)) + goto free_deferred; + } + return 0; + +free_deferred: + kfree(shrinker->nr_deferred); + shrinker->nr_deferred = NULL; + return -ENOMEM; } void free_prealloced_shrinker(struct shrinker *shrinker) { + if (!shrinker->nr_deferred) + return; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); + kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; } @@ -347,6 +408,8 @@ void unregister_shrinker(struct shrinker *shrinker) { if (!shrinker->nr_deferred) return; + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); down_write(&shrinker_rwsem); list_del(&shrinker->list); up_write(&shrinker_rwsem); From b05706f1001fe662bafe198814c5999fd996dce0 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:47:33 -0700 Subject: [PATCH 071/111] mm/memcontrol.c: move up for_each_mem_cgroup{, _tree} defines Next patch requires these defines are above their current position, so here they are moved to declarations. Link: http://lkml.kernel.org/r/153063055665.1818.5200425793649695598.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2f00b455080f..313355dddf66 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -233,6 +233,21 @@ enum res_type { /* Used for OOM nofiier */ #define OOM_CONTROL (0) +/* + * Iteration constructs for visiting all cgroups (under a tree). If + * loops are exited prematurely (break), mem_cgroup_iter_break() must + * be used for reference counting. + */ +#define for_each_mem_cgroup_tree(iter, root) \ + for (iter = mem_cgroup_iter(root, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(root, iter, NULL)) + +#define for_each_mem_cgroup(iter) \ + for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(NULL, iter, NULL)) + /* Some nice accessors for the vmpressure. */ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) { @@ -913,21 +928,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) } } -/* - * Iteration constructs for visiting all cgroups (under a tree). If - * loops are exited prematurely (break), mem_cgroup_iter_break() must - * be used for reference counting. - */ -#define for_each_mem_cgroup_tree(iter, root) \ - for (iter = mem_cgroup_iter(root, NULL, NULL); \ - iter != NULL; \ - iter = mem_cgroup_iter(root, iter, NULL)) - -#define for_each_mem_cgroup(iter) \ - for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ - iter != NULL; \ - iter = mem_cgroup_iter(NULL, iter, NULL)) - /** * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy * @memcg: hierarchy root From 0a4465d340282f92719f4e3a56545a848e638d15 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:47:37 -0700 Subject: [PATCH 072/111] mm, memcg: assign memcg-aware shrinkers bitmap to memcg Imagine a big node with many cpus, memory cgroups and containers. Let we have 200 containers, every container has 10 mounts, and 10 cgroups. All container tasks don't touch foreign containers mounts. If there is intensive pages write, and global reclaim happens, a writing task has to iterate over all memcgs to shrink slab, before it's able to go to shrink_page_list(). Iteration over all the memcg slabs is very expensive: the task has to visit 200 * 10 = 2000 shrinkers for every memcg, and since there are 2000 memcgs, the total calls are 2000 * 2000 = 4000000. So, the shrinker makes 4 million do_shrink_slab() calls just to try to isolate SWAP_CLUSTER_MAX pages in one of the actively writing memcg via shrink_page_list(). I've observed a node spending almost 100% in kernel, making useless iteration over already shrinked slab. This patch adds bitmap of memcg-aware shrinkers to memcg. The size of the bitmap depends on bitmap_nr_ids, and during memcg life it's maintained to be enough to fit bitmap_nr_ids shrinkers. Every bit in the map is related to corresponding shrinker id. Next patches will maintain set bit only for really charged memcg. This will allow shrink_slab() to increase its performance in significant way. See the last patch for the numbers. [ktkhai@virtuozzo.com: v9] Link: http://lkml.kernel.org/r/153112549031.4097.3576147070498769979.stgit@localhost.localdomain [ktkhai@virtuozzo.com: add comment to mem_cgroup_css_online()] Link: http://lkml.kernel.org/r/521f9e5f-c436-b388-fe83-4dc870bfb489@virtuozzo.com Link: http://lkml.kernel.org/r/153063056619.1818.12550500883688681076.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 14 +++++ mm/memcontrol.c | 124 +++++++++++++++++++++++++++++++++++++ mm/vmscan.c | 8 ++- 3 files changed, 145 insertions(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f3c026df7443..2cccbb9e1b3e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -111,6 +111,15 @@ struct lruvec_stat { long count[NR_VM_NODE_STAT_ITEMS]; }; +/* + * Bitmap of shrinker::id corresponding to memcg-aware shrinkers, + * which have elements charged to this memcg. + */ +struct memcg_shrinker_map { + struct rcu_head rcu; + unsigned long map[0]; +}; + /* * per-zone information in memory controller. */ @@ -124,6 +133,9 @@ struct mem_cgroup_per_node { struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; +#ifdef CONFIG_MEMCG_KMEM + struct memcg_shrinker_map __rcu *shrinker_map; +#endif struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ @@ -1262,6 +1274,8 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } +extern int memcg_expand_shrinker_maps(int new_id); + #else #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 313355dddf66..827c9e87ca08 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -320,6 +320,119 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); struct workqueue_struct *memcg_kmem_cache_wq; +static int memcg_shrinker_map_size; +static DEFINE_MUTEX(memcg_shrinker_map_mutex); + +static void memcg_free_shrinker_map_rcu(struct rcu_head *head) +{ + kvfree(container_of(head, struct memcg_shrinker_map, rcu)); +} + +static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, + int size, int old_size) +{ + struct memcg_shrinker_map *new, *old; + int nid; + + lockdep_assert_held(&memcg_shrinker_map_mutex); + + for_each_node(nid) { + old = rcu_dereference_protected( + mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true); + /* Not yet online memcg */ + if (!old) + return 0; + + new = kvmalloc(sizeof(*new) + size, GFP_KERNEL); + if (!new) + return -ENOMEM; + + /* Set all old bits, clear all new bits */ + memset(new->map, (int)0xff, old_size); + memset((void *)new->map + old_size, 0, size - old_size); + + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new); + call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); + } + + return 0; +} + +static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *pn; + struct memcg_shrinker_map *map; + int nid; + + if (mem_cgroup_is_root(memcg)) + return; + + for_each_node(nid) { + pn = mem_cgroup_nodeinfo(memcg, nid); + map = rcu_dereference_protected(pn->shrinker_map, true); + if (map) + kvfree(map); + rcu_assign_pointer(pn->shrinker_map, NULL); + } +} + +static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) +{ + struct memcg_shrinker_map *map; + int nid, size, ret = 0; + + if (mem_cgroup_is_root(memcg)) + return 0; + + mutex_lock(&memcg_shrinker_map_mutex); + size = memcg_shrinker_map_size; + for_each_node(nid) { + map = kvzalloc(sizeof(*map) + size, GFP_KERNEL); + if (!map) { + memcg_free_shrinker_maps(memcg); + ret = -ENOMEM; + break; + } + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); + } + mutex_unlock(&memcg_shrinker_map_mutex); + + return ret; +} + +int memcg_expand_shrinker_maps(int new_id) +{ + int size, old_size, ret = 0; + struct mem_cgroup *memcg; + + size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); + old_size = memcg_shrinker_map_size; + if (size <= old_size) + return 0; + + mutex_lock(&memcg_shrinker_map_mutex); + if (!root_mem_cgroup) + goto unlock; + + for_each_mem_cgroup(memcg) { + if (mem_cgroup_is_root(memcg)) + continue; + ret = memcg_expand_one_shrinker_map(memcg, size, old_size); + if (ret) + goto unlock; + } +unlock: + if (!ret) + memcg_shrinker_map_size = size; + mutex_unlock(&memcg_shrinker_map_mutex); + return ret; +} +#else /* CONFIG_MEMCG_KMEM */ +static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) +{ + return 0; +} +static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { } #endif /* CONFIG_MEMCG_KMEM */ /** @@ -4356,6 +4469,16 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + /* + * A memcg must be visible for memcg_expand_shrinker_maps() + * by the time the maps are allocated. So, we allocate maps + * here, when for_each_mem_cgroup() can't skip it. + */ + if (memcg_alloc_shrinker_maps(memcg)) { + mem_cgroup_id_remove(memcg); + return -ENOMEM; + } + /* Online state pins memcg ID, memcg ID pins CSS */ atomic_set(&memcg->id.ref, 1); css_get(css); @@ -4408,6 +4531,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); + memcg_free_shrinker_maps(memcg); memcg_free_kmem(memcg); mem_cgroup_free(memcg); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 5cb4f779ea4a..db0970ba340d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -183,8 +183,14 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) if (id < 0) goto unlock; - if (id >= shrinker_nr_max) + if (id >= shrinker_nr_max) { + if (memcg_expand_shrinker_maps(id)) { + idr_remove(&shrinker_idr, id); + goto unlock; + } + shrinker_nr_max = id + 1; + } shrinker->id = id; ret = 0; unlock: From 39887653aab4cffb0074d0d3c4f392e61b67d22b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:47:41 -0700 Subject: [PATCH 073/111] mm/workingset.c: refactor workingset_init() Use prealloc_shrinker()/register_shrinker_prepared() instead of register_shrinker(). This will be used in next patch. [ktkhai@virtuozzo.com: v9] Link: http://lkml.kernel.org/r/153112550112.4097.16606173020912323761.stgit@localhost.localdomain Link: http://lkml.kernel.org/r/153063057666.1818.17625951186610808734.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/workingset.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/workingset.c b/mm/workingset.c index 529480c21f93..4e0b2523aae2 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -523,15 +523,16 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key); + ret = prealloc_shrinker(&workingset_shadow_shrinker); if (ret) goto err; - ret = register_shrinker(&workingset_shadow_shrinker); + ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key); if (ret) goto err_list_lru; + register_shrinker_prepared(&workingset_shadow_shrinker); return 0; err_list_lru: - list_lru_destroy(&shadow_nodes); + free_prealloced_shrinker(&workingset_shadow_shrinker); err: return ret; } From 2b3648a6ff83bd2a59b427d3537cc570933659b5 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:47:45 -0700 Subject: [PATCH 074/111] fs/super.c: refactor alloc_super() Do two list_lru_init_memcg() calls after prealloc_super(). destroy_unused_super() in fail path is OK with this. Next patch needs such the order. Link: http://lkml.kernel.org/r/153063058712.1818.3382490999719078571.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/super.c b/fs/super.c index 50728d9c1a05..78227c4ddb21 100644 --- a/fs/super.c +++ b/fs/super.c @@ -244,10 +244,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, INIT_LIST_HEAD(&s->s_inodes_wb); spin_lock_init(&s->s_inode_wblist_lock); - if (list_lru_init_memcg(&s->s_dentry_lru)) - goto fail; - if (list_lru_init_memcg(&s->s_inode_lru)) - goto fail; s->s_count = 1; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); @@ -265,6 +261,10 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; if (prealloc_shrinker(&s->s_shrink)) goto fail; + if (list_lru_init_memcg(&s->s_dentry_lru)) + goto fail; + if (list_lru_init_memcg(&s->s_inode_lru)) + goto fail; return s; fail: From c92e8e10cafeaaedc84f23fed1bfcf9cf07399c2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:47:50 -0700 Subject: [PATCH 075/111] fs: propagate shrinker::id to list_lru Add list_lru::shrinker_id field and populate it by registered shrinker id. This will be used to set correct bit in memcg shrinkers map by lru code in next patches, after there appeared the first related to memcg element in list_lru. Link: http://lkml.kernel.org/r/153063059758.1818.14866596416857717800.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 4 ++-- include/linux/list_lru.h | 12 ++++++++---- mm/list_lru.c | 11 ++++++++++- mm/workingset.c | 3 ++- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/fs/super.c b/fs/super.c index 78227c4ddb21..f5f96e52e0cd 100644 --- a/fs/super.c +++ b/fs/super.c @@ -261,9 +261,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; if (prealloc_shrinker(&s->s_shrink)) goto fail; - if (list_lru_init_memcg(&s->s_dentry_lru)) + if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink)) goto fail; - if (list_lru_init_memcg(&s->s_inode_lru)) + if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink)) goto fail; return s; diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 2d23b5b745be..9e75bb33766b 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -53,16 +53,20 @@ struct list_lru { struct list_lru_node *node; #ifdef CONFIG_MEMCG_KMEM struct list_head list; + int shrinker_id; #endif }; void list_lru_destroy(struct list_lru *lru); int __list_lru_init(struct list_lru *lru, bool memcg_aware, - struct lock_class_key *key); + struct lock_class_key *key, struct shrinker *shrinker); -#define list_lru_init(lru) __list_lru_init((lru), false, NULL) -#define list_lru_init_key(lru, key) __list_lru_init((lru), false, (key)) -#define list_lru_init_memcg(lru) __list_lru_init((lru), true, NULL) +#define list_lru_init(lru) \ + __list_lru_init((lru), false, NULL, NULL) +#define list_lru_init_key(lru, key) \ + __list_lru_init((lru), false, (key), NULL) +#define list_lru_init_memcg(lru, shrinker) \ + __list_lru_init((lru), true, NULL, shrinker) int memcg_update_all_list_lrus(int num_memcgs); void memcg_drain_all_list_lrus(int src_idx, int dst_idx); diff --git a/mm/list_lru.c b/mm/list_lru.c index c5217d84c6e1..5aebbb9b2f5b 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -546,12 +546,18 @@ static void memcg_destroy_list_lru(struct list_lru *lru) #endif /* CONFIG_MEMCG_KMEM */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, - struct lock_class_key *key) + struct lock_class_key *key, struct shrinker *shrinker) { int i; size_t size = sizeof(*lru->node) * nr_node_ids; int err = -ENOMEM; +#ifdef CONFIG_MEMCG_KMEM + if (shrinker) + lru->shrinker_id = shrinker->id; + else + lru->shrinker_id = -1; +#endif memcg_get_cache_ids(); lru->node = kzalloc(size, GFP_KERNEL); @@ -594,6 +600,9 @@ void list_lru_destroy(struct list_lru *lru) kfree(lru->node); lru->node = NULL; +#ifdef CONFIG_MEMCG_KMEM + lru->shrinker_id = -1; +#endif memcg_put_cache_ids(); } EXPORT_SYMBOL_GPL(list_lru_destroy); diff --git a/mm/workingset.c b/mm/workingset.c index 4e0b2523aae2..cd0b2ae615e4 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -526,7 +526,8 @@ static int __init workingset_init(void) ret = prealloc_shrinker(&workingset_shadow_shrinker); if (ret) goto err; - ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key); + ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, + &workingset_shadow_shrinker); if (ret) goto err_list_lru; register_shrinker_prepared(&workingset_shadow_shrinker); From 44bd4a4759d5a714767aa6be7e806ab54b7fa3a8 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:47:54 -0700 Subject: [PATCH 076/111] mm/list_lru.c: add memcg argument to list_lru_from_kmem() This is just refactoring to allow the next patches to have memcg pointer in list_lru_from_kmem(). Link: http://lkml.kernel.org/r/153063060664.1818.9541345386733498582.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 5aebbb9b2f5b..1fc5be746e69 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -65,18 +65,24 @@ static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) } static inline struct list_lru_one * -list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) +list_lru_from_kmem(struct list_lru_node *nlru, void *ptr, + struct mem_cgroup **memcg_ptr) { - struct mem_cgroup *memcg; + struct list_lru_one *l = &nlru->lru; + struct mem_cgroup *memcg = NULL; if (!nlru->memcg_lrus) - return &nlru->lru; + goto out; memcg = mem_cgroup_from_kmem(ptr); if (!memcg) - return &nlru->lru; + goto out; - return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); + l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); +out: + if (memcg_ptr) + *memcg_ptr = memcg; + return l; } #else static void list_lru_register(struct list_lru *lru) @@ -99,8 +105,11 @@ list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) } static inline struct list_lru_one * -list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) +list_lru_from_kmem(struct list_lru_node *nlru, void *ptr, + struct mem_cgroup **memcg_ptr) { + if (memcg_ptr) + *memcg_ptr = NULL; return &nlru->lru; } #endif /* CONFIG_MEMCG_KMEM */ @@ -113,7 +122,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) spin_lock(&nlru->lock); if (list_empty(item)) { - l = list_lru_from_kmem(nlru, item); + l = list_lru_from_kmem(nlru, item, NULL); list_add_tail(item, &l->list); l->nr_items++; nlru->nr_items++; @@ -133,7 +142,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) spin_lock(&nlru->lock); if (!list_empty(item)) { - l = list_lru_from_kmem(nlru, item); + l = list_lru_from_kmem(nlru, item, NULL); list_del_init(item); l->nr_items--; nlru->nr_items--; From 9bec5c35bfa3d41b046594b5890f772ed737f1fd Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:47:58 -0700 Subject: [PATCH 077/111] mm/list_lru: pass dst_memcg argument to memcg_drain_list_lru_node() This is just refactoring to allow the next patches to have dst_memcg pointer in memcg_drain_list_lru_node(). Link: http://lkml.kernel.org/r/153063062118.1818.2761273817739499749.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list_lru.h | 2 +- mm/list_lru.c | 11 ++++++----- mm/memcontrol.c | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 9e75bb33766b..d9c16f2f2f00 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -69,7 +69,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, __list_lru_init((lru), true, NULL, shrinker) int memcg_update_all_list_lrus(int num_memcgs); -void memcg_drain_all_list_lrus(int src_idx, int dst_idx); +void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg); /** * list_lru_add: add an element to the lru list's tail diff --git a/mm/list_lru.c b/mm/list_lru.c index 1fc5be746e69..5384cda08984 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -502,8 +502,9 @@ fail: } static void memcg_drain_list_lru_node(struct list_lru_node *nlru, - int src_idx, int dst_idx) + int src_idx, struct mem_cgroup *dst_memcg) { + int dst_idx = dst_memcg->kmemcg_id; struct list_lru_one *src, *dst; /* @@ -523,7 +524,7 @@ static void memcg_drain_list_lru_node(struct list_lru_node *nlru, } static void memcg_drain_list_lru(struct list_lru *lru, - int src_idx, int dst_idx) + int src_idx, struct mem_cgroup *dst_memcg) { int i; @@ -531,16 +532,16 @@ static void memcg_drain_list_lru(struct list_lru *lru, return; for_each_node(i) - memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx); + memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_memcg); } -void memcg_drain_all_list_lrus(int src_idx, int dst_idx) +void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg) { struct list_lru *lru; mutex_lock(&list_lrus_mutex); list_for_each_entry(lru, &list_lrus, list) - memcg_drain_list_lru(lru, src_idx, dst_idx); + memcg_drain_list_lru(lru, src_idx, dst_memcg); mutex_unlock(&list_lrus_mutex); } #else diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 827c9e87ca08..a35dc901424d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3060,7 +3060,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) } rcu_read_unlock(); - memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); + memcg_drain_all_list_lrus(kmemcg_id, parent); memcg_free_cache_id(kmemcg_id); } From 3b82c4dcc2f0f98f2aca3b9dc9b88721e962eec9 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:48:01 -0700 Subject: [PATCH 078/111] mm/list_lru.c: pass lru argument to memcg_drain_list_lru_node() This is just refactoring to allow next patches to have lru pointer in memcg_drain_list_lru_node(). Link: http://lkml.kernel.org/r/153063063164.1818.55009531386089350.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 5384cda08984..c6131925ec76 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -501,9 +501,10 @@ fail: goto out; } -static void memcg_drain_list_lru_node(struct list_lru_node *nlru, +static void memcg_drain_list_lru_node(struct list_lru *lru, int nid, int src_idx, struct mem_cgroup *dst_memcg) { + struct list_lru_node *nlru = &lru->node[nid]; int dst_idx = dst_memcg->kmemcg_id; struct list_lru_one *src, *dst; @@ -532,7 +533,7 @@ static void memcg_drain_list_lru(struct list_lru *lru, return; for_each_node(i) - memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_memcg); + memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg); } void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg) From dfd2f10ccfd7e6bd2a096eaf42e76a7229776322 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:48:06 -0700 Subject: [PATCH 079/111] mm/memcontrol.c: export mem_cgroup_is_root() This will be used in next patch. Link: http://lkml.kernel.org/r/153063064347.1818.1987011484100392706.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ++++++++++ mm/memcontrol.c | 5 ----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2cccbb9e1b3e..258c8a46959a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -318,6 +318,11 @@ struct mem_cgroup { extern struct mem_cgroup *root_mem_cgroup; +static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) +{ + return (memcg == root_mem_cgroup); +} + static inline bool mem_cgroup_disabled(void) { return !cgroup_subsys_enabled(memory_cgrp_subsys); @@ -784,6 +789,11 @@ void mem_cgroup_split_huge_fixup(struct page *head); struct mem_cgroup; +static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) +{ + return true; +} + static inline bool mem_cgroup_disabled(void) { return true; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a35dc901424d..d8cd6c39eca5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -261,11 +261,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; } -static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) -{ - return (memcg == root_mem_cgroup); -} - #ifdef CONFIG_MEMCG_KMEM /* * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. From fae91d6d8be5e20c47e459dbeb3d43bd5f9486f4 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:48:10 -0700 Subject: [PATCH 080/111] mm/list_lru.c: set bit in memcg shrinker bitmap on first list_lru item appearance Introduce set_shrinker_bit() function to set shrinker-related bit in memcg shrinker bitmap, and set the bit after the first item is added and in case of reparenting destroyed memcg's items. This will allow next patch to make shrinkers be called only, in case of they have charged objects at the moment, and to improve shrink_slab() performance. [ktkhai@virtuozzo.com: v9] Link: http://lkml.kernel.org/r/153112557572.4097.17315791419810749985.stgit@localhost.localdomain Link: http://lkml.kernel.org/r/153063065671.1818.15914674956134687268.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 4 ++++ mm/list_lru.c | 22 ++++++++++++++++++++-- mm/memcontrol.c | 13 +++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 258c8a46959a..0e6c515fb698 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1286,6 +1286,8 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) extern int memcg_expand_shrinker_maps(int new_id); +extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, + int nid, int shrinker_id); #else #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) @@ -1308,6 +1310,8 @@ static inline void memcg_put_cache_ids(void) { } +static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, + int nid, int shrinker_id) { } #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/mm/list_lru.c b/mm/list_lru.c index c6131925ec76..c9bdde9c03d1 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -30,6 +30,11 @@ static void list_lru_unregister(struct list_lru *lru) mutex_unlock(&list_lrus_mutex); } +static int lru_shrinker_id(struct list_lru *lru) +{ + return lru->shrinker_id; +} + static inline bool list_lru_memcg_aware(struct list_lru *lru) { /* @@ -93,6 +98,11 @@ static void list_lru_unregister(struct list_lru *lru) { } +static int lru_shrinker_id(struct list_lru *lru) +{ + return -1; +} + static inline bool list_lru_memcg_aware(struct list_lru *lru) { return false; @@ -118,13 +128,17 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); struct list_lru_node *nlru = &lru->node[nid]; + struct mem_cgroup *memcg; struct list_lru_one *l; spin_lock(&nlru->lock); if (list_empty(item)) { - l = list_lru_from_kmem(nlru, item, NULL); + l = list_lru_from_kmem(nlru, item, &memcg); list_add_tail(item, &l->list); - l->nr_items++; + /* Set shrinker bit if the first element was added */ + if (!l->nr_items++) + memcg_set_shrinker_bit(memcg, nid, + lru_shrinker_id(lru)); nlru->nr_items++; spin_unlock(&nlru->lock); return true; @@ -507,6 +521,7 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid, struct list_lru_node *nlru = &lru->node[nid]; int dst_idx = dst_memcg->kmemcg_id; struct list_lru_one *src, *dst; + bool set; /* * Since list_lru_{add,del} may be called under an IRQ-safe lock, @@ -518,7 +533,10 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid, dst = list_lru_from_memcg_idx(nlru, dst_idx); list_splice_init(&src->list, &dst->list); + set = (!dst->nr_items && src->nr_items); dst->nr_items += src->nr_items; + if (set) + memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); src->nr_items = 0; spin_unlock_irq(&nlru->lock); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d8cd6c39eca5..55c010a58535 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -422,6 +422,19 @@ unlock: mutex_unlock(&memcg_shrinker_map_mutex); return ret; } + +void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) +{ + if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { + struct memcg_shrinker_map *map; + + rcu_read_lock(); + map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); + set_bit(shrinker_id, map->map); + rcu_read_unlock(); + } +} + #else /* CONFIG_MEMCG_KMEM */ static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) { From b0dedc49a2daa0f44ddc51fbf686b2ef012fccbf Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:48:14 -0700 Subject: [PATCH 081/111] mm/vmscan.c: iterate only over charged shrinkers during memcg shrink_slab() Using the preparations made in previous patches, in case of memcg shrink, we may avoid shrinkers, which are not set in memcg's shrinkers bitmap. To do that, we separate iterations over memcg-aware and !memcg-aware shrinkers, and memcg-aware shrinkers are chosen via for_each_set_bit() from the bitmap. In case of big nodes, having many isolated environments, this gives significant performance growth. See next patches for the details. Note that the patch does not respect to empty memcg shrinkers, since we never clear the bitmap bits after we set it once. Their shrinkers will be called again, with no shrinked objects as result. This functionality is provided by next patches. [ktkhai@virtuozzo.com: v9] Link: http://lkml.kernel.org/r/153112558507.4097.12713813335683345488.stgit@localhost.localdomain Link: http://lkml.kernel.org/r/153063066653.1818.976035462801487910.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 75 insertions(+), 9 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index db0970ba340d..d7a5b8566869 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -364,6 +364,21 @@ int prealloc_shrinker(struct shrinker *shrinker) if (!shrinker->nr_deferred) return -ENOMEM; + /* + * There is a window between prealloc_shrinker() + * and register_shrinker_prepared(). We don't want + * to clear bit of a shrinker in such the state + * in shrink_slab_memcg(), since this will impose + * restrictions on a code registering a shrinker + * (they would have to guarantee, their LRU lists + * are empty till shrinker is completely registered). + * So, we differ the situation, when 1)a shrinker + * is semi-registered (id is assigned, but it has + * not yet linked to shrinker_list) and 2)shrinker + * is not registered (id is not assigned). + */ + INIT_LIST_HEAD(&shrinker->list); + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { if (prealloc_memcg_shrinker(shrinker)) goto free_deferred; @@ -543,6 +558,63 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, return freed; } +#ifdef CONFIG_MEMCG_KMEM +static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, int priority) +{ + struct memcg_shrinker_map *map; + unsigned long freed = 0; + int ret, i; + + if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)) + return 0; + + if (!down_read_trylock(&shrinker_rwsem)) + return 0; + + map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map, + true); + if (unlikely(!map)) + goto unlock; + + for_each_set_bit(i, map->map, shrinker_nr_max) { + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + .memcg = memcg, + }; + struct shrinker *shrinker; + + shrinker = idr_find(&shrinker_idr, i); + if (unlikely(!shrinker)) { + clear_bit(i, map->map); + continue; + } + + /* See comment in prealloc_shrinker() */ + if (unlikely(list_empty(&shrinker->list))) + continue; + + ret = do_shrink_slab(&sc, shrinker, priority); + freed += ret; + + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } + } +unlock: + up_read(&shrinker_rwsem); + return freed; +} +#else /* CONFIG_MEMCG_KMEM */ +static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, int priority) +{ + return 0; +} +#endif /* CONFIG_MEMCG_KMEM */ + /** * shrink_slab - shrink slab caches * @gfp_mask: allocation context @@ -572,8 +644,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct shrinker *shrinker; unsigned long freed = 0; - if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))) - return 0; + if (memcg && !mem_cgroup_is_root(memcg)) + return shrink_slab_memcg(gfp_mask, nid, memcg, priority); if (!down_read_trylock(&shrinker_rwsem)) goto out; @@ -585,13 +657,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, .memcg = memcg, }; - /* - * If kernel memory accounting is disabled, we ignore - * SHRINKER_MEMCG_AWARE flag and call all shrinkers - * passing NULL for memcg. - */ - if (memcg_kmem_enabled() && - !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE)) + if (!!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE)) continue; if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) From aeed1d325d429ac9699c4bf62d17156d60905519 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 17 Aug 2018 15:48:17 -0700 Subject: [PATCH 082/111] mm/vmscan.c: generalize shrink_slab() calls in shrink_node() The patch makes shrink_slab() be called for root_mem_cgroup in the same way as it's called for the rest of cgroups. This simplifies the logic and improves the readability. [ktkhai@virtuozzo.com: wrote changelog] Link: http://lkml.kernel.org/r/153063068338.1818.11496084754797453962.stgit@localhost.localdomain Signed-off-by: Vladimir Davydov Signed-off-by: Kirill Tkhai Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index d7a5b8566869..2aa3cb760189 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -627,10 +627,8 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, * unaware shrinkers will receive a node id of 0 instead. * - * @memcg specifies the memory cgroup to target. If it is not NULL, - * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan - * objects from the memory cgroup specified. Otherwise, only unaware - * shrinkers are called. + * @memcg specifies the memory cgroup to target. Unaware shrinkers + * are called only if it is the root cgroup. * * @priority is sc->priority, we take the number of objects and >> by priority * in order to get the scan target. @@ -644,7 +642,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct shrinker *shrinker; unsigned long freed = 0; - if (memcg && !mem_cgroup_is_root(memcg)) + if (!mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); if (!down_read_trylock(&shrinker_rwsem)) @@ -657,9 +655,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, .memcg = memcg, }; - if (!!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE)) - continue; - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) sc.nid = 0; @@ -689,6 +684,7 @@ void drop_slab_node(int nid) struct mem_cgroup *memcg = NULL; freed = 0; + memcg = mem_cgroup_iter(NULL, NULL, NULL); do { freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); @@ -2708,9 +2704,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) shrink_node_memcg(pgdat, memcg, sc, &lru_pages); node_lru_pages += lru_pages; - if (memcg) - shrink_slab(sc->gfp_mask, pgdat->node_id, - memcg, sc->priority); + shrink_slab(sc->gfp_mask, pgdat->node_id, + memcg, sc->priority); /* Record the group's reclaim efficiency */ vmpressure(sc->gfp_mask, memcg, false, @@ -2734,10 +2729,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) } } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); - if (global_reclaim(sc)) - shrink_slab(sc->gfp_mask, pgdat->node_id, NULL, - sc->priority); - if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; From 9b996468cfdba09f688f52dba4287de596194613 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:48:21 -0700 Subject: [PATCH 083/111] mm: add SHRINK_EMPTY shrinker methods return value We need to distinguish the situations when shrinker has very small amount of objects (see vfs_pressure_ratio() called from super_cache_count()), and when it has no objects at all. Currently, in the both of these cases, shrinker::count_objects() returns 0. The patch introduces new SHRINK_EMPTY return value, which will be used for "no objects at all" case. It's is a refactoring mostly, as SHRINK_EMPTY is replaced by 0 by all callers of do_shrink_slab() in this patch, and all the magic will happen in further. Link: http://lkml.kernel.org/r/153063069574.1818.11037751256699341813.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 3 +++ include/linux/shrinker.h | 7 +++++-- mm/vmscan.c | 12 +++++++++--- mm/workingset.c | 3 +++ 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/fs/super.c b/fs/super.c index f5f96e52e0cd..7429588d6b49 100644 --- a/fs/super.c +++ b/fs/super.c @@ -144,6 +144,9 @@ static unsigned long super_cache_count(struct shrinker *shrink, total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); + if (!total_objects) + return SHRINK_EMPTY; + total_objects = vfs_pressure_ratio(total_objects); return total_objects; } diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 7ca9c18cf130..b154fd2b084c 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -34,12 +34,15 @@ struct shrink_control { }; #define SHRINK_STOP (~0UL) +#define SHRINK_EMPTY (~0UL - 1) /* * A callback you can register to apply pressure to ageable caches. * * @count_objects should return the number of freeable items in the cache. If - * there are no objects to free or the number of freeable items cannot be - * determined, it should return 0. No deadlock checks should be done during the + * there are no objects to free, it should return SHRINK_EMPTY, while 0 is + * returned in cases of the number of freeable items cannot be determined + * or shrinker should skip this cache for this time (e.g., their number + * is below shrinkable limit). No deadlock checks should be done during the * count callback - the shrinker relies on aggregating scan counts that couldn't * be executed due to potential deadlocks to be run at a later call when the * deadlock condition is no longer pending. diff --git a/mm/vmscan.c b/mm/vmscan.c index 2aa3cb760189..8199e1b9a204 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -456,8 +456,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, long scanned = 0, next_deferred; freeable = shrinker->count_objects(shrinker, shrinkctl); - if (freeable == 0) - return 0; + if (freeable == 0 || freeable == SHRINK_EMPTY) + return freeable; /* * copy the current shrinker scan count into a local variable @@ -596,6 +596,8 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, continue; ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; freed += ret; if (rwsem_is_contended(&shrinker_rwsem)) { @@ -641,6 +643,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, { struct shrinker *shrinker; unsigned long freed = 0; + int ret; if (!mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); @@ -658,7 +661,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) sc.nid = 0; - freed += do_shrink_slab(&sc, shrinker, priority); + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; + freed += ret; /* * Bail out if someone want to register a new shrinker to * prevent the regsitration from being stalled for long periods diff --git a/mm/workingset.c b/mm/workingset.c index cd0b2ae615e4..bc72ad029b3e 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -399,6 +399,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, } max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); + if (!nodes) + return SHRINK_EMPTY; + if (nodes <= max_nodes) return 0; return nodes - max_nodes; From f90280d6b7963fa8925258ed66b4f567fe73dfea Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:48:25 -0700 Subject: [PATCH 084/111] mm/vmscan.c: clear shrinker bit if there are no objects related to memcg To avoid further unneed calls of do_shrink_slab() for shrinkers, which already do not have any charged objects in a memcg, their bits have to be cleared. This patch introduces a lockless mechanism to do that without races without parallel list lru add. After do_shrink_slab() returns SHRINK_EMPTY the first time, we clear the bit and call it once again. Then we restore the bit, if the new return value is different. Note, that single smp_mb__after_atomic() in shrink_slab_memcg() covers two situations: 1)list_lru_add() shrink_slab_memcg list_add_tail() for_each_set_bit() <--- read bit do_shrink_slab() <--- missed list update (no barrier) set_bit() do_shrink_slab() <--- seen list update This situation, when the first do_shrink_slab() sees set bit, but it doesn't see list update (i.e., race with the first element queueing), is rare. So we don't add before the first call of do_shrink_slab() instead of this to do not slow down generic case. Also, it's need the second call as seen in below in (2). 2)list_lru_add() shrink_slab_memcg() list_add_tail() ... set_bit() ... ... for_each_set_bit() do_shrink_slab() do_shrink_slab() clear_bit() ... ... ... list_lru_add() ... list_add_tail() clear_bit() set_bit() do_shrink_slab() The barriers guarantee that the second do_shrink_slab() in the right side task sees list update if really cleared the bit. This case is drawn in the code comment. [Results/performance of the patchset] After the whole patchset applied the below test shows signify increase of performance: $echo 1 > /sys/fs/cgroup/memory/memory.use_hierarchy $mkdir /sys/fs/cgroup/memory/ct $echo 4000M > /sys/fs/cgroup/memory/ct/memory.kmem.limit_in_bytes $for i in `seq 0 4000`; do mkdir /sys/fs/cgroup/memory/ct/$i; echo $$ > /sys/fs/cgroup/memory/ct/$i/cgroup.procs; mkdir -p s/$i; mount -t tmpfs $i s/$i; touch s/$i/file; done Then, 5 sequential calls of drop caches: $time echo 3 > /proc/sys/vm/drop_caches 1)Before: 0.00user 13.78system 0:13.78elapsed 99%CPU 0.00user 5.59system 0:05.60elapsed 99%CPU 0.00user 5.48system 0:05.48elapsed 99%CPU 0.00user 8.35system 0:08.35elapsed 99%CPU 0.00user 8.34system 0:08.35elapsed 99%CPU 2)After 0.00user 1.10system 0:01.10elapsed 99%CPU 0.00user 0.00system 0:00.01elapsed 64%CPU 0.00user 0.01system 0:00.01elapsed 82%CPU 0.00user 0.00system 0:00.01elapsed 64%CPU 0.00user 0.01system 0:00.01elapsed 82%CPU The results show the performance increases at least in 548 times. Shakeel Butt tested this patchset with fork-bomb on his configuration: > I created 255 memcgs, 255 ext4 mounts and made each memcg create a > file containing few KiBs on corresponding mount. Then in a separate > memcg of 200 MiB limit ran a fork-bomb. > > I ran the "perf record -ag -- sleep 60" and below are the results: > > Without the patch series: > Samples: 4M of event 'cycles', Event count (approx.): 3279403076005 > + 36.40% fb.sh [kernel.kallsyms] [k] shrink_slab > + 18.97% fb.sh [kernel.kallsyms] [k] list_lru_count_one > + 6.75% fb.sh [kernel.kallsyms] [k] super_cache_count > + 0.49% fb.sh [kernel.kallsyms] [k] down_read_trylock > + 0.44% fb.sh [kernel.kallsyms] [k] mem_cgroup_iter > + 0.27% fb.sh [kernel.kallsyms] [k] up_read > + 0.21% fb.sh [kernel.kallsyms] [k] osq_lock > + 0.13% fb.sh [kernel.kallsyms] [k] shmem_unused_huge_count > + 0.08% fb.sh [kernel.kallsyms] [k] shrink_node_memcg > + 0.08% fb.sh [kernel.kallsyms] [k] shrink_node > > With the patch series: > Samples: 4M of event 'cycles', Event count (approx.): 2756866824946 > + 47.49% fb.sh [kernel.kallsyms] [k] down_read_trylock > + 30.72% fb.sh [kernel.kallsyms] [k] up_read > + 9.51% fb.sh [kernel.kallsyms] [k] mem_cgroup_iter > + 1.69% fb.sh [kernel.kallsyms] [k] shrink_node_memcg > + 1.35% fb.sh [kernel.kallsyms] [k] mem_cgroup_protected > + 1.05% fb.sh [kernel.kallsyms] [k] queued_spin_lock_slowpath > + 0.85% fb.sh [kernel.kallsyms] [k] _raw_spin_lock > + 0.78% fb.sh [kernel.kallsyms] [k] lruvec_lru_size > + 0.57% fb.sh [kernel.kallsyms] [k] shrink_node > + 0.54% fb.sh [kernel.kallsyms] [k] queue_work_on > + 0.46% fb.sh [kernel.kallsyms] [k] shrink_slab_memcg [ktkhai@virtuozzo.com: v9] Link: http://lkml.kernel.org/r/153112561772.4097.11011071937553113003.stgit@localhost.localdomain Link: http://lkml.kernel.org/r/153063070859.1818.11870882950920963480.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 ++ mm/vmscan.c | 26 ++++++++++++++++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 55c010a58535..6a921890739f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -430,6 +430,8 @@ void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) rcu_read_lock(); map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); + /* Pairs with smp mb in shrink_slab() */ + smp_mb__before_atomic(); set_bit(shrinker_id, map->map); rcu_read_unlock(); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 8199e1b9a204..93fdd0375b64 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -596,8 +596,30 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, continue; ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) - ret = 0; + if (ret == SHRINK_EMPTY) { + clear_bit(i, map->map); + /* + * After the shrinker reported that it had no objects to + * free, but before we cleared the corresponding bit in + * the memcg shrinker map, a new object might have been + * added. To make sure, we have the bit set in this + * case, we invoke the shrinker one more time and reset + * the bit if it reports that it is not empty anymore. + * The memory barrier here pairs with the barrier in + * memcg_set_shrinker_bit(): + * + * list_lru_add() shrink_slab_memcg() + * list_add_tail() clear_bit() + * + * set_bit() do_shrink_slab() + */ + smp_mb__after_atomic(); + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; + else + memcg_set_shrinker_bit(memcg, nid, i); + } freed += ret; if (rwsem_is_contended(&shrinker_rwsem)) { From ac7fb3ad2717c56cee70456ffe9d808230db528e Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:48:30 -0700 Subject: [PATCH 085/111] mm/vmscan.c: move check for SHRINKER_NUMA_AWARE to do_shrink_slab() In case of shrink_slab_memcg() we do not zero nid, when shrinker is not numa-aware. This is not a real problem, since currently all memcg-aware shrinkers are numa-aware too (we have two: super_block shrinker and workingset shrinker), but something may change in the future. Link: http://lkml.kernel.org/r/153320759911.18959.8842396230157677671.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Reviewed-by: Andrew Morton Cc: Vladimir Davydov Cc: Michal Hocko Cc: Andrey Ryabinin Cc: "Huang, Ying" Cc: Tetsuo Handa Cc: Matthew Wilcox Cc: Shakeel Butt Cc: Josef Bacik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 93fdd0375b64..8fcc86f1d7bc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -455,6 +455,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, : SHRINK_BATCH; long scanned = 0, next_deferred; + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0 || freeable == SHRINK_EMPTY) return freeable; @@ -680,9 +683,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, .memcg = memcg, }; - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) - sc.nid = 0; - ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) ret = 0; From 7e010df53c80197b23119e7d7b95892aa13629df Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:48:34 -0700 Subject: [PATCH 086/111] mm: use special value SHRINKER_REGISTERING instead of list_empty() check The patch introduces a special value SHRINKER_REGISTERING to use instead of list_empty() to differ a registering shrinker from unregistered shrinker. Why we need that at all? Shrinker registration is split in two parts. The first one is prealloc_shrinker(), which allocates shrinker memory and reserves ID in shrinker_idr. This function can fail. The second is register_shrinker_prepared(), and it finalizes the registration. This function actually makes shrinker available to be used from shrink_slab(), and it can't fail. One shrinker may be based on more then one LRU lists. So, we never clear the bit in memcg shrinker maps, when (one of) corresponding LRU list becomes empty, since other LRU lists may be not empty. See superblock shrinker for example: it is based on two LRU lists: s_inode_lru and s_dentry_lru. We do not want to clear shrinker bit, when there are no inodes in s_inode_lru, as s_dentry_lru may contain dentries. Instead of that, we use special algorithm to detect shrinkers having no elements at all its LRU lists, and this is made in shrink_slab_memcg(). See the comment in this function for the details. Also, in shrink_slab_memcg() we clear shrinker bit in the map, when we meet unregistered shrinker (bit is set, while there is no a shrinker in IDR). Otherwise, we would have done that at the moment of shrinker unregistration for all memcgs (and this looks worse, since iteration over all memcg may take much time). Also this would have imposed restrictions on shrinker unregistration order for its users: they would have had to guarantee, there are no new elements after unregister_shrinker() (otherwise, a new added element would have set a bit). So, if we meet a set bit in map and no shrinker in IDR when we're iterating over the map in shrink_slab_memcg(), this means the corresponding shrinker is unregistered, and we must clear the bit. Another case is shrinker registration. We want two things there: 1) do_shrink_slab() can be called only for completely registered shrinkers; 2) shrinker internal lists may be populated in any order with register_shrinker_prepared() (let's talk on the example with sb). Both of: a)list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru); [cpu0] memcg_set_shrinker_bit(); [cpu0] ... register_shrinker_prepared(); [cpu1] and b)register_shrinker_prepared(); [cpu0] ... list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru); [cpu1] memcg_set_shrinker_bit(); [cpu1] are legitimate. We don't want to impose restriction here and to force people to use only (b) variant. We don't want to force people to care, there is no elements in LRU lists before the shrinker is completely registered. Internal users of LRU lists and shrinker code are two different subsystems, and they have to be closed in themselves each other. In (a) case we have the bit set before shrinker is completely registered. We don't want do_shrink_slab() is called at this moment, so we have to detect such the registering shrinkers. Before this patch list_empty() (shrinker is not linked to the list) check was used for that. So, in (a) there could be a bit set, but we don't call do_shrink_slab() unless shrinker is linked to the list. It's just an indicator, I just overloaded linking to the list. This was not the best solution, since it's better not to touch the shrinker memory from shrink_slab_memcg() before it's completely registered (this also will be useful in the future to make shrink_slab() completely lockless). So, this patch introduces better way to detect registering shrinker, which allows not to dereference shrinker memory. It's just a ~0UL value, which we insert into the IDR during ID allocation. After shrinker is ready to be used, we insert actual shrinker pointer in the IDR, and it becomes available to shrink_slab_memcg(). We can't use NULL instead of this new value for this purpose as: shrink_slab_memcg() already uses NULL to detect unregistered shrinkers, and we don't want the function sees NULL and clears the bit, otherwise (a) won't work. This is the only thing the patch makes: the better way to detect registering shrinker. Nothing else this patch makes. Also this gives a better assembler, but it's minor side of the patch: Before: callq mov %rax,%r15 test %rax,%rax je mov 0x20(%rax),%rax lea 0x20(%r15),%rdx cmp %rax,%rdx je mov 0x8(%rsp),%edx mov %r15,%rsi lea 0x10(%rsp),%rdi callq After: callq mov %rax,%r15 lea -0x1(%rax),%rax cmp $0xfffffffffffffffd,%rax ja mov 0x8(%rsp),%edx mov %r15,%rsi lea 0x10(%rsp),%rdi callq ffffffff810cefd0 [ktkhai@virtuozzo.com: add #ifdef CONFIG_MEMCG_KMEM around idr_replace()] Link: http://lkml.kernel.org/r/758b8fec-7573-47eb-b26a-7b2847ae7b8c@virtuozzo.com Link: http://lkml.kernel.org/r/153355467546.11522.4518015068123480218.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Reviewed-by: Andrew Morton Cc: Vladimir Davydov Cc: Michal Hocko Cc: Andrey Ryabinin Cc: "Huang, Ying" Cc: Tetsuo Handa Cc: Matthew Wilcox Cc: Shakeel Butt Cc: Josef Bacik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8fcc86f1d7bc..4375b1e9bd56 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -170,6 +170,20 @@ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG_KMEM + +/* + * We allow subsystems to populate their shrinker-related + * LRU lists before register_shrinker_prepared() is called + * for the shrinker, since we don't want to impose + * restrictions on their internal registration order. + * In this case shrink_slab_memcg() may find corresponding + * bit is set in the shrinkers map. + * + * This value is used by the function to detect registering + * shrinkers and to skip do_shrink_slab() calls for them. + */ +#define SHRINKER_REGISTERING ((struct shrinker *)~0UL) + static DEFINE_IDR(shrinker_idr); static int shrinker_nr_max; @@ -179,7 +193,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) down_write(&shrinker_rwsem); /* This may call shrinker, so it must use down_read_trylock() */ - id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); + id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -364,21 +378,6 @@ int prealloc_shrinker(struct shrinker *shrinker) if (!shrinker->nr_deferred) return -ENOMEM; - /* - * There is a window between prealloc_shrinker() - * and register_shrinker_prepared(). We don't want - * to clear bit of a shrinker in such the state - * in shrink_slab_memcg(), since this will impose - * restrictions on a code registering a shrinker - * (they would have to guarantee, their LRU lists - * are empty till shrinker is completely registered). - * So, we differ the situation, when 1)a shrinker - * is semi-registered (id is assigned, but it has - * not yet linked to shrinker_list) and 2)shrinker - * is not registered (id is not assigned). - */ - INIT_LIST_HEAD(&shrinker->list); - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { if (prealloc_memcg_shrinker(shrinker)) goto free_deferred; @@ -408,6 +407,9 @@ void register_shrinker_prepared(struct shrinker *shrinker) { down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); +#ifdef CONFIG_MEMCG_KMEM + idr_replace(&shrinker_idr, shrinker, shrinker->id); +#endif up_write(&shrinker_rwsem); } @@ -589,15 +591,12 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct shrinker *shrinker; shrinker = idr_find(&shrinker_idr, i); - if (unlikely(!shrinker)) { - clear_bit(i, map->map); + if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) { + if (!shrinker) + clear_bit(i, map->map); continue; } - /* See comment in prealloc_shrinker() */ - if (unlikely(list_empty(&shrinker->list))) - continue; - ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) { clear_bit(i, map->map); From f2fc10e0b3fe7d1aecbd2cab6bf0007b6771e16d Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 17 Aug 2018 15:48:38 -0700 Subject: [PATCH 087/111] mm/sparse.c: add a static variable nr_present_sections Patch series "mm/sparse: Optimize memmap allocation during sparse_init()", v6. In sparse_init(), two temporary pointer arrays, usemap_map and map_map are allocated with the size of NR_MEM_SECTIONS. They are used to store each memory section's usemap and mem map if marked as present. In 5-level paging mode, this will cost 512M memory though they will be released at the end of sparse_init(). System with few memory, like kdump kernel which usually only has about 256M, will fail to boot because of allocation failure if CONFIG_X86_5LEVEL=y. In this patchset, optimize the memmap allocation code to only use usemap_map and map_map with the size of nr_present_sections. This makes kdump kernel boot up with normal crashkernel='' setting when CONFIG_X86_5LEVEL=y. This patch (of 5): nr_present_sections is used to record how many memory sections are marked as present during system boot up, and will be used in the later patch. Link: http://lkml.kernel.org/r/20180228032657.32385-2-bhe@redhat.com Signed-off-by: Baoquan He Acked-by: Dave Hansen Reviewed-by: Andrew Morton Reviewed-by: Pavel Tatashin Reviewed-by: Oscar Salvador Cc: Pasha Tatashin Cc: Kirill A. Shutemov Cc: Pankaj Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/sparse.c b/mm/sparse.c index b1b14a9c4041..99a6383e98bc 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -200,6 +200,12 @@ static inline int next_present_section_nr(int section_nr) (section_nr <= __highest_present_section_nr)); \ section_nr = next_present_section_nr(section_nr)) +/* + * Record how many memory sections are marked as present + * during system bootup. + */ +static int __initdata nr_present_sections; + /* Record a memory area against a node. */ void __init memory_present(int nid, unsigned long start, unsigned long end) { @@ -229,6 +235,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) ms->section_mem_map = sparse_encode_early_nid(nid) | SECTION_IS_ONLINE; section_mark_present(ms); + nr_present_sections++; } } } From 07a34a8c36521c37119259d937d1389c3f5f6db9 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 17 Aug 2018 15:48:42 -0700 Subject: [PATCH 088/111] mm/sparsemem.c: defer the ms->section_mem_map clearing In sparse_init(), if CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER=y, system will allocate one continuous memory chunk for mem maps on one node and populate the relevant page tables to map memory section one by one. If fail to populate for a certain mem section, print warning and its ->section_mem_map will be cleared to cancel the marking of being present. Like this, the number of mem sections marked as present could become less during sparse_init() execution. Here just defer the ms->section_mem_map clearing if failed to populate its page tables until the last for_each_present_section_nr() loop. This is in preparation for later optimizing the mem map allocation. [akpm@linux-foundation.org: remove now-unused local `ms', per Oscar] Link: http://lkml.kernel.org/r/20180228032657.32385-3-bhe@redhat.com Signed-off-by: Baoquan He Acked-by: Dave Hansen Reviewed-by: Pavel Tatashin Reviewed-by: Oscar Salvador Cc: Pasha Tatashin Cc: Kirill A. Shutemov Cc: Pankaj Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse-vmemmap.c | 4 ---- mm/sparse.c | 12 ++++++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index bd0276d5f66b..68bb65b2d34d 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -292,18 +292,14 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, } for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - struct mem_section *ms; - if (!present_section_nr(pnum)) continue; map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL); if (map_map[pnum]) continue; - ms = __nr_to_section(pnum); pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); - ms->section_mem_map = 0; } if (vmemmap_buf_start) { diff --git a/mm/sparse.c b/mm/sparse.c index 99a6383e98bc..eb31274aae8b 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -446,7 +446,6 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, ms = __nr_to_section(pnum); pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); - ms->section_mem_map = 0; } } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ @@ -474,7 +473,6 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); - ms->section_mem_map = 0; return NULL; } #endif @@ -578,17 +576,23 @@ void __init sparse_init(void) #endif for_each_present_section_nr(0, pnum) { + struct mem_section *ms; + ms = __nr_to_section(pnum); usemap = usemap_map[pnum]; - if (!usemap) + if (!usemap) { + ms->section_mem_map = 0; continue; + } #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER map = map_map[pnum]; #else map = sparse_early_mem_map_alloc(pnum); #endif - if (!map) + if (!map) { + ms->section_mem_map = 0; continue; + } sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap); From 9258631b33374f20d856032c3542b76ad7f5a312 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 17 Aug 2018 15:48:45 -0700 Subject: [PATCH 089/111] mm/sparse.c: add a new parameter 'data_unit_size' for alloc_usemap_and_memmap It's used to pass the size of map data unit into alloc_usemap_and_memmap, and is preparation for next patch. Link: http://lkml.kernel.org/r/20180228032657.32385-4-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Pavel Tatashin Reviewed-by: Oscar Salvador Cc: Pasha Tatashin Cc: Kirill A. Shutemov Cc: Pankaj Gupta Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index eb31274aae8b..eb188eb6b82d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -484,10 +484,12 @@ void __weak __meminit vmemmap_populate_print_last(void) /** * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap * @map: usemap_map for pageblock flags or mmap_map for vmemmap + * @unit_size: size of map unit */ static void __init alloc_usemap_and_memmap(void (*alloc_func) (void *, unsigned long, unsigned long, - unsigned long, int), void *data) + unsigned long, int), void *data, + int data_unit_size) { unsigned long pnum; unsigned long map_count; @@ -564,7 +566,8 @@ void __init sparse_init(void) if (!usemap_map) panic("can not allocate usemap_map\n"); alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, - (void *)usemap_map); + (void *)usemap_map, + sizeof(usemap_map[0])); #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER size2 = sizeof(struct page *) * NR_MEM_SECTIONS; @@ -572,7 +575,8 @@ void __init sparse_init(void) if (!map_map) panic("can not allocate map_map\n"); alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, - (void *)map_map); + (void *)map_map, + sizeof(map_map[0])); #endif for_each_present_section_nr(0, pnum) { From c98aff649349d9147915a19d378c9c3c1bd85de0 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 17 Aug 2018 15:48:49 -0700 Subject: [PATCH 090/111] mm/sparse: optimize memmap allocation during sparse_init() In sparse_init(), two temporary pointer arrays, usemap_map and map_map are allocated with the size of NR_MEM_SECTIONS. They are used to store each memory section's usemap and mem map if marked as present. With the help of these two arrays, continuous memory chunk is allocated for usemap and memmap for memory sections on one node. This avoids too many memory fragmentations. Like below diagram, '1' indicates the present memory section, '0' means absent one. The number 'n' could be much smaller than NR_MEM_SECTIONS on most of systems. |1|1|1|1|0|0|0|0|1|1|0|0|...|1|0||1|0|...|1||0|1|...|0| ------------------------------------------------------- 0 1 2 3 4 5 i i+1 n-1 n If we fail to populate the page tables to map one section's memmap, its ->section_mem_map will be cleared finally to indicate that it's not present. After use, these two arrays will be released at the end of sparse_init(). In 4-level paging mode, each array costs 4M which can be ignorable. While in 5-level paging, they costs 256M each, 512M altogether. Kdump kernel Usually only reserves very few memory, e.g 256M. So, even thouth they are temporarily allocated, still not acceptable. In fact, there's no need to allocate them with the size of NR_MEM_SECTIONS. Since the ->section_mem_map clearing has been deferred to the last, the number of present memory sections are kept the same during sparse_init() until we finally clear out the memory section's ->section_mem_map if its usemap or memmap is not correctly handled. Thus in the middle whenever for_each_present_section_nr() loop is taken, the i-th present memory section is always the same one. Here only allocate usemap_map and map_map with the size of 'nr_present_sections'. For the i-th present memory section, install its usemap and memmap to usemap_map[i] and mam_map[i] during allocation. Then in the last for_each_present_section_nr() loop which clears the failed memory section's ->section_mem_map, fetch usemap and memmap from usemap_map[] and map_map[] array and set them into mem_section[] accordingly. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20180628062857.29658-5-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Pavel Tatashin Cc: Pasha Tatashin Cc: Dave Hansen Cc: Kirill A. Shutemov Cc: Oscar Salvador Cc: Pankaj Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse-vmemmap.c | 6 ++++-- mm/sparse.c | 46 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 68bb65b2d34d..95e2c7638a5c 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -281,6 +281,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, unsigned long pnum; unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; void *vmemmap_buf_start; + int nr_consumed_maps = 0; size = ALIGN(size, PMD_SIZE); vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count, @@ -295,8 +296,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (!present_section_nr(pnum)) continue; - map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL); - if (map_map[pnum]) + map_map[nr_consumed_maps] = + sparse_mem_map_populate(pnum, nodeid, NULL); + if (map_map[nr_consumed_maps++]) continue; pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); diff --git a/mm/sparse.c b/mm/sparse.c index eb188eb6b82d..2ea8b3dbd0df 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -381,6 +381,7 @@ static void __init sparse_early_usemaps_alloc_node(void *data, unsigned long pnum; unsigned long **usemap_map = (unsigned long **)data; int size = usemap_size(); + int nr_consumed_maps = 0; usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), size * usemap_count); @@ -392,9 +393,10 @@ static void __init sparse_early_usemaps_alloc_node(void *data, for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) continue; - usemap_map[pnum] = usemap; + usemap_map[nr_consumed_maps] = usemap; usemap += size; - check_usemap_section_nr(nodeid, usemap_map[pnum]); + check_usemap_section_nr(nodeid, usemap_map[nr_consumed_maps]); + nr_consumed_maps++; } } @@ -419,29 +421,34 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, void *map; unsigned long pnum; unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; + int nr_consumed_maps; size = PAGE_ALIGN(size); map = memblock_virt_alloc_try_nid_raw(size * map_count, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nodeid); if (map) { + nr_consumed_maps = 0; for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) continue; - map_map[pnum] = map; + map_map[nr_consumed_maps] = map; map += size; + nr_consumed_maps++; } return; } /* fallback */ + nr_consumed_maps = 0; for (pnum = pnum_begin; pnum < pnum_end; pnum++) { struct mem_section *ms; if (!present_section_nr(pnum)) continue; - map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL); - if (map_map[pnum]) + map_map[nr_consumed_maps] = + sparse_mem_map_populate(pnum, nodeid, NULL); + if (map_map[nr_consumed_maps++]) continue; ms = __nr_to_section(pnum); pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", @@ -521,6 +528,7 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func) /* new start, update count etc*/ nodeid_begin = nodeid; pnum_begin = pnum; + data += map_count * data_unit_size; map_count = 1; } /* ok, last chunk */ @@ -539,6 +547,7 @@ void __init sparse_init(void) unsigned long *usemap; unsigned long **usemap_map; int size; + int nr_consumed_maps = 0; #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER int size2; struct page **map_map; @@ -561,7 +570,7 @@ void __init sparse_init(void) * powerpc need to call sparse_init_one_section right after each * sparse_early_mem_map_alloc, so allocate usemap_map at first. */ - size = sizeof(unsigned long *) * NR_MEM_SECTIONS; + size = sizeof(unsigned long *) * nr_present_sections; usemap_map = memblock_virt_alloc(size, 0); if (!usemap_map) panic("can not allocate usemap_map\n"); @@ -570,7 +579,7 @@ void __init sparse_init(void) sizeof(usemap_map[0])); #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - size2 = sizeof(struct page *) * NR_MEM_SECTIONS; + size2 = sizeof(struct page *) * nr_present_sections; map_map = memblock_virt_alloc(size2, 0); if (!map_map) panic("can not allocate map_map\n"); @@ -579,27 +588,46 @@ void __init sparse_init(void) sizeof(map_map[0])); #endif + /* + * The number of present sections stored in nr_present_sections + * are kept the same since mem sections are marked as present in + * memory_present(). In this for loop, we need check which sections + * failed to allocate memmap or usemap, then clear its + * ->section_mem_map accordingly. During this process, we need + * increase 'nr_consumed_maps' whether its allocation of memmap + * or usemap failed or not, so that after we handle the i-th + * memory section, can get memmap and usemap of (i+1)-th section + * correctly. + */ for_each_present_section_nr(0, pnum) { struct mem_section *ms; + + if (nr_consumed_maps >= nr_present_sections) { + pr_err("nr_consumed_maps goes beyond nr_present_sections\n"); + break; + } ms = __nr_to_section(pnum); - usemap = usemap_map[pnum]; + usemap = usemap_map[nr_consumed_maps]; if (!usemap) { ms->section_mem_map = 0; + nr_consumed_maps++; continue; } #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - map = map_map[pnum]; + map = map_map[nr_consumed_maps]; #else map = sparse_early_mem_map_alloc(pnum); #endif if (!map) { ms->section_mem_map = 0; + nr_consumed_maps++; continue; } sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap); + nr_consumed_maps++; } vmemmap_populate_print_last(); From 50c150f26261e723523f077a67378736fa7511a4 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 17 Aug 2018 15:48:53 -0700 Subject: [PATCH 091/111] Revert "mm: always flush VMA ranges affected by zap_page_range" There was a bug in Linux that could cause madvise (and mprotect?) system calls to return to userspace without the TLB having been flushed for all the pages involved. This could happen when multiple threads of a process made simultaneous madvise and/or mprotect calls. This was noticed in the summer of 2017, at which time two solutions were created: 56236a59556c ("mm: refactor TLB gathering API") 99baac21e458 ("mm: fix MADV_[FREE|DONTNEED] TLB flush miss problem") and 4647706ebeee ("mm: always flush VMA ranges affected by zap_page_range") We need only one of these solutions, and the former appears to be a little more efficient than the latter, so revert that one. This reverts 4647706ebeee6e50 ("mm: always flush VMA ranges affected by zap_page_range") Link: http://lkml.kernel.org/r/20180706131019.51e3a5f0@imladris.surriel.com Signed-off-by: Rik van Riel Acked-by: Mel Gorman Cc: Andy Lutomirski Cc: Michal Hocko Cc: Minchan Kim Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: "Aneesh Kumar K.V" Cc: Nicholas Piggin Cc: Nadav Amit Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index ae2ec887508b..19f47d7b9b86 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1613,20 +1613,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, start, end); - for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { + for ( ; vma && vma->vm_start < end; vma = vma->vm_next) unmap_single_vma(&tlb, vma, start, end, NULL); - - /* - * zap_page_range does not specify whether mmap_sem should be - * held for read or write. That allows parallel zap_page_range - * operations to unmap a PTE and defer a flush meaning that - * this call observes pte_none and fails to flush the TLB. - * Rather than adding a complex API, ensure that no stale - * TLB entries exist when this call returns. - */ - flush_tlb_range(vma, start, end); - } - mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); } From 6518202970c1052148daaef9a8096711775e43a2 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 17 Aug 2018 15:48:57 -0700 Subject: [PATCH 092/111] mm/cma: remove unsupported gfp_mask parameter from cma_alloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cma_alloc() doesn't really support gfp flags other than __GFP_NOWARN, so convert gfp_mask parameter to boolean no_warn parameter. This will help to avoid giving false feeling that this function supports standard gfp flags and callers can pass __GFP_ZERO to get zeroed buffer, what has already been an issue: see commit dd65a941f6ba ("arm64: dma-mapping: clear buffers allocated with FORCE_CONTIGUOUS flag"). Link: http://lkml.kernel.org/r/20180709122019eucas1p2340da484acfcc932537e6014f4fd2c29~-sqTPJKij2939229392eucas1p2j@eucas1p2.samsung.com Signed-off-by: Marek Szyprowski Acked-by: Michal Hocko Acked-by: Michał Nazarewicz Acked-by: Laura Abbott Acked-by: Vlastimil Babka Reviewed-by: Christoph Hellwig Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kvm/book3s_hv_builtin.c | 2 +- drivers/s390/char/vmcp.c | 2 +- drivers/staging/android/ion/ion_cma_heap.c | 2 +- include/linux/cma.h | 2 +- kernel/dma/contiguous.c | 3 ++- mm/cma.c | 8 ++++---- mm/cma_debug.c | 2 +- 7 files changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index d4a3f4da409b..fc6bb9630a9c 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -77,7 +77,7 @@ struct page *kvm_alloc_hpt_cma(unsigned long nr_pages) VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES), - GFP_KERNEL); + false); } EXPORT_SYMBOL_GPL(kvm_alloc_hpt_cma); diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c index 948ce82a7725..0fa1b6b1491a 100644 --- a/drivers/s390/char/vmcp.c +++ b/drivers/s390/char/vmcp.c @@ -68,7 +68,7 @@ static void vmcp_response_alloc(struct vmcp_session *session) * anymore the system won't work anyway. */ if (order > 2) - page = cma_alloc(vmcp_cma, nr_pages, 0, GFP_KERNEL); + page = cma_alloc(vmcp_cma, nr_pages, 0, false); if (page) { session->response = (char *)page_to_phys(page); session->cma_alloc = 1; diff --git a/drivers/staging/android/ion/ion_cma_heap.c b/drivers/staging/android/ion/ion_cma_heap.c index 49718c96bf9e..3fafd013d80a 100644 --- a/drivers/staging/android/ion/ion_cma_heap.c +++ b/drivers/staging/android/ion/ion_cma_heap.c @@ -39,7 +39,7 @@ static int ion_cma_allocate(struct ion_heap *heap, struct ion_buffer *buffer, if (align > CONFIG_CMA_ALIGNMENT) align = CONFIG_CMA_ALIGNMENT; - pages = cma_alloc(cma_heap->cma, nr_pages, align, GFP_KERNEL); + pages = cma_alloc(cma_heap->cma, nr_pages, align, false); if (!pages) return -ENOMEM; diff --git a/include/linux/cma.h b/include/linux/cma.h index bf90f0bb42bd..190184b5ff32 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -33,7 +33,7 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, const char *name, struct cma **res_cma); extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, - gfp_t gfp_mask); + bool no_warn); extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index d987dcd1bd56..19ea5d70150c 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -191,7 +191,8 @@ struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, if (align > CONFIG_CMA_ALIGNMENT) align = CONFIG_CMA_ALIGNMENT; - return cma_alloc(dev_get_cma_area(dev), count, align, gfp_mask); + return cma_alloc(dev_get_cma_area(dev), count, align, + gfp_mask & __GFP_NOWARN); } /** diff --git a/mm/cma.c b/mm/cma.c index 5809bbe360d7..4cb76121a3ab 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -395,13 +395,13 @@ static inline void cma_debug_show_areas(struct cma *cma) { } * @cma: Contiguous memory region for which the allocation is performed. * @count: Requested number of pages. * @align: Requested alignment of pages (in PAGE_SIZE order). - * @gfp_mask: GFP mask to use during compaction + * @no_warn: Avoid printing message about failed allocation * * This function allocates part of contiguous memory on specific * contiguous memory area. */ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, - gfp_t gfp_mask) + bool no_warn) { unsigned long mask, offset; unsigned long pfn = -1; @@ -447,7 +447,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma_mutex); ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, - gfp_mask); + GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); mutex_unlock(&cma_mutex); if (ret == 0) { page = pfn_to_page(pfn); @@ -466,7 +466,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, trace_cma_alloc(pfn, page, count, align); - if (ret && !(gfp_mask & __GFP_NOWARN)) { + if (ret && !no_warn) { pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n", __func__, count, ret); cma_debug_show_areas(cma); diff --git a/mm/cma_debug.c b/mm/cma_debug.c index f23467291cfb..ad6723e9d110 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -139,7 +139,7 @@ static int cma_alloc_mem(struct cma *cma, int count) if (!mem) return -ENOMEM; - p = cma_alloc(cma, count, 0, GFP_KERNEL); + p = cma_alloc(cma, count, 0, false); if (!p) { kfree(mem); return -ENOMEM; From d834c5ab83febf9624ad3b16c3c348aa1e02014c Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 17 Aug 2018 15:49:00 -0700 Subject: [PATCH 093/111] kernel/dma: remove unsupported gfp_mask parameter from dma_alloc_from_contiguous() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CMA memory allocator doesn't support standard gfp flags for memory allocation, so there is no point having it as a parameter for dma_alloc_from_contiguous() function. Replace it by a boolean no_warn argument, which covers all the underlaying cma_alloc() function supports. This will help to avoid giving false feeling that this function supports standard gfp flags and callers can pass __GFP_ZERO to get zeroed buffer, what has already been an issue: see commit dd65a941f6ba ("arm64: dma-mapping: clear buffers allocated with FORCE_CONTIGUOUS flag"). Link: http://lkml.kernel.org/r/20180709122020eucas1p21a71b092975cb4a3b9954ffc63f699d1~-sqUFoa-h2939329393eucas1p2Y@eucas1p2.samsung.com Signed-off-by: Marek Szyprowski Acked-by: Michał Nazarewicz Acked-by: Vlastimil Babka Reviewed-by: Christoph Hellwig Cc: Laura Abbott Cc: Michal Hocko Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/dma-mapping.c | 5 +++-- arch/arm64/mm/dma-mapping.c | 4 ++-- arch/xtensa/kernel/pci-dma.c | 2 +- drivers/iommu/amd_iommu.c | 2 +- drivers/iommu/intel-iommu.c | 3 ++- include/linux/dma-contiguous.h | 4 ++-- kernel/dma/contiguous.c | 7 +++---- kernel/dma/direct.c | 3 ++- 8 files changed, 16 insertions(+), 14 deletions(-) diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index ba0e786c952e..66566472c153 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -594,7 +594,7 @@ static void *__alloc_from_contiguous(struct device *dev, size_t size, struct page *page; void *ptr = NULL; - page = dma_alloc_from_contiguous(dev, count, order, gfp); + page = dma_alloc_from_contiguous(dev, count, order, gfp & __GFP_NOWARN); if (!page) return NULL; @@ -1299,7 +1299,8 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size, unsigned long order = get_order(size); struct page *page; - page = dma_alloc_from_contiguous(dev, count, order, gfp); + page = dma_alloc_from_contiguous(dev, count, order, + gfp & __GFP_NOWARN); if (!page) goto error; diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 61e93f0b5482..072c51fb07d7 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -355,7 +355,7 @@ static int __init atomic_pool_init(void) if (dev_get_cma_area(NULL)) page = dma_alloc_from_contiguous(NULL, nr_pages, - pool_size_order, GFP_KERNEL); + pool_size_order, false); else page = alloc_pages(GFP_DMA32, pool_size_order); @@ -573,7 +573,7 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size, struct page *page; page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT, - get_order(size), gfp); + get_order(size), gfp & __GFP_NOWARN); if (!page) return NULL; diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c index 392b4a80ebc2..a02dc563d290 100644 --- a/arch/xtensa/kernel/pci-dma.c +++ b/arch/xtensa/kernel/pci-dma.c @@ -137,7 +137,7 @@ static void *xtensa_dma_alloc(struct device *dev, size_t size, if (gfpflags_allow_blocking(flag)) page = dma_alloc_from_contiguous(dev, count, get_order(size), - flag); + flag & __GFP_NOWARN); if (!page) page = alloc_pages(flag, get_order(size)); diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 596b95c50051..60b2eab29cd8 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2620,7 +2620,7 @@ static void *alloc_coherent(struct device *dev, size_t size, return NULL; page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT, - get_order(size), flag); + get_order(size), flag & __GFP_NOWARN); if (!page) return NULL; } diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 115ff26e9ced..6a237d18fabf 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -3758,7 +3758,8 @@ static void *intel_alloc_coherent(struct device *dev, size_t size, if (gfpflags_allow_blocking(flags)) { unsigned int count = size >> PAGE_SHIFT; - page = dma_alloc_from_contiguous(dev, count, order, flags); + page = dma_alloc_from_contiguous(dev, count, order, + flags & __GFP_NOWARN); if (page && iommu_no_mapping(dev) && page_to_phys(page) + size > dev->coherent_dma_mask) { dma_release_from_contiguous(dev, page, count); diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h index 3c5a4cb3eb95..f247e8aa5e3d 100644 --- a/include/linux/dma-contiguous.h +++ b/include/linux/dma-contiguous.h @@ -112,7 +112,7 @@ static inline int dma_declare_contiguous(struct device *dev, phys_addr_t size, } struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, - unsigned int order, gfp_t gfp_mask); + unsigned int order, bool no_warn); bool dma_release_from_contiguous(struct device *dev, struct page *pages, int count); @@ -145,7 +145,7 @@ int dma_declare_contiguous(struct device *dev, phys_addr_t size, static inline struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, - unsigned int order, gfp_t gfp_mask) + unsigned int order, bool no_warn) { return NULL; } diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 19ea5d70150c..286d82329eb0 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -178,7 +178,7 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, * @dev: Pointer to device for which the allocation is performed. * @count: Requested number of pages. * @align: Requested alignment of pages (in PAGE_SIZE order). - * @gfp_mask: GFP flags to use for this allocation. + * @no_warn: Avoid printing message about failed allocation. * * This function allocates memory buffer for specified device. It uses * device specific contiguous memory area if available or the default @@ -186,13 +186,12 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, * function. */ struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, - unsigned int align, gfp_t gfp_mask) + unsigned int align, bool no_warn) { if (align > CONFIG_CMA_ALIGNMENT) align = CONFIG_CMA_ALIGNMENT; - return cma_alloc(dev_get_cma_area(dev), count, align, - gfp_mask & __GFP_NOWARN); + return cma_alloc(dev_get_cma_area(dev), count, align, no_warn); } /** diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index c2860c5a9e96..1c35b7b945d0 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -78,7 +78,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, again: /* CMA can be used only in the context which permits sleeping */ if (gfpflags_allow_blocking(gfp)) { - page = dma_alloc_from_contiguous(dev, count, page_order, gfp); + page = dma_alloc_from_contiguous(dev, count, page_order, + gfp & __GFP_NOWARN); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_release_from_contiguous(dev, page, count); page = NULL; From 9bfe5ded054b8e28a94c78580f233d6879a00146 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 17 Aug 2018 15:49:04 -0700 Subject: [PATCH 094/111] mm, oom: remove sleep from under oom_lock Tetsuo has pointed out that since 27ae357fa82b ("mm, oom: fix concurrent munlock and oom reaper unmap, v3") we have a strong synchronization between the oom_killer and victim's exiting because both have to take the oom_lock. Therefore the original heuristic to sleep for a short time in out_of_memory doesn't serve the original purpose. Moreover Tetsuo has noticed that the short sleep can be more harmful than actually useful. Hammering the system with many processes can lead to a starvation when the task holding the oom_lock can block for a long time (minutes) and block any further progress because the oom_reaper depends on the oom_lock as well. Drop the short sleep from out_of_memory when we hold the lock. Keep the sleep when the trylock fails to throttle the concurrent OOM paths a bit. This should be solved in a more reasonable way (e.g. sleep proportional to the time spent in the active reclaiming etc.) but this is much more complex thing to achieve. This is a quick fixup to remove a stale code. Link: http://lkml.kernel.org/r/20180709074706.30635-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Reviewed-by: Andrew Morton Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 84081e77bc51..cd6520f7553d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1077,15 +1077,9 @@ bool out_of_memory(struct oom_control *oc) dump_header(oc, NULL); panic("Out of memory and no killable processes...\n"); } - if (oc->chosen && oc->chosen != (void *)-1UL) { + if (oc->chosen && oc->chosen != (void *)-1UL) oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : "Memory cgroup out of memory"); - /* - * Give the killed process a good chance to exit before trying - * to allocate memory again. - */ - schedule_timeout_killable(1); - } return !!oc->chosen; } From 40d18ebffb3974272a920c41f2d74431152cae98 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 17 Aug 2018 15:49:07 -0700 Subject: [PATCH 095/111] mm/hugetlb: remove gigantic page support for HIGHMEM This reverts ee8f248d266e ("hugetlb: add phys addr to struct huge_bootmem_page"). At one time powerpc used this field and supporting code. However that was removed with commit 79cc38ded1e1 ("powerpc/mm/hugetlb: Add support for reserving gigantic huge pages via kernel command line"). There are no users of this field and supporting code, so remove it. Link: http://lkml.kernel.org/r/20180711195913.1294-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: "Aneesh Kumar K . V" Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Cannon Matthews Cc: Becky Bruce Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 --- mm/hugetlb.c | 9 +-------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 36fa6a2a82e3..c39d9170a8a0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -348,9 +348,6 @@ struct hstate { struct huge_bootmem_page { struct list_head list; struct hstate *hstate; -#ifdef CONFIG_HIGHMEM - phys_addr_t phys; -#endif }; struct page *alloc_huge_page(struct vm_area_struct *vma, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f1bcaae0d73a..4cea30ac5033 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2139,16 +2139,9 @@ static void __init gather_bootmem_prealloc(void) struct huge_bootmem_page *m; list_for_each_entry(m, &huge_boot_pages, list) { + struct page *page = virt_to_page(m); struct hstate *h = m->hstate; - struct page *page; -#ifdef CONFIG_HIGHMEM - page = pfn_to_page(m->phys >> PAGE_SHIFT); - memblock_free_late(__pa(m), - sizeof(struct huge_bootmem_page)); -#else - page = virt_to_page(m); -#endif WARN_ON(page_count(page) != 1); prep_compound_huge_page(page, h->order); WARN_ON(PageReserved(page)); From a195d3f5b74f3f45a6742f9063b5e95a2522b46d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 17 Aug 2018 15:49:10 -0700 Subject: [PATCH 096/111] mm/oom_kill.c: document oom_lock Add comments describing oom_lock's scope. Requested-by: David Rientjes Link: http://lkml.kernel.org/r/20180711120121.25635-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Andrew Morton Cc: David Rientjes Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index cd6520f7553d..412f43453a68 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -53,6 +53,14 @@ int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks = 1; +/* + * Serializes oom killer invocations (out_of_memory()) from all contexts to + * prevent from over eager oom killing (e.g. when the oom killer is invoked + * from different domains). + * + * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled + * and mark_oom_victim + */ DEFINE_MUTEX(oom_lock); #ifdef CONFIG_NUMA From d8a759b5703519d37fa5b752f825cbfc06b57906 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Fri, 17 Aug 2018 15:49:14 -0700 Subject: [PATCH 097/111] mm, page_alloc: double zone's batchsize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To improve page allocator's performance for order-0 pages, each CPU has a Per-CPU-Pageset(PCP) per zone. Whenever an order-0 page is needed, PCP will be checked first before asking pages from Buddy. When PCP is used up, a batch of pages will be fetched from Buddy to improve performance and the size of batch can affect performance. zone's batch size gets doubled last time by commit ba56e91c9401("mm: page_alloc: increase size of per-cpu-pages") over ten years ago. Since then, CPU has envolved a lot and CPU's cache sizes also increased. Dave Hansen is concerned the current batch size doesn't fit well with modern hardware and suggested me to do two things: first, use a page allocator intensive benchmark, e.g. will-it-scale/page_fault1 to find out how performance changes with different batch sizes on various machines and then choose a new default batch size; second, see how this new batch size work with other workloads. In the first test, we saw performance gains on high-core-count systems and little to no effect on older systems with more modest core counts. In this phase's test data, two candidates: 63 and 127 are chosen. In the second step, ebizzy, oltp, kbuild, pigz, netperf, vm-scalability and more will-it-scale sub-tests are tested to see how these two candidates work with these workloads and decides a new default according to their results. Most test results are flat. will-it-scale/page_fault2 process mode has 10%-18% performance increase on 4-sockets Skylake and Broadwell. vm-scalability/lru-file-mmap-read has 17%-47% performance increase for 4-sockets servers while for 2-sockets servers, it caused 3%-8% performance drop. Further analysis showed that, with a larger pcp->batch and thus larger pcp->high(the relationship of pcp->high=6 * pcp->batch is maintained in this patch), zone lock contention shifted to LRU add side lock contention and that caused performance drop. This performance drop might be mitigated by others' work on optimizing LRU lock. Another downside of increasing pcp->batch is, when PCP is used up and need to fetch a batch of pages from Buddy, since batch is increased, that time can be longer than before. My understanding is, this doesn't affect slowpath where direct reclaim and compaction dominates. For fastpath, throughput is a win(according to will-it-scale/page_fault1) but worst latency can be larger now. Overall, I think double the batch size from 31 to 63 is relatively safe and provide good performance boost for high-core-count systems. The two phase's test results are listed below(all tests are done with THP disabled). Phase one(will-it-scale/page_fault1) test results: Skylake-EX: increased batch size has a good effect on zone->lock contention, though LRU contention will rise at the same time and limited the final performance increase. batch score change zone_contention lru_contention total_contention 31 15345900 +0.00% 64% 8% 72% 53 17903847 +16.67% 32% 38% 70% 63 17992886 +17.25% 24% 45% 69% 73 18022825 +17.44% 10% 61% 71% 119 18023401 +17.45% 4% 66% 70% 127 18029012 +17.48% 3% 66% 69% 137 18036075 +17.53% 4% 66% 70% 165 18035964 +17.53% 2% 67% 69% 188 18101105 +17.95% 2% 67% 69% 223 18130951 +18.15% 2% 67% 69% 255 18118898 +18.07% 2% 67% 69% 267 18101559 +17.96% 2% 67% 69% 299 18160468 +18.34% 2% 68% 70% 320 18139845 +18.21% 2% 67% 69% 393 18160869 +18.34% 2% 68% 70% 424 18170999 +18.41% 2% 68% 70% 458 18144868 +18.24% 2% 68% 70% 467 18142366 +18.22% 2% 68% 70% 498 18154549 +18.30% 1% 68% 69% 511 18134525 +18.17% 1% 69% 70% Broadwell-EX: similar pattern as Skylake-EX. batch score change zone_contention lru_contention total_contention 31 16703983 +0.00% 67% 7% 74% 53 18195393 +8.93% 43% 28% 71% 63 18288885 +9.49% 38% 33% 71% 73 18344329 +9.82% 35% 37% 72% 119 18535529 +10.96% 24% 46% 70% 127 18513596 +10.83% 23% 48% 71% 137 18514327 +10.84% 23% 48% 71% 165 18511840 +10.82% 22% 49% 71% 188 18593478 +11.31% 17% 53% 70% 223 18601667 +11.36% 17% 52% 69% 255 18774825 +12.40% 12% 58% 70% 267 18754781 +12.28% 9% 60% 69% 299 18892265 +13.10% 7% 63% 70% 320 18873812 +12.99% 8% 62% 70% 393 18891174 +13.09% 6% 64% 70% 424 18975108 +13.60% 6% 64% 70% 458 18932364 +13.34% 8% 62% 70% 467 18960891 +13.51% 5% 65% 70% 498 18944526 +13.41% 5% 64% 69% 511 18960839 +13.51% 5% 64% 69% Skylake-EP: although increased batch reduced zone->lock contention, but the effect is not as good as EX: zone->lock contention is still as high as 20% with a very high batch value instead of 1% on Skylake-EX or 5% on Broadwell-EX. Also, total_contention actually decreased with a higher batch but that doesn't translate to performance increase. batch score change zone_contention lru_contention total_contention 31 9554867 +0.00% 66% 3% 69% 53 9855486 +3.15% 63% 3% 66% 63 9980145 +4.45% 62% 4% 66% 73 10092774 +5.63% 62% 5% 67% 119 10310061 +7.90% 45% 19% 64% 127 10342019 +8.24% 42% 19% 61% 137 10358182 +8.41% 42% 21% 63% 165 10397060 +8.81% 37% 24% 61% 188 10341808 +8.24% 34% 26% 60% 223 10349135 +8.31% 31% 27% 58% 255 10327189 +8.08% 28% 29% 57% 267 10344204 +8.26% 27% 29% 56% 299 10325043 +8.06% 25% 30% 55% 320 10310325 +7.91% 25% 31% 56% 393 10293274 +7.73% 21% 31% 52% 424 10311099 +7.91% 21% 32% 53% 458 10321375 +8.02% 21% 32% 53% 467 10303881 +7.84% 21% 32% 53% 498 10332462 +8.14% 20% 33% 53% 511 10325016 +8.06% 20% 32% 52% Broadwell-EP: zone->lock and lru lock had an agreement to make sure performance doesn't increase and they successfully managed to keep total contention at 70%. batch score change zone_contention lru_contention total_contention 31 10121178 +0.00% 19% 50% 69% 53 10142366 +0.21% 6% 63% 69% 63 10117984 -0.03% 11% 58% 69% 73 10123330 +0.02% 7% 63% 70% 119 10108791 -0.12% 2% 67% 69% 127 10166074 +0.44% 3% 66% 69% 137 10141574 +0.20% 3% 66% 69% 165 10154499 +0.33% 2% 68% 70% 188 10124921 +0.04% 2% 67% 69% 223 10137399 +0.16% 2% 67% 69% 255 10143289 +0.22% 0% 68% 68% 267 10123535 +0.02% 1% 68% 69% 299 10140952 +0.20% 0% 68% 68% 320 10163170 +0.41% 0% 68% 68% 393 10000633 -1.19% 0% 69% 69% 424 10087998 -0.33% 0% 69% 69% 458 10187116 +0.65% 0% 69% 69% 467 10146790 +0.25% 0% 69% 69% 498 10197958 +0.76% 0% 69% 69% 511 10152326 +0.31% 0% 69% 69% Haswell-EP: similar to Broadwell-EP. batch score change zone_contention lru_contention total_contention 31 10442205 +0.00% 14% 48% 62% 53 10442255 +0.00% 5% 57% 62% 63 10452059 +0.09% 6% 57% 63% 73 10482349 +0.38% 5% 59% 64% 119 10454644 +0.12% 3% 60% 63% 127 10431514 -0.10% 3% 59% 62% 137 10423785 -0.18% 3% 60% 63% 165 10481216 +0.37% 2% 61% 63% 188 10448755 +0.06% 2% 61% 63% 223 10467144 +0.24% 2% 61% 63% 255 10480215 +0.36% 2% 61% 63% 267 10484279 +0.40% 2% 61% 63% 299 10466450 +0.23% 2% 61% 63% 320 10452578 +0.10% 2% 61% 63% 393 10499678 +0.55% 1% 62% 63% 424 10481454 +0.38% 1% 62% 63% 458 10473562 +0.30% 1% 62% 63% 467 10484269 +0.40% 0% 62% 62% 498 10505599 +0.61% 0% 62% 62% 511 10483395 +0.39% 0% 62% 62% Westmere-EP: contention is pretty small so not interesting. Note too high a batch value could hurt performance. batch score change zone_contention lru_contention total_contention 31 4831523 +0.00% 2% 3% 5% 53 4834086 +0.05% 2% 4% 6% 63 4834262 +0.06% 2% 3% 5% 73 4832851 +0.03% 2% 4% 6% 119 4830534 -0.02% 1% 3% 4% 127 4827461 -0.08% 1% 4% 5% 137 4827459 -0.08% 1% 3% 4% 165 4820534 -0.23% 0% 4% 4% 188 4817947 -0.28% 0% 3% 3% 223 4809671 -0.45% 0% 3% 3% 255 4802463 -0.60% 0% 4% 4% 267 4801634 -0.62% 0% 3% 3% 299 4798047 -0.69% 0% 3% 3% 320 4793084 -0.80% 0% 3% 3% 393 4785877 -0.94% 0% 3% 3% 424 4782911 -1.01% 0% 3% 3% 458 4779346 -1.08% 0% 3% 3% 467 4780306 -1.06% 0% 3% 3% 498 4780589 -1.05% 0% 3% 3% 511 4773724 -1.20% 0% 3% 3% Skylake-Desktop: similar to Westmere-EP, nothing interesting. batch score change zone_contention lru_contention total_contention 31 3906608 +0.00% 2% 3% 5% 53 3940164 +0.86% 2% 3% 5% 63 3937289 +0.79% 2% 3% 5% 73 3940201 +0.86% 2% 3% 5% 119 3933240 +0.68% 2% 3% 5% 127 3930514 +0.61% 2% 4% 6% 137 3938639 +0.82% 0% 3% 3% 165 3908755 +0.05% 0% 3% 3% 188 3905621 -0.03% 0% 3% 3% 223 3903015 -0.09% 0% 4% 4% 255 3889480 -0.44% 0% 3% 3% 267 3891669 -0.38% 0% 4% 4% 299 3898728 -0.20% 0% 4% 4% 320 3894547 -0.31% 0% 4% 4% 393 3875137 -0.81% 0% 4% 4% 424 3874521 -0.82% 0% 3% 3% 458 3880432 -0.67% 0% 4% 4% 467 3888715 -0.46% 0% 3% 3% 498 3888633 -0.46% 0% 4% 4% 511 3875305 -0.80% 0% 5% 5% Haswell-Desktop: zone->lock is pretty low as other desktops, though lru contention is higher than other desktops. batch score change zone_contention lru_contention total_contention 31 3511158 +0.00% 2% 5% 7% 53 3555445 +1.26% 2% 6% 8% 63 3561082 +1.42% 2% 6% 8% 73 3547218 +1.03% 2% 6% 8% 119 3571319 +1.71% 1% 7% 8% 127 3549375 +1.09% 0% 6% 6% 137 3560233 +1.40% 0% 6% 6% 165 3555176 +1.25% 2% 6% 8% 188 3551501 +1.15% 0% 8% 8% 223 3531462 +0.58% 0% 7% 7% 255 3570400 +1.69% 0% 7% 7% 267 3532235 +0.60% 1% 8% 9% 299 3562326 +1.46% 0% 6% 6% 320 3553569 +1.21% 0% 8% 8% 393 3539519 +0.81% 0% 7% 7% 424 3549271 +1.09% 0% 8% 8% 458 3528885 +0.50% 0% 8% 8% 467 3526554 +0.44% 0% 7% 7% 498 3525302 +0.40% 0% 9% 9% 511 3527556 +0.47% 0% 8% 8% Sandybridge-Desktop: the 0% contention isn't accurate but caused by dropped fractional part. Since multiple contention path's contentions are all under 1% here, with some arithmetic operations like add, the final deviation could be as large as 3%. batch score change zone_contention lru_contention total_contention 31 1744495 +0.00% 0% 0% 0% 53 1755341 +0.62% 0% 0% 0% 63 1758469 +0.80% 0% 0% 0% 73 1759626 +0.87% 0% 0% 0% 119 1770417 +1.49% 0% 0% 0% 127 1768252 +1.36% 0% 0% 0% 137 1767848 +1.34% 0% 0% 0% 165 1765088 +1.18% 0% 0% 0% 188 1766918 +1.29% 0% 0% 0% 223 1767866 +1.34% 0% 0% 0% 255 1768074 +1.35% 0% 0% 0% 267 1763187 +1.07% 0% 0% 0% 299 1765620 +1.21% 0% 0% 0% 320 1767603 +1.32% 0% 0% 0% 393 1764612 +1.15% 0% 0% 0% 424 1758476 +0.80% 0% 0% 0% 458 1758593 +0.81% 0% 0% 0% 467 1757915 +0.77% 0% 0% 0% 498 1753363 +0.51% 0% 0% 0% 511 1755548 +0.63% 0% 0% 0% Phase two test results: Note: all percent change is against base(batch=31). ebizzy.throughput (higer is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 2410037±7% 2600451±2% +7.9% 2602878 +8.0% lkp-bdw-ex1 1493328 1489243 -0.3% 1492145 -0.1% lkp-skl-2sp2 1329674 1345891 +1.2% 1351056 +1.6% lkp-bdw-ep2 711511 711511 0.0% 710708 -0.1% lkp-wsm-ep2 75750 75528 -0.3% 75441 -0.4% lkp-skl-d01 264126 262791 -0.5% 264113 +0.0% lkp-hsw-d01 176601 176328 -0.2% 176368 -0.1% lkp-sb02 98937 98937 +0.0% 99030 +0.1% kbuild.buildtime (less is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 107.00 107.67 +0.6% 107.11 +0.1% lkp-bdw-ex1 97.33 97.33 +0.0% 97.42 +0.1% lkp-skl-2sp2 180.00 179.83 -0.1% 179.83 -0.1% lkp-bdw-ep2 178.17 179.17 +0.6% 177.50 -0.4% lkp-wsm-ep2 737.00 738.00 +0.1% 738.00 +0.1% lkp-skl-d01 642.00 653.00 +1.7% 653.00 +1.7% lkp-hsw-d01 1310.00 1316.00 +0.5% 1311.00 +0.1% netperf/TCP_STREAM.Throughput_total_Mbps (higher is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 948790 947144 -0.2% 948333 -0.0% lkp-bdw-ex1 904224 904366 +0.0% 904926 +0.1% lkp-skl-2sp2 239731 239607 -0.1% 239565 -0.1% lk-bdw-ep2 365764 365933 +0.0% 365951 +0.1% lkp-wsm-ep2 93736 93803 +0.1% 93808 +0.1% lkp-skl-d01 77314 77303 -0.0% 77375 +0.1% lkp-hsw-d01 58617 60387 +3.0% 60208 +2.7% lkp-sb02 29990 30137 +0.5% 30103 +0.4% oltp.transactions (higer is better) machine batch=31 batch=63 batch=127 lkp-bdw-ex1 9073276 9100377 +0.3% 9036344 -0.4% lkp-skl-2sp2 8898717 8852054 -0.5% 8894459 -0.0% lkp-bdw-ep2 13426155 13384654 -0.3% 13333637 -0.7% lkp-hsw-ep2 13146314 13232784 +0.7% 13193163 +0.4% lkp-wsm-ep2 5035355 5019348 -0.3% 5033418 -0.0% lkp-skl-d01 418485 4413339 -0.1% 4419039 +0.0% lkp-hsw-d01 3517817±5% 3396120±3% -3.5% 3455138±3% -1.8% pigz.throughput (higer is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 1.513e+08 1.507e+08 -0.4% 1.511e+08 -0.2% lkp-bdw-ex1 2.060e+08 2.052e+08 -0.4% 2.044e+08 -0.8% lkp-skl-2sp2 8.836e+08 8.845e+08 +0.1% 8.836e+08 -0.0% lkp-bdw-ep2 8.275e+08 8.464e+08 +2.3% 8.330e+08 +0.7% lkp-wsm-ep2 2.224e+08 2.221e+08 -0.2% 2.218e+08 -0.3% lkp-skl-d01 1.177e+08 1.177e+08 -0.0% 1.176e+08 -0.1% lkp-hsw-d01 1.154e+08 1.154e+08 +0.1% 1.154e+08 -0.0% lkp-sb02 0.633e+08 0.633e+08 +0.1% 0.633e+08 +0.0% will-it-scale.malloc1.processes (higher is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 620181 620484 +0.0% 620240 +0.0% lkp-bdw-ex1 1403610 1401201 -0.2% 1417900 +1.0% lkp-skl-2sp2 1288097 1284145 -0.3% 1283907 -0.3% lkp-bdw-ep2 1427879 1427675 -0.0% 1428266 +0.0% lkp-hsw-ep2 1362546 1353965 -0.6% 1354759 -0.6% lkp-wsm-ep2 2099657 2107576 +0.4% 2100226 +0.0% lkp-skl-d01 1476835 1476358 -0.0% 1474487 -0.2% lkp-hsw-d01 1308810 1303429 -0.4% 1301299 -0.6% lkp-sb02 589286 589284 -0.0% 588101 -0.2% will-it-scale.malloc1.threads (higher is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 21289 21125 -0.8% 21241 -0.2% lkp-bdw-ex1 28114 28089 -0.1% 28007 -0.4% lkp-skl-2sp2 91866 91946 +0.1% 92723 +0.9% lkp-bdw-ep2 37637 37501 -0.4% 37317 -0.9% lkp-hsw-ep2 43673 43590 -0.2% 43754 +0.2% lkp-wsm-ep2 28577 28298 -1.0% 28545 -0.1% lkp-skl-d01 175277 173343 -1.1% 173082 -1.3% lkp-hsw-d01 130303 129566 -0.6% 129250 -0.8% lkp-sb02 113742±3% 116911 +2.8% 116417±3% +2.4% will-it-scale.malloc2.processes (higer is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 1.206e+09 1.206e+09 -0.0% 1.206e+09 +0.0% lkp-bdw-ex1 1.319e+09 1.319e+09 -0.0% 1.319e+09 +0.0% lkp-skl-2sp2 8.000e+08 8.021e+08 +0.3% 7.995e+08 -0.1% lkp-bdw-ep2 6.582e+08 6.634e+08 +0.8% 6.513e+08 -1.1% lkp-hsw-ep2 6.671e+08 6.669e+08 -0.0% 6.665e+08 -0.1% lkp-wsm-ep2 1.805e+08 1.806e+08 +0.0% 1.804e+08 -0.1% lkp-skl-d01 1.611e+08 1.611e+08 -0.0% 1.610e+08 -0.0% lkp-hsw-d01 1.333e+08 1.332e+08 -0.0% 1.332e+08 -0.0% lkp-sb02 82485104 82478206 -0.0% 82473546 -0.0% will-it-scale.malloc2.threads (higer is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 1.574e+09 1.574e+09 -0.0% 1.574e+09 -0.0% lkp-bdw-ex1 1.737e+09 1.737e+09 +0.0% 1.737e+09 -0.0% lkp-skl-2sp2 9.161e+08 9.162e+08 +0.0% 9.181e+08 +0.2% lkp-bdw-ep2 7.856e+08 8.015e+08 +2.0% 8.113e+08 +3.3% lkp-hsw-ep2 6.908e+08 6.904e+08 -0.1% 6.907e+08 -0.0% lkp-wsm-ep2 2.409e+08 2.409e+08 +0.0% 2.409e+08 -0.0% lkp-skl-d01 1.199e+08 1.199e+08 -0.0% 1.199e+08 -0.0% lkp-hsw-d01 1.029e+08 1.029e+08 -0.0% 1.029e+08 +0.0% lkp-sb02 68081213 68061423 -0.0% 68076037 -0.0% will-it-scale.page_fault2.processes (higer is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 14509125±4% 16472364 +13.5% 17123117 +18.0% lkp-bdw-ex1 14736381 16196588 +9.9% 16364011 +11.0% lkp-skl-2sp2 6354925 6435444 +1.3% 6436644 +1.3% lkp-bdw-ep2 8749584 8834422 +1.0% 8827179 +0.9% lkp-hsw-ep2 8762591 8845920 +1.0% 8825697 +0.7% lkp-wsm-ep2 3036083 3030428 -0.2% 3021741 -0.5% lkp-skl-d01 2307834 2304731 -0.1% 2286142 -0.9% lkp-hsw-d01 1806237 1800786 -0.3% 1795943 -0.6% lkp-sb02 842616 837844 -0.6% 833921 -1.0% will-it-scale.page_fault2.threads machine batch=31 batch=63 batch=127 lkp-skl-4sp1 1623294 1615132±2% -0.5% 1656777 +2.1% lkp-bdw-ex1 1995714 2025948 +1.5% 2113753±3% +5.9% lkp-skl-2sp2 2346708 2415591 +2.9% 2416919 +3.0% lkp-bdw-ep2 2342564 2344882 +0.1% 2300206 -1.8% lkp-hsw-ep2 1820658 1831681 +0.6% 1844057 +1.3% lkp-wsm-ep2 1725482 1733774 +0.5% 1740517 +0.9% lkp-skl-d01 1832833 1823628 -0.5% 1806489 -1.4% lkp-hsw-d01 1427913 1427287 -0.0% 1420226 -0.5% lkp-sb02 750626 748615 -0.3% 746621 -0.5% will-it-scale.page_fault3.processes (higher is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 24382726 24400317 +0.1% 24668774 +1.2% lkp-bdw-ex1 35399750 35683124 +0.8% 35829492 +1.2% lkp-skl-2sp2 28136820 28068248 -0.2% 28147989 +0.0% lkp-bdw-ep2 37269077 37459490 +0.5% 37373073 +0.3% lkp-hsw-ep2 36224967 36114085 -0.3% 36104908 -0.3% lkp-wsm-ep2 16820457 16911005 +0.5% 16968596 +0.9% lkp-skl-d01 7721138 7725904 +0.1% 7756740 +0.5% lkp-hsw-d01 7611979 7650928 +0.5% 7651323 +0.5% lkp-sb02 3781546 3796502 +0.4% 3796827 +0.4% will-it-scale.page_fault3.threads (higer is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 1865820±3% 1900917±2% +1.9% 1826245±4% -2.1% lkp-bdw-ex1 3094060 3148326 +1.8% 3150036 +1.8% lkp-skl-2sp2 3952940 3953898 +0.0% 3989360 +0.9% lkp-bdw-ep2 3420373±3% 3643964 +6.5% 3644910±5% +6.6% lkp-hsw-ep2 2609635±2% 2582310±3% -1.0% 2780459 +6.5% lkp-wsm-ep2 4395001 4417196 +0.5% 4432499 +0.9% lkp-skl-d01 5363977 5400003 +0.7% 5411370 +0.9% lkp-hsw-d01 5274131 5311294 +0.7% 5319359 +0.9% lkp-sb02 2917314 2913004 -0.1% 2935286 +0.6% will-it-scale.read1.processes (higer is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 73762279±14% 69322519±10% -6.0% 69349855±13% -6.0% (result unstable) lkp-bdw-ex1 1.701e+08 1.704e+08 +0.1% 1.705e+08 +0.2% lkp-skl-2sp2 63111570 63113953 +0.0% 63836573 +1.1% lkp-bdw-ep2 79247409 79424610 +0.2% 78012656 -1.6% lkp-hsw-ep2 67677026 68308800 +0.9% 67539106 -0.2% lkp-wsm-ep2 13339630 13939817 +4.5% 13766865 +3.2% lkp-skl-d01 10969487 10972650 +0.0% no data lkp-hsw-d01 9857342±2% 10080592±2% +2.3% 10131560 +2.8% lkp-sb02 5189076 5197473 +0.2% 5163253 -0.5% will-it-scale.read1.threads (higher is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 62468045±12% 73666726±7% +17.9% 79553123±12% +27.4% (result unstable) lkp-bdw-ex1 1.62e+08 1.624e+08 +0.3% 1.614e+08 -0.3% lkp-skl-2sp2 58319780 59181032 +1.5% 59821353 +2.6% lkp-bdw-ep2 74057992 75698171 +2.2% 74990869 +1.3% lkp-hsw-ep2 63672959 63639652 -0.1% 64387051 +1.1% lkp-wsm-ep2 13489943 13526058 +0.3% 13259032 -1.7% lkp-skl-d01 10297906 10338796 +0.4% 10407328 +1.1% lkp-hsw-d01 9636721 9667376 +0.3% 9341147 -3.1% lkp-sb02 4801938 4804496 +0.1% 4802290 +0.0% will-it-scale.write1.processes (higer is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 1.111e+08 1.104e+08±2% -0.7% 1.122e+08±2% +1.0% lkp-bdw-ex1 1.392e+08 1.399e+08 +0.5% 1.397e+08 +0.4% lkp-skl-2sp2 59369233 58994841 -0.6% 58715168 -1.1% lkp-bdw-ep2 61820979 CPU throttle 63593123 +2.9% lkp-hsw-ep2 57897587 57435605 -0.8% 56347450 -2.7% lkp-wsm-ep2 7814203 7918017±2% +1.3% 7669068 -1.9% lkp-skl-d01 8886557 8971422 +1.0% 8818366 -0.8% lkp-hsw-d01 9171001±5% 9189915 +0.2% 9483909 +3.4% lkp-sb02 4475406 4475294 -0.0% 4501756 +0.6% will-it-scale.write1.threads (higer is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 1.058e+08 1.055e+08±2% -0.2% 1.065e+08 +0.7% lkp-bdw-ex1 1.316e+08 1.300e+08 -1.2% 1.308e+08 -0.6% lkp-skl-2sp2 54492421 56086678 +2.9% 55975657 +2.7% lkp-bdw-ep2 59360449 59003957 -0.6% 58101262 -2.1% lkp-hsw-ep2 53346346±2% 52530876 -1.5% 52902487 -0.8% lkp-wsm-ep2 7774006 7800092±2% +0.3% 7558833 -2.8% lkp-skl-d01 8346174 8235695 -1.3% no data lkp-hsw-d01 8636244 8655731 +0.2% 8658868 +0.3% lkp-sb02 4181820 4204107 +0.5% 4182992 +0.0% vm-scalability.anon-r-rand.throughput (higher is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 11933873±3% 12356544±2% +3.5% 12188624 +2.1% lkp-bdw-ex1 7114424±2% 7330949±2% +3.0% 7392419 +3.9% lkp-skl-2sp2 6773277±5% 6492332±8% -4.1% 6543962 -3.4% lkp-bdw-ep2 7133846±4% 7233508 +1.4% 7013518±3% -1.7% lkp-hsw-ep2 4576626 4527098 -1.1% 4551679 -0.5% lkp-wsm-ep2 2583599 2592492 +0.3% 2588039 +0.2% lkp-hsw-d01 998199±2% 1028311 +3.0% 1006460±2% +0.8% lkp-sb02 570572 567854 -0.5% 568449 -0.4% vm-scalability.anon-r-rand-mt.throughput (higher is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 1789419 1787830 -0.1% 1788208 -0.1% lkp-bdw-ex1 3492595±2% 3554966±2% +1.8% 3558835±3% +1.9% lkp-skl-2sp2 3856238±2% 3975403±4% +3.1% 3994600 +3.6% lkp-bdw-ep2 3726963±11% 3809292±6% +2.2% 3871924±4% +3.9% lkp-hsw-ep2 2131760±3% 2033578±4% -4.6% 2130727±6% -0.0% lkp-wsm-ep2 2369731 2368384 -0.1% 2370252 +0.0% lkp-skl-d01 1207128 1206220 -0.1% 1205801 -0.1% lkp-hsw-d01 964317 992329±2% +2.9% 992099±2% +2.9% lkp-sb02 567137 567346 +0.0% 566144 -0.2% vm-scalability.lru-file-mmap-read.throughput (higher is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 19560469±6% 23018999 +17.7% 23418800 +19.7% lkp-bdw-ex1 17769135±14% 26141676±3% +47.1% 26284723±5% +47.9% lkp-skl-2sp2 14056512 13578884 -3.4% 13146214 -6.5% lkp-bdw-ep2 15336542 14737654 -3.9% 14088159 -8.1% lkp-hsw-ep2 16275498 15756296 -3.2% 15018090 -7.7% lkp-wsm-ep2 11272160 11237231 -0.3% 11310047 +0.3% lkp-skl-d01 7322119 7324569 +0.0% 7184148 -1.9% lkp-hsw-d01 6449234 6404542 -0.7% 6356141 -1.4% lkp-sb02 3517943 3520668 +0.1% 3527309 +0.3% vm-scalability.lru-file-mmap-read-rand.throughput (higher is better) machine batch=31 batch=63 batch=127 lkp-skl-4sp1 1689052 1697553 +0.5% 1698726 +0.6% lkp-bdw-ex1 1675246 1699764 +1.5% 1712226 +2.2% lkp-skl-2sp2 1800533 1799749 -0.0% 1800581 +0.0% lkp-bdw-ep2 1807422 1807758 +0.0% 1804932 -0.1% lkp-hsw-ep2 1809807 1808781 -0.1% 1807811 -0.1% lkp-wsm-ep2 1800198 1802434 +0.1% 1801236 +0.1% lkp-skl-d01 696689 695537 -0.2% 694106 -0.4% lkp-hsw-d01 698364 698666 +0.0% 696686 -0.2% lkp-sb02 258939 258787 -0.1% 258199 -0.3% Link: http://lkml.kernel.org/r/20180711055855.29072-1-aaron.lu@intel.com Signed-off-by: Aaron Lu Suggested-by: Dave Hansen Acked-by: Michal Hocko Acked-by: Jesper Dangaard Brouer Cc: Huang Ying Cc: Kemi Wang Cc: Tim Chen Cc: Andi Kleen Cc: Vlastimil Babka Cc: Mel Gorman Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e1517bb143dc..15ea511fb41c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5564,13 +5564,12 @@ static int zone_batchsize(struct zone *zone) /* * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. But no more than 1/2 of a meg. - * - * OK, so we don't know how big the cache is. So guess. + * size of the zone. */ batch = zone->managed_pages / 1024; - if (batch * PAGE_SIZE > 512 * 1024) - batch = (512 * 1024) / PAGE_SIZE; + /* But no more than a meg. */ + if (batch * PAGE_SIZE > 1024 * 1024) + batch = (1024 * 1024) / PAGE_SIZE; batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; From 330d6e489a0ab49136561d7f792b1d81bcdbb83c Mon Sep 17 00:00:00 2001 From: Cannon Matthews Date: Fri, 17 Aug 2018 15:49:17 -0700 Subject: [PATCH 098/111] mm/hugetlb.c: don't zero 1GiB bootmem pages When using 1GiB pages during early boot, use the new memblock_virt_alloc_try_nid_raw() to allocate memory without zeroing it. Zeroing out hundreds or thousands of GiB in a single core memset() call is very slow, and can make early boot last upwards of 20-30 minutes on multi TiB machines. The memory does not need to be zero'd as the hugetlb pages are always zero'd on page fault. Tested: Booted with ~3800 1G pages, and it booted successfully in roughly the same amount of time as with 0, as opposed to the 25+ minutes it would take before. Link: http://lkml.kernel.org/r/20180711213313.92481-1-cannonmatthews@google.com Signed-off-by: Cannon Matthews Acked-by: Mike Kravetz Acked-by: Michal Hocko Cc: Andres Lagar-Cavilla Cc: Peter Feiner Cc: David Matlack Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4cea30ac5033..47566bb0b4b1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2101,7 +2101,7 @@ int __alloc_bootmem_huge_page(struct hstate *h) for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { void *addr; - addr = memblock_virt_alloc_try_nid_nopanic( + addr = memblock_virt_alloc_try_nid_raw( huge_page_size(h), huge_page_size(h), 0, BOOTMEM_ALLOC_ACCESSIBLE, node); if (addr) { @@ -2119,6 +2119,7 @@ int __alloc_bootmem_huge_page(struct hstate *h) found: BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); /* Put them into a private list first because mem_map is not up yet */ + INIT_LIST_HEAD(&m->list); list_add(&m->list, &huge_boot_pages); m->hstate = h; return 1; From 35fd1eb1e8212c02f6eae24335a9e5b80f9519b4 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 17 Aug 2018 15:49:21 -0700 Subject: [PATCH 099/111] mm/sparse: abstract sparse buffer allocations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "sparse_init rewrite", v6. In sparse_init() we allocate two large buffers to temporary hold usemap and memmap for the whole machine. However, we can avoid doing that if we changed sparse_init() to operated on per-node bases instead of doing it on the whole machine beforehand. As shown by Baoquan http://lkml.kernel.org/r/20180628062857.29658-1-bhe@redhat.com The buffers are large enough to cause machine stop to boot on small memory systems. Another benefit of these changes is that they also obsolete CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER. This patch (of 5): When struct pages are allocated for sparse-vmemmap VA layout, we first try to allocate one large buffer, and than if that fails allocate struct pages for each section as we go. The code that allocates buffer is uses global variables and is spread across several call sites. Cleanup the code by introducing three functions to handle the global buffer: sparse_buffer_init() initialize the buffer sparse_buffer_fini() free the remaining part of the buffer sparse_buffer_alloc() alloc from the buffer, and if buffer is empty return NULL Define these functions in sparse.c instead of sparse-vmemmap.c because later we will use them for non-vmemmap sparse allocations as well. [akpm@linux-foundation.org: use PTR_ALIGN()] [akpm@linux-foundation.org: s/BUG_ON/WARN_ON/] Link: http://lkml.kernel.org/r/20180712203730.8703-2-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Tested-by: Michael Ellerman [powerpc] Reviewed-by: Oscar Salvador Tested-by: Oscar Salvador Cc: Pasha Tatashin Cc: Steven Sistare Cc: Daniel Jordan Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Dan Williams Cc: Jan Kara Cc: Jérôme Glisse Cc: Souptick Joarder Cc: Baoquan He Cc: Greg Kroah-Hartman Cc: Vlastimil Babka Cc: Wei Yang Cc: Dave Hansen Cc: David Rientjes Cc: Ingo Molnar Cc: Abdul Haleem Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++++ mm/sparse-vmemmap.c | 40 ++++++---------------------------------- mm/sparse.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 54 insertions(+), 35 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2fb32d1561eb..4ace5d50a892 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2671,6 +2671,10 @@ void sparse_mem_maps_populate_node(struct page **map_map, unsigned long map_count, int nodeid); +unsigned long __init section_map_size(void); +void sparse_buffer_init(unsigned long size, int nid); +void sparse_buffer_fini(void); +void *sparse_buffer_alloc(unsigned long size); struct page *sparse_mem_map_populate(unsigned long pnum, int nid, struct vmem_altmap *altmap); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 95e2c7638a5c..b05c7663c640 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -43,12 +43,9 @@ static void * __ref __earlyonly_bootmem_alloc(int node, unsigned long goal) { return memblock_virt_alloc_try_nid_raw(size, align, goal, - BOOTMEM_ALLOC_ACCESSIBLE, node); + BOOTMEM_ALLOC_ACCESSIBLE, node); } -static void *vmemmap_buf; -static void *vmemmap_buf_end; - void * __meminit vmemmap_alloc_block(unsigned long size, int node) { /* If the main allocator is up use that, fallback to bootmem. */ @@ -76,18 +73,10 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) /* need to make sure size is all the same during early stage */ void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) { - void *ptr; - - if (!vmemmap_buf) - return vmemmap_alloc_block(size, node); - - /* take the from buf */ - ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size); - if (ptr + size > vmemmap_buf_end) - return vmemmap_alloc_block(size, node); - - vmemmap_buf = ptr + size; + void *ptr = sparse_buffer_alloc(size); + if (!ptr) + ptr = vmemmap_alloc_block(size, node); return ptr; } @@ -279,19 +268,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, unsigned long map_count, int nodeid) { unsigned long pnum; - unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; - void *vmemmap_buf_start; int nr_consumed_maps = 0; - size = ALIGN(size, PMD_SIZE); - vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count, - PMD_SIZE, __pa(MAX_DMA_ADDRESS)); - - if (vmemmap_buf_start) { - vmemmap_buf = vmemmap_buf_start; - vmemmap_buf_end = vmemmap_buf_start + size * map_count; - } - + sparse_buffer_init(section_map_size() * map_count, nodeid); for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) continue; @@ -303,12 +282,5 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); } - - if (vmemmap_buf_start) { - /* need to free left buf */ - memblock_free_early(__pa(vmemmap_buf), - vmemmap_buf_end - vmemmap_buf); - vmemmap_buf = NULL; - vmemmap_buf_end = NULL; - } + sparse_buffer_fini(); } diff --git a/mm/sparse.c b/mm/sparse.c index 2ea8b3dbd0df..9a0a5f598469 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -400,7 +400,14 @@ static void __init sparse_early_usemaps_alloc_node(void *data, } } -#ifndef CONFIG_SPARSEMEM_VMEMMAP +#ifdef CONFIG_SPARSEMEM_VMEMMAP +unsigned long __init section_map_size(void) + +{ + return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE); +} + +#else struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid, struct vmem_altmap *altmap) { @@ -457,6 +464,42 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ +static void *sparsemap_buf __meminitdata; +static void *sparsemap_buf_end __meminitdata; + +void __init sparse_buffer_init(unsigned long size, int nid) +{ + WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ + sparsemap_buf = + memblock_virt_alloc_try_nid_raw(size, PAGE_SIZE, + __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nid); + sparsemap_buf_end = sparsemap_buf + size; +} + +void __init sparse_buffer_fini(void) +{ + unsigned long size = sparsemap_buf_end - sparsemap_buf; + + if (sparsemap_buf && size > 0) + memblock_free_early(__pa(sparsemap_buf), size); + sparsemap_buf = NULL; +} + +void * __meminit sparse_buffer_alloc(unsigned long size) +{ + void *ptr = NULL; + + if (sparsemap_buf) { + ptr = PTR_ALIGN(sparsemap_buf, size); + if (ptr + size > sparsemap_buf_end) + ptr = NULL; + else + sparsemap_buf = ptr + size; + } + return ptr; +} + #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER static void __init sparse_early_mem_maps_alloc_node(void *data, unsigned long pnum_begin, From e131c06b14b8601e2b1dbc7ec9cc6418c293a067 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 17 Aug 2018 15:49:26 -0700 Subject: [PATCH 100/111] mm/sparse: use the new sparse buffer functions in non-vmemmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit non-vmemmap sparse also allocated large contiguous chunk of memory, and if fails falls back to smaller allocations. Use the same functions to allocate buffer as the vmemmap-sparse Link: http://lkml.kernel.org/r/20180712203730.8703-3-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Tested-by: Michael Ellerman [powerpc] Reviewed-by: Oscar Salvador Tested-by: Oscar Salvador Cc: Pasha Tatashin Cc: Abdul Haleem Cc: Baoquan He Cc: Daniel Jordan Cc: Dan Williams Cc: Dave Hansen Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Jan Kara Cc: Jérôme Glisse Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Souptick Joarder Cc: Steven Sistare Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 41 ++++++++++++++--------------------------- 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index 9a0a5f598469..db4867b62fff 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -408,13 +408,20 @@ unsigned long __init section_map_size(void) } #else +unsigned long __init section_map_size(void) +{ + return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); +} + struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid, struct vmem_altmap *altmap) { - struct page *map; - unsigned long size; + unsigned long size = section_map_size(); + struct page *map = sparse_buffer_alloc(size); + + if (map) + return map; - size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); map = memblock_virt_alloc_try_nid(size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); @@ -425,42 +432,22 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, unsigned long pnum_end, unsigned long map_count, int nodeid) { - void *map; unsigned long pnum; - unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; - int nr_consumed_maps; + unsigned long size = section_map_size(); + int nr_consumed_maps = 0; - size = PAGE_ALIGN(size); - map = memblock_virt_alloc_try_nid_raw(size * map_count, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nodeid); - if (map) { - nr_consumed_maps = 0; - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - map_map[nr_consumed_maps] = map; - map += size; - nr_consumed_maps++; - } - return; - } - - /* fallback */ - nr_consumed_maps = 0; + sparse_buffer_init(size * map_count, nodeid); for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - struct mem_section *ms; - if (!present_section_nr(pnum)) continue; map_map[nr_consumed_maps] = sparse_mem_map_populate(pnum, nodeid, NULL); if (map_map[nr_consumed_maps++]) continue; - ms = __nr_to_section(pnum); pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); } + sparse_buffer_fini(); } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ From afda57bc13410459fc957e93341ade7bebca36e2 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 17 Aug 2018 15:49:30 -0700 Subject: [PATCH 101/111] mm/sparse: move buffer init/fini to the common place MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that both variants of sparse memory use the same buffers to populate memory map, we can move sparse_buffer_init()/sparse_buffer_fini() to the common place. Link: http://lkml.kernel.org/r/20180712203730.8703-4-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Tested-by: Michael Ellerman [powerpc] Tested-by: Oscar Salvador Reviewed-by: Andrew Morton Cc: Pasha Tatashin Cc: Abdul Haleem Cc: Baoquan He Cc: Daniel Jordan Cc: Dan Williams Cc: Dave Hansen Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Jan Kara Cc: Jérôme Glisse Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Souptick Joarder Cc: Steven Sistare Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 --- mm/sparse-vmemmap.c | 2 -- mm/sparse.c | 14 +++++++------- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 4ace5d50a892..48040510df05 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2671,9 +2671,6 @@ void sparse_mem_maps_populate_node(struct page **map_map, unsigned long map_count, int nodeid); -unsigned long __init section_map_size(void); -void sparse_buffer_init(unsigned long size, int nid); -void sparse_buffer_fini(void); void *sparse_buffer_alloc(unsigned long size); struct page *sparse_mem_map_populate(unsigned long pnum, int nid, struct vmem_altmap *altmap); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index b05c7663c640..cd15f3d252c3 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -270,7 +270,6 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, unsigned long pnum; int nr_consumed_maps = 0; - sparse_buffer_init(section_map_size() * map_count, nodeid); for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) continue; @@ -282,5 +281,4 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); } - sparse_buffer_fini(); } diff --git a/mm/sparse.c b/mm/sparse.c index db4867b62fff..20ca292d8f11 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -401,14 +401,14 @@ static void __init sparse_early_usemaps_alloc_node(void *data, } #ifdef CONFIG_SPARSEMEM_VMEMMAP -unsigned long __init section_map_size(void) +static unsigned long __init section_map_size(void) { return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE); } #else -unsigned long __init section_map_size(void) +static unsigned long __init section_map_size(void) { return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); } @@ -433,10 +433,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, unsigned long map_count, int nodeid) { unsigned long pnum; - unsigned long size = section_map_size(); int nr_consumed_maps = 0; - sparse_buffer_init(size * map_count, nodeid); for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) continue; @@ -447,14 +445,13 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); } - sparse_buffer_fini(); } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ static void *sparsemap_buf __meminitdata; static void *sparsemap_buf_end __meminitdata; -void __init sparse_buffer_init(unsigned long size, int nid) +static void __init sparse_buffer_init(unsigned long size, int nid) { WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ sparsemap_buf = @@ -464,7 +461,7 @@ void __init sparse_buffer_init(unsigned long size, int nid) sparsemap_buf_end = sparsemap_buf + size; } -void __init sparse_buffer_fini(void) +static void __init sparse_buffer_fini(void) { unsigned long size = sparsemap_buf_end - sparsemap_buf; @@ -494,8 +491,11 @@ static void __init sparse_early_mem_maps_alloc_node(void *data, unsigned long map_count, int nodeid) { struct page **map_map = (struct page **)data; + + sparse_buffer_init(section_map_size() * map_count, nodeid); sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, map_count, nodeid); + sparse_buffer_fini(); } #else static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) From 85c77f79139062901727cc3bd87a65212c8c0a32 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 17 Aug 2018 15:49:33 -0700 Subject: [PATCH 102/111] mm/sparse: add new sparse_init_nid() and sparse_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sparse_init() requires to temporary allocate two large buffers: usemap_map and map_map. Baoquan He has identified that these buffers are so large that Linux is not bootable on small memory machines, such as a kdump boot. The buffers are especially large when CONFIG_X86_5LEVEL is set, as they are scaled to the maximum physical memory size. Baoquan provided a fix, which reduces these sizes of these buffers, but it is much better to get rid of them entirely. Add a new way to initialize sparse memory: sparse_init_nid(), which only operates within one memory node, and thus allocates memory either in large contiguous block or allocates section by section. This eliminates the need for use of temporary buffers. For simplified bisecting and review temporarly call sparse_init() new_sparse_init(), the new interface is going to be enabled as well as old code removed in the next patch. Link: http://lkml.kernel.org/r/20180712203730.8703-5-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Oscar Salvador Tested-by: Oscar Salvador Tested-by: Michael Ellerman [powerpc] Cc: Pasha Tatashin Cc: Abdul Haleem Cc: Baoquan He Cc: Daniel Jordan Cc: Dan Williams Cc: Dave Hansen Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Jan Kara Cc: Jérôme Glisse Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Souptick Joarder Cc: Steven Sistare Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/mm/sparse.c b/mm/sparse.c index 20ca292d8f11..248d5d7bbf55 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -200,6 +200,11 @@ static inline int next_present_section_nr(int section_nr) (section_nr <= __highest_present_section_nr)); \ section_nr = next_present_section_nr(section_nr)) +static inline unsigned long first_present_section_nr(void) +{ + return next_present_section_nr(-1); +} + /* * Record how many memory sections are marked as present * during system bootup. @@ -668,6 +673,86 @@ void __init sparse_init(void) memblock_free_early(__pa(usemap_map), size); } +/* + * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end) + * And number of present sections in this node is map_count. + */ +static void __init sparse_init_nid(int nid, unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count) +{ + unsigned long pnum, usemap_longs, *usemap; + struct page *map; + + usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS); + usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid), + usemap_size() * + map_count); + if (!usemap) { + pr_err("%s: node[%d] usemap allocation failed", __func__, nid); + goto failed; + } + sparse_buffer_init(map_count * section_map_size(), nid); + for_each_present_section_nr(pnum_begin, pnum) { + if (pnum >= pnum_end) + break; + + map = sparse_mem_map_populate(pnum, nid, NULL); + if (!map) { + pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.", + __func__, nid); + pnum_begin = pnum; + goto failed; + } + check_usemap_section_nr(nid, usemap); + sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap); + usemap += usemap_longs; + } + sparse_buffer_fini(); + return; +failed: + /* We failed to allocate, mark all the following pnums as not present */ + for_each_present_section_nr(pnum_begin, pnum) { + struct mem_section *ms; + + if (pnum >= pnum_end) + break; + ms = __nr_to_section(pnum); + ms->section_mem_map = 0; + } +} + +/* + * Allocate the accumulated non-linear sections, allocate a mem_map + * for each and record the physical to section mapping. + */ +void __init new_sparse_init(void) +{ + unsigned long pnum_begin = first_present_section_nr(); + int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin)); + unsigned long pnum_end, map_count = 1; + + /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ + set_pageblock_order(); + + for_each_present_section_nr(pnum_begin + 1, pnum_end) { + int nid = sparse_early_nid(__nr_to_section(pnum_end)); + + if (nid == nid_begin) { + map_count++; + continue; + } + /* Init node with sections in range [pnum_begin, pnum_end) */ + sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count); + nid_begin = nid; + pnum_begin = pnum_end; + map_count = 1; + } + /* cover the last node */ + sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count); + vmemmap_populate_print_last(); +} + #ifdef CONFIG_MEMORY_HOTPLUG /* Mark all memory sections within the pfn range as online */ From 2a3cb8baef71e4dad4a6ec17f5f0db9e05f46a01 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 17 Aug 2018 15:49:37 -0700 Subject: [PATCH 103/111] mm/sparse: delete old sparse_init and enable new one MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename new_sparse_init() to sparse_init() which enables it. Delete old sparse_init() and all the code that became obsolete with. [pasha.tatashin@oracle.com: remove unused sparse_mem_maps_populate_node()] Link: http://lkml.kernel.org/r/20180716174447.14529-6-pasha.tatashin@oracle.com Link: http://lkml.kernel.org/r/20180712203730.8703-6-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Tested-by: Michael Ellerman [powerpc] Tested-by: Oscar Salvador Reviewed-by: Oscar Salvador Cc: Pasha Tatashin Cc: Abdul Haleem Cc: Baoquan He Cc: Daniel Jordan Cc: Dan Williams Cc: Dave Hansen Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Jan Kara Cc: Jérôme Glisse Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Souptick Joarder Cc: Steven Sistare Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 -- mm/Kconfig | 4 - mm/sparse-vmemmap.c | 21 ---- mm/sparse.c | 237 +------------------------------------------- 4 files changed, 1 insertion(+), 267 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 48040510df05..a3cae495f9ce 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2665,12 +2665,6 @@ extern int randomize_va_space; const char * arch_vma_name(struct vm_area_struct *vma); void print_vma_addr(char *prefix, unsigned long rip); -void sparse_mem_maps_populate_node(struct page **map_map, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long map_count, - int nodeid); - void *sparse_buffer_alloc(unsigned long size); struct page *sparse_mem_map_populate(unsigned long pnum, int nid, struct vmem_altmap *altmap); diff --git a/mm/Kconfig b/mm/Kconfig index 08d8399bb93b..adfeae4decb4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -118,10 +118,6 @@ config SPARSEMEM_EXTREME config SPARSEMEM_VMEMMAP_ENABLE bool -config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - def_bool y - depends on SPARSEMEM && X86_64 - config SPARSEMEM_VMEMMAP bool "Sparse Memory virtual memmap" depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index cd15f3d252c3..8301293331a2 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -261,24 +261,3 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid, return map; } - -void __init sparse_mem_maps_populate_node(struct page **map_map, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long map_count, int nodeid) -{ - unsigned long pnum; - int nr_consumed_maps = 0; - - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - - map_map[nr_consumed_maps] = - sparse_mem_map_populate(pnum, nodeid, NULL); - if (map_map[nr_consumed_maps++]) - continue; - pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", - __func__); - } -} diff --git a/mm/sparse.c b/mm/sparse.c index 248d5d7bbf55..10b07eea9a6e 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -205,12 +205,6 @@ static inline unsigned long first_present_section_nr(void) return next_present_section_nr(-1); } -/* - * Record how many memory sections are marked as present - * during system bootup. - */ -static int __initdata nr_present_sections; - /* Record a memory area against a node. */ void __init memory_present(int nid, unsigned long start, unsigned long end) { @@ -240,7 +234,6 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) ms->section_mem_map = sparse_encode_early_nid(nid) | SECTION_IS_ONLINE; section_mark_present(ms); - nr_present_sections++; } } } @@ -377,37 +370,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) } #endif /* CONFIG_MEMORY_HOTREMOVE */ -static void __init sparse_early_usemaps_alloc_node(void *data, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long usemap_count, int nodeid) -{ - void *usemap; - unsigned long pnum; - unsigned long **usemap_map = (unsigned long **)data; - int size = usemap_size(); - int nr_consumed_maps = 0; - - usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), - size * usemap_count); - if (!usemap) { - pr_warn("%s: allocation failed\n", __func__); - return; - } - - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - usemap_map[nr_consumed_maps] = usemap; - usemap += size; - check_usemap_section_nr(nodeid, usemap_map[nr_consumed_maps]); - nr_consumed_maps++; - } -} - #ifdef CONFIG_SPARSEMEM_VMEMMAP static unsigned long __init section_map_size(void) - { return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE); } @@ -432,25 +396,6 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid, BOOTMEM_ALLOC_ACCESSIBLE, nid); return map; } -void __init sparse_mem_maps_populate_node(struct page **map_map, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long map_count, int nodeid) -{ - unsigned long pnum; - int nr_consumed_maps = 0; - - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - map_map[nr_consumed_maps] = - sparse_mem_map_populate(pnum, nodeid, NULL); - if (map_map[nr_consumed_maps++]) - continue; - pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", - __func__); - } -} #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ static void *sparsemap_buf __meminitdata; @@ -489,190 +434,10 @@ void * __meminit sparse_buffer_alloc(unsigned long size) return ptr; } -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER -static void __init sparse_early_mem_maps_alloc_node(void *data, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long map_count, int nodeid) -{ - struct page **map_map = (struct page **)data; - - sparse_buffer_init(section_map_size() * map_count, nodeid); - sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, - map_count, nodeid); - sparse_buffer_fini(); -} -#else -static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) -{ - struct page *map; - struct mem_section *ms = __nr_to_section(pnum); - int nid = sparse_early_nid(ms); - - map = sparse_mem_map_populate(pnum, nid, NULL); - if (map) - return map; - - pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", - __func__); - return NULL; -} -#endif - void __weak __meminit vmemmap_populate_print_last(void) { } -/** - * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap - * @map: usemap_map for pageblock flags or mmap_map for vmemmap - * @unit_size: size of map unit - */ -static void __init alloc_usemap_and_memmap(void (*alloc_func) - (void *, unsigned long, unsigned long, - unsigned long, int), void *data, - int data_unit_size) -{ - unsigned long pnum; - unsigned long map_count; - int nodeid_begin = 0; - unsigned long pnum_begin = 0; - - for_each_present_section_nr(0, pnum) { - struct mem_section *ms; - - ms = __nr_to_section(pnum); - nodeid_begin = sparse_early_nid(ms); - pnum_begin = pnum; - break; - } - map_count = 1; - for_each_present_section_nr(pnum_begin + 1, pnum) { - struct mem_section *ms; - int nodeid; - - ms = __nr_to_section(pnum); - nodeid = sparse_early_nid(ms); - if (nodeid == nodeid_begin) { - map_count++; - continue; - } - /* ok, we need to take cake of from pnum_begin to pnum - 1*/ - alloc_func(data, pnum_begin, pnum, - map_count, nodeid_begin); - /* new start, update count etc*/ - nodeid_begin = nodeid; - pnum_begin = pnum; - data += map_count * data_unit_size; - map_count = 1; - } - /* ok, last chunk */ - alloc_func(data, pnum_begin, __highest_present_section_nr+1, - map_count, nodeid_begin); -} - -/* - * Allocate the accumulated non-linear sections, allocate a mem_map - * for each and record the physical to section mapping. - */ -void __init sparse_init(void) -{ - unsigned long pnum; - struct page *map; - unsigned long *usemap; - unsigned long **usemap_map; - int size; - int nr_consumed_maps = 0; -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - int size2; - struct page **map_map; -#endif - - /* see include/linux/mmzone.h 'struct mem_section' definition */ - BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section))); - - /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ - set_pageblock_order(); - - /* - * map is using big page (aka 2M in x86 64 bit) - * usemap is less one page (aka 24 bytes) - * so alloc 2M (with 2M align) and 24 bytes in turn will - * make next 2M slip to one more 2M later. - * then in big system, the memory will have a lot of holes... - * here try to allocate 2M pages continuously. - * - * powerpc need to call sparse_init_one_section right after each - * sparse_early_mem_map_alloc, so allocate usemap_map at first. - */ - size = sizeof(unsigned long *) * nr_present_sections; - usemap_map = memblock_virt_alloc(size, 0); - if (!usemap_map) - panic("can not allocate usemap_map\n"); - alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, - (void *)usemap_map, - sizeof(usemap_map[0])); - -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - size2 = sizeof(struct page *) * nr_present_sections; - map_map = memblock_virt_alloc(size2, 0); - if (!map_map) - panic("can not allocate map_map\n"); - alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, - (void *)map_map, - sizeof(map_map[0])); -#endif - - /* - * The number of present sections stored in nr_present_sections - * are kept the same since mem sections are marked as present in - * memory_present(). In this for loop, we need check which sections - * failed to allocate memmap or usemap, then clear its - * ->section_mem_map accordingly. During this process, we need - * increase 'nr_consumed_maps' whether its allocation of memmap - * or usemap failed or not, so that after we handle the i-th - * memory section, can get memmap and usemap of (i+1)-th section - * correctly. - */ - for_each_present_section_nr(0, pnum) { - struct mem_section *ms; - - if (nr_consumed_maps >= nr_present_sections) { - pr_err("nr_consumed_maps goes beyond nr_present_sections\n"); - break; - } - ms = __nr_to_section(pnum); - usemap = usemap_map[nr_consumed_maps]; - if (!usemap) { - ms->section_mem_map = 0; - nr_consumed_maps++; - continue; - } - -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - map = map_map[nr_consumed_maps]; -#else - map = sparse_early_mem_map_alloc(pnum); -#endif - if (!map) { - ms->section_mem_map = 0; - nr_consumed_maps++; - continue; - } - - sparse_init_one_section(__nr_to_section(pnum), pnum, map, - usemap); - nr_consumed_maps++; - } - - vmemmap_populate_print_last(); - -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - memblock_free_early(__pa(map_map), size2); -#endif - memblock_free_early(__pa(usemap_map), size); -} - /* * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end) * And number of present sections in this node is map_count. @@ -726,7 +491,7 @@ failed: * Allocate the accumulated non-linear sections, allocate a mem_map * for each and record the physical to section mapping. */ -void __init new_sparse_init(void) +void __init sparse_init(void) { unsigned long pnum_begin = first_present_section_nr(); int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin)); From 14fef28414c365a979311821bbf1018a8290cc0f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 17 Aug 2018 15:49:41 -0700 Subject: [PATCH 104/111] mm, swap: make CONFIG_THP_SWAP depend on CONFIG_SWAP CONFIG_THP_SWAP should depend on CONFIG_SWAP, because it's unreasonable to optimize swapping for THP (Transparent Huge Page) without basic swapping support. In original code, when CONFIG_SWAP=n and CONFIG_THP_SWAP=y, split_swap_cluster() will not be built because it is in swapfile.c, but it will be called in huge_memory.c. This doesn't trigger a build error in practice because the call site is enclosed by PageSwapCache(), which is defined to be constant 0 when CONFIG_SWAP=n. But this is fragile and should be fixed. The comments are fixed too to reflect the latest progress. Link: http://lkml.kernel.org/r/20180713021228.439-1-ying.huang@intel.com Fixes: 38d8b4e6bdc8 ("mm, THP, swap: delay splitting THP during swap out") Signed-off-by: "Huang, Ying" Reviewed-by: Dan Williams Reviewed-by: Naoya Horiguchi Cc: Michal Hocko Cc: Johannes Weiner Cc: Shaohua Li Cc: Hugh Dickins Cc: Minchan Kim Cc: Rik van Riel Cc: Dave Hansen Cc: Zi Yan Cc: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index adfeae4decb4..a550635ea5c3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -419,10 +419,11 @@ config ARCH_WANTS_THP_SWAP config THP_SWAP def_bool y - depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP + depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP help Swap transparent huge pages in one piece, without splitting. - XXX: For now this only does clustered swap space allocation. + XXX: For now, swap cluster backing transparent huge page + will be split after swapout. For selection by architectures with reasonable THP sizes. From 87a5ffc163966b2eb675c9c863c0caccab3183f6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 17 Aug 2018 15:49:45 -0700 Subject: [PATCH 105/111] mm/list_lru.c: use list_lru_walk_one() in list_lru_walk_node() Patch series "mm/list_lru: Add list_lru_shrink_walk_irq() and a user". This series removes the local_irq_disable() around list_lru_shrink_walk() (as used by mm/workingset) by adding list_lru_shrink_walk_irq(). Vladimir Davydov preferred this over `irq' argument which I added to struct list_lru. The initial post (of this series) received a Reviewed-by tag by Vladimir Davydov which I added to each patch of the series. The series applies on top of akpm's tree which has Kirill's shrink_slab series and does not clash with it (akpm asked me to wait a week or so and repost it then). I tested the code paths by triggering the OOM-killer via memory over commit and lockdep did not complain (nor did I see any warnings). This patch (of 4): list_lru_walk_node() invokes __list_lru_walk_one() with -1 as the memcg_idx parameter. The same can be achieved by list_lru_walk_one() and passing NULL as memcg argument which then gets converted into -1. This is a preparation step when the spin_lock() function is lifted to the caller of __list_lru_walk_one(). Invoke list_lru_walk_one() instead __list_lru_walk_one() when possible. Link: http://lkml.kernel.org/r/20180716111921.5365-2-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Vladimir Davydov Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index c9bdde9c03d1..522b98ca76b3 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -287,8 +287,8 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, long isolated = 0; int memcg_idx; - isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg, - nr_to_walk); + isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg, + nr_to_walk); if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { for_each_memcg_cache_index(memcg_idx) { isolated += __list_lru_walk_one(lru, nid, memcg_idx, From 6cfe57a96ba33139c57056185d5eee9018e3dedc Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 17 Aug 2018 15:49:48 -0700 Subject: [PATCH 106/111] mm/list_lru.c: move locking from __list_lru_walk_one() to its caller Move the locking inside __list_lru_walk_one() to its caller. This is a preparation step in order to introduce list_lru_walk_one_irq() which does spin_lock_irq() instead of spin_lock() for the locking. Link: http://lkml.kernel.org/r/20180716111921.5365-3-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Vladimir Davydov Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 522b98ca76b3..286f92a0963e 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -219,7 +219,6 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, struct list_head *item, *n; unsigned long isolated = 0; - spin_lock(&nlru->lock); l = list_lru_from_memcg_idx(nlru, memcg_idx); restart: list_for_each_safe(item, n, &l->list) { @@ -265,8 +264,6 @@ restart: BUG(); } } - - spin_unlock(&nlru->lock); return isolated; } @@ -275,8 +272,14 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { - return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg), - isolate, cb_arg, nr_to_walk); + struct list_lru_node *nlru = &lru->node[nid]; + unsigned long ret; + + spin_lock(&nlru->lock); + ret = __list_lru_walk_one(lru, nid, memcg_cache_id(memcg), + isolate, cb_arg, nr_to_walk); + spin_unlock(&nlru->lock); + return ret; } EXPORT_SYMBOL_GPL(list_lru_walk_one); @@ -291,8 +294,13 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, nr_to_walk); if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { for_each_memcg_cache_index(memcg_idx) { + struct list_lru_node *nlru = &lru->node[nid]; + + spin_lock(&nlru->lock); isolated += __list_lru_walk_one(lru, nid, memcg_idx, isolate, cb_arg, nr_to_walk); + spin_unlock(&nlru->lock); + if (*nr_to_walk <= 0) break; } From 6e018968f8d384d84484eba8e4c90489a25d7095 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 17 Aug 2018 15:49:51 -0700 Subject: [PATCH 107/111] mm/list_lru.c: pass struct list_lru_node* as an argument to __list_lru_walk_one() __list_lru_walk_one() is invoked with struct list_lru *lru, int nid as the first two argument. Those two are only used to retrieve struct list_lru_node. Since this is already done by the caller of the function for the locking, we can pass struct list_lru_node* directly and avoid the dance around it. Link: http://lkml.kernel.org/r/20180716111921.5365-4-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Vladimir Davydov Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 286f92a0963e..f5c6a2d1ea66 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -209,12 +209,11 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid) EXPORT_SYMBOL_GPL(list_lru_count_node); static unsigned long -__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, +__list_lru_walk_one(struct list_lru_node *nlru, int memcg_idx, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { - struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; struct list_head *item, *n; unsigned long isolated = 0; @@ -276,8 +275,8 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, unsigned long ret; spin_lock(&nlru->lock); - ret = __list_lru_walk_one(lru, nid, memcg_cache_id(memcg), - isolate, cb_arg, nr_to_walk); + ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg, + nr_to_walk); spin_unlock(&nlru->lock); return ret; } @@ -297,8 +296,9 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, struct list_lru_node *nlru = &lru->node[nid]; spin_lock(&nlru->lock); - isolated += __list_lru_walk_one(lru, nid, memcg_idx, - isolate, cb_arg, nr_to_walk); + isolated += __list_lru_walk_one(nlru, memcg_idx, + isolate, cb_arg, + nr_to_walk); spin_unlock(&nlru->lock); if (*nr_to_walk <= 0) From 6b51e88199ca4f75ff647eff28efd30bfcb08dc4 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 17 Aug 2018 15:49:55 -0700 Subject: [PATCH 108/111] mm/list_lru: introduce list_lru_shrink_walk_irq() Provide list_lru_shrink_walk_irq() and let it behave like list_lru_walk_one() except that it locks the spinlock with spin_lock_irq(). This is used by scan_shadow_nodes() because its lock nests within the i_pages lock which is acquired with IRQ. This change allows to use proper locking promitives instead hand crafted lock_irq_disable() plus spin_lock(). There is no EXPORT_SYMBOL provided because the current user is in-kernel only. Add list_lru_shrink_walk_irq() which acquires the spinlock with the proper locking primitives. Link: http://lkml.kernel.org/r/20180716111921.5365-5-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Vladimir Davydov Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list_lru.h | 25 +++++++++++++++++++++++++ mm/list_lru.c | 15 +++++++++++++++ mm/workingset.c | 8 ++------ 3 files changed, 42 insertions(+), 6 deletions(-) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index d9c16f2f2f00..aa5efd9351eb 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -166,6 +166,23 @@ unsigned long list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); +/** + * list_lru_walk_one_irq: walk a list_lru, isolating and disposing freeable items. + * @lru: the lru pointer. + * @nid: the node id to scan from. + * @memcg: the cgroup to scan from. + * @isolate: callback function that is resposible for deciding what to do with + * the item currently being scanned + * @cb_arg: opaque type that will be passed to @isolate + * @nr_to_walk: how many items to scan. + * + * Same as @list_lru_walk_one except that the spinlock is acquired with + * spin_lock_irq(). + */ +unsigned long list_lru_walk_one_irq(struct list_lru *lru, + int nid, struct mem_cgroup *memcg, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk); unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); @@ -178,6 +195,14 @@ list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc, &sc->nr_to_scan); } +static inline unsigned long +list_lru_shrink_walk_irq(struct list_lru *lru, struct shrink_control *sc, + list_lru_walk_cb isolate, void *cb_arg) +{ + return list_lru_walk_one_irq(lru, sc->nid, sc->memcg, isolate, cb_arg, + &sc->nr_to_scan); +} + static inline unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate, void *cb_arg, unsigned long nr_to_walk) diff --git a/mm/list_lru.c b/mm/list_lru.c index f5c6a2d1ea66..5b30625fd365 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -282,6 +282,21 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, } EXPORT_SYMBOL_GPL(list_lru_walk_one); +unsigned long +list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) +{ + struct list_lru_node *nlru = &lru->node[nid]; + unsigned long ret; + + spin_lock_irq(&nlru->lock); + ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg, + nr_to_walk); + spin_unlock_irq(&nlru->lock); + return ret; +} + unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) diff --git a/mm/workingset.c b/mm/workingset.c index bc72ad029b3e..4516dd790129 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -483,13 +483,9 @@ out: static unsigned long scan_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { - unsigned long ret; - /* list_lru lock nests inside the IRQ-safe i_pages lock */ - local_irq_disable(); - ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL); - local_irq_enable(); - return ret; + return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate, + NULL); } static struct shrinker workingset_shadow_shrinker = { From ddbf369c0a33924f76d092985bd20d9310f43d7f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 17 Aug 2018 15:49:58 -0700 Subject: [PATCH 109/111] mm, vmacache: hash addresses based on pmd When perf profiling a wide variety of different workloads, it was found that vmacache_find() had higher than expected cost: up to 0.08% of cpu utilization in some cases. This was found to rival other core VM functions such as alloc_pages_vma() with thp enabled and default mempolicy, and the conditionals in __get_vma_policy(). VMACACHE_HASH() determines which of the four per-task_struct slots a vma is cached for a particular address. This currently depends on the pfn, so pfn 5212 occupies a different vmacache slot than its neighboring pfn 5213. vmacache_find() iterates through all four of current's vmacache slots when looking up an address. Hashing based on pfn, an address has ~1/VMACACHE_SIZE chance of being cached in the first vmacache slot, or about 25%, *if* the vma is cached. This patch hashes an address by its pmd instead of pte to optimize for workloads with good spatial locality. This results in a higher probability of vmas being cached in the first slot that is checked: normally ~70% on the same workloads instead of 25%. [rientjes@google.com: various updates] Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1807231532290.109445@chino.kir.corp.google.com Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1807091749150.114630@chino.kir.corp.google.com Signed-off-by: David Rientjes Reviewed-by: Andrew Morton Cc: Davidlohr Bueso Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmacache.h | 6 ------ mm/vmacache.c | 38 +++++++++++++++++++++++++++++--------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h index a5b3aa8d281f..3e9a963edd6a 100644 --- a/include/linux/vmacache.h +++ b/include/linux/vmacache.h @@ -5,12 +5,6 @@ #include #include -/* - * Hash based on the page number. Provides a good hit rate for - * workloads with good locality and those with random accesses as well. - */ -#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK) - static inline void vmacache_flush(struct task_struct *tsk) { memset(tsk->vmacache.vmas, 0, sizeof(tsk->vmacache.vmas)); diff --git a/mm/vmacache.c b/mm/vmacache.c index db7596eb6132..ea517bef7dc5 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -6,6 +6,18 @@ #include #include #include +#include + +/* + * Hash based on the pmd of addr if configured with MMU, which provides a good + * hit rate for workloads with spatial locality. Otherwise, use pages. + */ +#ifdef CONFIG_MMU +#define VMACACHE_SHIFT PMD_SHIFT +#else +#define VMACACHE_SHIFT PAGE_SHIFT +#endif +#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK) /* * Flush vma caches for threads that share a given mm. @@ -87,6 +99,7 @@ static bool vmacache_valid(struct mm_struct *mm) struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) { + int idx = VMACACHE_HASH(addr); int i; count_vm_vmacache_event(VMACACHE_FIND_CALLS); @@ -95,16 +108,20 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) return NULL; for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache.vmas[i]; + struct vm_area_struct *vma = current->vmacache.vmas[idx]; - if (!vma) - continue; - if (WARN_ON_ONCE(vma->vm_mm != mm)) - break; - if (vma->vm_start <= addr && vma->vm_end > addr) { - count_vm_vmacache_event(VMACACHE_FIND_HITS); - return vma; + if (vma) { +#ifdef CONFIG_DEBUG_VM_VMACACHE + if (WARN_ON_ONCE(vma->vm_mm != mm)) + break; +#endif + if (vma->vm_start <= addr && vma->vm_end > addr) { + count_vm_vmacache_event(VMACACHE_FIND_HITS); + return vma; + } } + if (++idx == VMACACHE_SIZE) + idx = 0; } return NULL; @@ -115,6 +132,7 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, unsigned long start, unsigned long end) { + int idx = VMACACHE_HASH(start); int i; count_vm_vmacache_event(VMACACHE_FIND_CALLS); @@ -123,12 +141,14 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, return NULL; for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache.vmas[i]; + struct vm_area_struct *vma = current->vmacache.vmas[idx]; if (vma && vma->vm_start == start && vma->vm_end == end) { count_vm_vmacache_event(VMACACHE_FIND_HITS); return vma; } + if (++idx == VMACACHE_SIZE) + idx = 0; } return NULL; From 5241d4727479aad77af50b80757c38268bfa4560 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 17 Aug 2018 15:50:01 -0700 Subject: [PATCH 110/111] fs/userfaultfd.c: remove redundant pointer uwq Pointer uwq is being assigned but is never used hence it is redundant and can be removed. Cleans up clang warning: warning: variable 'uwq' set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/20180717090802.18357-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index bad9cea37f12..15c265d450bf 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1849,17 +1849,14 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) { struct userfaultfd_ctx *ctx = f->private_data; wait_queue_entry_t *wq; - struct userfaultfd_wait_queue *uwq; unsigned long pending = 0, total = 0; spin_lock(&ctx->fault_pending_wqh.lock); list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { - uwq = container_of(wq, struct userfaultfd_wait_queue, wq); pending++; total++; } list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { - uwq = container_of(wq, struct userfaultfd_wait_queue, wq); total++; } spin_unlock(&ctx->fault_pending_wqh.lock); From 1e9264192961aa519595170aa8b0f7651a2ad28e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 17 Aug 2018 15:50:07 -0700 Subject: [PATCH 111/111] mm/hmm.c: remove unused variables align_start and align_end Variables align_start and align_end are being assigned but are never used hence they are redundant and can be removed. Cleans up clang warnings: warning: variable 'align_start' set but not used [-Wunused-but-set-variable] warning: variable 'align_size' set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/20180714161124.3923-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hmm.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index caf9df27599e..76e7a058b32f 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -973,10 +973,7 @@ static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); static void hmm_devmem_radix_release(struct resource *resource) { - resource_size_t key, align_start, align_size; - - align_start = resource->start & ~(PA_SECTION_SIZE - 1); - align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE); + resource_size_t key; mutex_lock(&hmm_devmem_lock); for (key = resource->start;