diff --git a/MAINTAINERS b/MAINTAINERS index 38df53f828e1..9bff63cf326e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2975,6 +2975,7 @@ F: kernel/cpuset.c CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG) M: Johannes Weiner M: Michal Hocko +M: Vladimir Davydov L: cgroups@vger.kernel.org L: linux-mm@kvack.org S: Maintained diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c index d8117be729a2..730d394ce5f0 100644 --- a/arch/mips/mm/dma-default.c +++ b/arch/mips/mm/dma-default.c @@ -145,7 +145,7 @@ static void *mips_dma_alloc_coherent(struct device *dev, size_t size, gfp = massage_gfp_flags(dev, gfp); - if (IS_ENABLED(CONFIG_DMA_CMA) && !(gfp & GFP_ATOMIC)) + if (IS_ENABLED(CONFIG_DMA_CMA) && gfpflags_allow_blocking(gfp)) page = dma_alloc_from_contiguous(dev, count, get_order(size)); if (!page) diff --git a/arch/sh/include/uapi/asm/unistd_64.h b/arch/sh/include/uapi/asm/unistd_64.h index e6820c86e8c7..47ebd5b5ed55 100644 --- a/arch/sh/include/uapi/asm/unistd_64.h +++ b/arch/sh/include/uapi/asm/unistd_64.h @@ -278,7 +278,7 @@ #define __NR_fsetxattr 256 #define __NR_getxattr 257 #define __NR_lgetxattr 258 -#define __NR_fgetxattr 269 +#define __NR_fgetxattr 259 #define __NR_listxattr 260 #define __NR_llistxattr 261 #define __NR_flistxattr 262 diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 2804aed3f416..25425d3f2575 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -303,6 +303,10 @@ static int memory_subsys_offline(struct device *dev) if (mem->state == MEM_OFFLINE) return 0; + /* Can't offline block with non-present sections */ + if (mem->section_count != sections_per_block) + return -EINVAL; + return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); } diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 73c64daa0f55..60f03b78914e 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -592,10 +592,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) } unlock_page(page); } - if (PageDirty(page) || PageWriteback(page)) - *uptodate = true; - else - *uptodate = PageUptodate(page); + *uptodate = PageUptodate(page); EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate); return page; } else { diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 5c0c6b58157f..9aebffb40505 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -476,10 +476,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) } unlock_page(page); } - if (PageDirty(page) || PageWriteback(page)) - *uptodate = true; - else - *uptodate = PageUptodate(page); + *uptodate = PageUptodate(page); dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate); return page; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index a03f6f433075..3123408da935 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -367,13 +367,11 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - status = posix_acl_create(dir, &mode, &default_acl, &acl); + status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (status) { mlog_errno(status); goto leave; } - /* update inode->i_mode after mask with "umask". */ - inode->i_mode = mode; handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, S_ISDIR(mode), diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index d0a1f99e24e3..4894c6888bc6 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -25,7 +25,7 @@ #ifdef CONFIG_DEBUG_KMEMLEAK -extern void kmemleak_init(void) __ref; +extern void kmemleak_init(void) __init; extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) __ref; extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 0adedca24c5b..0e1b1540597a 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -99,7 +99,7 @@ static inline int try_stop_cpus(const struct cpumask *cpumask, * grabbing every spinlock (and more). So the "read" side to such a * lock is anything which disables preemption. */ -#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP) +#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU) /** * stop_machine: freeze the machine on all CPUs and run this function @@ -118,7 +118,7 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); -#else /* CONFIG_STOP_MACHINE && CONFIG_SMP */ +#else /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ static inline int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) @@ -137,5 +137,5 @@ static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, return stop_machine(fn, data, cpus); } -#endif /* CONFIG_STOP_MACHINE && CONFIG_SMP */ +#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ #endif /* _LINUX_STOP_MACHINE */ diff --git a/init/Kconfig b/init/Kconfig index c24b6f767bf0..235c7a2c0d20 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2030,13 +2030,6 @@ config INIT_ALL_POSSIBLE it was better to provide this option than to break all the archs and have several arch maintainers pursuing me down dark alleys. -config STOP_MACHINE - bool - default y - depends on (SMP && MODULE_UNLOAD) || HOTPLUG_CPU - help - Need stop_machine() primitive. - source "block/Kconfig" config PREEMPT_NOTIFIERS diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 867bc20e1ef1..a3bbaee77c58 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -531,7 +531,7 @@ static int __init cpu_stop_init(void) } early_initcall(cpu_stop_init); -#ifdef CONFIG_STOP_MACHINE +#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU) static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { @@ -631,4 +631,4 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, return ret ?: done.ret; } -#endif /* CONFIG_STOP_MACHINE */ +#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8ed2ffd963c5..7340353f8aea 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -957,8 +957,9 @@ EXPORT_SYMBOL(congestion_wait); * jiffies for either a BDI to exit congestion of the given @sync queue * or a write to complete. * - * In the absence of zone congestion, cond_resched() is called to yield - * the processor if necessary but otherwise does not sleep. + * In the absence of zone congestion, a short sleep or a cond_resched is + * performed to yield the processor and to allow other subsystems to make + * a forward progress. * * The return value is 0 if the sleep is for the full timeout. Otherwise, * it is the number of jiffies that were still remaining when the function @@ -978,7 +979,19 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) */ if (atomic_read(&nr_wb_congested[sync]) == 0 || !test_bit(ZONE_CONGESTED, &zone->flags)) { - cond_resched(); + + /* + * Memory allocation/reclaim might be called from a WQ + * context and the current implementation of the WQ + * concurrency control doesn't recognize that a particular + * WQ is congested if the worker thread is looping without + * ever sleeping. Therefore we have to do a short sleep + * here rather than calling cond_resched(). + */ + if (current->flags & PF_WQ_WORKER) + schedule_timeout(1); + else + cond_resched(); /* In case we scheduled, work out time remaining */ ret = timeout - (jiffies - start); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 827bb02a43a4..ef6963b577fd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -372,8 +372,10 @@ retry_locked: spin_unlock(&resv->lock); trg = kmalloc(sizeof(*trg), GFP_KERNEL); - if (!trg) + if (!trg) { + kfree(nrg); return -ENOMEM; + } spin_lock(&resv->lock); list_add(&trg->link, &resv->region_cache); @@ -483,8 +485,16 @@ static long region_del(struct resv_map *resv, long f, long t) retry: spin_lock(&resv->lock); list_for_each_entry_safe(rg, trg, head, link) { - if (rg->to <= f) + /* + * Skip regions before the range to be deleted. file_region + * ranges are normally of the form [from, to). However, there + * may be a "placeholder" entry in the map which is of the form + * (from, to) with from == to. Check for placeholder entries + * at the beginning of the range to be deleted. + */ + if (rg->to <= f && (rg->to != rg->from || rg->to != f)) continue; + if (rg->from >= t) break; @@ -1886,7 +1896,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, page = __alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; - + if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { + SetPagePrivate(page); + h->resv_huge_pages--; + } spin_lock(&hugetlb_lock); list_move(&page->lru, &h->hugepage_activelist); /* Fall through */ @@ -3693,12 +3706,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); + } else { + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); + if (!ptep) + return VM_FAULT_OOM; } - ptep = huge_pte_alloc(mm, address, huge_page_size(h)); - if (!ptep) - return VM_FAULT_OOM; - mapping = vma->vm_file->f_mapping; idx = vma_hugecache_offset(h, vma, address); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c92a65b2b4ab..e234c21a5e6c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2128,7 +2128,7 @@ done_restock: */ do { if (page_counter_read(&memcg->memory) > memcg->high) { - current->memcg_nr_pages_over_high += nr_pages; + current->memcg_nr_pages_over_high += batch; set_notify_resume(current); break; } @@ -5512,11 +5512,11 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) * mem_cgroup_replace_page - migrate a charge to another page * @oldpage: currently charged page * @newpage: page to transfer the charge to - * @lrucare: either or both pages might be on the LRU already * * Migrate the charge from @oldpage to @newpage. * * Both pages must be locked, @newpage->mapping must be set up. + * Either or both pages might be on the LRU already. */ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) { diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d13a33918fa2..c12680993ff3 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -608,6 +608,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, continue; if (unlikely(p->flags & PF_KTHREAD)) continue; + if (is_global_init(p)) + continue; if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) continue; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 17a3c66639a9..9d666df5ef95 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3647,8 +3647,9 @@ static void show_migration_types(unsigned char type) { static const char types[MIGRATE_TYPES] = { [MIGRATE_UNMOVABLE] = 'U', - [MIGRATE_RECLAIMABLE] = 'E', [MIGRATE_MOVABLE] = 'M', + [MIGRATE_RECLAIMABLE] = 'E', + [MIGRATE_HIGHATOMIC] = 'H', #ifdef CONFIG_CMA [MIGRATE_CMA] = 'C', #endif diff --git a/mm/shmem.c b/mm/shmem.c index 9187eee4128b..2afcdbbdb685 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -843,14 +843,14 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) list_add_tail(&info->swaplist, &shmem_swaplist); if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { + spin_lock(&info->lock); + shmem_recalc_inode(inode); + info->swapped++; + spin_unlock(&info->lock); + swap_shmem_alloc(swap); shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); - spin_lock(&info->lock); - info->swapped++; - shmem_recalc_inode(inode); - spin_unlock(&info->lock); - mutex_unlock(&shmem_swaplist_mutex); BUG_ON(page_mapped(page)); swap_writepage(page, wbc); @@ -1078,7 +1078,7 @@ repeat: if (sgp != SGP_WRITE && sgp != SGP_FALLOC && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; - goto failed; + goto unlock; } if (page && sgp == SGP_WRITE) @@ -1246,11 +1246,15 @@ clear: /* Perhaps the file has been truncated since we checked */ if (sgp != SGP_WRITE && sgp != SGP_FALLOC && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + if (alloced) { + ClearPageDirty(page); + delete_from_page_cache(page); + spin_lock(&info->lock); + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + } error = -EINVAL; - if (alloced) - goto trunc; - else - goto failed; + goto unlock; } *pagep = page; return 0; @@ -1258,23 +1262,13 @@ clear: /* * Error recovery. */ -trunc: - info = SHMEM_I(inode); - ClearPageDirty(page); - delete_from_page_cache(page); - spin_lock(&info->lock); - info->alloced--; - inode->i_blocks -= BLOCKS_PER_PAGE; - spin_unlock(&info->lock); decused: - sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) percpu_counter_add(&sbinfo->used_blocks, -1); unacct: shmem_unacct_blocks(info->flags, 1); failed: - if (swap.val && error != -EINVAL && - !shmem_confirm_swap(mapping, index, swap)) + if (swap.val && !shmem_confirm_swap(mapping, index, swap)) error = -EEXIST; unlock: if (page) { diff --git a/mm/vmstat.c b/mm/vmstat.c index 879a2be23325..0d5712b0206c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -921,8 +921,8 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, #ifdef CONFIG_PROC_FS static char * const migratetype_names[MIGRATE_TYPES] = { "Unmovable", - "Reclaimable", "Movable", + "Reclaimable", "HighAtomic", #ifdef CONFIG_CMA "CMA", @@ -1379,6 +1379,7 @@ static const struct file_operations proc_vmstat_file_operations = { #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SMP +static struct workqueue_struct *vmstat_wq; static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; static cpumask_var_t cpu_stat_off; @@ -1391,7 +1392,7 @@ static void vmstat_update(struct work_struct *w) * to occur in the future. Keep on running the * update worker thread. */ - schedule_delayed_work_on(smp_processor_id(), + queue_delayed_work_on(smp_processor_id(), vmstat_wq, this_cpu_ptr(&vmstat_work), round_jiffies_relative(sysctl_stat_interval)); } else { @@ -1460,7 +1461,7 @@ static void vmstat_shepherd(struct work_struct *w) if (need_update(cpu) && cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) - schedule_delayed_work_on(cpu, + queue_delayed_work_on(cpu, vmstat_wq, &per_cpu(vmstat_work, cpu), 0); put_online_cpus(); @@ -1549,6 +1550,7 @@ static int __init setup_vmstat(void) start_shepherd_timer(); cpu_notifier_register_done(); + vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); #endif #ifdef CONFIG_PROC_FS proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);