Merge branch 'akpm' (patches from Andrew)

Merge yet more updates from Andrew Morton:
 "Subsystems affected by this patch series: mm (memcg, migration,
  pagemap, gup, madvise, vmalloc), ia64, and misc"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (31 commits)
  mm: remove duplicate include statement in mmu.c
  mm: remove the filename in the top of file comment in vmalloc.c
  mm: cleanup the gfp_mask handling in __vmalloc_area_node
  mm: remove alloc_vm_area
  x86/xen: open code alloc_vm_area in arch_gnttab_valloc
  xen/xenbus: use apply_to_page_range directly in xenbus_map_ring_pv
  drm/i915: use vmap in i915_gem_object_map
  drm/i915: stop using kmap in i915_gem_object_map
  drm/i915: use vmap in shmem_pin_map
  zsmalloc: switch from alloc_vm_area to get_vm_area
  mm: allow a NULL fn callback in apply_to_page_range
  mm: add a vmap_pfn function
  mm: add a VM_MAP_PUT_PAGES flag for vmap
  mm: update the documentation for vfree
  mm/madvise: introduce process_madvise() syscall: an external memory hinting API
  pid: move pidfd_get_pid() to pid.c
  mm/madvise: pass mm to do_madvise
  selftests/vm: 10x speedup for hmm-tests
  binfmt_elf: take the mmap lock around find_extend_vma()
  mm/gup_benchmark: take the mmap lock around GUP
  ...
This commit is contained in:
Linus Torvalds 2020-10-18 12:25:25 -07:00
commit 1912b04e0f
55 changed files with 601 additions and 448 deletions

View File

@ -479,3 +479,4 @@
547 common openat2 sys_openat2
548 common pidfd_getfd sys_pidfd_getfd
549 common faccessat2 sys_faccessat2
550 common process_madvise sys_process_madvise

View File

@ -17,7 +17,6 @@
#include <asm/cp15.h>
#include <asm/cputype.h>
#include <asm/sections.h>
#include <asm/cachetype.h>
#include <asm/fixmap.h>
#include <asm/sections.h>

View File

@ -453,3 +453,4 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise

View File

@ -38,7 +38,7 @@
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
#define __NR_compat_syscalls 440
#define __NR_compat_syscalls 441
#endif
#define __ARCH_WANT_SYS_CLONE

View File

@ -887,6 +887,8 @@ __SYSCALL(__NR_openat2, sys_openat2)
__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
#define __NR_faccessat2 439
__SYSCALL(__NR_faccessat2, sys_faccessat2)
#define __NR_process_madvise 440
__SYSCALL(__NR_process_madvise, sys_process_madvise)
/*
* Please add new compat syscalls above this comment and update

View File

@ -40,7 +40,7 @@ obj-y += esi_stub.o # must be in kernel proper
endif
obj-$(CONFIG_INTEL_IOMMU) += pci-dma.o
obj-$(CONFIG_BINFMT_ELF) += elfcore.o
obj-$(CONFIG_ELF_CORE) += elfcore.o
# fp_emulate() expects f2-f5,f16-f31 to contain the user-level state.
CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31

View File

@ -360,3 +360,4 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise

View File

@ -439,3 +439,4 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise

View File

@ -445,3 +445,4 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise

View File

@ -378,3 +378,4 @@
437 n32 openat2 sys_openat2
438 n32 pidfd_getfd sys_pidfd_getfd
439 n32 faccessat2 sys_faccessat2
440 n32 process_madvise sys_process_madvise

View File

@ -354,3 +354,4 @@
437 n64 openat2 sys_openat2
438 n64 pidfd_getfd sys_pidfd_getfd
439 n64 faccessat2 sys_faccessat2
440 n64 process_madvise sys_process_madvise

View File

@ -427,3 +427,4 @@
437 o32 openat2 sys_openat2
438 o32 pidfd_getfd sys_pidfd_getfd
439 o32 faccessat2 sys_faccessat2
440 o32 process_madvise sys_process_madvise

View File

@ -437,3 +437,4 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise

View File

@ -529,3 +529,4 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise

View File

@ -442,3 +442,4 @@
437 common openat2 sys_openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise sys_process_madvise

View File

@ -442,3 +442,4 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise

View File

@ -485,3 +485,4 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise

View File

@ -444,3 +444,4 @@
437 i386 openat2 sys_openat2
438 i386 pidfd_getfd sys_pidfd_getfd
439 i386 faccessat2 sys_faccessat2
440 i386 process_madvise sys_process_madvise

View File

@ -361,6 +361,7 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
#
# x32-specific system call numbers start at 512 to avoid cache impact

View File

@ -25,6 +25,7 @@
static struct gnttab_vm_area {
struct vm_struct *area;
pte_t **ptes;
int idx;
} gnttab_shared_vm_area, gnttab_status_vm_area;
int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
@ -90,19 +91,31 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
}
}
static int gnttab_apply(pte_t *pte, unsigned long addr, void *data)
{
struct gnttab_vm_area *area = data;
area->ptes[area->idx++] = pte;
return 0;
}
static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames)
{
area->ptes = kmalloc_array(nr_frames, sizeof(*area->ptes), GFP_KERNEL);
if (area->ptes == NULL)
return -ENOMEM;
area->area = alloc_vm_area(PAGE_SIZE * nr_frames, area->ptes);
if (area->area == NULL) {
kfree(area->ptes);
return -ENOMEM;
}
area->area = get_vm_area(PAGE_SIZE * nr_frames, VM_IOREMAP);
if (!area->area)
goto out_free_ptes;
if (apply_to_page_range(&init_mm, (unsigned long)area->area->addr,
PAGE_SIZE * nr_frames, gnttab_apply, area))
goto out_free_vm_area;
return 0;
out_free_vm_area:
free_vm_area(area->area);
out_free_ptes:
kfree(area->ptes);
return -ENOMEM;
}
static void arch_gnttab_vfree(struct gnttab_vm_area *area)

View File

@ -410,3 +410,4 @@
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise

View File

@ -25,6 +25,7 @@ config DRM_I915
select CRC32
select SND_HDA_I915 if SND_HDA_CORE
select CEC_CORE if CEC_NOTIFIER
select VMAP_PFN
help
Choose this option if you have a system that has "Intel Graphics
Media Accelerator" or "HD Graphics" integrated graphics,

View File

@ -162,8 +162,6 @@ static void unmap_object(struct drm_i915_gem_object *obj, void *ptr)
{
if (is_vmalloc_addr(ptr))
vunmap(ptr);
else
kunmap(kmap_to_page(ptr));
}
struct sg_table *
@ -234,34 +232,21 @@ unlock:
return err;
}
static inline pte_t iomap_pte(resource_size_t base,
dma_addr_t offset,
pgprot_t prot)
{
return pte_mkspecial(pfn_pte((base + offset) >> PAGE_SHIFT, prot));
}
/* The 'mapping' part of i915_gem_object_pin_map() below */
static void *i915_gem_object_map(struct drm_i915_gem_object *obj,
enum i915_map_type type)
static void *i915_gem_object_map_page(struct drm_i915_gem_object *obj,
enum i915_map_type type)
{
unsigned long n_pte = obj->base.size >> PAGE_SHIFT;
struct sg_table *sgt = obj->mm.pages;
pte_t *stack[32], **mem;
struct vm_struct *area;
unsigned long n_pages = obj->base.size >> PAGE_SHIFT, i;
struct page *stack[32], **pages = stack, *page;
struct sgt_iter iter;
pgprot_t pgprot;
void *vaddr;
if (!i915_gem_object_has_struct_page(obj) && type != I915_MAP_WC)
return NULL;
if (GEM_WARN_ON(type == I915_MAP_WC &&
!static_cpu_has(X86_FEATURE_PAT)))
return NULL;
/* A single page can always be kmapped */
if (n_pte == 1 && type == I915_MAP_WB) {
struct page *page = sg_page(sgt->sgl);
switch (type) {
default:
MISSING_CASE(type);
fallthrough; /* to use PAGE_KERNEL anyway */
case I915_MAP_WB:
/*
* On 32b, highmem using a finite set of indirect PTE (i.e.
* vmap) to provide virtual mappings of the high pages.
@ -277,33 +262,10 @@ static void *i915_gem_object_map(struct drm_i915_gem_object *obj,
* forever.
*
* So if the page is beyond the 32b boundary, make an explicit
* vmap. On 64b, this check will be optimised away as we can
* directly kmap any page on the system.
* vmap.
*/
if (!PageHighMem(page))
return kmap(page);
}
mem = stack;
if (n_pte > ARRAY_SIZE(stack)) {
/* Too big for stack -- allocate temporary array instead */
mem = kvmalloc_array(n_pte, sizeof(*mem), GFP_KERNEL);
if (!mem)
return NULL;
}
area = alloc_vm_area(obj->base.size, mem);
if (!area) {
if (mem != stack)
kvfree(mem);
return NULL;
}
switch (type) {
default:
MISSING_CASE(type);
fallthrough; /* to use PAGE_KERNEL anyway */
case I915_MAP_WB:
if (n_pages == 1 && !PageHighMem(sg_page(obj->mm.pages->sgl)))
return page_address(sg_page(obj->mm.pages->sgl));
pgprot = PAGE_KERNEL;
break;
case I915_MAP_WC:
@ -311,30 +273,50 @@ static void *i915_gem_object_map(struct drm_i915_gem_object *obj,
break;
}
if (i915_gem_object_has_struct_page(obj)) {
struct sgt_iter iter;
struct page *page;
pte_t **ptes = mem;
for_each_sgt_page(page, iter, sgt)
**ptes++ = mk_pte(page, pgprot);
} else {
resource_size_t iomap;
struct sgt_iter iter;
pte_t **ptes = mem;
dma_addr_t addr;
iomap = obj->mm.region->iomap.base;
iomap -= obj->mm.region->region.start;
for_each_sgt_daddr(addr, iter, sgt)
**ptes++ = iomap_pte(iomap, addr, pgprot);
if (n_pages > ARRAY_SIZE(stack)) {
/* Too big for stack -- allocate temporary array instead */
pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
if (!pages)
return NULL;
}
if (mem != stack)
kvfree(mem);
i = 0;
for_each_sgt_page(page, iter, obj->mm.pages)
pages[i++] = page;
vaddr = vmap(pages, n_pages, 0, pgprot);
if (pages != stack)
kvfree(pages);
return vaddr;
}
return area->addr;
static void *i915_gem_object_map_pfn(struct drm_i915_gem_object *obj,
enum i915_map_type type)
{
resource_size_t iomap = obj->mm.region->iomap.base -
obj->mm.region->region.start;
unsigned long n_pfn = obj->base.size >> PAGE_SHIFT;
unsigned long stack[32], *pfns = stack, i;
struct sgt_iter iter;
dma_addr_t addr;
void *vaddr;
if (type != I915_MAP_WC)
return NULL;
if (n_pfn > ARRAY_SIZE(stack)) {
/* Too big for stack -- allocate temporary array instead */
pfns = kvmalloc_array(n_pfn, sizeof(*pfns), GFP_KERNEL);
if (!pfns)
return NULL;
}
i = 0;
for_each_sgt_daddr(addr, iter, obj->mm.pages)
pfns[i++] = (iomap + addr) >> PAGE_SHIFT;
vaddr = vmap_pfn(pfns, n_pfn, pgprot_writecombine(PAGE_KERNEL_IO));
if (pfns != stack)
kvfree(pfns);
return vaddr;
}
/* get, pin, and map the pages of the object into kernel space */
@ -386,7 +368,13 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
}
if (!ptr) {
ptr = i915_gem_object_map(obj, type);
if (GEM_WARN_ON(type == I915_MAP_WC &&
!static_cpu_has(X86_FEATURE_PAT)))
ptr = NULL;
else if (i915_gem_object_has_struct_page(obj))
ptr = i915_gem_object_map_page(obj, type);
else
ptr = i915_gem_object_map_pfn(obj, type);
if (!ptr) {
err = -ENOMEM;
goto err_unpin;

View File

@ -49,80 +49,40 @@ struct file *shmem_create_from_object(struct drm_i915_gem_object *obj)
return file;
}
static size_t shmem_npte(struct file *file)
{
return file->f_mapping->host->i_size >> PAGE_SHIFT;
}
static void __shmem_unpin_map(struct file *file, void *ptr, size_t n_pte)
{
unsigned long pfn;
vunmap(ptr);
for (pfn = 0; pfn < n_pte; pfn++) {
struct page *page;
page = shmem_read_mapping_page_gfp(file->f_mapping, pfn,
GFP_KERNEL);
if (!WARN_ON(IS_ERR(page))) {
put_page(page);
put_page(page);
}
}
}
void *shmem_pin_map(struct file *file)
{
const size_t n_pte = shmem_npte(file);
pte_t *stack[32], **ptes, **mem;
struct vm_struct *area;
unsigned long pfn;
struct page **pages;
size_t n_pages, i;
void *vaddr;
mem = stack;
if (n_pte > ARRAY_SIZE(stack)) {
mem = kvmalloc_array(n_pte, sizeof(*mem), GFP_KERNEL);
if (!mem)
return NULL;
}
area = alloc_vm_area(n_pte << PAGE_SHIFT, mem);
if (!area) {
if (mem != stack)
kvfree(mem);
n_pages = file->f_mapping->host->i_size >> PAGE_SHIFT;
pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
if (!pages)
return NULL;
}
ptes = mem;
for (pfn = 0; pfn < n_pte; pfn++) {
struct page *page;
page = shmem_read_mapping_page_gfp(file->f_mapping, pfn,
GFP_KERNEL);
if (IS_ERR(page))
for (i = 0; i < n_pages; i++) {
pages[i] = shmem_read_mapping_page_gfp(file->f_mapping, i,
GFP_KERNEL);
if (IS_ERR(pages[i]))
goto err_page;
**ptes++ = mk_pte(page, PAGE_KERNEL);
}
if (mem != stack)
kvfree(mem);
vaddr = vmap(pages, n_pages, VM_MAP_PUT_PAGES, PAGE_KERNEL);
if (!vaddr)
goto err_page;
mapping_set_unevictable(file->f_mapping);
return area->addr;
return vaddr;
err_page:
if (mem != stack)
kvfree(mem);
__shmem_unpin_map(file, area->addr, pfn);
while (--i >= 0)
put_page(pages[i]);
kvfree(pages);
return NULL;
}
void shmem_unpin_map(struct file *file, void *ptr)
{
mapping_clear_unevictable(file->f_mapping);
__shmem_unpin_map(file, ptr, shmem_npte(file));
vfree(ptr);
}
static int __shmem_rw(struct file *file, loff_t off,

View File

@ -73,16 +73,13 @@ struct map_ring_valloc {
struct xenbus_map_node *node;
/* Why do we need two arrays? See comment of __xenbus_map_ring */
union {
unsigned long addrs[XENBUS_MAX_RING_GRANTS];
pte_t *ptes[XENBUS_MAX_RING_GRANTS];
};
unsigned long addrs[XENBUS_MAX_RING_GRANTS];
phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS];
struct gnttab_map_grant_ref map[XENBUS_MAX_RING_GRANTS];
struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS];
unsigned int idx; /* HVM only. */
unsigned int idx;
};
static DEFINE_SPINLOCK(xenbus_valloc_lock);
@ -686,6 +683,14 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
#ifdef CONFIG_XEN_PV
static int map_ring_apply(pte_t *pte, unsigned long addr, void *data)
{
struct map_ring_valloc *info = data;
info->phys_addrs[info->idx++] = arbitrary_virt_to_machine(pte).maddr;
return 0;
}
static int xenbus_map_ring_pv(struct xenbus_device *dev,
struct map_ring_valloc *info,
grant_ref_t *gnt_refs,
@ -694,18 +699,15 @@ static int xenbus_map_ring_pv(struct xenbus_device *dev,
{
struct xenbus_map_node *node = info->node;
struct vm_struct *area;
int err = GNTST_okay;
int i;
bool leaked;
bool leaked = false;
int err = -ENOMEM;
area = alloc_vm_area(XEN_PAGE_SIZE * nr_grefs, info->ptes);
area = get_vm_area(XEN_PAGE_SIZE * nr_grefs, VM_IOREMAP);
if (!area)
return -ENOMEM;
for (i = 0; i < nr_grefs; i++)
info->phys_addrs[i] =
arbitrary_virt_to_machine(info->ptes[i]).maddr;
if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
XEN_PAGE_SIZE * nr_grefs, map_ring_apply, info))
goto failed;
err = __xenbus_map_ring(dev, gnt_refs, nr_grefs, node->handles,
info, GNTMAP_host_map | GNTMAP_contains_pte,
&leaked);

View File

@ -310,7 +310,10 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
* Grow the stack manually; some architectures have a limit on how
* far ahead a user-space access may be in order to grow the stack.
*/
if (mmap_read_lock_killable(mm))
return -EINTR;
vma = find_extend_vma(mm, bprm->p);
mmap_read_unlock(mm);
if (!vma)
return -EFAULT;

View File

@ -842,13 +842,13 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
struct buffer_head *bh, *head;
gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
long offset;
struct mem_cgroup *memcg;
struct mem_cgroup *memcg, *old_memcg;
if (retry)
gfp |= __GFP_NOFAIL;
memcg = get_mem_cgroup_from_page(page);
memalloc_use_memcg(memcg);
old_memcg = set_active_memcg(memcg);
head = NULL;
offset = PAGE_SIZE;
@ -867,7 +867,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
set_bh_page(bh, page, offset);
}
out:
memalloc_unuse_memcg();
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
return head;
/*

View File

@ -3989,7 +3989,7 @@ static int io_madvise(struct io_kiocb *req, bool force_nonblock)
if (force_nonblock)
return -EAGAIN;
ret = do_madvise(ma->addr, ma->len, ma->advice);
ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
if (ret < 0)
req_set_fail_links(req);
io_req_complete(req, ret);

View File

@ -531,6 +531,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir);
const struct path *path = fsnotify_data_path(data, data_type);
unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
struct mem_cgroup *old_memcg;
struct inode *child = NULL;
bool name_event = false;
@ -580,7 +581,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
gfp |= __GFP_RETRY_MAYFAIL;
/* Whoever is interested in the event, pays for the allocation. */
memalloc_use_memcg(group->memcg);
old_memcg = set_active_memcg(group->memcg);
if (fanotify_is_perm_event(mask)) {
event = fanotify_alloc_perm_event(path, gfp);
@ -608,7 +609,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
event->pid = get_pid(task_tgid(current));
out:
memalloc_unuse_memcg();
set_active_memcg(old_memcg);
return event;
}

View File

@ -66,6 +66,7 @@ static int inotify_one_event(struct fsnotify_group *group, u32 mask,
int ret;
int len = 0;
int alloc_len = sizeof(struct inotify_event_info);
struct mem_cgroup *old_memcg;
if ((inode_mark->mask & FS_EXCL_UNLINK) &&
path && d_unlinked(path->dentry))
@ -87,9 +88,9 @@ static int inotify_one_event(struct fsnotify_group *group, u32 mask,
* trigger OOM killer in the target monitoring memcg as it may have
* security repercussion.
*/
memalloc_use_memcg(group->memcg);
old_memcg = set_active_memcg(group->memcg);
event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
memalloc_unuse_memcg();
set_active_memcg(old_memcg);
if (unlikely(!event)) {
/*

View File

@ -1531,18 +1531,6 @@ static inline bool memcg_kmem_enabled(void)
return static_branch_likely(&memcg_kmem_enabled_key);
}
static inline bool memcg_kmem_bypass(void)
{
if (in_interrupt())
return true;
/* Allow remote memcg charging in kthread contexts. */
if ((!current->mm || (current->flags & PF_KTHREAD)) &&
!current->active_memcg)
return true;
return false;
}
static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
int order)
{

View File

@ -2579,7 +2579,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf, bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
extern int do_madvise(unsigned long start, size_t len_in, int behavior);
extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
#ifdef CONFIG_MMU
extern int __mm_populate(unsigned long addr, unsigned long len,

View File

@ -77,6 +77,7 @@ extern const struct file_operations pidfd_fops;
struct file;
extern struct pid *pidfd_pid(const struct file *file);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
static inline struct pid *get_pid(struct pid *pid)
{

View File

@ -279,39 +279,38 @@ static inline void memalloc_nocma_restore(unsigned int flags)
#endif
#ifdef CONFIG_MEMCG
DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
/**
* memalloc_use_memcg - Starts the remote memcg charging scope.
* set_active_memcg - Starts the remote memcg charging scope.
* @memcg: memcg to charge.
*
* This function marks the beginning of the remote memcg charging scope. All the
* __GFP_ACCOUNT allocations till the end of the scope will be charged to the
* given memcg.
*
* NOTE: This function is not nesting safe.
* NOTE: This function can nest. Users must save the return value and
* reset the previous value after their own charging scope is over.
*/
static inline void memalloc_use_memcg(struct mem_cgroup *memcg)
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
WARN_ON_ONCE(current->active_memcg);
current->active_memcg = memcg;
}
struct mem_cgroup *old;
/**
* memalloc_unuse_memcg - Ends the remote memcg charging scope.
*
* This function marks the end of the remote memcg charging scope started by
* memalloc_use_memcg().
*/
static inline void memalloc_unuse_memcg(void)
{
current->active_memcg = NULL;
if (in_interrupt()) {
old = this_cpu_read(int_active_memcg);
this_cpu_write(int_active_memcg, memcg);
} else {
old = current->active_memcg;
current->active_memcg = memcg;
}
return old;
}
#else
static inline void memalloc_use_memcg(struct mem_cgroup *memcg)
{
}
static inline void memalloc_unuse_memcg(void)
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
return NULL;
}
#endif

View File

@ -879,6 +879,8 @@ asmlinkage long sys_munlockall(void);
asmlinkage long sys_mincore(unsigned long start, size_t len,
unsigned char __user * vec);
asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
size_t vlen, int behavior, unsigned int flags);
asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
unsigned long prot, unsigned long pgoff,
unsigned long flags);

View File

@ -24,6 +24,7 @@ struct notifier_block; /* in notifier.h */
#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */
#define VM_NO_GUARD 0x00000040 /* don't add guard page */
#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
#define VM_MAP_PUT_PAGES 0x00000100 /* put pages and free array in vfree */
/*
* VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
@ -121,6 +122,7 @@ extern void vfree_atomic(const void *addr);
extern void *vmap(struct page **pages, unsigned int count,
unsigned long flags, pgprot_t prot);
void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot);
extern void vunmap(const void *addr);
extern int remap_vmalloc_range_partial(struct vm_area_struct *vma,
@ -167,6 +169,7 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size,
unsigned long flags,
unsigned long start, unsigned long end,
const void *caller);
void free_vm_area(struct vm_struct *area);
extern struct vm_struct *remove_vm_area(const void *addr);
extern struct vm_struct *find_vm_area(const void *addr);
@ -202,10 +205,6 @@ static inline void set_vm_flush_reset_perms(void *addr)
}
#endif
/* Allocate/destroy a 'vmalloc' VM area. */
extern struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes);
extern void free_vm_area(struct vm_struct *area);
/* for /dev/kmem */
extern long vread(char *buf, char *addr, unsigned long count);
extern long vwrite(char *buf, char *addr, unsigned long count);

View File

@ -857,9 +857,11 @@ __SYSCALL(__NR_openat2, sys_openat2)
__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
#define __NR_faccessat2 439
__SYSCALL(__NR_faccessat2, sys_faccessat2)
#define __NR_process_madvise 440
__SYSCALL(__NR_process_madvise, sys_process_madvise)
#undef __NR_syscalls
#define __NR_syscalls 440
#define __NR_syscalls 441
/*
* 32 bit systems traditionally used different

View File

@ -1474,25 +1474,6 @@ end:
return retval;
}
static struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
{
struct fd f;
struct pid *pid;
f = fdget(fd);
if (!f.file)
return ERR_PTR(-EBADF);
pid = pidfd_pid(f.file);
if (!IS_ERR(pid)) {
get_pid(pid);
*flags = f.file->f_flags;
}
fdput(f);
return pid;
}
static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
int options, struct rusage *ru)
{

View File

@ -520,6 +520,25 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
return idr_get_next(&ns->idr, &nr);
}
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
{
struct fd f;
struct pid *pid;
f = fdget(fd);
if (!f.file)
return ERR_PTR(-EBADF);
pid = pidfd_pid(f.file);
if (!IS_ERR(pid)) {
get_pid(pid);
*flags = f.file->f_flags;
}
fdput(f);
return pid;
}
/**
* pidfd_create() - Create a new pid file descriptor.
*

View File

@ -280,6 +280,7 @@ COND_SYSCALL(mlockall);
COND_SYSCALL(munlockall);
COND_SYSCALL(mincore);
COND_SYSCALL(madvise);
COND_SYSCALL(process_madvise);
COND_SYSCALL(remap_file_pages);
COND_SYSCALL(mbind);
COND_SYSCALL_COMPAT(mbind);

View File

@ -816,6 +816,9 @@ config DEVICE_PRIVATE
memory; i.e., memory that is only accessible from the device (or
group of devices). You likely also want to select HMM_MIRROR.
config VMAP_PFN
bool
config FRAME_VECTOR
bool

View File

@ -72,6 +72,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
int nr;
struct page **pages;
int ret = 0;
bool needs_mmap_lock =
cmd != GUP_FAST_BENCHMARK && cmd != PIN_FAST_BENCHMARK;
if (gup->size > ULONG_MAX)
return -EINVAL;
@ -81,6 +83,11 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
if (!pages)
return -ENOMEM;
if (needs_mmap_lock && mmap_read_lock_killable(current->mm)) {
ret = -EINTR;
goto free_pages;
}
i = 0;
nr = gup->nr_pages_per_call;
start_time = ktime_get();
@ -120,9 +127,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
pages + i, NULL);
break;
default:
kvfree(pages);
ret = -EINVAL;
goto out;
goto unlock;
}
if (nr <= 0)
@ -150,8 +156,11 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
end_time = ktime_get();
gup->put_delta_usec = ktime_us_delta(end_time, start_time);
unlock:
if (needs_mmap_lock)
mmap_read_unlock(current->mm);
free_pages:
kvfree(pages);
out:
return ret;
}

View File

@ -17,6 +17,8 @@
#include <linux/falloc.h>
#include <linux/fadvise.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/uio.h>
#include <linux/ksm.h>
#include <linux/fs.h>
#include <linux/file.h>
@ -27,7 +29,6 @@
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/mmu_notifier.h>
#include <linux/sched/mm.h>
#include <asm/tlb.h>
@ -258,6 +259,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
loff_t offset;
@ -294,10 +296,10 @@ static long madvise_willneed(struct vm_area_struct *vma,
get_file(file);
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
mmap_read_unlock(current->mm);
mmap_read_unlock(mm);
vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
fput(file);
mmap_read_lock(current->mm);
mmap_read_lock(mm);
return 0;
}
@ -766,6 +768,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
int behavior)
{
struct mm_struct *mm = vma->vm_mm;
*prev = vma;
if (!can_madv_lru_vma(vma))
return -EINVAL;
@ -773,8 +777,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
if (!userfaultfd_remove(vma, start, end)) {
*prev = NULL; /* mmap_lock has been dropped, prev is stale */
mmap_read_lock(current->mm);
vma = find_vma(current->mm, start);
mmap_read_lock(mm);
vma = find_vma(mm, start);
if (!vma)
return -ENOMEM;
if (start < vma->vm_start) {
@ -828,6 +832,7 @@ static long madvise_remove(struct vm_area_struct *vma,
loff_t offset;
int error;
struct file *f;
struct mm_struct *mm = vma->vm_mm;
*prev = NULL; /* tell sys_madvise we drop mmap_lock */
@ -855,13 +860,13 @@ static long madvise_remove(struct vm_area_struct *vma,
get_file(f);
if (userfaultfd_remove(vma, start, end)) {
/* mmap_lock was not released by userfaultfd_remove() */
mmap_read_unlock(current->mm);
mmap_read_unlock(mm);
}
error = vfs_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
fput(f);
mmap_read_lock(current->mm);
mmap_read_lock(mm);
return error;
}
@ -984,6 +989,18 @@ madvise_behavior_valid(int behavior)
}
}
static bool
process_madvise_behavior_valid(int behavior)
{
switch (behavior) {
case MADV_COLD:
case MADV_PAGEOUT:
return true;
default:
return false;
}
}
/*
* The madvise(2) system call.
*
@ -1031,6 +1048,11 @@ madvise_behavior_valid(int behavior)
* MADV_DONTDUMP - the application wants to prevent pages in the given range
* from being included in its core dump.
* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
* MADV_COLD - the application is not expected to use this memory soon,
* deactivate pages in this range so that they can be reclaimed
* easily if memory pressure hanppens.
* MADV_PAGEOUT - the application is not expected to use this memory soon,
* page out the pages in this range immediately.
*
* return values:
* zero - success
@ -1045,7 +1067,7 @@ madvise_behavior_valid(int behavior)
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
*/
int do_madvise(unsigned long start, size_t len_in, int behavior)
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
unsigned long end, tmp;
struct vm_area_struct *vma, *prev;
@ -1083,10 +1105,10 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
write = madvise_need_mmap_write(behavior);
if (write) {
if (mmap_write_lock_killable(current->mm))
if (mmap_write_lock_killable(mm))
return -EINTR;
} else {
mmap_read_lock(current->mm);
mmap_read_lock(mm);
}
/*
@ -1094,7 +1116,7 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
* ranges, just ignore them, but return -ENOMEM at the end.
* - different from the way of handling in mlock etc.
*/
vma = find_vma_prev(current->mm, start, &prev);
vma = find_vma_prev(mm, start, &prev);
if (vma && start > vma->vm_start)
prev = vma;
@ -1131,19 +1153,92 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
if (prev)
vma = prev->vm_next;
else /* madvise_remove dropped mmap_lock */
vma = find_vma(current->mm, start);
vma = find_vma(mm, start);
}
out:
blk_finish_plug(&plug);
if (write)
mmap_write_unlock(current->mm);
mmap_write_unlock(mm);
else
mmap_read_unlock(current->mm);
mmap_read_unlock(mm);
return error;
}
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
return do_madvise(start, len_in, behavior);
return do_madvise(current->mm, start, len_in, behavior);
}
SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
size_t, vlen, int, behavior, unsigned int, flags)
{
ssize_t ret;
struct iovec iovstack[UIO_FASTIOV], iovec;
struct iovec *iov = iovstack;
struct iov_iter iter;
struct pid *pid;
struct task_struct *task;
struct mm_struct *mm;
size_t total_len;
unsigned int f_flags;
if (flags != 0) {
ret = -EINVAL;
goto out;
}
ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
if (ret < 0)
goto out;
pid = pidfd_get_pid(pidfd, &f_flags);
if (IS_ERR(pid)) {
ret = PTR_ERR(pid);
goto free_iov;
}
task = get_pid_task(pid, PIDTYPE_PID);
if (!task) {
ret = -ESRCH;
goto put_pid;
}
if (task->mm != current->mm &&
!process_madvise_behavior_valid(behavior)) {
ret = -EINVAL;
goto release_task;
}
mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
if (IS_ERR_OR_NULL(mm)) {
ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
goto release_task;
}
total_len = iov_iter_count(&iter);
while (iov_iter_count(&iter)) {
iovec = iov_iter_iovec(&iter);
ret = do_madvise(mm, (unsigned long)iovec.iov_base,
iovec.iov_len, behavior);
if (ret < 0)
break;
iov_iter_advance(&iter, iovec.iov_len);
}
if (ret == 0)
ret = total_len - iov_iter_count(&iter);
mmput(mm);
return ret;
release_task:
put_task_struct(task);
put_pid:
put_pid(pid);
free_iov:
kfree(iov);
out:
return ret;
}

View File

@ -73,6 +73,9 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
struct mem_cgroup *root_mem_cgroup __read_mostly;
/* Active memory cgroup to use from an interrupt context */
DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;
@ -1061,23 +1064,56 @@ struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
}
EXPORT_SYMBOL(get_mem_cgroup_from_page);
/**
* If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
*/
static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
static __always_inline struct mem_cgroup *active_memcg(void)
{
if (unlikely(current->active_memcg)) {
struct mem_cgroup *memcg;
if (in_interrupt())
return this_cpu_read(int_active_memcg);
else
return current->active_memcg;
}
rcu_read_lock();
static __always_inline struct mem_cgroup *get_active_memcg(void)
{
struct mem_cgroup *memcg;
rcu_read_lock();
memcg = active_memcg();
if (memcg) {
/* current->active_memcg must hold a ref. */
if (WARN_ON_ONCE(!css_tryget(&current->active_memcg->css)))
if (WARN_ON_ONCE(!css_tryget(&memcg->css)))
memcg = root_mem_cgroup;
else
memcg = current->active_memcg;
rcu_read_unlock();
return memcg;
}
rcu_read_unlock();
return memcg;
}
static __always_inline bool memcg_kmem_bypass(void)
{
/* Allow remote memcg charging from any context. */
if (unlikely(active_memcg()))
return false;
/* Memcg to charge can't be determined. */
if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
return true;
return false;
}
/**
* If active memcg is set, do not fallback to current->mm->memcg.
*/
static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
{
if (memcg_kmem_bypass())
return NULL;
if (unlikely(active_memcg()))
return get_active_memcg();
return get_mem_cgroup_from_mm(current->mm);
}
@ -2933,12 +2969,12 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg;
if (unlikely(!current->mm && !current->active_memcg))
if (memcg_kmem_bypass())
return NULL;
rcu_read_lock();
if (unlikely(current->active_memcg))
memcg = rcu_dereference(current->active_memcg);
if (unlikely(active_memcg()))
memcg = active_memcg();
else
memcg = mem_cgroup_from_task(current);
@ -3059,19 +3095,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
struct mem_cgroup *memcg;
int ret = 0;
if (memcg_kmem_bypass())
return 0;
memcg = get_mem_cgroup_from_current();
if (!mem_cgroup_is_root(memcg)) {
if (memcg && !mem_cgroup_is_root(memcg)) {
ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
if (!ret) {
page->mem_cgroup = memcg;
__SetPageKmemcg(page);
return 0;
}
css_put(&memcg->css);
}
css_put(&memcg->css);
return ret;
}
@ -5290,12 +5323,12 @@ static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
struct mem_cgroup *memcg;
struct mem_cgroup *memcg, *old_memcg;
long error = -ENOMEM;
memalloc_use_memcg(parent);
old_memcg = set_active_memcg(parent);
memcg = mem_cgroup_alloc();
memalloc_unuse_memcg();
set_active_memcg(old_memcg);
if (IS_ERR(memcg))
return ERR_CAST(memcg);

View File

@ -1673,16 +1673,6 @@ int unpoison_memory(unsigned long pfn)
}
EXPORT_SYMBOL(unpoison_memory);
static struct page *new_page(struct page *p, unsigned long private)
{
struct migration_target_control mtc = {
.nid = page_to_nid(p),
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
};
return alloc_migration_target(p, (unsigned long)&mtc);
}
/*
* Safely get reference count of an arbitrary page.
* Returns 0 for a free page, -EIO for a zero refcount page
@ -1797,6 +1787,10 @@ static int __soft_offline_page(struct page *page)
char const *msg_page[] = {"page", "hugepage"};
bool huge = PageHuge(page);
LIST_HEAD(pagelist);
struct migration_target_control mtc = {
.nid = NUMA_NO_NODE,
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
};
/*
* Check PageHWPoison again inside page lock because PageHWPoison
@ -1833,8 +1827,8 @@ static int __soft_offline_page(struct page *page)
}
if (isolate_page(hpage, &pagelist)) {
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (!ret) {
bool release = !huge;

View File

@ -2391,13 +2391,15 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
arch_enter_lazy_mmu_mode();
do {
if (create || !pte_none(*pte)) {
err = fn(pte++, addr, data);
if (err)
break;
}
} while (addr += PAGE_SIZE, addr != end);
if (fn) {
do {
if (create || !pte_none(*pte)) {
err = fn(pte++, addr, data);
if (err)
break;
}
} while (addr += PAGE_SIZE, addr != end);
}
*mask |= PGTBL_PTE_MODIFIED;
arch_leave_lazy_mmu_mode();

View File

@ -1290,27 +1290,6 @@ found:
return 0;
}
static struct page *new_node_page(struct page *page, unsigned long private)
{
nodemask_t nmask = node_states[N_MEMORY];
struct migration_target_control mtc = {
.nid = page_to_nid(page),
.nmask = &nmask,
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
};
/*
* try to allocate from a different node but reuse this node if there
* are no other online nodes to be used (e.g. we are offlining a part
* of the only existing node)
*/
node_clear(mtc.nid, nmask);
if (nodes_empty(nmask))
node_set(mtc.nid, nmask);
return alloc_migration_target(page, (unsigned long)&mtc);
}
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
@ -1370,9 +1349,28 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
put_page(page);
}
if (!list_empty(&source)) {
/* Allocate a new page from the nearest neighbor node */
ret = migrate_pages(&source, new_node_page, NULL, 0,
MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
nodemask_t nmask = node_states[N_MEMORY];
struct migration_target_control mtc = {
.nmask = &nmask,
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
};
/*
* We have checked that migration range is on a single zone so
* we can use the nid of the first page to all the others.
*/
mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
/*
* try to allocate from a different node but reuse this node
* if there are no other online nodes to be used (e.g. we are
* offlining a part of the only existing node)
*/
node_clear(mtc.nid, nmask);
if (nodes_empty(nmask))
node_set(mtc.nid, nmask);
ret = migrate_pages(&source, alloc_migration_target, NULL,
(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
if (ret) {
list_for_each_entry(page, &source, lru) {
pr_warn("migrating pfn %lx failed ret:%d ",

View File

@ -1864,6 +1864,53 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
return nr_pages ? -EFAULT : 0;
}
static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
{
struct task_struct *task;
struct mm_struct *mm;
/*
* There is no need to check if current process has the right to modify
* the specified process when they are same.
*/
if (!pid) {
mmget(current->mm);
*mem_nodes = cpuset_mems_allowed(current);
return current->mm;
}
/* Find the mm_struct */
rcu_read_lock();
task = find_task_by_vpid(pid);
if (!task) {
rcu_read_unlock();
return ERR_PTR(-ESRCH);
}
get_task_struct(task);
/*
* Check if this process has the right to modify the specified
* process. Use the regular "ptrace_may_access()" checks.
*/
if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
rcu_read_unlock();
mm = ERR_PTR(-EPERM);
goto out;
}
rcu_read_unlock();
mm = ERR_PTR(security_task_movememory(task));
if (IS_ERR(mm))
goto out;
*mem_nodes = cpuset_mems_allowed(task);
mm = get_task_mm(task);
out:
put_task_struct(task);
if (!mm)
mm = ERR_PTR(-EINVAL);
return mm;
}
/*
* Move a list of pages in the address space of the currently executing
* process.
@ -1873,7 +1920,6 @@ static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
const int __user *nodes,
int __user *status, int flags)
{
struct task_struct *task;
struct mm_struct *mm;
int err;
nodemask_t task_nodes;
@ -1885,36 +1931,9 @@ static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
/* Find the mm_struct */
rcu_read_lock();
task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
rcu_read_unlock();
return -ESRCH;
}
get_task_struct(task);
/*
* Check if this process has the right to modify the specified
* process. Use the regular "ptrace_may_access()" checks.
*/
if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
rcu_read_unlock();
err = -EPERM;
goto out;
}
rcu_read_unlock();
err = security_task_movememory(task);
if (err)
goto out;
task_nodes = cpuset_mems_allowed(task);
mm = get_task_mm(task);
put_task_struct(task);
if (!mm)
return -EINVAL;
mm = find_mm_struct(pid, &task_nodes);
if (IS_ERR(mm))
return PTR_ERR(mm);
if (nodes)
err = do_pages_move(mm, task_nodes, nr_pages, pages,
@ -1924,10 +1943,6 @@ static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
mmput(mm);
return err;
out:
put_task_struct(task);
return err;
}
SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,

View File

@ -558,6 +558,50 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
return 0;
}
/*
* vma_next() - Get the next VMA.
* @mm: The mm_struct.
* @vma: The current vma.
*
* If @vma is NULL, return the first vma in the mm.
*
* Returns: The next VMA after @vma.
*/
static inline struct vm_area_struct *vma_next(struct mm_struct *mm,
struct vm_area_struct *vma)
{
if (!vma)
return mm->mmap;
return vma->vm_next;
}
/*
* munmap_vma_range() - munmap VMAs that overlap a range.
* @mm: The mm struct
* @start: The start of the range.
* @len: The length of the range.
* @pprev: pointer to the pointer that will be set to previous vm_area_struct
* @rb_link: the rb_node
* @rb_parent: the parent rb_node
*
* Find all the vm_area_struct that overlap from @start to
* @end and munmap them. Set @pprev to the previous vm_area_struct.
*
* Returns: -ENOMEM on munmap failure or 0 on success.
*/
static inline int
munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len,
struct vm_area_struct **pprev, struct rb_node ***link,
struct rb_node **parent, struct list_head *uf)
{
while (find_vma_links(mm, start, start + len, pprev, link, parent))
if (do_munmap(mm, start, len, uf))
return -ENOMEM;
return 0;
}
static unsigned long count_vma_pages_range(struct mm_struct *mm,
unsigned long addr, unsigned long end)
{
@ -1128,10 +1172,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
if (vm_flags & VM_SPECIAL)
return NULL;
if (prev)
next = prev->vm_next;
else
next = mm->mmap;
next = vma_next(mm, prev);
area = next;
if (area && area->vm_end == end) /* cases 6, 7, 8 */
next = next->vm_next;
@ -1707,13 +1748,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
return -ENOMEM;
}
/* Clear old maps */
while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
&rb_parent)) {
if (do_munmap(mm, addr, len, uf))
return -ENOMEM;
}
/* Clear old maps, set up prev, rb_link, rb_parent, and uf */
if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
return -ENOMEM;
/*
* Private writable mapping: check memory availability
*/
@ -2632,7 +2669,7 @@ static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end)
{
struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
struct vm_area_struct *next = vma_next(mm, prev);
struct mmu_gather tlb;
lru_add_drain();
@ -2831,7 +2868,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
if (error)
return error;
}
vma = prev ? prev->vm_next : mm->mmap;
vma = vma_next(mm, prev);
if (unlikely(uf)) {
/*
@ -3049,14 +3086,9 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
if (error)
return error;
/*
* Clear old maps. this also does some error checking for us
*/
while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
&rb_parent)) {
if (do_munmap(mm, addr, len, uf))
return -ENOMEM;
}
/* Clear old maps, set up prev, rb_link, rb_parent, and uf */
if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
return -ENOMEM;
/* Check against address space limits *after* clearing old maps... */
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))

View File

@ -354,13 +354,6 @@ void vm_unmap_aliases(void)
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
{
BUG();
return NULL;
}
EXPORT_SYMBOL_GPL(alloc_vm_area);
void free_vm_area(struct vm_struct *area)
{
BUG();

View File

@ -1584,8 +1584,7 @@ static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
{
struct obj_cgroup *objcg;
if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT) ||
memcg_kmem_bypass())
if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
return PCPU_CHUNK_ROOT;
objcg = get_obj_cgroup_from_current();

View File

@ -280,9 +280,6 @@ static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
{
struct obj_cgroup *objcg;
if (memcg_kmem_bypass())
return NULL;
objcg = get_obj_cgroup_from_current();
if (!objcg)
return NULL;

View File

@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/vmalloc.c
*
* Copyright (C) 1993 Linus Torvalds
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
@ -2321,20 +2319,21 @@ static void __vfree(const void *addr)
}
/**
* vfree - release memory allocated by vmalloc()
* @addr: memory base address
* vfree - Release memory allocated by vmalloc()
* @addr: Memory base address
*
* Free the virtually continuous memory area starting at @addr, as
* obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
* NULL, no operation is performed.
* Free the virtually continuous memory area starting at @addr, as obtained
* from one of the vmalloc() family of APIs. This will usually also free the
* physical memory underlying the virtual allocation, but that memory is
* reference counted, so it will not be freed until the last user goes away.
*
* Must not be called in NMI context (strictly speaking, only if we don't
* have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
* conventions for vfree() arch-depenedent would be a really bad idea)
* If @addr is NULL, no operation is performed.
*
* Context:
* May sleep if called *not* from interrupt context.
*
* NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
* Must not be called in NMI context (strictly speaking, it could be
* if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
* conventions for vfree() arch-depenedent would be a really bad idea).
*/
void vfree(const void *addr)
{
@ -2376,8 +2375,11 @@ EXPORT_SYMBOL(vunmap);
* @flags: vm_area->flags
* @prot: page protection for the mapping
*
* Maps @count pages from @pages into contiguous kernel virtual
* space.
* Maps @count pages from @pages into contiguous kernel virtual space.
* If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
* (which must be kmalloc or vmalloc memory) and one reference per pages in it
* are transferred from the caller to vmap(), and will be freed / dropped when
* vfree() is called on the return value.
*
* Return: the address of the area or %NULL on failure
*/
@ -2403,28 +2405,73 @@ void *vmap(struct page **pages, unsigned int count,
return NULL;
}
if (flags & VM_MAP_PUT_PAGES)
area->pages = pages;
return area->addr;
}
EXPORT_SYMBOL(vmap);
#ifdef CONFIG_VMAP_PFN
struct vmap_pfn_data {
unsigned long *pfns;
pgprot_t prot;
unsigned int idx;
};
static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
{
struct vmap_pfn_data *data = private;
if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
return -EINVAL;
*pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
return 0;
}
/**
* vmap_pfn - map an array of PFNs into virtually contiguous space
* @pfns: array of PFNs
* @count: number of pages to map
* @prot: page protection for the mapping
*
* Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
* the start address of the mapping.
*/
void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
{
struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
struct vm_struct *area;
area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
__builtin_return_address(0));
if (!area)
return NULL;
if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
count * PAGE_SIZE, vmap_pfn_apply, &data)) {
free_vm_area(area);
return NULL;
}
return area->addr;
}
EXPORT_SYMBOL_GPL(vmap_pfn);
#endif /* CONFIG_VMAP_PFN */
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
struct page **pages;
unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
0 :
__GFP_HIGHMEM;
unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
unsigned int array_size = nr_pages * sizeof(struct page *), i;
struct page **pages;
nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *));
gfp_mask |= __GFP_NOWARN;
if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
gfp_mask |= __GFP_HIGHMEM;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
node, area->caller);
pages = __vmalloc_node(array_size, 1, nested_gfp, node,
area->caller);
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
@ -2442,9 +2489,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
struct page *page;
if (node == NUMA_NO_NODE)
page = alloc_page(alloc_mask|highmem_mask);
page = alloc_page(gfp_mask);
else
page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
page = alloc_pages_node(node, gfp_mask, 0);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vfree() */
@ -3032,54 +3079,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
static int f(pte_t *pte, unsigned long addr, void *data)
{
pte_t ***p = data;
if (p) {
*(*p) = pte;
(*p)++;
}
return 0;
}
/**
* alloc_vm_area - allocate a range of kernel address space
* @size: size of the area
* @ptes: returns the PTEs for the address space
*
* Returns: NULL on failure, vm_struct on success
*
* This function reserves a range of kernel address space, and
* allocates pagetables to map that range. No actual mappings
* are created.
*
* If @ptes is non-NULL, pointers to the PTEs (in init_mm)
* allocated for the VM area are returned.
*/
struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
{
struct vm_struct *area;
area = get_vm_area_caller(size, VM_IOREMAP,
__builtin_return_address(0));
if (area == NULL)
return NULL;
/*
* This ensures that page tables are constructed for this region
* of kernel virtual address space and mapped into init_mm.
*/
if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
size, f, ptes ? &ptes : NULL)) {
free_vm_area(area);
return NULL;
}
return area;
}
EXPORT_SYMBOL_GPL(alloc_vm_area);
void free_vm_area(struct vm_struct *area)
{
struct vm_struct *ret;

View File

@ -1122,10 +1122,16 @@ static inline int __zs_cpu_up(struct mapping_area *area)
*/
if (area->vm)
return 0;
area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL);
area->vm = get_vm_area(PAGE_SIZE * 2, 0);
if (!area->vm)
return -ENOMEM;
return 0;
/*
* Populate ptes in advance to avoid pte allocation with GFP_KERNEL
* in non-preemtible context of zs_map_object.
*/
return apply_to_page_range(&init_mm, (unsigned long)area->vm->addr,
PAGE_SIZE * 2, NULL, NULL);
}
static inline void __zs_cpu_down(struct mapping_area *area)

View File

@ -45,7 +45,7 @@ struct hmm_buffer {
#define TWOMEG (1 << 21)
#define HMM_BUFFER_SIZE (1024 << 12)
#define HMM_PATH_MAX 64
#define NTIMES 256
#define NTIMES 10
#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))