Merge branch 'akpm' (patches from Andrew Morton)

Merge more patches from Andrew Morton:
 "The rest of MM.  Plus one misc cleanup"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (35 commits)
  mm/Kconfig: add MMU dependency for MIGRATION.
  kernel: replace strict_strto*() with kstrto*()
  mm, thp: count thp_fault_fallback anytime thp fault fails
  thp: consolidate code between handle_mm_fault() and do_huge_pmd_anonymous_page()
  thp: do_huge_pmd_anonymous_page() cleanup
  thp: move maybe_pmd_mkwrite() out of mk_huge_pmd()
  mm: cleanup add_to_page_cache_locked()
  thp: account anon transparent huge pages into NR_ANON_PAGES
  truncate: drop 'oldsize' truncate_pagecache() parameter
  mm: make lru_add_drain_all() selective
  memcg: document cgroup dirty/writeback memory statistics
  memcg: add per cgroup writeback pages accounting
  memcg: check for proper lock held in mem_cgroup_update_page_stat
  memcg: remove MEMCG_NR_FILE_MAPPED
  memcg: reduce function dereference
  memcg: avoid overflow caused by PAGE_ALIGN
  memcg: rename RESOURCE_MAX to RES_COUNTER_MAX
  memcg: correct RESOURCE_MAX to ULLONG_MAX
  mm: memcg: do not trap chargers with full callstack on OOM
  mm: memcg: rework and document OOM waiting and wakeup
  ...
This commit is contained in:
Linus Torvalds 2013-09-12 15:44:27 -07:00
commit ac4de9543a
79 changed files with 972 additions and 918 deletions

View File

@ -490,6 +490,8 @@ pgpgin - # of charging events to the memory cgroup. The charging
pgpgout - # of uncharging events to the memory cgroup. The uncharging
event happens each time a page is unaccounted from the cgroup.
swap - # of bytes of swap usage
writeback - # of bytes of file/anon cache that are queued for syncing to
disk.
inactive_anon - # of bytes of anonymous and swap cache memory on inactive
LRU list.
active_anon - # of bytes of anonymous and swap cache memory on active

View File

@ -89,8 +89,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
const struct exception_table_entry *fixup;
int fault, si_code = SEGV_MAPERR;
siginfo_t info;
unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(cause > 0 ? FAULT_FLAG_WRITE : 0));
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
/* As of EV6, a load into $31/$f31 is a prefetch, and never faults
(or is suppressed by the PALcode). Support that for older CPUs
@ -115,7 +114,8 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
if (address >= TASK_SIZE)
goto vmalloc_fault;
#endif
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
retry:
down_read(&mm->mmap_sem);
vma = find_vma(mm, address);
@ -142,6 +142,7 @@ retry:
} else {
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
flags |= FAULT_FLAG_WRITE;
}
/* If for any reason at all we couldn't handle the fault,

View File

@ -60,8 +60,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address)
siginfo_t info;
int fault, ret;
int write = regs->ecr_cause & ECR_C_PROTV_STORE; /* ST/EX */
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(write ? FAULT_FLAG_WRITE : 0);
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
/*
* We fault-in kernel-space virtual memory on-demand. The
@ -89,6 +88,8 @@ void do_page_fault(struct pt_regs *regs, unsigned long address)
if (in_atomic() || !mm)
goto no_context;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
retry:
down_read(&mm->mmap_sem);
vma = find_vma(mm, address);
@ -117,12 +118,12 @@ good_area:
if (write) {
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
flags |= FAULT_FLAG_WRITE;
} else {
if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
goto bad_area;
}
survive:
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
@ -201,10 +202,6 @@ no_context:
die("Oops", regs, address);
out_of_memory:
if (is_global_init(tsk)) {
yield();
goto survive;
}
up_read(&mm->mmap_sem);
if (user_mode(regs)) {

View File

@ -261,9 +261,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
struct task_struct *tsk;
struct mm_struct *mm;
int fault, sig, code;
int write = fsr & FSR_WRITE;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(write ? FAULT_FLAG_WRITE : 0);
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
if (notify_page_fault(regs, fsr))
return 0;
@ -282,6 +280,11 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
if (in_atomic() || !mm)
goto no_context;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
if (fsr & FSR_WRITE)
flags |= FAULT_FLAG_WRITE;
/*
* As per x86, we may deadlock here. However, since the kernel only
* validly references user space from well defined areas of the code,
@ -349,6 +352,13 @@ retry:
if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
return 0;
/*
* If we are in kernel mode at this point, we
* have no context to handle this fault with.
*/
if (!user_mode(regs))
goto no_context;
if (fault & VM_FAULT_OOM) {
/*
* We ran out of memory, call the OOM killer, and return to
@ -359,13 +369,6 @@ retry:
return 0;
}
/*
* If we are in kernel mode at this point, we
* have no context to handle this fault with.
*/
if (!user_mode(regs))
goto no_context;
if (fault & VM_FAULT_SIGBUS) {
/*
* We had some memory, but were unable to

View File

@ -199,13 +199,6 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
if (esr & ESR_LNX_EXEC) {
vm_flags = VM_EXEC;
} else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) {
vm_flags = VM_WRITE;
mm_flags |= FAULT_FLAG_WRITE;
}
tsk = current;
mm = tsk->mm;
@ -220,6 +213,16 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
if (in_atomic() || !mm)
goto no_context;
if (user_mode(regs))
mm_flags |= FAULT_FLAG_USER;
if (esr & ESR_LNX_EXEC) {
vm_flags = VM_EXEC;
} else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) {
vm_flags = VM_WRITE;
mm_flags |= FAULT_FLAG_WRITE;
}
/*
* As per x86, we may deadlock here. However, since the kernel only
* validly references user space from well defined areas of the code,
@ -288,6 +291,13 @@ retry:
VM_FAULT_BADACCESS))))
return 0;
/*
* If we are in kernel mode at this point, we have no context to
* handle this fault with.
*/
if (!user_mode(regs))
goto no_context;
if (fault & VM_FAULT_OOM) {
/*
* We ran out of memory, call the OOM killer, and return to
@ -298,13 +308,6 @@ retry:
return 0;
}
/*
* If we are in kernel mode at this point, we have no context to
* handle this fault with.
*/
if (!user_mode(regs))
goto no_context;
if (fault & VM_FAULT_SIGBUS) {
/*
* We had some memory, but were unable to successfully fix up

View File

@ -86,6 +86,8 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
local_irq_enable();
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
retry:
down_read(&mm->mmap_sem);
@ -228,9 +230,9 @@ no_context:
*/
out_of_memory:
up_read(&mm->mmap_sem);
pagefault_out_of_memory();
if (!user_mode(regs))
goto no_context;
pagefault_out_of_memory();
return;
do_sigbus:

View File

@ -58,8 +58,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
struct vm_area_struct * vma;
siginfo_t info;
int fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
((writeaccess & 1) ? FAULT_FLAG_WRITE : 0);
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
D(printk(KERN_DEBUG
"Page fault for %lX on %X at %lX, prot %d write %d\n",
@ -117,6 +116,8 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
if (in_atomic() || !mm)
goto no_context;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
retry:
down_read(&mm->mmap_sem);
vma = find_vma(mm, address);
@ -155,6 +156,7 @@ retry:
} else if (writeaccess == 1) {
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
flags |= FAULT_FLAG_WRITE;
} else {
if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
goto bad_area;

View File

@ -34,11 +34,11 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
struct vm_area_struct *vma;
struct mm_struct *mm;
unsigned long _pme, lrai, lrad, fixup;
unsigned long flags = 0;
siginfo_t info;
pgd_t *pge;
pud_t *pue;
pte_t *pte;
int write;
int fault;
#if 0
@ -81,6 +81,9 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
if (in_atomic() || !mm)
goto no_context;
if (user_mode(__frame))
flags |= FAULT_FLAG_USER;
down_read(&mm->mmap_sem);
vma = find_vma(mm, ear0);
@ -129,7 +132,6 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
*/
good_area:
info.si_code = SEGV_ACCERR;
write = 0;
switch (esr0 & ESR0_ATXC) {
default:
/* handle write to write protected page */
@ -140,7 +142,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
#endif
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
write = 1;
flags |= FAULT_FLAG_WRITE;
break;
/* handle read from protected page */
@ -162,7 +164,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
fault = handle_mm_fault(mm, vma, ear0, write ? FAULT_FLAG_WRITE : 0);
fault = handle_mm_fault(mm, vma, ear0, flags);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;

View File

@ -53,8 +53,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs)
int si_code = SEGV_MAPERR;
int fault;
const struct exception_table_entry *fixup;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(cause > 0 ? FAULT_FLAG_WRITE : 0);
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
/*
* If we're in an interrupt or have no user context,
@ -65,6 +64,8 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs)
local_irq_enable();
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
retry:
down_read(&mm->mmap_sem);
vma = find_vma(mm, address);
@ -96,6 +97,7 @@ good_area:
case FLT_STORE:
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
flags |= FAULT_FLAG_WRITE;
break;
}

View File

@ -90,8 +90,6 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT)
| (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT));
flags |= ((mask & VM_WRITE) ? FAULT_FLAG_WRITE : 0);
/* mmap_sem is performance critical.... */
prefetchw(&mm->mmap_sem);
@ -119,6 +117,10 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
if (notify_page_fault(regs, TRAP_BRKPT))
return;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
if (mask & VM_WRITE)
flags |= FAULT_FLAG_WRITE;
retry:
down_read(&mm->mmap_sem);

View File

@ -78,7 +78,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
struct mm_struct *mm;
struct vm_area_struct * vma;
unsigned long page, addr;
int write;
unsigned long flags = 0;
int fault;
siginfo_t info;
@ -117,6 +117,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (in_atomic() || !mm)
goto bad_area_nosemaphore;
if (error_code & ACE_USERMODE)
flags |= FAULT_FLAG_USER;
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunately, in the case of an
@ -166,14 +169,13 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
*/
good_area:
info.si_code = SEGV_ACCERR;
write = 0;
switch (error_code & (ACE_WRITE|ACE_PROTECTION)) {
default: /* 3: write, present */
/* fall through */
case ACE_WRITE: /* write, not present */
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
write++;
flags |= FAULT_FLAG_WRITE;
break;
case ACE_PROTECTION: /* read, present */
case 0: /* read, not present */
@ -194,7 +196,7 @@ good_area:
*/
addr = (address & PAGE_MASK);
set_thread_fault_code(error_code);
fault = handle_mm_fault(mm, vma, addr, write ? FAULT_FLAG_WRITE : 0);
fault = handle_mm_fault(mm, vma, addr, flags);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;

View File

@ -88,6 +88,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
if (in_atomic() || !mm)
goto no_context;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
retry:
down_read(&mm->mmap_sem);

View File

@ -53,8 +53,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
struct vm_area_struct *vma, *prev_vma;
siginfo_t info;
int fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(write_access ? FAULT_FLAG_WRITE : 0);
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
@ -109,6 +108,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
if (in_atomic() || !mm)
goto no_context;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
retry:
down_read(&mm->mmap_sem);
@ -121,6 +122,7 @@ good_area:
if (write_access) {
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
flags |= FAULT_FLAG_WRITE;
} else {
if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
goto bad_area;

View File

@ -92,8 +92,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
int code = SEGV_MAPERR;
int is_write = error_code & ESR_S;
int fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(is_write ? FAULT_FLAG_WRITE : 0);
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
regs->ear = address;
regs->esr = error_code;
@ -121,6 +120,9 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
die("Weird page fault", regs, SIGSEGV);
}
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunately, in the case of an
@ -199,6 +201,7 @@ good_area:
if (unlikely(is_write)) {
if (unlikely(!(vma->vm_flags & VM_WRITE)))
goto bad_area;
flags |= FAULT_FLAG_WRITE;
/* a read */
} else {
/* protection fault */

View File

@ -42,8 +42,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
const int field = sizeof(unsigned long) * 2;
siginfo_t info;
int fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(write ? FAULT_FLAG_WRITE : 0);
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
#if 0
printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", raw_smp_processor_id(),
@ -93,6 +92,8 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
if (in_atomic() || !mm)
goto bad_area_nosemaphore;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
retry:
down_read(&mm->mmap_sem);
vma = find_vma(mm, address);
@ -114,6 +115,7 @@ good_area:
if (write) {
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
flags |= FAULT_FLAG_WRITE;
} else {
if (cpu_has_rixi) {
if (address == regs->cp0_epc && !(vma->vm_flags & VM_EXEC)) {
@ -241,6 +243,8 @@ out_of_memory:
* (which will retry the fault, or kill us if we got oom-killed).
*/
up_read(&mm->mmap_sem);
if (!user_mode(regs))
goto no_context;
pagefault_out_of_memory();
return;

View File

@ -171,6 +171,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code,
if (in_atomic() || !mm)
goto no_context;
if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR)
flags |= FAULT_FLAG_USER;
retry:
down_read(&mm->mmap_sem);

View File

@ -86,6 +86,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address,
if (user_mode(regs)) {
/* Exception was in userspace: reenable interrupts */
local_irq_enable();
flags |= FAULT_FLAG_USER;
} else {
/* If exception was in a syscall, then IRQ's may have
* been enabled or disabled. If they were enabled,

View File

@ -180,6 +180,10 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
if (in_atomic() || !mm)
goto no_context;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
if (acc_type & VM_WRITE)
flags |= FAULT_FLAG_WRITE;
retry:
down_read(&mm->mmap_sem);
vma = find_vma_prev(mm, address, &prev_vma);
@ -203,8 +207,7 @@ good_area:
* fault.
*/
fault = handle_mm_fault(mm, vma, address,
flags | ((acc_type & VM_WRITE) ? FAULT_FLAG_WRITE : 0));
fault = handle_mm_fault(mm, vma, address, flags);
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
return;

View File

@ -223,9 +223,6 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
is_write = error_code & ESR_DST;
#endif /* CONFIG_4xx || CONFIG_BOOKE */
if (is_write)
flags |= FAULT_FLAG_WRITE;
#ifdef CONFIG_PPC_ICSWX
/*
* we need to do this early because this "data storage
@ -288,6 +285,9 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
if (user_mode(regs))
store_update_sp = store_updates_sp(regs);
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunately, in the case of an
@ -415,6 +415,7 @@ good_area:
} else if (is_write) {
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
flags |= FAULT_FLAG_WRITE;
/* a read */
} else {
/* protection fault */

View File

@ -302,6 +302,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
address = trans_exc_code & __FAIL_ADDR_MASK;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
flags |= FAULT_FLAG_WRITE;
down_read(&mm->mmap_sem);

View File

@ -47,6 +47,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
const int field = sizeof(unsigned long) * 2;
unsigned long flags = 0;
siginfo_t info;
int fault;
@ -75,6 +76,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
if (in_atomic() || !mm)
goto bad_area_nosemaphore;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
down_read(&mm->mmap_sem);
vma = find_vma(mm, address);
if (!vma)
@ -95,18 +99,18 @@ good_area:
if (write) {
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
flags |= FAULT_FLAG_WRITE;
} else {
if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)))
goto bad_area;
}
survive:
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
fault = handle_mm_fault(mm, vma, address, write);
fault = handle_mm_fault(mm, vma, address, flags);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
@ -167,11 +171,6 @@ no_context:
*/
out_of_memory:
up_read(&mm->mmap_sem);
if (is_global_init(tsk)) {
yield();
down_read(&mm->mmap_sem);
goto survive;
}
if (!user_mode(regs))
goto no_context;
pagefault_out_of_memory();

View File

@ -400,9 +400,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
struct mm_struct *mm;
struct vm_area_struct * vma;
int fault;
int write = error_code & FAULT_CODE_WRITE;
unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(write ? FAULT_FLAG_WRITE : 0));
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
mm = tsk->mm;
@ -476,6 +474,11 @@ good_area:
set_thread_fault_code(error_code);
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
if (error_code & FAULT_CODE_WRITE)
flags |= FAULT_FLAG_WRITE;
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo

View File

@ -177,8 +177,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
unsigned long g2;
int from_user = !(regs->psr & PSR_PS);
int fault, code;
unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(write ? FAULT_FLAG_WRITE : 0));
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
if (text_fault)
address = regs->pc;
@ -235,6 +234,11 @@ good_area:
goto bad_area;
}
if (from_user)
flags |= FAULT_FLAG_USER;
if (write)
flags |= FAULT_FLAG_WRITE;
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
@ -383,6 +387,7 @@ static void force_user_fault(unsigned long address, int write)
struct vm_area_struct *vma;
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
unsigned int flags = FAULT_FLAG_USER;
int code;
code = SEGV_MAPERR;
@ -402,11 +407,12 @@ good_area:
if (write) {
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
flags |= FAULT_FLAG_WRITE;
} else {
if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
goto bad_area;
}
switch (handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0)) {
switch (handle_mm_fault(mm, vma, address, flags)) {
case VM_FAULT_SIGBUS:
case VM_FAULT_OOM:
goto do_sigbus;

View File

@ -315,7 +315,8 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
bad_kernel_pc(regs, address);
return;
}
}
} else
flags |= FAULT_FLAG_USER;
/*
* If we're in an interrupt or have no user
@ -418,13 +419,14 @@ good_area:
vma->vm_file != NULL)
set_thread_fault_code(fault_code |
FAULT_CODE_BLKCOMMIT);
flags |= FAULT_FLAG_WRITE;
} else {
/* Allow reads even for write-only mappings */
if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
goto bad_area;
}
flags |= ((fault_code & FAULT_CODE_WRITE) ? FAULT_FLAG_WRITE : 0);
fault = handle_mm_fault(mm, vma, address, flags);
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))

View File

@ -280,8 +280,7 @@ static int handle_page_fault(struct pt_regs *regs,
if (!is_page_fault)
write = 1;
flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(write ? FAULT_FLAG_WRITE : 0));
flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
is_kernel_mode = !user_mode(regs);
@ -365,6 +364,9 @@ static int handle_page_fault(struct pt_regs *regs,
goto bad_area_nosemaphore;
}
if (!is_kernel_mode)
flags |= FAULT_FLAG_USER;
/*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
@ -425,12 +427,12 @@ good_area:
#endif
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
flags |= FAULT_FLAG_WRITE;
} else {
if (!is_page_fault || !(vma->vm_flags & VM_READ))
goto bad_area;
}
survive:
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
@ -555,11 +557,6 @@ no_context:
*/
out_of_memory:
up_read(&mm->mmap_sem);
if (is_global_init(tsk)) {
yield();
down_read(&mm->mmap_sem);
goto survive;
}
if (is_kernel_mode)
goto no_context;
pagefault_out_of_memory();

View File

@ -30,8 +30,7 @@ int handle_page_fault(unsigned long address, unsigned long ip,
pmd_t *pmd;
pte_t *pte;
int err = -EFAULT;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(is_write ? FAULT_FLAG_WRITE : 0);
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
*code_out = SEGV_MAPERR;
@ -42,6 +41,8 @@ int handle_page_fault(unsigned long address, unsigned long ip,
if (in_atomic())
goto out_nosemaphore;
if (is_user)
flags |= FAULT_FLAG_USER;
retry:
down_read(&mm->mmap_sem);
vma = find_vma(mm, address);
@ -58,12 +59,15 @@ retry:
good_area:
*code_out = SEGV_ACCERR;
if (is_write && !(vma->vm_flags & VM_WRITE))
goto out;
/* Don't require VM_READ|VM_EXEC for write faults! */
if (!is_write && !(vma->vm_flags & (VM_READ | VM_EXEC)))
goto out;
if (is_write) {
if (!(vma->vm_flags & VM_WRITE))
goto out;
flags |= FAULT_FLAG_WRITE;
} else {
/* Don't require VM_READ|VM_EXEC for write faults! */
if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
goto out;
}
do {
int fault;
@ -124,6 +128,8 @@ out_of_memory:
* (which will retry the fault, or kill us if we got oom-killed).
*/
up_read(&mm->mmap_sem);
if (!is_user)
goto out_nosemaphore;
pagefault_out_of_memory();
return 0;
}

View File

@ -209,8 +209,7 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
struct task_struct *tsk;
struct mm_struct *mm;
int fault, sig, code;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
((!(fsr ^ 0x12)) ? FAULT_FLAG_WRITE : 0);
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
mm = tsk->mm;
@ -222,6 +221,11 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
if (in_atomic() || !mm)
goto no_context;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
if (!(fsr ^ 0x12))
flags |= FAULT_FLAG_WRITE;
/*
* As per x86, we may deadlock here. However, since the kernel only
* validly references user space from well defined areas of the code,
@ -278,6 +282,13 @@ retry:
(VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
return 0;
/*
* If we are in kernel mode at this point, we
* have no context to handle this fault with.
*/
if (!user_mode(regs))
goto no_context;
if (fault & VM_FAULT_OOM) {
/*
* We ran out of memory, call the OOM killer, and return to
@ -288,13 +299,6 @@ retry:
return 0;
}
/*
* If we are in kernel mode at this point, we
* have no context to handle this fault with.
*/
if (!user_mode(regs))
goto no_context;
if (fault & VM_FAULT_SIGBUS) {
/*
* We had some memory, but were unable to

View File

@ -842,23 +842,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
force_sig_info_fault(SIGBUS, code, address, tsk, fault);
}
static noinline int
static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, unsigned int fault)
{
/*
* Pagefault was interrupted by SIGKILL. We have no reason to
* continue pagefault.
*/
if (fatal_signal_pending(current)) {
if (!(fault & VM_FAULT_RETRY))
up_read(&current->mm->mmap_sem);
if (!(error_code & PF_USER))
no_context(regs, error_code, address, 0, 0);
return 1;
if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
up_read(&current->mm->mmap_sem);
no_context(regs, error_code, address, 0, 0);
return;
}
if (!(fault & VM_FAULT_ERROR))
return 0;
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
@ -866,7 +858,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
up_read(&current->mm->mmap_sem);
no_context(regs, error_code, address,
SIGSEGV, SEGV_MAPERR);
return 1;
return;
}
up_read(&current->mm->mmap_sem);
@ -884,7 +876,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
else
BUG();
}
return 1;
}
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
@ -1011,9 +1002,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
unsigned long address;
struct mm_struct *mm;
int fault;
int write = error_code & PF_WRITE;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(write ? FAULT_FLAG_WRITE : 0);
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
mm = tsk->mm;
@ -1083,6 +1072,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
if (user_mode_vm(regs)) {
local_irq_enable();
error_code |= PF_USER;
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
@ -1109,6 +1099,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
return;
}
if (error_code & PF_WRITE)
flags |= FAULT_FLAG_WRITE;
/*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in
@ -1187,9 +1180,17 @@ good_area:
*/
fault = handle_mm_fault(mm, vma, address, flags);
if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
if (mm_fault_error(regs, error_code, address, fault))
return;
/*
* If we need to retry but a fatal signal is pending, handle the
* signal first. We do not need to release the mmap_sem because it
* would already be released in __lock_page_or_retry in mm/filemap.c.
*/
if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)))
return;
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, fault);
return;
}
/*

View File

@ -72,6 +72,8 @@ void do_page_fault(struct pt_regs *regs)
address, exccause, regs->pc, is_write? "w":"", is_exec? "x":"");
#endif
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
retry:
down_read(&mm->mmap_sem);
vma = find_vma(mm, address);

View File

@ -125,13 +125,7 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(node_page_state(nid, NR_WRITEBACK)),
nid, K(node_page_state(nid, NR_FILE_PAGES)),
nid, K(node_page_state(nid, NR_FILE_MAPPED)),
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
nid, K(node_page_state(nid, NR_ANON_PAGES)
+ node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) *
HPAGE_PMD_NR),
#else
nid, K(node_page_state(nid, NR_ANON_PAGES)),
#endif
nid, K(node_page_state(nid, NR_SHMEM)),
nid, node_page_state(nid, NR_KERNEL_STACK) *
THREAD_SIZE / 1024,

View File

@ -50,7 +50,7 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to)
struct inode *inode = mapping->host;
if (to > inode->i_size)
truncate_pagecache(inode, to, inode->i_size);
truncate_pagecache(inode, inode->i_size);
}
static int adfs_write_begin(struct file *file, struct address_space *mapping,

View File

@ -406,7 +406,7 @@ static void affs_write_failed(struct address_space *mapping, loff_t to)
struct inode *inode = mapping->host;
if (to > inode->i_size) {
truncate_pagecache(inode, to, inode->i_size);
truncate_pagecache(inode, inode->i_size);
affs_truncate(inode);
}
}

View File

@ -166,7 +166,7 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to)
struct inode *inode = mapping->host;
if (to > inode->i_size)
truncate_pagecache(inode, to, inode->i_size);
truncate_pagecache(inode, inode->i_size);
}
static int bfs_write_begin(struct file *file, struct address_space *mapping,

View File

@ -221,12 +221,10 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
struct btrfs_path *path,
struct inode *inode)
{
loff_t oldsize;
int ret = 0;
oldsize = i_size_read(inode);
btrfs_i_size_write(inode, 0);
truncate_pagecache(inode, oldsize, 0);
truncate_pagecache(inode, 0);
/*
* We don't need an orphan item because truncating the free space cache

View File

@ -4349,7 +4349,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
if (newsize > oldsize) {
truncate_pagecache(inode, oldsize, newsize);
truncate_pagecache(inode, newsize);
ret = btrfs_cont_expand(inode, oldsize, newsize);
if (ret)
return ret;

View File

@ -1856,14 +1856,11 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
static void cifs_setsize(struct inode *inode, loff_t offset)
{
loff_t oldsize;
spin_lock(&inode->i_lock);
oldsize = inode->i_size;
i_size_write(inode, offset);
spin_unlock(&inode->i_lock);
truncate_pagecache(inode, oldsize, offset);
truncate_pagecache(inode, offset);
}
static int

View File

@ -861,7 +861,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
static void _write_failed(struct inode *inode, loff_t to)
{
if (to > inode->i_size)
truncate_pagecache(inode, to, inode->i_size);
truncate_pagecache(inode, inode->i_size);
}
int exofs_write_begin(struct file *file, struct address_space *mapping,

View File

@ -58,7 +58,7 @@ static void ext2_write_failed(struct address_space *mapping, loff_t to)
struct inode *inode = mapping->host;
if (to > inode->i_size) {
truncate_pagecache(inode, to, inode->i_size);
truncate_pagecache(inode, inode->i_size);
ext2_truncate_blocks(inode, inode->i_size);
}
}

View File

@ -4587,7 +4587,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
handle_t *handle;
loff_t oldsize = inode->i_size;
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@ -4650,7 +4649,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
* Truncate pagecache after we've waited for commit
* in data=journal mode to make pages freeable.
*/
truncate_pagecache(inode, oldsize, inode->i_size);
truncate_pagecache(inode, inode->i_size);
}
/*
* We want to call ext4_truncate() even if attr->ia_size ==

View File

@ -147,7 +147,7 @@ static void fat_write_failed(struct address_space *mapping, loff_t to)
struct inode *inode = mapping->host;
if (to > inode->i_size) {
truncate_pagecache(inode, to, inode->i_size);
truncate_pagecache(inode, inode->i_size);
fat_truncate_blocks(inode, inode->i_size);
}
}

View File

@ -1678,7 +1678,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
* FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
*/
if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
truncate_pagecache(inode, oldsize, outarg.attr.size);
truncate_pagecache(inode, outarg.attr.size);
invalidate_inode_pages2(inode->i_mapping);
}

View File

@ -218,7 +218,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
bool inval = false;
if (oldsize != attr->size) {
truncate_pagecache(inode, oldsize, attr->size);
truncate_pagecache(inode, attr->size);
inval = true;
} else if (fc->auto_inval_data) {
struct timespec new_mtime = {

View File

@ -1016,7 +1016,7 @@ static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize
chunk = oldsize - newsize;
if (chunk > max_chunk)
chunk = max_chunk;
truncate_pagecache(inode, oldsize, oldsize - chunk);
truncate_pagecache(inode, oldsize - chunk);
oldsize -= chunk;
gfs2_trans_end(sdp);
error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
@ -1067,7 +1067,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
if (journaled)
error = gfs2_journaled_truncate(inode, oldsize, newsize);
else
truncate_pagecache(inode, oldsize, newsize);
truncate_pagecache(inode, newsize);
if (error) {
brelse(dibh);

View File

@ -41,7 +41,7 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to)
struct inode *inode = mapping->host;
if (to > inode->i_size) {
truncate_pagecache(inode, to, inode->i_size);
truncate_pagecache(inode, inode->i_size);
hfs_file_truncate(inode);
}
}

View File

@ -36,7 +36,7 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
struct inode *inode = mapping->host;
if (to > inode->i_size) {
truncate_pagecache(inode, to, inode->i_size);
truncate_pagecache(inode, inode->i_size);
hfsplus_file_truncate(inode);
}
}

View File

@ -138,7 +138,7 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to)
hpfs_lock(inode->i_sb);
if (to > inode->i_size) {
truncate_pagecache(inode, to, inode->i_size);
truncate_pagecache(inode, inode->i_size);
hpfs_truncate(inode);
}

View File

@ -306,7 +306,7 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to)
struct inode *inode = mapping->host;
if (to > inode->i_size) {
truncate_pagecache(inode, to, inode->i_size);
truncate_pagecache(inode, inode->i_size);
jfs_truncate(inode);
}
}

View File

@ -400,7 +400,7 @@ static void minix_write_failed(struct address_space *mapping, loff_t to)
struct inode *inode = mapping->host;
if (to > inode->i_size) {
truncate_pagecache(inode, to, inode->i_size);
truncate_pagecache(inode, inode->i_size);
minix_truncate(inode);
}
}

View File

@ -541,7 +541,6 @@ EXPORT_SYMBOL_GPL(nfs_setattr);
*/
static int nfs_vmtruncate(struct inode * inode, loff_t offset)
{
loff_t oldsize;
int err;
err = inode_newsize_ok(inode, offset);
@ -549,11 +548,10 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
goto out;
spin_lock(&inode->i_lock);
oldsize = inode->i_size;
i_size_write(inode, offset);
spin_unlock(&inode->i_lock);
truncate_pagecache(inode, oldsize, offset);
truncate_pagecache(inode, offset);
out:
return err;
}

View File

@ -254,7 +254,7 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to)
struct inode *inode = mapping->host;
if (to > inode->i_size) {
truncate_pagecache(inode, to, inode->i_size);
truncate_pagecache(inode, inode->i_size);
nilfs_truncate(inode);
}
}

View File

@ -1768,7 +1768,7 @@ static void ntfs_write_failed(struct address_space *mapping, loff_t to)
struct inode *inode = mapping->host;
if (to > inode->i_size) {
truncate_pagecache(inode, to, inode->i_size);
truncate_pagecache(inode, inode->i_size);
ntfs_truncate_vfs(inode);
}
}

View File

@ -311,7 +311,7 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to)
struct inode *inode = mapping->host;
if (to > inode->i_size) {
truncate_pagecache(inode, to, inode->i_size);
truncate_pagecache(inode, inode->i_size);
omfs_truncate(inode);
}
}

View File

@ -132,13 +132,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(i.freeswap),
K(global_page_state(NR_FILE_DIRTY)),
K(global_page_state(NR_WRITEBACK)),
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
K(global_page_state(NR_ANON_PAGES)
+ global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
HPAGE_PMD_NR),
#else
K(global_page_state(NR_ANON_PAGES)),
#endif
K(global_page_state(NR_FILE_MAPPED)),
K(global_page_state(NR_SHMEM)),
K(global_page_state(NR_SLAB_RECLAIMABLE) +

View File

@ -469,7 +469,7 @@ static void sysv_write_failed(struct address_space *mapping, loff_t to)
struct inode *inode = mapping->host;
if (to > inode->i_size) {
truncate_pagecache(inode, to, inode->i_size);
truncate_pagecache(inode, inode->i_size);
sysv_truncate(inode);
}
}

View File

@ -172,7 +172,7 @@ static void udf_write_failed(struct address_space *mapping, loff_t to)
loff_t isize = inode->i_size;
if (to > isize) {
truncate_pagecache(inode, to, isize);
truncate_pagecache(inode, isize);
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
down_write(&iinfo->i_data_sem);
udf_clear_extent_cache(inode);

View File

@ -531,7 +531,7 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to)
struct inode *inode = mapping->host;
if (to > inode->i_size)
truncate_pagecache(inode, to, inode->i_size);
truncate_pagecache(inode, inode->i_size);
}
static int ufs_write_begin(struct file *file, struct address_space *mapping,

View File

@ -1582,7 +1582,7 @@ xfs_vm_write_begin(
unlock_page(page);
if (pos + len > i_size_read(inode))
truncate_pagecache(inode, pos + len, i_size_read(inode));
truncate_pagecache(inode, i_size_read(inode));
page_cache_release(page);
page = NULL;
@ -1618,7 +1618,7 @@ xfs_vm_write_end(
loff_t to = pos + len;
if (to > isize) {
truncate_pagecache(inode, to, isize);
truncate_pagecache(inode, isize);
xfs_vm_kill_delalloc_range(inode, isize, to);
}
}

View File

@ -96,9 +96,6 @@ extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end);
extern int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags);
extern int split_huge_page_to_list(struct page *page, struct list_head *list);
static inline int split_huge_page(struct page *page)
{

View File

@ -30,9 +30,21 @@ struct page;
struct mm_struct;
struct kmem_cache;
/* Stats that can be updated by kernel. */
enum mem_cgroup_page_stat_item {
MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
/*
* The corresponding mem_cgroup_stat_names is defined in mm/memcontrol.c,
* These two lists should keep in accord with each other.
*/
enum mem_cgroup_stat_index {
/*
* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
*/
MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */
MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
MEM_CGROUP_STAT_NSTATS,
};
struct mem_cgroup_reclaim_cookie {
@ -41,6 +53,23 @@ struct mem_cgroup_reclaim_cookie {
unsigned int generation;
};
enum mem_cgroup_filter_t {
VISIT, /* visit current node */
SKIP, /* skip the current node and continue traversal */
SKIP_TREE, /* skip the whole subtree and continue traversal */
};
/*
* mem_cgroup_filter_t predicate might instruct mem_cgroup_iter_cond how to
* iterate through the hierarchy tree. Each tree element is checked by the
* predicate before it is returned by the iterator. If a filter returns
* SKIP or SKIP_TREE then the iterator code continues traversal (with the
* next node down the hierarchy or the next node that doesn't belong under the
* memcg's subtree).
*/
typedef enum mem_cgroup_filter_t
(*mem_cgroup_iter_filter)(struct mem_cgroup *memcg, struct mem_cgroup *root);
#ifdef CONFIG_MEMCG
/*
* All "charge" functions with gfp_mask should use GFP_KERNEL or
@ -108,9 +137,18 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
struct page *oldpage, struct page *newpage, bool migration_ok);
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
struct mem_cgroup *,
struct mem_cgroup_reclaim_cookie *);
struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim,
mem_cgroup_iter_filter cond);
static inline struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim)
{
return mem_cgroup_iter_cond(root, prev, reclaim, NULL);
}
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
/*
@ -125,6 +163,48 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
extern void mem_cgroup_replace_page_cache(struct page *oldpage,
struct page *newpage);
/**
* mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task
* @new: true to enable, false to disable
*
* Toggle whether a failed memcg charge should invoke the OOM killer
* or just return -ENOMEM. Returns the previous toggle state.
*
* NOTE: Any path that enables the OOM killer before charging must
* call mem_cgroup_oom_synchronize() afterward to finalize the
* OOM handling and clean up.
*/
static inline bool mem_cgroup_toggle_oom(bool new)
{
bool old;
old = current->memcg_oom.may_oom;
current->memcg_oom.may_oom = new;
return old;
}
static inline void mem_cgroup_enable_oom(void)
{
bool old = mem_cgroup_toggle_oom(true);
WARN_ON(old == true);
}
static inline void mem_cgroup_disable_oom(void)
{
bool old = mem_cgroup_toggle_oom(false);
WARN_ON(old == false);
}
static inline bool task_in_memcg_oom(struct task_struct *p)
{
return p->memcg_oom.in_memcg_oom;
}
bool mem_cgroup_oom_synchronize(void);
#ifdef CONFIG_MEMCG_SWAP
extern int do_swap_account;
#endif
@ -165,24 +245,24 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page,
}
void mem_cgroup_update_page_stat(struct page *page,
enum mem_cgroup_page_stat_item idx,
enum mem_cgroup_stat_index idx,
int val);
static inline void mem_cgroup_inc_page_stat(struct page *page,
enum mem_cgroup_page_stat_item idx)
enum mem_cgroup_stat_index idx)
{
mem_cgroup_update_page_stat(page, idx, 1);
}
static inline void mem_cgroup_dec_page_stat(struct page *page,
enum mem_cgroup_page_stat_item idx)
enum mem_cgroup_stat_index idx)
{
mem_cgroup_update_page_stat(page, idx, -1);
}
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask,
unsigned long *total_scanned);
enum mem_cgroup_filter_t
mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
struct mem_cgroup *root);
void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
@ -296,6 +376,15 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
struct page *oldpage, struct page *newpage, bool migration_ok)
{
}
static inline struct mem_cgroup *
mem_cgroup_iter_cond(struct mem_cgroup *root,
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim,
mem_cgroup_iter_filter cond)
{
/* first call must return non-NULL, second return NULL */
return (struct mem_cgroup *)(unsigned long)!prev;
}
static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
@ -348,22 +437,45 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page,
{
}
static inline bool mem_cgroup_toggle_oom(bool new)
{
return false;
}
static inline void mem_cgroup_enable_oom(void)
{
}
static inline void mem_cgroup_disable_oom(void)
{
}
static inline bool task_in_memcg_oom(struct task_struct *p)
{
return false;
}
static inline bool mem_cgroup_oom_synchronize(void)
{
return false;
}
static inline void mem_cgroup_inc_page_stat(struct page *page,
enum mem_cgroup_page_stat_item idx)
enum mem_cgroup_stat_index idx)
{
}
static inline void mem_cgroup_dec_page_stat(struct page *page,
enum mem_cgroup_page_stat_item idx)
enum mem_cgroup_stat_index idx)
{
}
static inline
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask,
unsigned long *total_scanned)
enum mem_cgroup_filter_t
mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
struct mem_cgroup *root)
{
return 0;
return VISIT;
}
static inline void mem_cgroup_split_huge_fixup(struct page *head)

View File

@ -176,6 +176,7 @@ extern pgprot_t protection_map[16];
#define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */
#define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */
#define FAULT_FLAG_TRIED 0x40 /* second try */
#define FAULT_FLAG_USER 0x80 /* The fault originated in userspace */
/*
* vm_fault is filled by the the pagefault handler and passed to the vma's
@ -876,11 +877,12 @@ static inline int page_mapped(struct page *page)
#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
#define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */
#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
VM_FAULT_HWPOISON_LARGE)
VM_FAULT_FALLBACK | VM_FAULT_HWPOISON_LARGE)
/* Encode hstate index for a hwpoisoned large page */
#define VM_FAULT_SET_HINDEX(x) ((x) << 12)
@ -984,7 +986,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
unmap_mapping_range(mapping, holebegin, holelen, 0);
}
extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new);
extern void truncate_pagecache(struct inode *inode, loff_t new);
extern void truncate_setsize(struct inode *inode, loff_t newsize);
void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
int truncate_inode_page(struct address_space *mapping, struct page *page);

View File

@ -54,7 +54,7 @@ struct res_counter {
struct res_counter *parent;
};
#define RESOURCE_MAX (unsigned long long)LLONG_MAX
#define RES_COUNTER_MAX ULLONG_MAX
/**
* Helpers to interact with userspace

View File

@ -1393,6 +1393,13 @@ struct task_struct {
unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
} memcg_batch;
unsigned int memcg_kmem_skip_account;
struct memcg_oom_info {
unsigned int may_oom:1;
unsigned int in_memcg_oom:1;
unsigned int oom_locked:1;
int wakeups;
struct mem_cgroup *wait_on_memcg;
} memcg_oom;
#endif
#ifdef CONFIG_UPROBES
struct uprobe_task *utask;

View File

@ -280,7 +280,7 @@ extern void activate_page(struct page *);
extern void mark_page_accessed(struct page *);
extern void lru_add_drain(void);
extern void lru_add_drain_cpu(int cpu);
extern int lru_add_drain_all(void);
extern void lru_add_drain_all(void);
extern void rotate_reclaimable_page(struct page *page);
extern void deactivate_page(struct page *page);
extern void swap_setup(void);

View File

@ -74,7 +74,7 @@ static int __init gcov_persist_setup(char *str)
{
unsigned long val;
if (strict_strtoul(str, 0, &val)) {
if (kstrtoul(str, 0, &val)) {
pr_warning("invalid gcov_persist parameter '%s'\n", str);
return 0;
}

View File

@ -113,7 +113,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
unsigned long cnt;
int ret;
if (strict_strtoul(buf, 0, &cnt))
if (kstrtoul(buf, 0, &cnt))
return -EINVAL;
ret = crash_shrink_memory(cnt);

View File

@ -253,13 +253,13 @@ int parse_args(const char *doing,
EXPORT_SYMBOL(param_ops_##name)
STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, strict_strtoul);
STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul);
STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol);
STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul);
STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtoul);
STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul);
STANDARD_PARAM_DEF(int, int, "%i", long, kstrtoul);
STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul);
STANDARD_PARAM_DEF(long, long, "%li", long, kstrtoul);
STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul);
int param_set_charp(const char *val, const struct kernel_param *kp)
{

View File

@ -17,8 +17,8 @@
void res_counter_init(struct res_counter *counter, struct res_counter *parent)
{
spin_lock_init(&counter->lock);
counter->limit = RESOURCE_MAX;
counter->soft_limit = RESOURCE_MAX;
counter->limit = RES_COUNTER_MAX;
counter->soft_limit = RES_COUNTER_MAX;
counter->parent = parent;
}
@ -178,23 +178,30 @@ u64 res_counter_read_u64(struct res_counter *counter, int member)
#endif
int res_counter_memparse_write_strategy(const char *buf,
unsigned long long *res)
unsigned long long *resp)
{
char *end;
unsigned long long res;
/* return RESOURCE_MAX(unlimited) if "-1" is specified */
/* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
if (*buf == '-') {
*res = simple_strtoull(buf + 1, &end, 10);
if (*res != 1 || *end != '\0')
res = simple_strtoull(buf + 1, &end, 10);
if (res != 1 || *end != '\0')
return -EINVAL;
*res = RESOURCE_MAX;
*resp = RES_COUNTER_MAX;
return 0;
}
*res = memparse(buf, &end);
res = memparse(buf, &end);
if (*end != '\0')
return -EINVAL;
*res = PAGE_ALIGN(*res);
if (PAGE_ALIGN(res) >= res)
res = PAGE_ALIGN(res);
else
res = RES_COUNTER_MAX;
*resp = res;
return 0;
}

View File

@ -245,7 +245,7 @@ config COMPACTION
config MIGRATION
bool "Page migration"
def_bool y
depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA
depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU
help
Allows the migration of the physical location of pages of processes
while the virtual addresses are not changed. This is useful in
@ -480,7 +480,7 @@ config FRONTSWAP
config CMA
bool "Contiguous Memory Allocator"
depends on HAVE_MEMBLOCK
depends on HAVE_MEMBLOCK && MMU
select MIGRATION
select MEMORY_ISOLATION
help

View File

@ -467,32 +467,34 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
error = mem_cgroup_cache_charge(page, current->mm,
gfp_mask & GFP_RECLAIM_MASK);
if (error)
goto out;
return error;
error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error == 0) {
page_cache_get(page);
page->mapping = mapping;
page->index = offset;
spin_lock_irq(&mapping->tree_lock);
error = radix_tree_insert(&mapping->page_tree, offset, page);
if (likely(!error)) {
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
trace_mm_filemap_add_to_page_cache(page);
} else {
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
page_cache_release(page);
}
radix_tree_preload_end();
} else
if (error) {
mem_cgroup_uncharge_cache_page(page);
out:
return error;
}
page_cache_get(page);
page->mapping = mapping;
page->index = offset;
spin_lock_irq(&mapping->tree_lock);
error = radix_tree_insert(&mapping->page_tree, offset, page);
radix_tree_preload_end();
if (unlikely(error))
goto err_insert;
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
trace_mm_filemap_add_to_page_cache(page);
return 0;
err_insert:
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
page_cache_release(page);
return error;
}
EXPORT_SYMBOL(add_to_page_cache_locked);
@ -1614,6 +1616,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
struct page *page;
bool memcg_oom;
pgoff_t size;
int ret = 0;
@ -1622,7 +1625,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
/*
* Do we have something in the page cache already?
* Do we have something in the page cache already? Either
* way, try readahead, but disable the memcg OOM killer for it
* as readahead is optional and no errors are propagated up
* the fault stack. The OOM killer is enabled while trying to
* instantiate the faulting page individually below.
*/
page = find_get_page(mapping, offset);
if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
@ -1630,10 +1637,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
* We found the page, so try async readahead before
* waiting for the lock.
*/
memcg_oom = mem_cgroup_toggle_oom(false);
do_async_mmap_readahead(vma, ra, file, page, offset);
mem_cgroup_toggle_oom(memcg_oom);
} else if (!page) {
/* No page in the page cache at all */
memcg_oom = mem_cgroup_toggle_oom(false);
do_sync_mmap_readahead(vma, ra, file, offset);
mem_cgroup_toggle_oom(memcg_oom);
count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;

View File

@ -695,11 +695,10 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
return pmd;
}
static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
{
pmd_t entry;
entry = mk_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
entry = mk_pmd(page, prot);
entry = pmd_mkhuge(entry);
return entry;
}
@ -732,7 +731,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
pte_free(mm, pgtable);
} else {
pmd_t entry;
entry = mk_huge_pmd(page, vma);
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
@ -788,77 +788,57 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct page *page;
unsigned long haddr = address & HPAGE_PMD_MASK;
pte_t *pte;
if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
if (unlikely(anon_vma_prepare(vma)))
if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
return VM_FAULT_FALLBACK;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma)))
return VM_FAULT_OOM;
if (!(flags & FAULT_FLAG_WRITE) &&
transparent_hugepage_use_zero_page()) {
pgtable_t pgtable;
struct page *zero_page;
bool set;
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma)))
return VM_FAULT_OOM;
if (!(flags & FAULT_FLAG_WRITE) &&
transparent_hugepage_use_zero_page()) {
pgtable_t pgtable;
struct page *zero_page;
bool set;
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
zero_page = get_huge_zero_page();
if (unlikely(!zero_page)) {
pte_free(mm, pgtable);
count_vm_event(THP_FAULT_FALLBACK);
goto out;
}
spin_lock(&mm->page_table_lock);
set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
zero_page);
spin_unlock(&mm->page_table_lock);
if (!set) {
pte_free(mm, pgtable);
put_huge_zero_page();
}
return 0;
}
page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
vma, haddr, numa_node_id(), 0);
if (unlikely(!page)) {
zero_page = get_huge_zero_page();
if (unlikely(!zero_page)) {
pte_free(mm, pgtable);
count_vm_event(THP_FAULT_FALLBACK);
goto out;
return VM_FAULT_FALLBACK;
}
count_vm_event(THP_FAULT_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
put_page(page);
goto out;
spin_lock(&mm->page_table_lock);
set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
zero_page);
spin_unlock(&mm->page_table_lock);
if (!set) {
pte_free(mm, pgtable);
put_huge_zero_page();
}
if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
page))) {
mem_cgroup_uncharge_page(page);
put_page(page);
goto out;
}
return 0;
}
out:
/*
* Use __pte_alloc instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
if (unlikely(pmd_none(*pmd)) &&
unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
/* if an huge pmd materialized from under us just retry later */
if (unlikely(pmd_trans_huge(*pmd)))
return 0;
/*
* A regular pmd is established and it can't morph into a huge pmd
* from under us anymore at this point because we hold the mmap_sem
* read mode and khugepaged takes it in write mode. So now it's
* safe to run pte_offset_map().
*/
pte = pte_offset_map(pmd, address);
return handle_pte_fault(mm, vma, address, pte, pmd, flags);
page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
vma, haddr, numa_node_id(), 0);
if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
mem_cgroup_uncharge_page(page);
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
count_vm_event(THP_FAULT_ALLOC);
return 0;
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@ -1170,7 +1150,6 @@ alloc:
new_page = NULL;
if (unlikely(!new_page)) {
count_vm_event(THP_FAULT_FALLBACK);
if (is_huge_zero_pmd(orig_pmd)) {
ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
address, pmd, orig_pmd, haddr);
@ -1181,9 +1160,9 @@ alloc:
split_huge_page(page);
put_page(page);
}
count_vm_event(THP_FAULT_FALLBACK);
goto out;
}
count_vm_event(THP_FAULT_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
put_page(new_page);
@ -1191,10 +1170,13 @@ alloc:
split_huge_page(page);
put_page(page);
}
count_vm_event(THP_FAULT_FALLBACK);
ret |= VM_FAULT_OOM;
goto out;
}
count_vm_event(THP_FAULT_ALLOC);
if (is_huge_zero_pmd(orig_pmd))
clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
else
@ -1215,7 +1197,8 @@ alloc:
goto out_mn;
} else {
pmd_t entry;
entry = mk_huge_pmd(new_page, vma);
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
pmdp_clear_flush(vma, haddr, pmd);
page_add_new_anon_rmap(new_page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
@ -1666,7 +1649,6 @@ static void __split_huge_page_refcount(struct page *page,
BUG_ON(atomic_read(&page->_count) <= 0);
__mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
ClearPageCompound(page);
compound_unlock(page);
@ -2364,7 +2346,8 @@ static void collapse_huge_page(struct mm_struct *mm,
__SetPageUptodate(new_page);
pgtable = pmd_pgtable(_pmd);
_pmd = mk_huge_pmd(new_page, vma);
_pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
/*
* spin_lock() below is not the equivalent of smp_wmb(), so

File diff suppressed because it is too large Load Diff

View File

@ -3695,7 +3695,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
int handle_pte_fault(struct mm_struct *mm,
static int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags)
{
@ -3754,22 +3754,14 @@ unlock:
/*
* By the time we get here, we already hold the mm semaphore
*/
int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
__set_current_state(TASK_RUNNING);
count_vm_event(PGFAULT);
mem_cgroup_count_vm_event(mm, PGFAULT);
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
@ -3782,9 +3774,12 @@ retry:
if (!pmd)
return VM_FAULT_OOM;
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
int ret = VM_FAULT_FALLBACK;
if (!vma->vm_ops)
return do_huge_pmd_anonymous_page(mm, vma, address,
pmd, flags);
ret = do_huge_pmd_anonymous_page(mm, vma, address,
pmd, flags);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
pmd_t orig_pmd = *pmd;
int ret;
@ -3850,6 +3845,37 @@ retry:
return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}
int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
int ret;
__set_current_state(TASK_RUNNING);
count_vm_event(PGFAULT);
mem_cgroup_count_vm_event(mm, PGFAULT);
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
/*
* Enable the memcg OOM handling for faults triggered in user
* space. Kernel faults are handled more gracefully.
*/
if (flags & FAULT_FLAG_USER)
mem_cgroup_enable_oom();
ret = __handle_mm_fault(mm, vma, address, flags);
if (flags & FAULT_FLAG_USER)
mem_cgroup_disable_oom();
if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
mem_cgroup_oom_synchronize();
return ret;
}
#ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.

View File

@ -678,9 +678,12 @@ out:
*/
void pagefault_out_of_memory(void)
{
struct zonelist *zonelist = node_zonelist(first_online_node,
GFP_KERNEL);
struct zonelist *zonelist;
if (mem_cgroup_oom_synchronize())
return;
zonelist = node_zonelist(first_online_node, GFP_KERNEL);
if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
out_of_memory(NULL, 0, 0, NULL, false);
clear_zonelist_oom(zonelist, GFP_KERNEL);

View File

@ -2143,11 +2143,17 @@ EXPORT_SYMBOL(account_page_dirtied);
/*
* Helper function for set_page_writeback family.
*
* The caller must hold mem_cgroup_begin/end_update_page_stat() lock
* while calling this function.
* See test_set_page_writeback for example.
*
* NOTE: Unlike account_page_dirtied this does not rely on being atomic
* wrt interrupts.
*/
void account_page_writeback(struct page *page)
{
mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
inc_zone_page_state(page, NR_WRITEBACK);
}
EXPORT_SYMBOL(account_page_writeback);
@ -2364,7 +2370,10 @@ int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
int ret;
bool locked;
unsigned long memcg_flags;
mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
if (mapping) {
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
@ -2385,9 +2394,11 @@ int test_clear_page_writeback(struct page *page)
ret = TestClearPageWriteback(page);
}
if (ret) {
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_WRITTEN);
}
mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
return ret;
}
@ -2395,7 +2406,10 @@ int test_set_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
int ret;
bool locked;
unsigned long memcg_flags;
mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
if (mapping) {
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
@ -2422,6 +2436,7 @@ int test_set_page_writeback(struct page *page)
}
if (!ret)
account_page_writeback(page);
mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
return ret;
}

View File

@ -1052,11 +1052,11 @@ void do_page_add_anon_rmap(struct page *page,
{
int first = atomic_inc_and_test(&page->_mapcount);
if (first) {
if (!PageTransHuge(page))
__inc_zone_page_state(page, NR_ANON_PAGES);
else
if (PageTransHuge(page))
__inc_zone_page_state(page,
NR_ANON_TRANSPARENT_HUGEPAGES);
__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
hpage_nr_pages(page));
}
if (unlikely(PageKsm(page)))
return;
@ -1085,10 +1085,10 @@ void page_add_new_anon_rmap(struct page *page,
VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
SetPageSwapBacked(page);
atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
if (!PageTransHuge(page))
__inc_zone_page_state(page, NR_ANON_PAGES);
else
if (PageTransHuge(page))
__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
hpage_nr_pages(page));
__page_set_anon_rmap(page, vma, address, 1);
if (!mlocked_vma_newpage(vma, page)) {
SetPageActive(page);
@ -1111,7 +1111,7 @@ void page_add_file_rmap(struct page *page)
mem_cgroup_begin_update_page_stat(page, &locked, &flags);
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
}
mem_cgroup_end_update_page_stat(page, &locked, &flags);
}
@ -1148,14 +1148,14 @@ void page_remove_rmap(struct page *page)
goto out;
if (anon) {
mem_cgroup_uncharge_page(page);
if (!PageTransHuge(page))
__dec_zone_page_state(page, NR_ANON_PAGES);
else
if (PageTransHuge(page))
__dec_zone_page_state(page,
NR_ANON_TRANSPARENT_HUGEPAGES);
__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
-hpage_nr_pages(page));
} else {
__dec_zone_page_state(page, NR_FILE_MAPPED);
mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
mem_cgroup_end_update_page_stat(page, &locked, &flags);
}
if (unlikely(PageMlocked(page)))

View File

@ -432,6 +432,11 @@ static void activate_page_drain(int cpu)
pagevec_lru_move_fn(pvec, __activate_page, NULL);
}
static bool need_activate_page_drain(int cpu)
{
return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
}
void activate_page(struct page *page)
{
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@ -449,6 +454,11 @@ static inline void activate_page_drain(int cpu)
{
}
static bool need_activate_page_drain(int cpu)
{
return false;
}
void activate_page(struct page *page)
{
struct zone *zone = page_zone(page);
@ -701,12 +711,36 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
lru_add_drain();
}
/*
* Returns 0 for success
*/
int lru_add_drain_all(void)
static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
void lru_add_drain_all(void)
{
return schedule_on_each_cpu(lru_add_drain_per_cpu);
static DEFINE_MUTEX(lock);
static struct cpumask has_work;
int cpu;
mutex_lock(&lock);
get_online_cpus();
cpumask_clear(&has_work);
for_each_online_cpu(cpu) {
struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
schedule_work_on(cpu, work);
cpumask_set_cpu(cpu, &has_work);
}
}
for_each_cpu(cpu, &has_work)
flush_work(&per_cpu(lru_add_drain_work, cpu));
put_online_cpus();
mutex_unlock(&lock);
}
/*

View File

@ -567,7 +567,6 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
/**
* truncate_pagecache - unmap and remove pagecache that has been truncated
* @inode: inode
* @oldsize: old file size
* @newsize: new file size
*
* inode's new i_size must already be written before truncate_pagecache
@ -580,7 +579,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
* situations such as writepage being called for a page that has already
* had its underlying blocks deallocated.
*/
void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize)
void truncate_pagecache(struct inode *inode, loff_t newsize)
{
struct address_space *mapping = inode->i_mapping;
loff_t holebegin = round_up(newsize, PAGE_SIZE);
@ -614,12 +613,8 @@ EXPORT_SYMBOL(truncate_pagecache);
*/
void truncate_setsize(struct inode *inode, loff_t newsize)
{
loff_t oldsize;
oldsize = inode->i_size;
i_size_write(inode, newsize);
truncate_pagecache(inode, oldsize, newsize);
truncate_pagecache(inode, newsize);
}
EXPORT_SYMBOL(truncate_setsize);

View File

@ -139,11 +139,23 @@ static bool global_reclaim(struct scan_control *sc)
{
return !sc->target_mem_cgroup;
}
static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
{
struct mem_cgroup *root = sc->target_mem_cgroup;
return !mem_cgroup_disabled() &&
mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE;
}
#else
static bool global_reclaim(struct scan_control *sc)
{
return true;
}
static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
{
return false;
}
#endif
unsigned long zone_reclaimable_pages(struct zone *zone)
@ -2164,9 +2176,11 @@ static inline bool should_continue_reclaim(struct zone *zone,
}
}
static void shrink_zone(struct zone *zone, struct scan_control *sc)
static int
__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
{
unsigned long nr_reclaimed, nr_scanned;
int groups_scanned = 0;
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
@ -2174,15 +2188,17 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
.zone = zone,
.priority = sc->priority,
};
struct mem_cgroup *memcg;
struct mem_cgroup *memcg = NULL;
mem_cgroup_iter_filter filter = (soft_reclaim) ?
mem_cgroup_soft_reclaim_eligible : NULL;
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) {
struct lruvec *lruvec;
groups_scanned++;
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
shrink_lruvec(lruvec, sc);
@ -2202,8 +2218,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
mem_cgroup_iter_break(root, memcg);
break;
}
memcg = mem_cgroup_iter(root, memcg, &reclaim);
} while (memcg);
}
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
sc->nr_scanned - nr_scanned,
@ -2211,6 +2226,37 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
return groups_scanned;
}
static void shrink_zone(struct zone *zone, struct scan_control *sc)
{
bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc);
unsigned long nr_scanned = sc->nr_scanned;
int scanned_groups;
scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim);
/*
* memcg iterator might race with other reclaimer or start from
* a incomplete tree walk so the tree walk in __shrink_zone
* might have missed groups that are above the soft limit. Try
* another loop to catch up with others. Do it just once to
* prevent from reclaim latencies when other reclaimers always
* preempt this one.
*/
if (do_soft_reclaim && !scanned_groups)
__shrink_zone(zone, sc, do_soft_reclaim);
/*
* No group is over the soft limit or those that are do not have
* pages in the zone we are reclaiming so we have to reclaim everybody
*/
if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) {
__shrink_zone(zone, sc, false);
return;
}
}
/* Returns true if compaction should go ahead for a high-order request */
@ -2274,8 +2320,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
bool aborted_reclaim = false;
/*
@ -2315,18 +2359,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
continue;
}
}
/*
* This steals pages from memory cgroups over softlimit
* and returns the number of reclaimed pages and
* scanned pages. This works for global memory pressure
* and balancing, not for a memcg's limit.
*/
nr_soft_scanned = 0;
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
sc->order, sc->gfp_mask,
&nr_soft_scanned);
sc->nr_reclaimed += nr_soft_reclaimed;
sc->nr_scanned += nr_soft_scanned;
/* need some check for avoid more shrink_zone() */
}
@ -2920,8 +2952,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
{
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.priority = DEF_PRIORITY,
@ -3036,15 +3066,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
sc.nr_scanned = 0;
nr_soft_scanned = 0;
/*
* Call soft limit reclaim before calling shrink_zone.
*/
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
order, sc.gfp_mask,
&nr_soft_scanned);
sc.nr_reclaimed += nr_soft_reclaimed;
/*
* There should be no need to raise the scanning
* priority if enough pages are already being scanned

View File

@ -87,8 +87,8 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
if (!cg_proto)
return -EINVAL;
if (val > RESOURCE_MAX)
val = RESOURCE_MAX;
if (val > RES_COUNTER_MAX)
val = RES_COUNTER_MAX;
tcp = tcp_from_cgproto(cg_proto);
@ -101,9 +101,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
net->ipv4.sysctl_tcp_mem[i]);
if (val == RESOURCE_MAX)
if (val == RES_COUNTER_MAX)
clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
else if (val != RESOURCE_MAX) {
else if (val != RES_COUNTER_MAX) {
/*
* The active bit needs to be written after the static_key
* update. This is what guarantees that the socket activation
@ -187,7 +187,7 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
switch (cft->private) {
case RES_LIMIT:
val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX);
val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX);
break;
case RES_USAGE:
val = tcp_read_usage(memcg);