From a2ef990ab5a6705a356d146dd773a3b359787497 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Thu, 12 Jan 2012 17:17:08 -0800 Subject: [PATCH 01/96] proc: fix null pointer deref in proc_pid_permission() get_proc_task() can fail to search the task and return NULL, put_task_struct() will then bomb the kernel with following oops: BUG: unable to handle kernel NULL pointer dereference at 0000000000000010 IP: [] proc_pid_permission+0x64/0xe0 PGD 112075067 PUD 112814067 PMD 0 Oops: 0002 [#1] PREEMPT SMP This is a regression introduced by commit 0499680a ("procfs: add hidepid= and gid= mount options"). The kernel should return -ESRCH if get_proc_task() failed. Signed-off-by: Xiaotian Feng Cc: Al Viro Cc: Vasiliy Kulikov Cc: Stephen Wilson Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index 8173dfd89cb2..5485a5388ecb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -654,6 +654,8 @@ static int proc_pid_permission(struct inode *inode, int mask) bool has_perms; task = get_proc_task(inode); + if (!task) + return -ESRCH; has_perms = has_pid_permissions(pid, task, 1); put_task_struct(task); From efeb156e7275c5b6c6e0f96aceb3c6abf98fc392 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 12 Jan 2012 17:17:11 -0800 Subject: [PATCH 02/96] kprobes: silence DEBUG_STRICT_USER_COPY_CHECKS=y warning Enabling DEBUG_STRICT_USER_COPY_CHECKS causes the following warning: In file included from arch/x86/include/asm/uaccess.h:573, from kernel/kprobes.c:55: In function 'copy_from_user', inlined from 'write_enabled_file_bool' at kernel/kprobes.c:2191: arch/x86/include/asm/uaccess_64.h:65: warning: call to 'copy_from_user_overflow' declared with attribute warning: copy_from_user() buffer size is not provably correct presumably due to buf_size being signed causing GCC to fail to see that buf_size can't become negative. Signed-off-by: Stephen Boyd Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S. Miller Acked-by: Masami Hiramatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e5d84644823b..95dd7212e610 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2198,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { char buf[32]; - int buf_size; + size_t buf_size; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) From 4da47859956cebdc4c58c38a931e21847458d744 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 12 Jan 2012 17:17:13 -0800 Subject: [PATCH 03/96] kernel.h: neaten panic prototype Use __printf macro. Convert NORET_AND to ATTRIB_NORET. Use the normal kernel style for pointer arguments. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d0a7a0c71661..60934395e36c 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -185,8 +185,9 @@ static inline void might_fault(void) extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(int state); -NORET_TYPE void panic(const char * fmt, ...) - __attribute__ ((NORET_AND format (printf, 1, 2))) __cold; +NORET_TYPE __printf(1, 2) +void panic(const char *fmt, ...) + ATTRIB_NORET __cold; extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); From 80bf007f20b16272f210e0803f739f5606cff59d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 12 Jan 2012 17:17:14 -0800 Subject: [PATCH 04/96] include/linux/linkage.h: remove unused NORET_AND macro The only use in kernel.h is gone so remove the macro. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/linkage.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 3f46aedea42f..c75074cb8ad4 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -90,6 +90,5 @@ #define NORET_TYPE /**/ #define ATTRIB_NORET __attribute__((noreturn)) -#define NORET_AND noreturn, #endif From 9402c95f34a66e81eba473a2f7267bbae5a1dee2 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 12 Jan 2012 17:17:17 -0800 Subject: [PATCH 05/96] treewide: remove useless NORET_TYPE macro and uses It's a very old and now unused prototype marking so just delete it. Neaten panic pointer argument style to keep checkpatch quiet. Signed-off-by: Joe Perches Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Tony Luck Cc: Fenghua Yu Acked-by: Geert Uytterhoeven Acked-by: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/avr32/include/asm/system.h | 2 +- arch/avr32/kernel/traps.c | 2 +- arch/ia64/kernel/machine_kexec.c | 2 +- arch/m68k/amiga/config.c | 2 +- arch/mips/include/asm/ptrace.h | 2 +- arch/mips/kernel/traps.c | 2 +- arch/powerpc/kernel/machine_kexec_32.c | 2 +- arch/powerpc/kernel/machine_kexec_64.c | 6 +++--- arch/s390/kernel/nmi.c | 2 +- arch/tile/kernel/machine_kexec.c | 4 ++-- include/linux/kernel.h | 6 +++--- include/linux/linkage.h | 1 - include/linux/sched.h | 2 +- kernel/exit.c | 6 +++--- kernel/panic.c | 2 +- 15 files changed, 21 insertions(+), 22 deletions(-) diff --git a/arch/avr32/include/asm/system.h b/arch/avr32/include/asm/system.h index 9702c2213e1e..62d9ded01635 100644 --- a/arch/avr32/include/asm/system.h +++ b/arch/avr32/include/asm/system.h @@ -169,7 +169,7 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr, #define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n)) struct pt_regs; -void NORET_TYPE die(const char *str, struct pt_regs *regs, long err); +void die(const char *str, struct pt_regs *regs, long err); void _exception(long signr, struct pt_regs *regs, int code, unsigned long addr); diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c index 7aa25756412f..3d760c06f024 100644 --- a/arch/avr32/kernel/traps.c +++ b/arch/avr32/kernel/traps.c @@ -24,7 +24,7 @@ static DEFINE_SPINLOCK(die_lock); -void NORET_TYPE die(const char *str, struct pt_regs *regs, long err) +void die(const char *str, struct pt_regs *regs, long err) { static int die_counter; diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c index 3d3aeef46947..581a16d5e85b 100644 --- a/arch/ia64/kernel/machine_kexec.c +++ b/arch/ia64/kernel/machine_kexec.c @@ -27,7 +27,7 @@ #include #include -typedef NORET_TYPE void (*relocate_new_kernel_t)( +typedef void (*relocate_new_kernel_t)( unsigned long indirection_page, unsigned long start_address, struct ia64_boot_param *boot_param, diff --git a/arch/m68k/amiga/config.c b/arch/m68k/amiga/config.c index 82a4bb51d5d8..a3b0558328b6 100644 --- a/arch/m68k/amiga/config.c +++ b/arch/m68k/amiga/config.c @@ -511,7 +511,7 @@ static unsigned long amiga_gettimeoffset(void) return ticks + offset; } -static NORET_TYPE void amiga_reset(void) +static void amiga_reset(void) ATTRIB_NORET; static void amiga_reset(void) diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index de39b1f343ea..3d913259e507 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -144,7 +144,7 @@ extern int ptrace_set_watch_regs(struct task_struct *child, extern asmlinkage void syscall_trace_enter(struct pt_regs *regs); extern asmlinkage void syscall_trace_leave(struct pt_regs *regs); -extern NORET_TYPE void die(const char *, struct pt_regs *) ATTRIB_NORET; +extern void die(const char *, struct pt_regs *) ATTRIB_NORET; static inline void die_if_kernel(const char *str, struct pt_regs *regs) { diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 5c8a49d55054..725e9a5ca966 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -1340,7 +1340,7 @@ void ejtag_exception_handler(struct pt_regs *regs) /* * NMI exception handler. */ -NORET_TYPE void ATTRIB_NORET nmi_exception_handler(struct pt_regs *regs) +void ATTRIB_NORET nmi_exception_handler(struct pt_regs *regs) { bust_spinlocks(1); printk("NMI taken!!!!\n"); diff --git a/arch/powerpc/kernel/machine_kexec_32.c b/arch/powerpc/kernel/machine_kexec_32.c index e63f2e7d2efb..026e7f153949 100644 --- a/arch/powerpc/kernel/machine_kexec_32.c +++ b/arch/powerpc/kernel/machine_kexec_32.c @@ -16,7 +16,7 @@ #include #include -typedef NORET_TYPE void (*relocate_new_kernel_t)( +typedef void (*relocate_new_kernel_t)( unsigned long indirection_page, unsigned long reboot_code_buffer, unsigned long start_address) ATTRIB_NORET; diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 26ccbf77dd41..5fbbf814923a 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -307,9 +307,9 @@ static union thread_union kexec_stack __init_task_data = struct paca_struct kexec_paca; /* Our assembly helper, in kexec_stub.S */ -extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start, - void *image, void *control, - void (*clear_all)(void)) ATTRIB_NORET; +extern void kexec_sequence(void *newstack, unsigned long start, + void *image, void *control, + void (*clear_all)(void)) ATTRIB_NORET; /* too late to fail here */ void default_machine_kexec(struct kimage *image) diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index fab88431a06f..0fd2e863e114 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -30,7 +30,7 @@ struct mcck_struct { static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck); -static NORET_TYPE void s390_handle_damage(char *msg) +static void s390_handle_damage(char *msg) { smp_send_stop(); disabled_wait((unsigned long) __builtin_return_address(0)); diff --git a/arch/tile/kernel/machine_kexec.c b/arch/tile/kernel/machine_kexec.c index e00d7179989e..b0c907059067 100644 --- a/arch/tile/kernel/machine_kexec.c +++ b/arch/tile/kernel/machine_kexec.c @@ -248,10 +248,10 @@ static void setup_quasi_va_is_pa(void) } -NORET_TYPE void machine_kexec(struct kimage *image) +void machine_kexec(struct kimage *image) { void *reboot_code_buffer; - NORET_TYPE void (*rnk)(unsigned long, void *, unsigned long) + void (*rnk)(unsigned long, void *, unsigned long) ATTRIB_NORET; /* Mask all interrupts before starting to reboot. */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 60934395e36c..aaf1753dd2b3 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -185,16 +185,16 @@ static inline void might_fault(void) extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(int state); -NORET_TYPE __printf(1, 2) +__printf(1, 2) void panic(const char *fmt, ...) ATTRIB_NORET __cold; extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); extern int oops_may_print(void); -NORET_TYPE void do_exit(long error_code) +void do_exit(long error_code) ATTRIB_NORET; -NORET_TYPE void complete_and_exit(struct completion *, long) +void complete_and_exit(struct completion *, long) ATTRIB_NORET; /* Internal, do not use. */ diff --git a/include/linux/linkage.h b/include/linux/linkage.h index c75074cb8ad4..6a8f252e49ee 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -88,7 +88,6 @@ #endif -#define NORET_TYPE /**/ #define ATTRIB_NORET __attribute__((noreturn)) #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 21cd0303af51..4032ec1cf836 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2275,7 +2275,7 @@ extern void __cleanup_sighand(struct sighand_struct *); extern void exit_itimers(struct signal_struct *); extern void flush_itimer_signals(void); -extern NORET_TYPE void do_group_exit(int); +extern void do_group_exit(int); extern void daemonize(const char *, ...); extern int allow_signal(int); diff --git a/kernel/exit.c b/kernel/exit.c index 94ed6e20bb53..c44738267be7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -887,7 +887,7 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -NORET_TYPE void do_exit(long code) +void do_exit(long code) { struct task_struct *tsk = current; int group_dead; @@ -1051,7 +1051,7 @@ NORET_TYPE void do_exit(long code) EXPORT_SYMBOL_GPL(do_exit); -NORET_TYPE void complete_and_exit(struct completion *comp, long code) +void complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); @@ -1070,7 +1070,7 @@ SYSCALL_DEFINE1(exit, int, error_code) * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ -NORET_TYPE void +void do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; diff --git a/kernel/panic.c b/kernel/panic.c index 3458469eb7c3..6fd09ed6fd90 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -57,7 +57,7 @@ EXPORT_SYMBOL(panic_blink); * * This function never returns. */ -NORET_TYPE void panic(const char * fmt, ...) +void panic(const char *fmt, ...) { static char buf[1024]; va_list args; From ff2d8b19a3a62559afba1c53360c8577a7697714 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 12 Jan 2012 17:17:21 -0800 Subject: [PATCH 06/96] treewide: convert uses of ATTRIB_NORETURN to __noreturn Use the more commonly used __noreturn instead of ATTRIB_NORETURN. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Joe Perches Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Tony Luck Cc: Fenghua Yu Acked-by: Geert Uytterhoeven Acked-by: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/machine_kexec.c | 2 +- arch/m68k/amiga/config.c | 3 +-- arch/mips/include/asm/ptrace.h | 2 +- arch/mips/kernel/traps.c | 2 +- arch/mn10300/include/asm/exceptions.h | 2 +- arch/powerpc/kernel/machine_kexec_32.c | 2 +- arch/powerpc/kernel/machine_kexec_64.c | 2 +- arch/s390/include/asm/processor.h | 2 +- arch/sh/kernel/process_32.c | 2 +- arch/sh/kernel/process_64.c | 2 +- arch/tile/kernel/machine_kexec.c | 2 +- include/linux/kernel.h | 6 +++--- 12 files changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c index 581a16d5e85b..4eed35814994 100644 --- a/arch/ia64/kernel/machine_kexec.c +++ b/arch/ia64/kernel/machine_kexec.c @@ -31,7 +31,7 @@ typedef void (*relocate_new_kernel_t)( unsigned long indirection_page, unsigned long start_address, struct ia64_boot_param *boot_param, - unsigned long pal_addr) ATTRIB_NORET; + unsigned long pal_addr) __noreturn; struct kimage *ia64_kimage; diff --git a/arch/m68k/amiga/config.c b/arch/m68k/amiga/config.c index a3b0558328b6..b95a451b1c3a 100644 --- a/arch/m68k/amiga/config.c +++ b/arch/m68k/amiga/config.c @@ -511,8 +511,7 @@ static unsigned long amiga_gettimeoffset(void) return ticks + offset; } -static void amiga_reset(void) - ATTRIB_NORET; +static void amiga_reset(void) __noreturn; static void amiga_reset(void) { diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index 3d913259e507..7b99c670e478 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -144,7 +144,7 @@ extern int ptrace_set_watch_regs(struct task_struct *child, extern asmlinkage void syscall_trace_enter(struct pt_regs *regs); extern asmlinkage void syscall_trace_leave(struct pt_regs *regs); -extern void die(const char *, struct pt_regs *) ATTRIB_NORET; +extern void die(const char *, struct pt_regs *) __noreturn; static inline void die_if_kernel(const char *str, struct pt_regs *regs) { diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 725e9a5ca966..bbddb86c1fa1 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -1340,7 +1340,7 @@ void ejtag_exception_handler(struct pt_regs *regs) /* * NMI exception handler. */ -void ATTRIB_NORET nmi_exception_handler(struct pt_regs *regs) +void __noreturn nmi_exception_handler(struct pt_regs *regs) { bust_spinlocks(1); printk("NMI taken!!!!\n"); diff --git a/arch/mn10300/include/asm/exceptions.h b/arch/mn10300/include/asm/exceptions.h index ca3e20508c77..95a4d42c3a06 100644 --- a/arch/mn10300/include/asm/exceptions.h +++ b/arch/mn10300/include/asm/exceptions.h @@ -110,7 +110,7 @@ extern asmlinkage void nmi_handler(void); extern asmlinkage void misalignment(struct pt_regs *, enum exception_code); extern void die(const char *, struct pt_regs *, enum exception_code) - ATTRIB_NORET; + __noreturn; extern int die_if_no_fixup(const char *, struct pt_regs *, enum exception_code); diff --git a/arch/powerpc/kernel/machine_kexec_32.c b/arch/powerpc/kernel/machine_kexec_32.c index 026e7f153949..affe5dcce7f4 100644 --- a/arch/powerpc/kernel/machine_kexec_32.c +++ b/arch/powerpc/kernel/machine_kexec_32.c @@ -19,7 +19,7 @@ typedef void (*relocate_new_kernel_t)( unsigned long indirection_page, unsigned long reboot_code_buffer, - unsigned long start_address) ATTRIB_NORET; + unsigned long start_address) __noreturn; /* * This is a generic machine_kexec function suitable at least for diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 5fbbf814923a..d7f609086a99 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -309,7 +309,7 @@ struct paca_struct kexec_paca; /* Our assembly helper, in kexec_stub.S */ extern void kexec_sequence(void *newstack, unsigned long start, void *image, void *control, - void (*clear_all)(void)) ATTRIB_NORET; + void (*clear_all)(void)) __noreturn; /* too late to fail here */ void default_machine_kexec(struct kimage *image) diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 27272f6a14c2..d25843a6a915 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -236,7 +236,7 @@ static inline unsigned long __rewind_psw(psw_t psw, unsigned long ilc) /* * Function to drop a processor into disabled wait state */ -static inline void ATTRIB_NORET disabled_wait(unsigned long code) +static inline void __noreturn disabled_wait(unsigned long code) { unsigned long ctl_buf; psw_t dw_psw; diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index aaf6d59c2012..7ec665178125 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -70,7 +70,7 @@ void show_regs(struct pt_regs * regs) /* * Create a kernel thread */ -ATTRIB_NORET void kernel_thread_helper(void *arg, int (*fn)(void *)) +__noreturn void kernel_thread_helper(void *arg, int (*fn)(void *)) { do_exit(fn(arg)); } diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c index 210c1cabcb7f..cbd4e4bb9fc5 100644 --- a/arch/sh/kernel/process_64.c +++ b/arch/sh/kernel/process_64.c @@ -285,7 +285,7 @@ void show_regs(struct pt_regs *regs) /* * Create a kernel thread */ -ATTRIB_NORET void kernel_thread_helper(void *arg, int (*fn)(void *)) +__noreturn void kernel_thread_helper(void *arg, int (*fn)(void *)) { do_exit(fn(arg)); } diff --git a/arch/tile/kernel/machine_kexec.c b/arch/tile/kernel/machine_kexec.c index b0c907059067..6255f2eab112 100644 --- a/arch/tile/kernel/machine_kexec.c +++ b/arch/tile/kernel/machine_kexec.c @@ -252,7 +252,7 @@ void machine_kexec(struct kimage *image) { void *reboot_code_buffer; void (*rnk)(unsigned long, void *, unsigned long) - ATTRIB_NORET; + __noreturn; /* Mask all interrupts before starting to reboot. */ interrupt_mask_set_mask(~0ULL); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index aaf1753dd2b3..e8343422240a 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -187,15 +187,15 @@ extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(int state); __printf(1, 2) void panic(const char *fmt, ...) - ATTRIB_NORET __cold; + __noreturn __cold; extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); extern int oops_may_print(void); void do_exit(long error_code) - ATTRIB_NORET; + __noreturn; void complete_and_exit(struct completion *, long) - ATTRIB_NORET; + __noreturn; /* Internal, do not use. */ int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res); From 0d259cf8190b9c446eefd5225ffcc3941e76a432 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 12 Jan 2012 17:17:25 -0800 Subject: [PATCH 07/96] include/linux/linkage.h: remove unused ATTRIB_NORET macro The uses have been renamed so delete the unused macro. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/linkage.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 6a8f252e49ee..807f1e533226 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -88,6 +88,4 @@ #endif -#define ATTRIB_NORET __attribute__((noreturn)) - #endif From 43570fd2f47ba518145e9289f54cde3dba4c8b25 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 12 Jan 2012 17:17:27 -0800 Subject: [PATCH 08/96] mm,slub,x86: decouple size of struct page from CONFIG_CMPXCHG_LOCAL While implementing cmpxchg_double() on s390 I realized that we don't set CONFIG_CMPXCHG_LOCAL despite the fact that we have support for it. However setting that option will increase the size of struct page by eight bytes on 64 bit, which we certainly do not want. Also, it doesn't make sense that a present cpu feature should increase the size of struct page. Besides that it looks like the dependency to CMPXCHG_LOCAL is wrong and that it should depend on CMPXCHG_DOUBLE instead. This patch: If an architecture supports CMPXCHG_LOCAL this shouldn't result automatically in larger struct pages if the SLUB allocator is used. Instead introduce a new config option "HAVE_ALIGNED_STRUCT_PAGE" which can be selected if a double word aligned struct page is required. Also update x86 Kconfig so that it should work as before. Signed-off-by: Heiko Carstens Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 8 ++++++++ arch/x86/Kconfig | 1 + include/linux/mm_types.h | 9 ++++----- mm/slub.c | 6 +++--- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 2505740b81d2..a2c5c077c32d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -185,4 +185,12 @@ config HAVE_RCU_TABLE_FREE config ARCH_HAVE_NMI_SAFE_CMPXCHG bool +config HAVE_ALIGNED_STRUCT_PAGE + bool + help + This makes sure that struct pages are double word aligned and that + e.g. the SLUB allocator can perform double word atomic operations + on a struct page for better performance. However selecting this + might increase the size of a struct page by a word. + source "kernel/gcov/Kconfig" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a150f4c35e94..5201a2c27239 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -60,6 +60,7 @@ config X86 select PERF_EVENTS select HAVE_PERF_EVENTS_NMI select ANON_INODES + select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5b42f1b34eb7..3cc3062b3767 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -151,12 +151,11 @@ struct page { #endif } /* - * If another subsystem starts using the double word pairing for atomic - * operations on struct page then it must change the #if to ensure - * proper alignment of the page struct. + * The struct page can be forced to be double word aligned so that atomic ops + * on double words work. The SLUB allocator can make use of such a feature. */ -#if defined(CONFIG_SLUB) && defined(CONFIG_CMPXCHG_LOCAL) - __attribute__((__aligned__(2*sizeof(unsigned long)))) +#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE + __aligned(2 * sizeof(unsigned long)) #endif ; diff --git a/mm/slub.c b/mm/slub.c index 5d37b5e44140..72aa84134609 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -366,7 +366,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page const char *n) { VM_BUG_ON(!irqs_disabled()); -#ifdef CONFIG_CMPXCHG_DOUBLE +#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, @@ -400,7 +400,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_new, unsigned long counters_new, const char *n) { -#ifdef CONFIG_CMPXCHG_DOUBLE +#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, @@ -3014,7 +3014,7 @@ static int kmem_cache_open(struct kmem_cache *s, } } -#ifdef CONFIG_CMPXCHG_DOUBLE +#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) /* Enable fast mode */ s->flags |= __CMPXCHG_DOUBLE; From 4156153c4daddf12dd386016f96a947a01e93bf4 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 12 Jan 2012 17:17:30 -0800 Subject: [PATCH 09/96] mm,x86,um: move CMPXCHG_LOCAL config option Move CMPXCHG_LOCAL and rename it to HAVE_CMPXCHG_LOCAL so architectures can simply select the option if it is supported. Signed-off-by: Heiko Carstens Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 3 +++ arch/x86/Kconfig | 1 + arch/x86/Kconfig.cpu | 3 --- arch/x86/um/Kconfig | 4 ---- mm/vmstat.c | 2 +- 5 files changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index a2c5c077c32d..22182a8cc62c 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -193,4 +193,7 @@ config HAVE_ALIGNED_STRUCT_PAGE on a struct page for better performance. However selecting this might increase the size of a struct page by a word. +config HAVE_CMPXCHG_LOCAL + bool + source "kernel/gcov/Kconfig" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5201a2c27239..59717fd17bc7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -61,6 +61,7 @@ config X86 select HAVE_PERF_EVENTS_NMI select ANON_INODES select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 + select HAVE_CMPXCHG_LOCAL if !M386 select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index e3ca7e0d858c..99d2ab8b7795 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -309,9 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT config X86_CMPXCHG def_bool X86_64 || (X86_32 && !M386) -config CMPXCHG_LOCAL - def_bool X86_64 || (X86_32 && !M386) - config CMPXCHG_DOUBLE def_bool y diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 1d97bd84b6fb..a62bfc66239e 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -6,10 +6,6 @@ menu "UML-specific options" menu "Host processor type and features" -config CMPXCHG_LOCAL - bool - default n - config CMPXCHG_DOUBLE bool default n diff --git a/mm/vmstat.c b/mm/vmstat.c index 8fd603b1665e..f600557a7659 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -295,7 +295,7 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item) } EXPORT_SYMBOL(__dec_zone_page_state); -#ifdef CONFIG_CMPXCHG_LOCAL +#ifdef CONFIG_HAVE_CMPXCHG_LOCAL /* * If we have cmpxchg_local support then we do not need to incur the overhead * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. From 2565409fc0303f3ab8d66b8326702a687962a29b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 12 Jan 2012 17:17:33 -0800 Subject: [PATCH 10/96] mm,x86,um: move CMPXCHG_DOUBLE config option Move CMPXCHG_DOUBLE and rename it to HAVE_CMPXCHG_DOUBLE so architectures can simply select the option if it is supported. Signed-off-by: Heiko Carstens Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 3 +++ arch/x86/Kconfig | 1 + arch/x86/Kconfig.cpu | 3 --- arch/x86/um/Kconfig | 4 ---- mm/slub.c | 9 ++++++--- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 22182a8cc62c..4f55c736be11 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -196,4 +196,7 @@ config HAVE_ALIGNED_STRUCT_PAGE config HAVE_CMPXCHG_LOCAL bool +config HAVE_CMPXCHG_DOUBLE + bool + source "kernel/gcov/Kconfig" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 59717fd17bc7..6c14ecd851d0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -62,6 +62,7 @@ config X86 select ANON_INODES select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 select HAVE_CMPXCHG_LOCAL if !M386 + select HAVE_CMPXCHG_DOUBLE select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 99d2ab8b7795..3c57033e2211 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -309,9 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT config X86_CMPXCHG def_bool X86_64 || (X86_32 && !M386) -config CMPXCHG_DOUBLE - def_bool y - config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index a62bfc66239e..b2b54d2edf53 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -6,10 +6,6 @@ menu "UML-specific options" menu "Host processor type and features" -config CMPXCHG_DOUBLE - bool - default n - source "arch/x86/Kconfig.cpu" endmenu diff --git a/mm/slub.c b/mm/slub.c index 72aa84134609..4907563ef7ff 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -366,7 +366,8 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page const char *n) { VM_BUG_ON(!irqs_disabled()); -#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, @@ -400,7 +401,8 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_new, unsigned long counters_new, const char *n) { -#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, @@ -3014,7 +3016,8 @@ static int kmem_cache_open(struct kmem_cache *s, } } -#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) /* Enable fast mode */ s->flags |= __CMPXCHG_DOUBLE; From 08346bf8051c62f92f132eff267e524a7165ee0c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 12 Jan 2012 17:17:34 -0800 Subject: [PATCH 11/96] drivers/video/nvidia/nvidia.c: fix warning Fix the int/bool confusion in there. drivers/video/nvidia/nvidia.c:1602: warning: return from incompatible pointer type Cc: Florian Tobias Schandinat Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/nvidia/nvidia.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/video/nvidia/nvidia.c b/drivers/video/nvidia/nvidia.c index 081dc4745274..fe13ac567d54 100644 --- a/drivers/video/nvidia/nvidia.c +++ b/drivers/video/nvidia/nvidia.c @@ -81,7 +81,7 @@ static int vram __devinitdata = 0; static int bpp __devinitdata = 8; static int reverse_i2c __devinitdata; #ifdef CONFIG_MTRR -static int nomtrr __devinitdata = 0; +static bool nomtrr __devinitdata = false; #endif #ifdef CONFIG_PMAC_BACKLIGHT static int backlight __devinitdata = 1; @@ -1509,7 +1509,7 @@ static int __devinit nvidiafb_setup(char *options) backlight = simple_strtoul(this_opt+10, NULL, 0); #ifdef CONFIG_MTRR } else if (!strncmp(this_opt, "nomtrr", 6)) { - nomtrr = 1; + nomtrr = true; #endif } else if (!strncmp(this_opt, "fpdither:", 9)) { fpdither = simple_strtol(this_opt+9, NULL, 0); @@ -1599,7 +1599,7 @@ MODULE_PARM_DESC(bpp, "pixel width in bits" module_param(reverse_i2c, int, 0); MODULE_PARM_DESC(reverse_i2c, "reverse port assignment of the i2c bus"); #ifdef CONFIG_MTRR -module_param(nomtrr, bool, 0); +module_param(nomtrr, bool, false); MODULE_PARM_DESC(nomtrr, "Disables MTRR support (0 or 1=disabled) " "(default=0)"); #endif From 01fa310cd9158ebdd1ee95383540ecaa3f4ef3f6 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 12 Jan 2012 17:17:36 -0800 Subject: [PATCH 12/96] ia64, exec: remove redundant set_fs(USER_DS) The address limit is already set in flush_old_exec() so this set_fs(USER_DS) is redundant. Signed-off-by: Mathias Krause Cc: Fenghua Yu Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/processor.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index d9f397fae03e..691be0b95c1e 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -309,7 +309,6 @@ struct thread_struct { } #define start_thread(regs,new_ip,new_sp) do { \ - set_fs(USER_DS); \ regs->cr_ipsr = ((regs->cr_ipsr | (IA64_PSR_BITS_TO_SET | IA64_PSR_CPL)) \ & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_RI | IA64_PSR_IS)); \ regs->cr_iip = new_ip; \ From 15ee2d000dd5813fcf1204b078fa276e57046b64 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 12 Jan 2012 17:17:38 -0800 Subject: [PATCH 13/96] parisc, exec: remove redundant set_fs(USER_DS) The address limit is already set in flush_old_exec() so those calls to set_fs(USER_DS) are redundant. Signed-off-by: Mathias Krause Cc: Kyle McMartin Cc: Helge Deller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/parisc/include/asm/processor.h | 2 -- arch/parisc/kernel/process.c | 1 - 2 files changed, 3 deletions(-) diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index 9ce66e9d1c2b..7213ec9e594c 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -196,7 +196,6 @@ typedef unsigned int elf_caddr_t; /* offset pc for priv. level */ \ pc |= 3; \ \ - set_fs(USER_DS); \ regs->iasq[0] = spaceid; \ regs->iasq[1] = spaceid; \ regs->iaoq[0] = pc; \ @@ -299,7 +298,6 @@ on downward growing arches, it looks like this: elf_addr_t pc = (elf_addr_t)new_pc | 3; \ elf_caddr_t *argv = (elf_caddr_t *)bprm->exec + 1; \ \ - set_fs(USER_DS); \ regs->iasq[0] = spaceid; \ regs->iasq[1] = spaceid; \ regs->iaoq[0] = pc; \ diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 4b4b9181a1a0..62c60b87d039 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -192,7 +192,6 @@ void flush_thread(void) /* Only needs to handle fpu stuff or perf monitors. ** REVISIT: several arches implement a "lazy fpu state". */ - set_fs(USER_DS); } void release_thread(struct task_struct *dead_task) From 888a214dc4c65e1486ef7d3ac6b0cf93ae3cc2c7 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 12 Jan 2012 17:17:39 -0800 Subject: [PATCH 14/96] slub: document setting min order with debug_guardpage_minorder > 0 Acked-by: David Rientjes Cc: Pekka Enberg Cc: "Rafael J. Wysocki" Signed-off-by: Stanislaw Gruszka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-kernel-slab | 4 ++++ Documentation/vm/slub.txt | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab index 8b093f8222d3..91bd6ca5440f 100644 --- a/Documentation/ABI/testing/sysfs-kernel-slab +++ b/Documentation/ABI/testing/sysfs-kernel-slab @@ -346,6 +346,10 @@ Description: number of objects per slab. If a slab cannot be allocated because of fragmentation, SLUB will retry with the minimum order possible depending on its characteristics. + When debug_guardpage_minorder=N (N > 0) parameter is specified + (see Documentation/kernel-parameters.txt), the minimum possible + order is used and this sysfs entry can not be used to change + the order at run time. What: /sys/kernel/slab/cache/order_fallback Date: April 2008 diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt index 2acdda9601b0..6752870c4970 100644 --- a/Documentation/vm/slub.txt +++ b/Documentation/vm/slub.txt @@ -131,7 +131,10 @@ slub_min_objects. slub_max_order specified the order at which slub_min_objects should no longer be checked. This is useful to avoid SLUB trying to generate super large order pages to fit slub_min_objects of a slab cache with -large object sizes into one high order page. +large object sizes into one high order page. Setting command line +parameter debug_guardpage_minorder=N (N > 0), forces setting +slub_max_order to 0, what cause minimum possible order of slabs +allocation. SLUB Debug output ----------------- From 2ccd4f4d4737b37e21dd92c8c584c23cd87740a2 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 12 Jan 2012 17:17:40 -0800 Subject: [PATCH 15/96] pipe: fail cleanly when root tries F_SETPIPE_SZ with big size When a user with the CAP_SYS_RESOURCE cap tries to F_SETPIPE_SZ a pipe with size bigger than kmalloc() can alloc it spits out an ugly warning: ------------[ cut here ]------------ WARNING: at mm/page_alloc.c:2095 __alloc_pages_nodemask+0x5d3/0x7a0() Pid: 733, comm: a.out Not tainted 3.2.0-rc1+ #4 Call Trace: warn_slowpath_common+0x75/0xb0 warn_slowpath_null+0x15/0x20 __alloc_pages_nodemask+0x5d3/0x7a0 __get_free_pages+0x12/0x50 __kmalloc+0x12b/0x150 pipe_set_size+0x75/0x120 pipe_fcntl+0xf8/0x140 do_fcntl+0x2d4/0x410 sys_fcntl+0x66/0xa0 system_call_fastpath+0x16/0x1b ---[ end trace 432f702e6db7b5ee ]--- Instead, make kcalloc() handle the overflow case and fail quietly. [akpm@linux-foundation.org: switch to sizeof(*bufs) for 80-column niceness] Signed-off-by: Sasha Levin Cc: Alexander Viro Acked-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/pipe.c b/fs/pipe.c index f0e485d54e64..a932ced92a16 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1137,7 +1137,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) if (nr_pages < pipe->nrbufs) return -EBUSY; - bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL); + bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); if (unlikely(!bufs)) return -ENOMEM; From 28d82dc1c4edbc352129f97f4ca22624d1fe61de Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 12 Jan 2012 17:17:43 -0800 Subject: [PATCH 16/96] epoll: limit paths The current epoll code can be tickled to run basically indefinitely in both loop detection path check (on ep_insert()), and in the wakeup paths. The programs that tickle this behavior set up deeply linked networks of epoll file descriptors that cause the epoll algorithms to traverse them indefinitely. A couple of these sample programs have been previously posted in this thread: https://lkml.org/lkml/2011/2/25/297. To fix the loop detection path check algorithms, I simply keep track of the epoll nodes that have been already visited. Thus, the loop detection becomes proportional to the number of epoll file descriptor and links. This dramatically decreases the run-time of the loop check algorithm. In one diabolical case I tried it reduced the run-time from 15 mintues (all in kernel time) to .3 seconds. Fixing the wakeup paths could be done at wakeup time in a similar manner by keeping track of nodes that have already been visited, but the complexity is harder, since there can be multiple wakeups on different cpus...Thus, I've opted to limit the number of possible wakeup paths when the paths are created. This is accomplished, by noting that the end file descriptor points that are found during the loop detection pass (from the newly added link), are actually the sources for wakeup events. I keep a list of these file descriptors and limit the number and length of these paths that emanate from these 'source file descriptors'. In the current implemetation I allow 1000 paths of length 1, 500 of length 2, 100 of length 3, 50 of length 4 and 10 of length 5. Note that it is sufficient to check the 'source file descriptors' reachable from the newly added link, since no other 'source file descriptors' will have newly added links. This allows us to check only the wakeup paths that may have gotten too long, and not re-check all possible wakeup paths on the system. In terms of the path limit selection, I think its first worth noting that the most common case for epoll, is probably the model where you have 1 epoll file descriptor that is monitoring n number of 'source file descriptors'. In this case, each 'source file descriptor' has a 1 path of length 1. Thus, I believe that the limits I'm proposing are quite reasonable and in fact may be too generous. Thus, I'm hoping that the proposed limits will not prevent any workloads that currently work to fail. In terms of locking, I have extended the use of the 'epmutex' to all epoll_ctl add and remove operations. Currently its only used in a subset of the add paths. I need to hold the epmutex, so that we can correctly traverse a coherent graph, to check the number of paths. I believe that this additional locking is probably ok, since its in the setup/teardown paths, and doesn't affect the running paths, but it certainly is going to add some extra overhead. Also, worth noting is that the epmuex was recently added to the ep_ctl add operations in the initial path loop detection code using the argument that it was not on a critical path. Another thing to note here, is the length of epoll chains that is allowed. Currently, eventpoll.c defines: /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 This basically means that I am limited to a graph depth of 5 (EP_MAX_NESTS + 1). However, this limit is currently only enforced during the loop check detection code, and only when the epoll file descriptors are added in a certain order. Thus, this limit is currently easily bypassed. The newly added check for wakeup paths, stricly limits the wakeup paths to a length of 5, regardless of the order in which ep's are linked together. Thus, a side-effect of the new code is a more consistent enforcement of the graph depth. Thus far, I've tested this, using the sample programs previously mentioned, which now either return quickly or return -EINVAL. I've also testing using the piptest.c epoll tester, which showed no difference in performance. I've also created a number of different epoll networks and tested that they behave as expectded. I believe this solves the original diabolical test cases, while still preserving the sane epoll nesting. Signed-off-by: Jason Baron Cc: Nelson Elhage Cc: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 232 ++++++++++++++++++++++++++++++++++---- include/linux/eventpoll.h | 1 + include/linux/fs.h | 1 + 3 files changed, 210 insertions(+), 24 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 828e750af23a..aabdfc38cf24 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -197,6 +197,12 @@ struct eventpoll { /* The user that created the eventpoll descriptor */ struct user_struct *user; + + struct file *file; + + /* used to optimize loop detection check */ + int visited; + struct list_head visited_list_link; }; /* Wait structure used by the poll hooks */ @@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __read_mostly; +/* Visited nodes during ep_loop_check(), so we can unset them when we finish */ +static LIST_HEAD(visited_list); + +/* + * List of files with newly added links, where we may need to limit the number + * of emanating paths. Protected by the epmutex. + */ +static LIST_HEAD(tfile_check_list); + #ifdef CONFIG_SYSCTL #include @@ -276,6 +291,12 @@ ctl_table epoll_table[] = { }; #endif /* CONFIG_SYSCTL */ +static const struct file_operations eventpoll_fops; + +static inline int is_file_epoll(struct file *f) +{ + return f->f_op == &eventpoll_fops; +} /* Setup the structure that is used as key for the RB tree */ static inline void ep_set_ffd(struct epoll_filefd *ffd, @@ -711,12 +732,6 @@ static const struct file_operations eventpoll_fops = { .llseek = noop_llseek, }; -/* Fast test to see if the file is an eventpoll file */ -static inline int is_file_epoll(struct file *f) -{ - return f->f_op == &eventpoll_fops; -} - /* * This is called from eventpoll_release() to unlink files from the eventpoll * interface. We need to have this facility to cleanup correctly files that are @@ -926,6 +941,99 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) rb_insert_color(&epi->rbn, &ep->rbr); } + + +#define PATH_ARR_SIZE 5 +/* + * These are the number paths of length 1 to 5, that we are allowing to emanate + * from a single file of interest. For example, we allow 1000 paths of length + * 1, to emanate from each file of interest. This essentially represents the + * potential wakeup paths, which need to be limited in order to avoid massive + * uncontrolled wakeup storms. The common use case should be a single ep which + * is connected to n file sources. In this case each file source has 1 path + * of length 1. Thus, the numbers below should be more than sufficient. These + * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify + * and delete can't add additional paths. Protected by the epmutex. + */ +static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; +static int path_count[PATH_ARR_SIZE]; + +static int path_count_inc(int nests) +{ + if (++path_count[nests] > path_limits[nests]) + return -1; + return 0; +} + +static void path_count_init(void) +{ + int i; + + for (i = 0; i < PATH_ARR_SIZE; i++) + path_count[i] = 0; +} + +static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) +{ + int error = 0; + struct file *file = priv; + struct file *child_file; + struct epitem *epi; + + list_for_each_entry(epi, &file->f_ep_links, fllink) { + child_file = epi->ep->file; + if (is_file_epoll(child_file)) { + if (list_empty(&child_file->f_ep_links)) { + if (path_count_inc(call_nests)) { + error = -1; + break; + } + } else { + error = ep_call_nested(&poll_loop_ncalls, + EP_MAX_NESTS, + reverse_path_check_proc, + child_file, child_file, + current); + } + if (error != 0) + break; + } else { + printk(KERN_ERR "reverse_path_check_proc: " + "file is not an ep!\n"); + } + } + return error; +} + +/** + * reverse_path_check - The tfile_check_list is list of file *, which have + * links that are proposed to be newly added. We need to + * make sure that those added links don't add too many + * paths such that we will spend all our time waking up + * eventpoll objects. + * + * Returns: Returns zero if the proposed links don't create too many paths, + * -1 otherwise. + */ +static int reverse_path_check(void) +{ + int length = 0; + int error = 0; + struct file *current_file; + + /* let's call this for all tfiles */ + list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { + length++; + path_count_init(); + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + reverse_path_check_proc, current_file, + current_file, current); + if (error) + break; + } + return error; +} + /* * Must be called with "mtx" held. */ @@ -987,6 +1095,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, */ ep_rbtree_insert(ep, epi); + /* now check if we've created too many backpaths */ + error = -EINVAL; + if (reverse_path_check()) + goto error_remove_epi; + /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); @@ -1011,6 +1124,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, return 0; +error_remove_epi: + spin_lock(&tfile->f_lock); + if (ep_is_linked(&epi->fllink)) + list_del_init(&epi->fllink); + spin_unlock(&tfile->f_lock); + + rb_erase(&epi->rbn, &ep->rbr); + error_unregister: ep_unregister_pollwait(ep, epi); @@ -1275,18 +1396,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) int error = 0; struct file *file = priv; struct eventpoll *ep = file->private_data; + struct eventpoll *ep_tovisit; struct rb_node *rbp; struct epitem *epi; mutex_lock_nested(&ep->mtx, call_nests + 1); + ep->visited = 1; + list_add(&ep->visited_list_link, &visited_list); for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { + ep_tovisit = epi->ffd.file->private_data; + if (ep_tovisit->visited) + continue; error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, - ep_loop_check_proc, epi->ffd.file, - epi->ffd.file->private_data, current); + ep_loop_check_proc, epi->ffd.file, + ep_tovisit, current); if (error != 0) break; + } else { + /* + * If we've reached a file that is not associated with + * an ep, then we need to check if the newly added + * links are going to add too many wakeup paths. We do + * this by adding it to the tfile_check_list, if it's + * not already there, and calling reverse_path_check() + * during ep_insert(). + */ + if (list_empty(&epi->ffd.file->f_tfile_llink)) + list_add(&epi->ffd.file->f_tfile_llink, + &tfile_check_list); } } mutex_unlock(&ep->mtx); @@ -1307,8 +1446,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) */ static int ep_loop_check(struct eventpoll *ep, struct file *file) { - return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + int ret; + struct eventpoll *ep_cur, *ep_next; + + ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, ep_loop_check_proc, file, ep, current); + /* clear visited list */ + list_for_each_entry_safe(ep_cur, ep_next, &visited_list, + visited_list_link) { + ep_cur->visited = 0; + list_del(&ep_cur->visited_list_link); + } + return ret; +} + +static void clear_tfile_check_list(void) +{ + struct file *file; + + /* first clear the tfile_check_list */ + while (!list_empty(&tfile_check_list)) { + file = list_first_entry(&tfile_check_list, struct file, + f_tfile_llink); + list_del_init(&file->f_tfile_llink); + } + INIT_LIST_HEAD(&tfile_check_list); } /* @@ -1316,8 +1478,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file) */ SYSCALL_DEFINE1(epoll_create1, int, flags) { - int error; + int error, fd; struct eventpoll *ep = NULL; + struct file *file; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); @@ -1334,11 +1497,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ - error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, + fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); + if (fd < 0) { + error = fd; + goto out_free_ep; + } + file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); - if (error < 0) - ep_free(ep); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto out_free_fd; + } + fd_install(fd, file); + ep->file = file; + return fd; +out_free_fd: + put_unused_fd(fd); +out_free_ep: + ep_free(ep); return error; } @@ -1404,21 +1581,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, /* * When we insert an epoll file descriptor, inside another epoll file * descriptor, there is the change of creating closed loops, which are - * better be handled here, than in more critical paths. + * better be handled here, than in more critical paths. While we are + * checking for loops we also determine the list of files reachable + * and hang them on the tfile_check_list, so we can check that we + * haven't created too many possible wakeup paths. * - * We hold epmutex across the loop check and the insert in this case, in - * order to prevent two separate inserts from racing and each doing the - * insert "at the same time" such that ep_loop_check passes on both - * before either one does the insert, thereby creating a cycle. + * We need to hold the epmutex across both ep_insert and ep_remove + * b/c we want to make sure we are looking at a coherent view of + * epoll network. */ - if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { + if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) { mutex_lock(&epmutex); did_lock_epmutex = 1; - error = -ELOOP; - if (ep_loop_check(ep, tfile) != 0) - goto error_tgt_fput; } - + if (op == EPOLL_CTL_ADD) { + if (is_file_epoll(tfile)) { + error = -ELOOP; + if (ep_loop_check(ep, tfile) != 0) + goto error_tgt_fput; + } else + list_add(&tfile->f_tfile_llink, &tfile_check_list); + } mutex_lock_nested(&ep->mtx, 0); @@ -1437,6 +1620,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, error = ep_insert(ep, &epds, tfile, fd); } else error = -EEXIST; + clear_tfile_check_list(); break; case EPOLL_CTL_DEL: if (epi) @@ -1455,7 +1639,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, mutex_unlock(&ep->mtx); error_tgt_fput: - if (unlikely(did_lock_epmutex)) + if (did_lock_epmutex) mutex_unlock(&epmutex); fput(tfile); diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index f362733186a5..657ab55beda0 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -61,6 +61,7 @@ struct file; static inline void eventpoll_init_file(struct file *file) { INIT_LIST_HEAD(&file->f_ep_links); + INIT_LIST_HEAD(&file->f_tfile_llink); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 7aacf31418fe..a7409bc157e0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1001,6 +1001,7 @@ struct file { #ifdef CONFIG_EPOLL /* Used by fs/eventpoll.c to link all the hooks to this file */ struct list_head f_ep_links; + struct list_head f_tfile_llink; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; #ifdef CONFIG_DEBUG_WRITECOUNT From ab936cbcd02072a34b60d268f94440fd5cf1970b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 12 Jan 2012 17:17:44 -0800 Subject: [PATCH 17/96] memcg: add mem_cgroup_replace_page_cache() to fix LRU issue Commit ef6a3c6311 ("mm: add replace_page_cache_page() function") added a function replace_page_cache_page(). This function replaces a page in the radix-tree with a new page. WHen doing this, memory cgroup needs to fix up the accounting information. memcg need to check PCG_USED bit etc. In some(many?) cases, 'newpage' is on LRU before calling replace_page_cache(). So, memcg's LRU accounting information should be fixed, too. This patch adds mem_cgroup_replace_page_cache() and removes the old hooks. In that function, old pages will be unaccounted without touching res_counter and new page will be accounted to the memcg (of old page). WHen overwriting pc->mem_cgroup of newpage, take zone->lru_lock and avoid races with LRU handling. Background: replace_page_cache_page() is called by FUSE code in its splice() handling. Here, 'newpage' is replacing oldpage but this newpage is not a newly allocated page and may be on LRU. LRU mis-accounting will be critical for memory cgroup because rmdir() checks the whole LRU is empty and there is no account leak. If a page is on the other LRU than it should be, rmdir() will fail. This bug was added in March 2011, but no bug report yet. I guess there are not many people who use memcg and FUSE at the same time with upstream kernels. The result of this bug is that admin cannot destroy a memcg because of account leak. So, no panic, no deadlock. And, even if an active cgroup exist, umount can succseed. So no problem at shutdown. Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Miklos Szeredi Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++++++ mm/filemap.c | 18 ++-------------- mm/memcontrol.c | 44 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 16 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f944591765eb..3558a5e268cf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -122,6 +122,8 @@ struct zone_reclaim_stat* mem_cgroup_get_reclaim_stat_from_page(struct page *page); extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); +extern void mem_cgroup_replace_page_cache(struct page *oldpage, + struct page *newpage); #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP extern int do_swap_account; @@ -369,6 +371,10 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) { } +static inline void mem_cgroup_replace_page_cache(struct page *oldpage, + struct page *newpage) +{ +} #endif /* CONFIG_CGROUP_MEM_CONT */ #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) diff --git a/mm/filemap.c b/mm/filemap.c index c4ee2e918bea..97f49ed35bd2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -393,24 +393,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_range); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) { int error; - struct mem_cgroup *memcg = NULL; VM_BUG_ON(!PageLocked(old)); VM_BUG_ON(!PageLocked(new)); VM_BUG_ON(new->mapping); - /* - * This is not page migration, but prepare_migration and - * end_migration does enough work for charge replacement. - * - * In the longer term we probably want a specialized function - * for moving the charge from old to new in a more efficient - * manner. - */ - error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask); - if (error) - return error; - error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (!error) { struct address_space *mapping = old->mapping; @@ -432,13 +419,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); + /* mem_cgroup codes must not be called under tree_lock */ + mem_cgroup_replace_page_cache(old, new); radix_tree_preload_end(); if (freepage) freepage(old); page_cache_release(old); - mem_cgroup_end_migration(memcg, old, new, true); - } else { - mem_cgroup_end_migration(memcg, old, new, false); } return error; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d87aa3510c5e..0b2d4036f1cd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3432,6 +3432,50 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, cgroup_release_and_wakeup_rmdir(&memcg->css); } +/* + * At replace page cache, newpage is not under any memcg but it's on + * LRU. So, this function doesn't touch res_counter but handles LRU + * in correct way. Both pages are locked so we cannot race with uncharge. + */ +void mem_cgroup_replace_page_cache(struct page *oldpage, + struct page *newpage) +{ + struct mem_cgroup *memcg; + struct page_cgroup *pc; + struct zone *zone; + enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; + unsigned long flags; + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(oldpage); + /* fix accounting on old pages */ + lock_page_cgroup(pc); + memcg = pc->mem_cgroup; + mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); + ClearPageCgroupUsed(pc); + unlock_page_cgroup(pc); + + if (PageSwapBacked(oldpage)) + type = MEM_CGROUP_CHARGE_TYPE_SHMEM; + + zone = page_zone(newpage); + pc = lookup_page_cgroup(newpage); + /* + * Even if newpage->mapping was NULL before starting replacement, + * the newpage may be on LRU(or pagevec for LRU) already. We lock + * LRU while we overwrite pc->mem_cgroup. + */ + spin_lock_irqsave(&zone->lru_lock, flags); + if (PageLRU(newpage)) + del_page_from_lru_list(zone, newpage, page_lru(newpage)); + __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type); + if (PageLRU(newpage)) + add_page_to_lru_list(zone, newpage, page_lru(newpage)); + spin_unlock_irqrestore(&zone->lru_lock, flags); +} + #ifdef CONFIG_DEBUG_VM static struct page_cgroup *lookup_page_cgroup_used(struct page *page) { From 9f3a0d0933de079665ec1b498947ffbf805b0018 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:17:48 -0800 Subject: [PATCH 18/96] mm: memcg: consolidate hierarchy iteration primitives The memcg naturalization series: Memory control groups are currently bolted onto the side of traditional memory management in places where better integration would be preferrable. To reclaim memory, for example, memory control groups maintain their own LRU list and reclaim strategy aside from the global per-zone LRU list reclaim. But an extra list head for each existing page frame is expensive and maintaining it requires additional code. This patchset disables the global per-zone LRU lists on memory cgroup configurations and converts all its users to operate on the per-memory cgroup lists instead. As LRU pages are then exclusively on one list, this saves two list pointers for each page frame in the system: page_cgroup array size with 4G physical memory vanilla: allocated 31457280 bytes of page_cgroup patched: allocated 15728640 bytes of page_cgroup At the same time, system performance for various workloads is unaffected: 100G sparse file cat, 4G physical memory, 10 runs, to test for code bloat in the traditional LRU handling and kswapd & direct reclaim paths, without/with the memory controller configured in vanilla: 71.603(0.207) seconds patched: 71.640(0.156) seconds vanilla: 79.558(0.288) seconds patched: 77.233(0.147) seconds 100G sparse file cat in 1G memory cgroup, 10 runs, to test for code bloat in the traditional memory cgroup LRU handling and reclaim path vanilla: 96.844(0.281) seconds patched: 94.454(0.311) seconds 4 unlimited memcgs running kbuild -j32 each, 4G physical memory, 500M swap on SSD, 10 runs, to test for regressions in kswapd & direct reclaim using per-memcg LRU lists with multiple memcgs and multiple allocators within each memcg vanilla: 717.722(1.440) seconds [ 69720.100(11600.835) majfaults ] patched: 714.106(2.313) seconds [ 71109.300(14886.186) majfaults ] 16 unlimited memcgs running kbuild, 1900M hierarchical limit, 500M swap on SSD, 10 runs, to test for regressions in hierarchical memcg setups vanilla: 2742.058(1.992) seconds [ 26479.600(1736.737) majfaults ] patched: 2743.267(1.214) seconds [ 27240.700(1076.063) majfaults ] This patch: There are currently two different implementations of iterating over a memory cgroup hierarchy tree. Consolidate them into one worker function and base the convenience looping-macros on top of it. Signed-off-by: Johannes Weiner Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Reviewed-by: Kirill A. Shutemov Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Ying Han Cc: Greg Thelen Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: Christoph Hellwig Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 203 ++++++++++++++++++------------------------------ 1 file changed, 77 insertions(+), 126 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0b2d4036f1cd..6edef95fecf4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -853,83 +853,76 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } -/* The caller has to guarantee "mem" exists before calling this */ -static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg) +static struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, + struct mem_cgroup *prev, + bool reclaim) { - struct cgroup_subsys_state *css; - int found; - - if (!memcg) /* ROOT cgroup has the smallest ID */ - return root_mem_cgroup; /*css_put/get against root is ignored*/ - if (!memcg->use_hierarchy) { - if (css_tryget(&memcg->css)) - return memcg; - return NULL; - } - rcu_read_lock(); - /* - * searching a memory cgroup which has the smallest ID under given - * ROOT cgroup. (ID >= 1) - */ - css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found); - if (css && css_tryget(css)) - memcg = container_of(css, struct mem_cgroup, css); - else - memcg = NULL; - rcu_read_unlock(); - return memcg; -} - -static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, - struct mem_cgroup *root, - bool cond) -{ - int nextid = css_id(&iter->css) + 1; - int found; - int hierarchy_used; - struct cgroup_subsys_state *css; - - hierarchy_used = iter->use_hierarchy; - - css_put(&iter->css); - /* If no ROOT, walk all, ignore hierarchy */ - if (!cond || (root && !hierarchy_used)) - return NULL; + struct mem_cgroup *memcg = NULL; + int id = 0; if (!root) root = root_mem_cgroup; - do { - iter = NULL; + if (prev && !reclaim) + id = css_id(&prev->css); + + if (prev && prev != root) + css_put(&prev->css); + + if (!root->use_hierarchy && root != root_mem_cgroup) { + if (prev) + return NULL; + return root; + } + + while (!memcg) { + struct cgroup_subsys_state *css; + + if (reclaim) + id = root->last_scanned_child; + rcu_read_lock(); - - css = css_get_next(&mem_cgroup_subsys, nextid, - &root->css, &found); - if (css && css_tryget(css)) - iter = container_of(css, struct mem_cgroup, css); + css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); + if (css) { + if (css == &root->css || css_tryget(css)) + memcg = container_of(css, + struct mem_cgroup, css); + } else + id = 0; rcu_read_unlock(); - /* If css is NULL, no more cgroups will be found */ - nextid = found + 1; - } while (css && !iter); - return iter; + if (reclaim) + root->last_scanned_child = id; + + if (prev && !css) + return NULL; + } + return memcg; } + +static void mem_cgroup_iter_break(struct mem_cgroup *root, + struct mem_cgroup *prev) +{ + if (!root) + root = root_mem_cgroup; + if (prev && prev != root) + css_put(&prev->css); +} + /* - * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please - * be careful that "break" loop is not allowed. We have reference count. - * Instead of that modify "cond" to be false and "continue" to exit the loop. + * Iteration constructs for visiting all cgroups (under a tree). If + * loops are exited prematurely (break), mem_cgroup_iter_break() must + * be used for reference counting. */ -#define for_each_mem_cgroup_tree_cond(iter, root, cond) \ - for (iter = mem_cgroup_start_loop(root);\ - iter != NULL;\ - iter = mem_cgroup_get_next(iter, root, cond)) - -#define for_each_mem_cgroup_tree(iter, root) \ - for_each_mem_cgroup_tree_cond(iter, root, true) - -#define for_each_mem_cgroup_all(iter) \ - for_each_mem_cgroup_tree_cond(iter, NULL, true) +#define for_each_mem_cgroup_tree(iter, root) \ + for (iter = mem_cgroup_iter(root, NULL, false); \ + iter != NULL; \ + iter = mem_cgroup_iter(root, iter, false)) +#define for_each_mem_cgroup(iter) \ + for (iter = mem_cgroup_iter(NULL, NULL, false); \ + iter != NULL; \ + iter = mem_cgroup_iter(NULL, iter, false)) static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { @@ -1536,43 +1529,6 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) return min(limit, memsw); } -/* - * Visit the first child (need not be the first child as per the ordering - * of the cgroup list, since we track last_scanned_child) of @mem and use - * that to reclaim free pages from. - */ -static struct mem_cgroup * -mem_cgroup_select_victim(struct mem_cgroup *root_memcg) -{ - struct mem_cgroup *ret = NULL; - struct cgroup_subsys_state *css; - int nextid, found; - - if (!root_memcg->use_hierarchy) { - css_get(&root_memcg->css); - ret = root_memcg; - } - - while (!ret) { - rcu_read_lock(); - nextid = root_memcg->last_scanned_child + 1; - css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css, - &found); - if (css && css_tryget(css)) - ret = container_of(css, struct mem_cgroup, css); - - rcu_read_unlock(); - /* Updates scanning parameter */ - if (!css) { - /* this means start scan from ID:1 */ - root_memcg->last_scanned_child = 0; - } else - root_memcg->last_scanned_child = found; - } - - return ret; -} - /** * test_mem_cgroup_node_reclaimable * @mem: the target memcg @@ -1728,7 +1684,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, unsigned long reclaim_options, unsigned long *total_scanned) { - struct mem_cgroup *victim; + struct mem_cgroup *victim = NULL; int ret, total = 0; int loop = 0; bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; @@ -1744,8 +1700,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, noswap = true; while (1) { - victim = mem_cgroup_select_victim(root_memcg); - if (victim == root_memcg) { + victim = mem_cgroup_iter(root_memcg, victim, true); + if (!victim) { loop++; /* * We are not draining per cpu cached charges during @@ -1761,10 +1717,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, * anything, it might because there are * no reclaimable pages under this hierarchy */ - if (!check_soft || !total) { - css_put(&victim->css); + if (!check_soft || !total) break; - } /* * We want to do more targeted reclaim. * excess >> 2 is not to excessive so as to @@ -1772,15 +1726,13 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, * coming back to reclaim from this cgroup */ if (total >= (excess >> 2) || - (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { - css_put(&victim->css); + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) break; - } } + continue; } if (!mem_cgroup_reclaimable(victim, noswap)) { /* this cgroup's local usage == 0 */ - css_put(&victim->css); continue; } /* we use swappiness of local cgroup */ @@ -1791,21 +1743,21 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, } else ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap); - css_put(&victim->css); + total += ret; /* * At shrinking usage, we can't check we should stop here or * reclaim more. It's depends on callers. last_scanned_child * will work enough for keeping fairness under tree. */ if (shrink) - return ret; - total += ret; + break; if (check_soft) { if (!res_counter_soft_limit_excess(&root_memcg->res)) - return total; + break; } else if (mem_cgroup_margin(root_memcg)) - return total; + break; } + mem_cgroup_iter_break(root_memcg, victim); return total; } @@ -1817,16 +1769,16 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) { struct mem_cgroup *iter, *failed = NULL; - bool cond = true; - for_each_mem_cgroup_tree_cond(iter, memcg, cond) { + for_each_mem_cgroup_tree(iter, memcg) { if (iter->oom_lock) { /* * this subtree of our hierarchy is already locked * so we cannot give a lock. */ failed = iter; - cond = false; + mem_cgroup_iter_break(memcg, iter); + break; } else iter->oom_lock = true; } @@ -1838,11 +1790,10 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) * OK, we failed to lock the whole subtree so we have to clean up * what we set up to the failing subtree */ - cond = true; - for_each_mem_cgroup_tree_cond(iter, memcg, cond) { + for_each_mem_cgroup_tree(iter, memcg) { if (iter == failed) { - cond = false; - continue; + mem_cgroup_iter_break(memcg, iter); + break; } iter->oom_lock = false; } @@ -2238,7 +2189,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, struct mem_cgroup *iter; if ((action == CPU_ONLINE)) { - for_each_mem_cgroup_all(iter) + for_each_mem_cgroup(iter) synchronize_mem_cgroup_on_move(iter, cpu); return NOTIFY_OK; } @@ -2246,7 +2197,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) return NOTIFY_OK; - for_each_mem_cgroup_all(iter) + for_each_mem_cgroup(iter) mem_cgroup_drain_pcp_counter(iter, cpu); stock = &per_cpu(memcg_stock, cpu); From 89b5fae5368f6aec62fb09c8e19b6c61f1154603 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:17:50 -0800 Subject: [PATCH 19/96] mm: vmscan: distinguish global reclaim from global LRU scanning The traditional zone reclaim code is scanning the per-zone LRU lists during direct reclaim and kswapd, and the per-zone per-memory cgroup LRU lists when reclaiming on behalf of a memory cgroup limit. Subsequent patches will convert the traditional reclaim code to reclaim exclusively from the per-memory cgroup LRU lists. As a result, using the predicate for which LRU list is scanned will no longer be appropriate to tell global reclaim from limit reclaim. This patch adds a global_reclaim() predicate to tell direct/kswapd reclaim from memory cgroup limit reclaim and substitutes it in all places where currently scanning_global_lru() is used for that. Signed-off-by: Johannes Weiner Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Reviewed-by: Kirill A. Shutemov Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Ying Han Cc: Greg Thelen Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: Christoph Hellwig Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 62 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 26f4a8a4e0c7..ee4a46b8ae33 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -153,9 +153,25 @@ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_CGROUP_MEM_RES_CTLR -#define scanning_global_lru(sc) (!(sc)->mem_cgroup) +static bool global_reclaim(struct scan_control *sc) +{ + return !sc->mem_cgroup; +} + +static bool scanning_global_lru(struct scan_control *sc) +{ + return !sc->mem_cgroup; +} #else -#define scanning_global_lru(sc) (1) +static bool global_reclaim(struct scan_control *sc) +{ + return true; +} + +static bool scanning_global_lru(struct scan_control *sc) +{ + return true; +} #endif static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, @@ -994,7 +1010,7 @@ keep_lumpy: * back off and wait for congestion to clear because further reclaim * will encounter the same problem */ - if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) + if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) zone_set_flag(zone, ZONE_CONGESTED); free_hot_cold_page_list(&free_pages, 1); @@ -1313,7 +1329,7 @@ static int too_many_isolated(struct zone *zone, int file, if (current_is_kswapd()) return 0; - if (!scanning_global_lru(sc)) + if (!global_reclaim(sc)) return 0; if (file) { @@ -1491,6 +1507,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_to_scan, &page_list, &nr_scanned, sc->order, reclaim_mode, zone, 0, file); + } else { + nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, + &nr_scanned, sc->order, reclaim_mode, zone, + sc->mem_cgroup, 0, file); + } + if (global_reclaim(sc)) { zone->pages_scanned += nr_scanned; if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, @@ -1498,14 +1520,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, else __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); - } else { - nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, - &nr_scanned, sc->order, reclaim_mode, zone, - sc->mem_cgroup, 0, file); - /* - * mem_cgroup_isolate_pages() keeps track of - * scanned pages on its own. - */ } if (nr_taken == 0) { @@ -1646,18 +1660,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, &pgscanned, sc->order, reclaim_mode, zone, 1, file); - zone->pages_scanned += pgscanned; } else { nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, reclaim_mode, zone, sc->mem_cgroup, 1, file); - /* - * mem_cgroup_isolate_pages() keeps track of - * scanned pages on its own. - */ } + if (global_reclaim(sc)) + zone->pages_scanned += pgscanned; + reclaim_stat->recent_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, pgscanned); @@ -1827,7 +1839,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, static int vmscan_swappiness(struct scan_control *sc) { - if (scanning_global_lru(sc)) + if (global_reclaim(sc)) return vm_swappiness; return mem_cgroup_swappiness(sc->mem_cgroup); } @@ -1862,9 +1874,9 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * latencies, so it's better to scan a minimum amount there as * well. */ - if (scanning_global_lru(sc) && current_is_kswapd()) + if (current_is_kswapd()) force_scan = true; - if (!scanning_global_lru(sc)) + if (!global_reclaim(sc)) force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ @@ -1881,7 +1893,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); - if (scanning_global_lru(sc)) { + if (global_reclaim(sc)) { free = zone_page_state(zone, NR_FREE_PAGES); /* If we have very few page cache pages, force-scan anon pages. */ @@ -2114,7 +2126,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, * Take care memory controller reclaiming has small influence * to global LRU. */ - if (scanning_global_lru(sc)) { + if (global_reclaim(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; if (zone->all_unreclaimable && priority != DEF_PRIORITY) @@ -2212,7 +2224,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, get_mems_allowed(); delayacct_freepages_start(); - if (scanning_global_lru(sc)) + if (global_reclaim(sc)) count_vm_event(ALLOCSTALL); for (priority = DEF_PRIORITY; priority >= 0; priority--) { @@ -2226,7 +2238,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, * Don't shrink slabs when reclaiming memory from * over limit cgroups */ - if (scanning_global_lru(sc)) { + if (global_reclaim(sc)) { unsigned long lru_pages = 0; for_each_zone_zonelist(zone, z, zonelist, gfp_zone(sc->gfp_mask)) { @@ -2288,7 +2300,7 @@ out: return 0; /* top priority shrink_zones still had more to do? don't OOM, then */ - if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) + if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) return 1; return 0; From f16015fbf2f6ac45505d6ad21455ff9f6c14473d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:17:52 -0800 Subject: [PATCH 20/96] mm: vmscan: distinguish between memcg triggering reclaim and memcg being scanned Memory cgroup hierarchies are currently handled completely outside of the traditional reclaim code, which is invoked with a single memory cgroup as an argument for the whole call stack. Subsequent patches will switch this code to do hierarchical reclaim, so there needs to be a distinction between a) the memory cgroup that is triggering reclaim due to hitting its limit and b) the memory cgroup that is being scanned as a child of a). This patch introduces a struct mem_cgroup_zone that contains the combination of the memory cgroup and the zone being scanned, which is then passed down the stack instead of the zone argument. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Johannes Weiner Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Reviewed-by: Kirill A. Shutemov Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Ying Han Cc: Greg Thelen Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: Christoph Hellwig Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 249 +++++++++++++++++++++++++++++----------------------- 1 file changed, 141 insertions(+), 108 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index ee4a46b8ae33..e0627d07c3ac 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -103,8 +103,11 @@ struct scan_control { */ reclaim_mode_t reclaim_mode; - /* Which cgroup do we reclaim from */ - struct mem_cgroup *mem_cgroup; + /* + * The memory cgroup that hit its limit and as a result is the + * primary target of this reclaim invocation. + */ + struct mem_cgroup *target_mem_cgroup; /* * Nodemask of nodes allowed by the caller. If NULL, all nodes @@ -113,6 +116,11 @@ struct scan_control { nodemask_t *nodemask; }; +struct mem_cgroup_zone { + struct mem_cgroup *mem_cgroup; + struct zone *zone; +}; + #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) #ifdef ARCH_HAS_PREFETCH @@ -155,12 +163,12 @@ static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_CGROUP_MEM_RES_CTLR static bool global_reclaim(struct scan_control *sc) { - return !sc->mem_cgroup; + return !sc->target_mem_cgroup; } -static bool scanning_global_lru(struct scan_control *sc) +static bool scanning_global_lru(struct mem_cgroup_zone *mz) { - return !sc->mem_cgroup; + return !mz->mem_cgroup; } #else static bool global_reclaim(struct scan_control *sc) @@ -168,29 +176,30 @@ static bool global_reclaim(struct scan_control *sc) return true; } -static bool scanning_global_lru(struct scan_control *sc) +static bool scanning_global_lru(struct mem_cgroup_zone *mz) { return true; } #endif -static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, - struct scan_control *sc) +static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) { - if (!scanning_global_lru(sc)) - return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); + if (!scanning_global_lru(mz)) + return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone); - return &zone->reclaim_stat; + return &mz->zone->reclaim_stat; } -static unsigned long zone_nr_lru_pages(struct zone *zone, - struct scan_control *sc, enum lru_list lru) +static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz, + enum lru_list lru) { - if (!scanning_global_lru(sc)) - return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, - zone_to_nid(zone), zone_idx(zone), BIT(lru)); + if (!scanning_global_lru(mz)) + return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup, + zone_to_nid(mz->zone), + zone_idx(mz->zone), + BIT(lru)); - return zone_page_state(zone, NR_LRU_BASE + lru); + return zone_page_state(mz->zone, NR_LRU_BASE + lru); } @@ -693,12 +702,13 @@ enum page_references { }; static enum page_references page_check_references(struct page *page, + struct mem_cgroup_zone *mz, struct scan_control *sc) { int referenced_ptes, referenced_page; unsigned long vm_flags; - referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); + referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags); referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ @@ -754,7 +764,7 @@ static enum page_references page_check_references(struct page *page, * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct zone *zone, + struct mem_cgroup_zone *mz, struct scan_control *sc, int priority, unsigned long *ret_nr_dirty, @@ -785,7 +795,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep; VM_BUG_ON(PageActive(page)); - VM_BUG_ON(page_zone(page) != zone); + VM_BUG_ON(page_zone(page) != mz->zone); sc->nr_scanned++; @@ -819,7 +829,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } - references = page_check_references(page, sc); + references = page_check_references(page, mz, sc); switch (references) { case PAGEREF_ACTIVATE: goto activate_locked; @@ -1011,7 +1021,7 @@ keep_lumpy: * will encounter the same problem */ if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) - zone_set_flag(zone, ZONE_CONGESTED); + zone_set_flag(mz->zone, ZONE_CONGESTED); free_hot_cold_page_list(&free_pages, 1); @@ -1347,13 +1357,14 @@ static int too_many_isolated(struct zone *zone, int file, * TODO: Try merging with migrations version of putback_lru_pages */ static noinline_for_stack void -putback_lru_pages(struct zone *zone, struct scan_control *sc, - unsigned long nr_anon, unsigned long nr_file, - struct list_head *page_list) +putback_lru_pages(struct mem_cgroup_zone *mz, struct scan_control *sc, + unsigned long nr_anon, unsigned long nr_file, + struct list_head *page_list) { struct page *page; struct pagevec pvec; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + struct zone *zone = mz->zone; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); pagevec_init(&pvec, 1); @@ -1393,15 +1404,17 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, pagevec_release(&pvec); } -static noinline_for_stack void update_isolated_counts(struct zone *zone, - struct scan_control *sc, - unsigned long *nr_anon, - unsigned long *nr_file, - struct list_head *isolated_list) +static noinline_for_stack void +update_isolated_counts(struct mem_cgroup_zone *mz, + struct scan_control *sc, + unsigned long *nr_anon, + unsigned long *nr_file, + struct list_head *isolated_list) { unsigned long nr_active; + struct zone *zone = mz->zone; unsigned int count[NR_LRU_LISTS] = { 0, }; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); nr_active = clear_active_flags(isolated_list, count); __count_vm_events(PGDEACTIVATE, nr_active); @@ -1470,8 +1483,8 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, * of reclaimed pages */ static noinline_for_stack unsigned long -shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, - struct scan_control *sc, int priority, int file) +shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, + struct scan_control *sc, int priority, int file) { LIST_HEAD(page_list); unsigned long nr_scanned; @@ -1482,6 +1495,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, unsigned long nr_dirty = 0; unsigned long nr_writeback = 0; isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; + struct zone *zone = mz->zone; while (unlikely(too_many_isolated(zone, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1504,13 +1518,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, spin_lock_irq(&zone->lru_lock); - if (scanning_global_lru(sc)) { + if (scanning_global_lru(mz)) { nr_taken = isolate_pages_global(nr_to_scan, &page_list, &nr_scanned, sc->order, reclaim_mode, zone, 0, file); } else { nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, &nr_scanned, sc->order, reclaim_mode, zone, - sc->mem_cgroup, 0, file); + mz->mem_cgroup, 0, file); } if (global_reclaim(sc)) { zone->pages_scanned += nr_scanned; @@ -1527,17 +1541,17 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, return 0; } - update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); + update_isolated_counts(mz, sc, &nr_anon, &nr_file, &page_list); spin_unlock_irq(&zone->lru_lock); - nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority, + nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, &nr_dirty, &nr_writeback); /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { set_reclaim_mode(priority, sc, true); - nr_reclaimed += shrink_page_list(&page_list, zone, sc, + nr_reclaimed += shrink_page_list(&page_list, mz, sc, priority, &nr_dirty, &nr_writeback); } @@ -1546,7 +1560,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, __count_vm_events(KSWAPD_STEAL, nr_reclaimed); __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); - putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); + putback_lru_pages(mz, sc, nr_anon, nr_file, &page_list); /* * If reclaim is isolating dirty pages under writeback, it implies @@ -1633,8 +1647,10 @@ static void move_active_pages_to_lru(struct zone *zone, __count_vm_events(PGDEACTIVATE, pgmoved); } -static void shrink_active_list(unsigned long nr_pages, struct zone *zone, - struct scan_control *sc, int priority, int file) +static void shrink_active_list(unsigned long nr_pages, + struct mem_cgroup_zone *mz, + struct scan_control *sc, + int priority, int file) { unsigned long nr_taken; unsigned long pgscanned; @@ -1643,9 +1659,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); unsigned long nr_rotated = 0; isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; + struct zone *zone = mz->zone; lru_add_drain(); @@ -1655,7 +1672,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, reclaim_mode |= ISOLATE_CLEAN; spin_lock_irq(&zone->lru_lock); - if (scanning_global_lru(sc)) { + if (scanning_global_lru(mz)) { nr_taken = isolate_pages_global(nr_pages, &l_hold, &pgscanned, sc->order, reclaim_mode, zone, @@ -1664,7 +1681,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, reclaim_mode, zone, - sc->mem_cgroup, 1, file); + mz->mem_cgroup, 1, file); } if (global_reclaim(sc)) @@ -1690,7 +1707,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, continue; } - if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { + if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and @@ -1753,10 +1770,8 @@ static int inactive_anon_is_low_global(struct zone *zone) * Returns true if the zone does not have enough inactive anon pages, * meaning some active anon pages need to be deactivated. */ -static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) +static int inactive_anon_is_low(struct mem_cgroup_zone *mz) { - int low; - /* * If we don't have swap space, anonymous page deactivation * is pointless. @@ -1764,15 +1779,14 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) if (!total_swap_pages) return 0; - if (scanning_global_lru(sc)) - low = inactive_anon_is_low_global(zone); - else - low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone); - return low; + if (!scanning_global_lru(mz)) + return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup, + mz->zone); + + return inactive_anon_is_low_global(mz->zone); } #else -static inline int inactive_anon_is_low(struct zone *zone, - struct scan_control *sc) +static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz) { return 0; } @@ -1790,8 +1804,7 @@ static int inactive_file_is_low_global(struct zone *zone) /** * inactive_file_is_low - check if file pages need to be deactivated - * @zone: zone to check - * @sc: scan control of this context + * @mz: memory cgroup and zone to check * * When the system is doing streaming IO, memory pressure here * ensures that active file pages get deactivated, until more @@ -1803,45 +1816,44 @@ static int inactive_file_is_low_global(struct zone *zone) * This uses a different ratio than the anonymous pages, because * the page cache uses a use-once replacement algorithm. */ -static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) +static int inactive_file_is_low(struct mem_cgroup_zone *mz) { - int low; + if (!scanning_global_lru(mz)) + return mem_cgroup_inactive_file_is_low(mz->mem_cgroup, + mz->zone); - if (scanning_global_lru(sc)) - low = inactive_file_is_low_global(zone); - else - low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone); - return low; + return inactive_file_is_low_global(mz->zone); } -static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, - int file) +static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file) { if (file) - return inactive_file_is_low(zone, sc); + return inactive_file_is_low(mz); else - return inactive_anon_is_low(zone, sc); + return inactive_anon_is_low(mz); } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, - struct zone *zone, struct scan_control *sc, int priority) + struct mem_cgroup_zone *mz, + struct scan_control *sc, int priority) { int file = is_file_lru(lru); if (is_active_lru(lru)) { - if (inactive_list_is_low(zone, sc, file)) - shrink_active_list(nr_to_scan, zone, sc, priority, file); + if (inactive_list_is_low(mz, file)) + shrink_active_list(nr_to_scan, mz, sc, priority, file); return 0; } - return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); + return shrink_inactive_list(nr_to_scan, mz, sc, priority, file); } -static int vmscan_swappiness(struct scan_control *sc) +static int vmscan_swappiness(struct mem_cgroup_zone *mz, + struct scan_control *sc) { if (global_reclaim(sc)) return vm_swappiness; - return mem_cgroup_swappiness(sc->mem_cgroup); + return mem_cgroup_swappiness(mz->mem_cgroup); } /* @@ -1852,13 +1864,13 @@ static int vmscan_swappiness(struct scan_control *sc) * * nr[0] = anon pages to scan; nr[1] = file pages to scan */ -static void get_scan_count(struct zone *zone, struct scan_control *sc, - unsigned long *nr, int priority) +static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, + unsigned long *nr, int priority) { unsigned long anon, file, free; unsigned long anon_prio, file_prio; unsigned long ap, fp; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); u64 fraction[2], denominator; enum lru_list l; int noswap = 0; @@ -1888,16 +1900,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, goto out; } - anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); - file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) + + zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); + file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) + + zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); if (global_reclaim(sc)) { - free = zone_page_state(zone, NR_FREE_PAGES); + free = zone_page_state(mz->zone, NR_FREE_PAGES); /* If we have very few page cache pages, force-scan anon pages. */ - if (unlikely(file + free <= high_wmark_pages(zone))) { + if (unlikely(file + free <= high_wmark_pages(mz->zone))) { fraction[0] = 1; fraction[1] = 0; denominator = 1; @@ -1909,8 +1921,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = vmscan_swappiness(sc); - file_prio = 200 - vmscan_swappiness(sc); + anon_prio = vmscan_swappiness(mz, sc); + file_prio = 200 - vmscan_swappiness(mz, sc); /* * OK, so we have swap space and a fair amount of page cache @@ -1923,7 +1935,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * * anon in [0], file in [1] */ - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&mz->zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { reclaim_stat->recent_scanned[0] /= 2; reclaim_stat->recent_rotated[0] /= 2; @@ -1944,7 +1956,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&mz->zone->lru_lock); fraction[0] = ap; fraction[1] = fp; @@ -1954,7 +1966,7 @@ out: int file = is_file_lru(l); unsigned long scan; - scan = zone_nr_lru_pages(zone, sc, l); + scan = zone_nr_lru_pages(mz, l); if (priority || noswap) { scan >>= priority; if (!scan && force_scan) @@ -1972,7 +1984,7 @@ out: * back to the allocator and call try_to_compact_zone(), we ensure that * there are enough free pages for it to be likely successful */ -static inline bool should_continue_reclaim(struct zone *zone, +static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, unsigned long nr_reclaimed, unsigned long nr_scanned, struct scan_control *sc) @@ -2012,15 +2024,15 @@ static inline bool should_continue_reclaim(struct zone *zone, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); - inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); if (nr_swap_pages > 0) - inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); + inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); if (sc->nr_reclaimed < pages_for_compaction && inactive_lru_pages > pages_for_compaction) return true; /* If compaction would go ahead or the allocation would succeed, stop */ - switch (compaction_suitable(zone, sc->order)) { + switch (compaction_suitable(mz->zone, sc->order)) { case COMPACT_PARTIAL: case COMPACT_CONTINUE: return false; @@ -2032,8 +2044,8 @@ static inline bool should_continue_reclaim(struct zone *zone, /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void shrink_zone(int priority, struct zone *zone, - struct scan_control *sc) +static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz, + struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; @@ -2045,7 +2057,7 @@ static void shrink_zone(int priority, struct zone *zone, restart: nr_reclaimed = 0; nr_scanned = sc->nr_scanned; - get_scan_count(zone, sc, nr, priority); + get_scan_count(mz, sc, nr, priority); blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || @@ -2057,7 +2069,7 @@ restart: nr[l] -= nr_to_scan; nr_reclaimed += shrink_list(l, nr_to_scan, - zone, sc, priority); + mz, sc, priority); } } /* @@ -2078,17 +2090,28 @@ restart: * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_anon_is_low(zone, sc)) - shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + if (inactive_anon_is_low(mz)) + shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0); /* reclaim/compaction might need reclaim to continue */ - if (should_continue_reclaim(zone, nr_reclaimed, + if (should_continue_reclaim(mz, nr_reclaimed, sc->nr_scanned - nr_scanned, sc)) goto restart; throttle_vm_writeout(sc->gfp_mask); } +static void shrink_zone(int priority, struct zone *zone, + struct scan_control *sc) +{ + struct mem_cgroup_zone mz = { + .mem_cgroup = sc->target_mem_cgroup, + .zone = zone, + }; + + shrink_mem_cgroup_zone(priority, &mz, sc); +} + /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation @@ -2230,7 +2253,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc->nr_scanned = 0; if (!priority) - disable_swap_token(sc->mem_cgroup); + disable_swap_token(sc->target_mem_cgroup); if (shrink_zones(priority, zonelist, sc)) break; @@ -2317,7 +2340,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .may_unmap = 1, .may_swap = 1, .order = order, - .mem_cgroup = NULL, + .target_mem_cgroup = NULL, .nodemask = nodemask, }; struct shrink_control shrink = { @@ -2349,7 +2372,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .may_unmap = 1, .may_swap = !noswap, .order = 0, - .mem_cgroup = mem, + .target_mem_cgroup = mem, }; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | @@ -2387,7 +2410,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, .order = 0, - .mem_cgroup = mem_cont, + .target_mem_cgroup = mem_cont, .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), @@ -2417,6 +2440,18 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, } #endif +static void age_active_anon(struct zone *zone, struct scan_control *sc, + int priority) +{ + struct mem_cgroup_zone mz = { + .mem_cgroup = NULL, + .zone = zone, + }; + + if (inactive_anon_is_low(&mz)) + shrink_active_list(SWAP_CLUSTER_MAX, &mz, sc, priority, 0); +} + /* * pgdat_balanced is used when checking if a node is balanced for high-order * allocations. Only zones that meet watermarks and are in a zone allowed @@ -2537,7 +2572,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, */ .nr_to_reclaim = ULONG_MAX, .order = order, - .mem_cgroup = NULL, + .target_mem_cgroup = NULL, }; struct shrink_control shrink = { .gfp_mask = sc.gfp_mask, @@ -2576,9 +2611,7 @@ loop_again: * Do some background aging of the anon list, to give * pages a chance to be referenced before reclaiming. */ - if (inactive_anon_is_low(zone, &sc)) - shrink_active_list(SWAP_CLUSTER_MAX, zone, - &sc, priority, 0); + age_active_anon(zone, &sc, priority); if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { From 527a5ec9a53471d855291ba9f1fdf1dd4e12a184 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:17:55 -0800 Subject: [PATCH 21/96] mm: memcg: per-priority per-zone hierarchy scan generations Memory cgroup limit reclaim currently picks one memory cgroup out of the target hierarchy, remembers it as the last scanned child, and reclaims all zones in it with decreasing priority levels. The new hierarchy reclaim code will pick memory cgroups from the same hierarchy concurrently from different zones and priority levels, it becomes necessary that hierarchy roots not only remember the last scanned child, but do so for each zone and priority level. Until now, we reclaimed memcgs like this: mem = mem_cgroup_iter(root) for each priority level: for each zone in zonelist: reclaim(mem, zone) But subsequent patches will move the memcg iteration inside the loop over the zones: for each priority level: for each zone in zonelist: mem = mem_cgroup_iter(root) reclaim(mem, zone) And to keep with the original scan order - memcg -> priority -> zone - the last scanned memcg has to be remembered per zone and per priority level. Furthermore, global reclaim will be switched to the hierarchy walk as well. Different from limit reclaim, which can just recheck the limit after some reclaim progress, its target is to scan all memcgs for the desired zone pages, proportional to the memcg size, and so reliably detecting a full hierarchy round-trip will become crucial. Currently, the code relies on one reclaimer encountering the same memcg twice, but that is error-prone with concurrent reclaimers. Instead, use a generation counter that is increased every time the child with the highest ID has been visited, so that reclaimers can stop when the generation changes. Signed-off-by: Johannes Weiner Reviewed-by: Kirill A. Shutemov Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Ying Han Cc: Greg Thelen Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: Christoph Hellwig Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 65 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 18 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6edef95fecf4..bec451da7def 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -123,6 +123,13 @@ struct mem_cgroup_stat_cpu { unsigned long targets[MEM_CGROUP_NTARGETS]; }; +struct mem_cgroup_reclaim_iter { + /* css_id of the last scanned hierarchy member */ + int position; + /* scan generation, increased every round-trip */ + unsigned int generation; +}; + /* * per-zone information in memory controller. */ @@ -133,6 +140,8 @@ struct mem_cgroup_per_zone { struct list_head lists[NR_LRU_LISTS]; unsigned long count[NR_LRU_LISTS]; + struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; + struct zone_reclaim_stat reclaim_stat; struct rb_node tree_node; /* RB tree node */ unsigned long long usage_in_excess;/* Set to the value by which */ @@ -233,11 +242,6 @@ struct mem_cgroup { * per zone LRU lists. */ struct mem_cgroup_lru_info info; - /* - * While reclaiming in a hierarchy, we cache the last child we - * reclaimed from. - */ - int last_scanned_child; int last_scanned_node; #if MAX_NUMNODES > 1 nodemask_t scan_nodes; @@ -853,9 +857,16 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } -static struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, - struct mem_cgroup *prev, - bool reclaim) +struct mem_cgroup_reclaim_cookie { + struct zone *zone; + int priority; + unsigned int generation; +}; + +static struct mem_cgroup * +mem_cgroup_iter(struct mem_cgroup *root, + struct mem_cgroup *prev, + struct mem_cgroup_reclaim_cookie *reclaim) { struct mem_cgroup *memcg = NULL; int id = 0; @@ -876,10 +887,20 @@ static struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, } while (!memcg) { + struct mem_cgroup_reclaim_iter *uninitialized_var(iter); struct cgroup_subsys_state *css; - if (reclaim) - id = root->last_scanned_child; + if (reclaim) { + int nid = zone_to_nid(reclaim->zone); + int zid = zone_idx(reclaim->zone); + struct mem_cgroup_per_zone *mz; + + mz = mem_cgroup_zoneinfo(root, nid, zid); + iter = &mz->reclaim_iter[reclaim->priority]; + if (prev && reclaim->generation != iter->generation) + return NULL; + id = iter->position; + } rcu_read_lock(); css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); @@ -891,8 +912,13 @@ static struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, id = 0; rcu_read_unlock(); - if (reclaim) - root->last_scanned_child = id; + if (reclaim) { + iter->position = id; + if (!css) + iter->generation++; + else if (!prev && memcg) + reclaim->generation = iter->generation; + } if (prev && !css) return NULL; @@ -915,14 +941,14 @@ static void mem_cgroup_iter_break(struct mem_cgroup *root, * be used for reference counting. */ #define for_each_mem_cgroup_tree(iter, root) \ - for (iter = mem_cgroup_iter(root, NULL, false); \ + for (iter = mem_cgroup_iter(root, NULL, NULL); \ iter != NULL; \ - iter = mem_cgroup_iter(root, iter, false)) + iter = mem_cgroup_iter(root, iter, NULL)) #define for_each_mem_cgroup(iter) \ - for (iter = mem_cgroup_iter(NULL, NULL, false); \ + for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ iter != NULL; \ - iter = mem_cgroup_iter(NULL, iter, false)) + iter = mem_cgroup_iter(NULL, iter, NULL)) static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { @@ -1692,6 +1718,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; unsigned long excess; unsigned long nr_scanned; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = 0, + }; excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; @@ -1700,7 +1730,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, noswap = true; while (1) { - victim = mem_cgroup_iter(root_memcg, victim, true); + victim = mem_cgroup_iter(root_memcg, victim, &reclaim); if (!victim) { loop++; /* @@ -5028,7 +5058,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); } - memcg->last_scanned_child = 0; memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); From 5660048ccac8735d9bc0a46325a02e6a6518b5b2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:17:59 -0800 Subject: [PATCH 22/96] mm: move memcg hierarchy reclaim to generic reclaim code Memory cgroup limit reclaim and traditional global pressure reclaim will soon share the same code to reclaim from a hierarchical tree of memory cgroups. In preparation of this, move the two right next to each other in shrink_zone(). The mem_cgroup_hierarchical_reclaim() polymath is split into a soft limit reclaim function, which still does hierarchy walking on its own, and a limit (shrinking) reclaim function, which relies on generic reclaim code to walk the hierarchy. Signed-off-by: Johannes Weiner Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Reviewed-by: Kirill A. Shutemov Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Ying Han Cc: Greg Thelen Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: Christoph Hellwig Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 24 ++++++ mm/memcontrol.c | 169 +++++++++++++++++++------------------ mm/vmscan.c | 43 +++++++++- 3 files changed, 148 insertions(+), 88 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3558a5e268cf..3b99dce85293 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -40,6 +40,12 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct mem_cgroup *mem_cont, int active, int file); +struct mem_cgroup_reclaim_cookie { + struct zone *zone; + int priority; + unsigned int generation; +}; + #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* * All "charge" functions with gfp_mask should use GFP_KERNEL or @@ -106,6 +112,11 @@ mem_cgroup_prepare_migration(struct page *page, extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, struct page *oldpage, struct page *newpage, bool migration_ok); +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, + struct mem_cgroup *, + struct mem_cgroup_reclaim_cookie *); +void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); + /* * For memory reclaim. */ @@ -281,6 +292,19 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg, { } +static inline struct mem_cgroup * +mem_cgroup_iter(struct mem_cgroup *root, + struct mem_cgroup *prev, + struct mem_cgroup_reclaim_cookie *reclaim) +{ + return NULL; +} + +static inline void mem_cgroup_iter_break(struct mem_cgroup *root, + struct mem_cgroup *prev) +{ +} + static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *memcg) { return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bec451da7def..750ed1449955 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -370,8 +370,6 @@ enum charge_type { #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) -#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 -#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) static void mem_cgroup_get(struct mem_cgroup *memcg); static void mem_cgroup_put(struct mem_cgroup *memcg); @@ -857,20 +855,33 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } -struct mem_cgroup_reclaim_cookie { - struct zone *zone; - int priority; - unsigned int generation; -}; - -static struct mem_cgroup * -mem_cgroup_iter(struct mem_cgroup *root, - struct mem_cgroup *prev, - struct mem_cgroup_reclaim_cookie *reclaim) +/** + * mem_cgroup_iter - iterate over memory cgroup hierarchy + * @root: hierarchy root + * @prev: previously returned memcg, NULL on first invocation + * @reclaim: cookie for shared reclaim walks, NULL for full walks + * + * Returns references to children of the hierarchy below @root, or + * @root itself, or %NULL after a full round-trip. + * + * Caller must pass the return value in @prev on subsequent + * invocations for reference counting, or use mem_cgroup_iter_break() + * to cancel a hierarchy walk before the round-trip is complete. + * + * Reclaimers can specify a zone and a priority level in @reclaim to + * divide up the memcgs in the hierarchy among all concurrent + * reclaimers operating on the same zone and priority. + */ +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, + struct mem_cgroup *prev, + struct mem_cgroup_reclaim_cookie *reclaim) { struct mem_cgroup *memcg = NULL; int id = 0; + if (mem_cgroup_disabled()) + return NULL; + if (!root) root = root_mem_cgroup; @@ -926,8 +937,13 @@ mem_cgroup_iter(struct mem_cgroup *root, return memcg; } -static void mem_cgroup_iter_break(struct mem_cgroup *root, - struct mem_cgroup *prev) +/** + * mem_cgroup_iter_break - abort a hierarchy walk prematurely + * @root: hierarchy root + * @prev: last visited hierarchy member as returned by mem_cgroup_iter() + */ +void mem_cgroup_iter_break(struct mem_cgroup *root, + struct mem_cgroup *prev) { if (!root) root = root_mem_cgroup; @@ -1555,6 +1571,42 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) return min(limit, memsw); } +static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, + gfp_t gfp_mask, + unsigned long flags) +{ + unsigned long total = 0; + bool noswap = false; + int loop; + + if (flags & MEM_CGROUP_RECLAIM_NOSWAP) + noswap = true; + if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) + noswap = true; + + for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { + if (loop) + drain_all_stock_async(memcg); + total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); + /* + * Allow limit shrinkers, which are triggered directly + * by userspace, to catch signals and stop reclaim + * after minimal progress, regardless of the margin. + */ + if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) + break; + if (mem_cgroup_margin(memcg)) + break; + /* + * If nothing was reclaimed after two attempts, there + * may be no reclaimable pages in this hierarchy. + */ + if (loop && !total) + break; + } + return total; +} + /** * test_mem_cgroup_node_reclaimable * @mem: the target memcg @@ -1692,30 +1744,14 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) } #endif -/* - * Scan the hierarchy if needed to reclaim memory. We remember the last child - * we reclaimed from, so that we don't end up penalizing one child extensively - * based on its position in the children list. - * - * root_memcg is the original ancestor that we've been reclaim from. - * - * We give up and return to the caller when we visit root_memcg twice. - * (other groups can be removed while we're walking....) - * - * If shrink==true, for avoiding to free too much, this returns immedieately. - */ -static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, - struct zone *zone, - gfp_t gfp_mask, - unsigned long reclaim_options, - unsigned long *total_scanned) +static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, + struct zone *zone, + gfp_t gfp_mask, + unsigned long *total_scanned) { struct mem_cgroup *victim = NULL; - int ret, total = 0; + int total = 0; int loop = 0; - bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; - bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; - bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; unsigned long excess; unsigned long nr_scanned; struct mem_cgroup_reclaim_cookie reclaim = { @@ -1725,29 +1761,17 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; - /* If memsw_is_minimum==1, swap-out is of-no-use. */ - if (!check_soft && !shrink && root_memcg->memsw_is_minimum) - noswap = true; - while (1) { victim = mem_cgroup_iter(root_memcg, victim, &reclaim); if (!victim) { loop++; - /* - * We are not draining per cpu cached charges during - * soft limit reclaim because global reclaim doesn't - * care about charges. It tries to free some memory and - * charges will not give any. - */ - if (!check_soft && loop >= 1) - drain_all_stock_async(root_memcg); if (loop >= 2) { /* * If we have not been able to reclaim * anything, it might because there are * no reclaimable pages under this hierarchy */ - if (!check_soft || !total) + if (!total) break; /* * We want to do more targeted reclaim. @@ -1761,30 +1785,12 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, } continue; } - if (!mem_cgroup_reclaimable(victim, noswap)) { - /* this cgroup's local usage == 0 */ + if (!mem_cgroup_reclaimable(victim, false)) continue; - } - /* we use swappiness of local cgroup */ - if (check_soft) { - ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, zone, &nr_scanned); - *total_scanned += nr_scanned; - } else - ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, - noswap); - total += ret; - /* - * At shrinking usage, we can't check we should stop here or - * reclaim more. It's depends on callers. last_scanned_child - * will work enough for keeping fairness under tree. - */ - if (shrink) - break; - if (check_soft) { - if (!res_counter_soft_limit_excess(&root_memcg->res)) - break; - } else if (mem_cgroup_margin(root_memcg)) + total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, + zone, &nr_scanned); + *total_scanned += nr_scanned; + if (!res_counter_soft_limit_excess(&root_memcg->res)) break; } mem_cgroup_iter_break(root_memcg, victim); @@ -2281,8 +2287,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (!(gfp_mask & __GFP_WAIT)) return CHARGE_WOULDBLOCK; - ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags, NULL); + ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) return CHARGE_RETRY; /* @@ -3559,9 +3564,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, if (!ret) break; - mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, - MEM_CGROUP_RECLAIM_SHRINK, - NULL); + mem_cgroup_reclaim(memcg, GFP_KERNEL, + MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3619,10 +3623,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, if (!ret) break; - mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, - MEM_CGROUP_RECLAIM_NOSWAP | - MEM_CGROUP_RECLAIM_SHRINK, - NULL); + mem_cgroup_reclaim(memcg, GFP_KERNEL, + MEM_CGROUP_RECLAIM_NOSWAP | + MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3665,10 +3668,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, break; nr_scanned = 0; - reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, - gfp_mask, - MEM_CGROUP_RECLAIM_SOFT, - &nr_scanned); + reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone, + gfp_mask, &nr_scanned); nr_reclaimed += reclaimed; *total_scanned += nr_scanned; spin_lock(&mctz->lock); diff --git a/mm/vmscan.c b/mm/vmscan.c index e0627d07c3ac..136c7eb0ad88 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2104,12 +2104,43 @@ restart: static void shrink_zone(int priority, struct zone *zone, struct scan_control *sc) { - struct mem_cgroup_zone mz = { - .mem_cgroup = sc->target_mem_cgroup, + struct mem_cgroup *root = sc->target_mem_cgroup; + struct mem_cgroup_reclaim_cookie reclaim = { .zone = zone, + .priority = priority, }; + struct mem_cgroup *memcg; - shrink_mem_cgroup_zone(priority, &mz, sc); + if (global_reclaim(sc)) { + struct mem_cgroup_zone mz = { + .mem_cgroup = NULL, + .zone = zone, + }; + + shrink_mem_cgroup_zone(priority, &mz, sc); + return; + } + + memcg = mem_cgroup_iter(root, NULL, &reclaim); + do { + struct mem_cgroup_zone mz = { + .mem_cgroup = memcg, + .zone = zone, + }; + + shrink_mem_cgroup_zone(priority, &mz, sc); + /* + * Limit reclaim has historically picked one memcg and + * scanned it with decreasing priority levels until + * nr_to_reclaim had been reclaimed. This priority + * cycle is thus over after a single memcg. + */ + if (!global_reclaim(sc)) { + mem_cgroup_iter_break(root, memcg); + break; + } + memcg = mem_cgroup_iter(root, memcg, &reclaim); + } while (memcg); } /* @@ -2374,6 +2405,10 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .order = 0, .target_mem_cgroup = mem, }; + struct mem_cgroup_zone mz = { + .mem_cgroup = mem, + .zone = zone, + }; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2389,7 +2424,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_zone(0, zone, &sc); + shrink_mem_cgroup_zone(0, &mz, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); From ad2b8e601099a23dffffb53f91c18d874fe98854 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:18:02 -0800 Subject: [PATCH 23/96] mm: memcg: remove optimization of keeping the root_mem_cgroup LRU lists empty root_mem_cgroup, lacking a configurable limit, was never subject to limit reclaim, so the pages charged to it could be kept off its LRU lists. They would be found on the global per-zone LRU lists upon physical memory pressure and it made sense to avoid uselessly linking them to both lists. The global per-zone LRU lists are about to go away on memcg-enabled kernels, with all pages being exclusively linked to their respective per-memcg LRU lists. As a result, pages of the root_mem_cgroup must also be linked to its LRU lists again. This is purely about the LRU list, root_mem_cgroup is still not charged. The overhead is temporary until the double-LRU scheme is going away completely. Signed-off-by: Johannes Weiner Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Reviewed-by: Kirill A. Shutemov Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Ying Han Cc: Greg Thelen Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: Christoph Hellwig Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 750ed1449955..ad7f36f676ff 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1031,8 +1031,6 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); /* huge page split is done under lru_lock. so, we have no races. */ MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; VM_BUG_ON(list_empty(&pc->lru)); list_del_init(&pc->lru); } @@ -1057,13 +1055,11 @@ void mem_cgroup_rotate_reclaimable_page(struct page *page) return; pc = lookup_page_cgroup(page); - /* unused or root page is not rotated. */ + /* unused page is not rotated. */ if (!PageCgroupUsed(pc)) return; /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ smp_rmb(); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); list_move_tail(&pc->lru, &mz->lists[lru]); } @@ -1077,13 +1073,11 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) return; pc = lookup_page_cgroup(page); - /* unused or root page is not rotated. */ + /* unused page is not rotated. */ if (!PageCgroupUsed(pc)) return; /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ smp_rmb(); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); list_move(&pc->lru, &mz->lists[lru]); } @@ -1115,8 +1109,6 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) /* huge page split is done under lru_lock. so, we have no races. */ MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); SetPageCgroupAcctLRU(pc); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; list_add(&pc->lru, &mz->lists[lru]); } From b95a2f2d486d0d768a92879c023a03757b9c7e58 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:18:06 -0800 Subject: [PATCH 24/96] mm: vmscan: convert global reclaim to per-memcg LRU lists The global per-zone LRU lists are about to go away on memcg-enabled kernels, global reclaim must be able to find its pages on the per-memcg LRU lists. Since the LRU pages of a zone are distributed over all existing memory cgroups, a scan target for a zone is complete when all memory cgroups are scanned for their proportional share of a zone's memory. The forced scanning of small scan targets from kswapd is limited to zones marked unreclaimable, otherwise kswapd can quickly overreclaim by force-scanning the LRU lists of multiple memory cgroups. Signed-off-by: Johannes Weiner Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Reviewed-by: Kirill A. Shutemov Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Ying Han Cc: Greg Thelen Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: Christoph Hellwig Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 136c7eb0ad88..024168cfdcb0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1886,7 +1886,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, * latencies, so it's better to scan a minimum amount there as * well. */ - if (current_is_kswapd()) + if (current_is_kswapd() && mz->zone->all_unreclaimable) force_scan = true; if (!global_reclaim(sc)) force_scan = true; @@ -2111,16 +2111,6 @@ static void shrink_zone(int priority, struct zone *zone, }; struct mem_cgroup *memcg; - if (global_reclaim(sc)) { - struct mem_cgroup_zone mz = { - .mem_cgroup = NULL, - .zone = zone, - }; - - shrink_mem_cgroup_zone(priority, &mz, sc); - return; - } - memcg = mem_cgroup_iter(root, NULL, &reclaim); do { struct mem_cgroup_zone mz = { @@ -2134,6 +2124,10 @@ static void shrink_zone(int priority, struct zone *zone, * scanned it with decreasing priority levels until * nr_to_reclaim had been reclaimed. This priority * cycle is thus over after a single memcg. + * + * Direct reclaim and kswapd, on the other hand, have + * to scan all memory cgroups to fulfill the overall + * scan target for the zone. */ if (!global_reclaim(sc)) { mem_cgroup_iter_break(root, memcg); @@ -2478,13 +2472,24 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, static void age_active_anon(struct zone *zone, struct scan_control *sc, int priority) { - struct mem_cgroup_zone mz = { - .mem_cgroup = NULL, - .zone = zone, - }; + struct mem_cgroup *memcg; - if (inactive_anon_is_low(&mz)) - shrink_active_list(SWAP_CLUSTER_MAX, &mz, sc, priority, 0); + if (!total_swap_pages) + return; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct mem_cgroup_zone mz = { + .mem_cgroup = memcg, + .zone = zone, + }; + + if (inactive_anon_is_low(&mz)) + shrink_active_list(SWAP_CLUSTER_MAX, &mz, + sc, priority, 0); + + memcg = mem_cgroup_iter(NULL, memcg, NULL); + } while (memcg); } /* From 6290df545814990ca2663baf6e894669132d5f73 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:18:10 -0800 Subject: [PATCH 25/96] mm: collect LRU list heads into struct lruvec Having a unified structure with a LRU list set for both global zones and per-memcg zones allows to keep that code simple which deals with LRU lists and does not care about the container itself. Once the per-memcg LRU lists directly link struct pages, the isolation function and all other list manipulations are shared between the memcg case and the global LRU case. Signed-off-by: Johannes Weiner Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Reviewed-by: Kirill A. Shutemov Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Ying Han Cc: Greg Thelen Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: Christoph Hellwig Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 2 +- include/linux/mmzone.h | 10 ++++++---- mm/memcontrol.c | 17 +++++++---------- mm/page_alloc.c | 2 +- mm/swap.c | 11 +++++------ mm/vmscan.c | 10 +++++----- 6 files changed, 25 insertions(+), 27 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8f7d24712dc1..e6a7ffe16d31 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -33,7 +33,7 @@ __add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, static inline void add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) { - __add_page_to_lru_list(zone, page, l, &zone->lru[l].list); + __add_page_to_lru_list(zone, page, l, &zone->lruvec.lists[l]); } static inline void diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ca6ca92418a6..42e544cd4c9f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -159,6 +159,10 @@ static inline int is_unevictable_lru(enum lru_list l) return (l == LRU_UNEVICTABLE); } +struct lruvec { + struct list_head lists[NR_LRU_LISTS]; +}; + /* Mask used at gathering information at once (see memcontrol.c) */ #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) @@ -364,10 +368,8 @@ struct zone { ZONE_PADDING(_pad1_) /* Fields commonly accessed by the page reclaim scanner */ - spinlock_t lru_lock; - struct zone_lru { - struct list_head list; - } lru[NR_LRU_LISTS]; + spinlock_t lru_lock; + struct lruvec lruvec; struct zone_reclaim_stat reclaim_stat; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ad7f36f676ff..6e7f849a1a9e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -134,10 +134,7 @@ struct mem_cgroup_reclaim_iter { * per-zone information in memory controller. */ struct mem_cgroup_per_zone { - /* - * spin_lock to protect the per cgroup LRU - */ - struct list_head lists[NR_LRU_LISTS]; + struct lruvec lruvec; unsigned long count[NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; @@ -1061,7 +1058,7 @@ void mem_cgroup_rotate_reclaimable_page(struct page *page) /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ smp_rmb(); mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - list_move_tail(&pc->lru, &mz->lists[lru]); + list_move_tail(&pc->lru, &mz->lruvec.lists[lru]); } void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) @@ -1079,7 +1076,7 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ smp_rmb(); mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - list_move(&pc->lru, &mz->lists[lru]); + list_move(&pc->lru, &mz->lruvec.lists[lru]); } void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) @@ -1109,7 +1106,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) /* huge page split is done under lru_lock. so, we have no races. */ MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); SetPageCgroupAcctLRU(pc); - list_add(&pc->lru, &mz->lists[lru]); + list_add(&pc->lru, &mz->lruvec.lists[lru]); } /* @@ -1307,7 +1304,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, BUG_ON(!mem_cont); mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); - src = &mz->lists[lru]; + src = &mz->lruvec.lists[lru]; scan = 0; list_for_each_entry_safe_reverse(pc, tmp, src, lru) { @@ -3738,7 +3735,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, zone = &NODE_DATA(node)->node_zones[zid]; mz = mem_cgroup_zoneinfo(memcg, node, zid); - list = &mz->lists[lru]; + list = &mz->lruvec.lists[lru]; loop = MEM_CGROUP_ZSTAT(mz, lru); /* give some margin against EBUSY etc...*/ @@ -4864,7 +4861,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; for_each_lru(l) - INIT_LIST_HEAD(&mz->lists[l]); + INIT_LIST_HEAD(&mz->lruvec.lists[l]); mz->usage_in_excess = 0; mz->on_tree = false; mz->mem = memcg; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 794e6715c226..25c248eb7d5f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4288,7 +4288,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_pcp_init(zone); for_each_lru(l) - INIT_LIST_HEAD(&zone->lru[l].list); + INIT_LIST_HEAD(&zone->lruvec.lists[l]); zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; zone->reclaim_stat.recent_scanned[0] = 0; diff --git a/mm/swap.c b/mm/swap.c index 67a09a633a09..76ef79d3857c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -236,7 +236,7 @@ static void pagevec_move_tail_fn(struct page *page, void *arg) if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { enum lru_list lru = page_lru_base_type(page); - list_move_tail(&page->lru, &zone->lru[lru].list); + list_move_tail(&page->lru, &zone->lruvec.lists[lru]); mem_cgroup_rotate_reclaimable_page(page); (*pgmoved)++; } @@ -480,7 +480,7 @@ static void lru_deactivate_fn(struct page *page, void *arg) * The page's writeback ends up during pagevec * We moves tha page into tail of inactive. */ - list_move_tail(&page->lru, &zone->lru[lru].list); + list_move_tail(&page->lru, &zone->lruvec.lists[lru]); mem_cgroup_rotate_reclaimable_page(page); __count_vm_event(PGROTATED); } @@ -654,7 +654,6 @@ void lru_add_page_tail(struct zone* zone, int active; enum lru_list lru; const int file = 0; - struct list_head *head; VM_BUG_ON(!PageHead(page)); VM_BUG_ON(PageCompound(page_tail)); @@ -674,10 +673,10 @@ void lru_add_page_tail(struct zone* zone, } update_page_reclaim_stat(zone, page_tail, file, active); if (likely(PageLRU(page))) - head = page->lru.prev; + __add_page_to_lru_list(zone, page_tail, lru, + page->lru.prev); else - head = &zone->lru[lru].list; - __add_page_to_lru_list(zone, page_tail, lru, head); + add_page_to_lru_list(zone, page_tail, lru); } else { SetPageUnevictable(page_tail); add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); diff --git a/mm/vmscan.c b/mm/vmscan.c index 024168cfdcb0..93cdc44a1693 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1250,8 +1250,8 @@ static unsigned long isolate_pages_global(unsigned long nr, lru += LRU_ACTIVE; if (file) lru += LRU_FILE; - return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, - mode, file); + return isolate_lru_pages(nr, &z->lruvec.lists[lru], dst, + scanned, order, mode, file); } /* @@ -1630,7 +1630,7 @@ static void move_active_pages_to_lru(struct zone *zone, VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - list_move(&page->lru, &zone->lru[lru].list); + list_move(&page->lru, &zone->lruvec.lists[lru]); mem_cgroup_add_lru_list(page, lru); pgmoved += hpage_nr_pages(page); @@ -3448,7 +3448,7 @@ retry: enum lru_list l = page_lru_base_type(page); __dec_zone_state(zone, NR_UNEVICTABLE); - list_move(&page->lru, &zone->lru[l].list); + list_move(&page->lru, &zone->lruvec.lists[l]); mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); __inc_zone_state(zone, NR_INACTIVE_ANON + l); __count_vm_event(UNEVICTABLE_PGRESCUED); @@ -3457,7 +3457,7 @@ retry: * rotate unevictable list */ SetPageUnevictable(page); - list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); + list_move(&page->lru, &zone->lruvec.lists[LRU_UNEVICTABLE]); mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); if (page_evictable(page, NULL)) goto retry; From 925b7673cce39116ce61e7a06683a4a0dad1e72a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:18:15 -0800 Subject: [PATCH 26/96] mm: make per-memcg LRU lists exclusive Now that all code that operated on global per-zone LRU lists is converted to operate on per-memory cgroup LRU lists instead, there is no reason to keep the double-LRU scheme around any longer. The pc->lru member is removed and page->lru is linked directly to the per-memory cgroup LRU lists, which removes two pointers from a descriptor that exists for every page frame in the system. Signed-off-by: Johannes Weiner Signed-off-by: Hugh Dickins Signed-off-by: Ying Han Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Reviewed-by: Kirill A. Shutemov Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Greg Thelen Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 63 ++++--- include/linux/mm_inline.h | 21 +-- include/linux/page_cgroup.h | 1 - mm/memcontrol.c | 319 ++++++++++++++++++------------------ mm/page_cgroup.c | 1 - mm/swap.c | 23 ++- mm/vmscan.c | 64 ++++---- 7 files changed, 235 insertions(+), 257 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3b99dce85293..e2f8e7caf04b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -32,14 +32,6 @@ enum mem_cgroup_page_stat_item { MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */ }; -extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, - struct list_head *dst, - unsigned long *scanned, int order, - isolate_mode_t mode, - struct zone *z, - struct mem_cgroup *mem_cont, - int active, int file); - struct mem_cgroup_reclaim_cookie { struct zone *zone; int priority; @@ -69,13 +61,14 @@ extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr); extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); -extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru); -extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru); -extern void mem_cgroup_rotate_reclaimable_page(struct page *page); -extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru); -extern void mem_cgroup_del_lru(struct page *page); -extern void mem_cgroup_move_lists(struct page *page, - enum lru_list from, enum lru_list to); + +struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); +struct lruvec *mem_cgroup_lru_add_list(struct zone *, struct page *, + enum lru_list); +void mem_cgroup_lru_del_list(struct page *, enum lru_list); +void mem_cgroup_lru_del(struct page *); +struct lruvec *mem_cgroup_lru_move_lists(struct zone *, struct page *, + enum lru_list, enum lru_list); /* For coalescing uncharge for reducing memcg' overhead*/ extern void mem_cgroup_uncharge_start(void); @@ -223,33 +216,33 @@ static inline void mem_cgroup_uncharge_cache_page(struct page *page) { } -static inline void mem_cgroup_add_lru_list(struct page *page, int lru) +static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, + struct mem_cgroup *memcg) +{ + return &zone->lruvec; +} + +static inline struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, + struct page *page, + enum lru_list lru) +{ + return &zone->lruvec; +} + +static inline void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) { } -static inline void mem_cgroup_del_lru_list(struct page *page, int lru) +static inline void mem_cgroup_lru_del(struct page *page) { - return ; } -static inline void mem_cgroup_rotate_reclaimable_page(struct page *page) -{ - return ; -} - -static inline void mem_cgroup_rotate_lru_list(struct page *page, int lru) -{ - return ; -} - -static inline void mem_cgroup_del_lru(struct page *page) -{ - return ; -} - -static inline void -mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to) +static inline struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, + struct page *page, + enum lru_list from, + enum lru_list to) { + return &zone->lruvec; } static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index e6a7ffe16d31..4e3478e71926 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -21,27 +21,22 @@ static inline int page_is_file_cache(struct page *page) return !PageSwapBacked(page); } -static inline void -__add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, - struct list_head *head) -{ - list_add(&page->lru, head); - __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page)); - mem_cgroup_add_lru_list(page, l); -} - static inline void add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) { - __add_page_to_lru_list(zone, page, l, &zone->lruvec.lists[l]); + struct lruvec *lruvec; + + lruvec = mem_cgroup_lru_add_list(zone, page, l); + list_add(&page->lru, &lruvec->lists[l]); + __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page)); } static inline void del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) { + mem_cgroup_lru_del_list(page, l); list_del(&page->lru); __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); - mem_cgroup_del_lru_list(page, l); } /** @@ -64,7 +59,6 @@ del_page_from_lru(struct zone *zone, struct page *page) { enum lru_list l; - list_del(&page->lru); if (PageUnevictable(page)) { __ClearPageUnevictable(page); l = LRU_UNEVICTABLE; @@ -75,8 +69,9 @@ del_page_from_lru(struct zone *zone, struct page *page) l += LRU_ACTIVE; } } + mem_cgroup_lru_del_list(page, l); + list_del(&page->lru); __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); - mem_cgroup_del_lru_list(page, l); } /** diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 961ecc7d30bc..5bae7535c202 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -31,7 +31,6 @@ enum { struct page_cgroup { unsigned long flags; struct mem_cgroup *mem_cgroup; - struct list_head lru; /* per cgroup LRU list */ }; void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6e7f849a1a9e..972878b648c2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -995,6 +995,27 @@ out: } EXPORT_SYMBOL(mem_cgroup_count_vm_event); +/** + * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg + * @zone: zone of the wanted lruvec + * @mem: memcg of the wanted lruvec + * + * Returns the lru list vector holding pages for the given @zone and + * @mem. This can be the global zone lruvec, if the memory controller + * is disabled. + */ +struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, + struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_zone *mz; + + if (mem_cgroup_disabled()) + return &zone->lruvec; + + mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); + return &mz->lruvec; +} + /* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. @@ -1009,84 +1030,29 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event); * When moving account, the page is not on LRU. It's isolated. */ -void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) -{ - struct page_cgroup *pc; - struct mem_cgroup_per_zone *mz; - - if (mem_cgroup_disabled()) - return; - pc = lookup_page_cgroup(page); - /* can happen while we handle swapcache. */ - if (!TestClearPageCgroupAcctLRU(pc)) - return; - VM_BUG_ON(!pc->mem_cgroup); - /* - * We don't check PCG_USED bit. It's cleared when the "page" is finally - * removed from global LRU. - */ - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - /* huge page split is done under lru_lock. so, we have no races. */ - MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); - VM_BUG_ON(list_empty(&pc->lru)); - list_del_init(&pc->lru); -} - -void mem_cgroup_del_lru(struct page *page) -{ - mem_cgroup_del_lru_list(page, page_lru(page)); -} - -/* - * Writeback is about to end against a page which has been marked for immediate - * reclaim. If it still appears to be reclaimable, move it to the tail of the - * inactive list. +/** + * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec + * @zone: zone of the page + * @page: the page + * @lru: current lru + * + * This function accounts for @page being added to @lru, and returns + * the lruvec for the given @zone and the memcg @page is charged to. + * + * The callsite is then responsible for physically linking the page to + * the returned lruvec->lists[@lru]. */ -void mem_cgroup_rotate_reclaimable_page(struct page *page) -{ - struct mem_cgroup_per_zone *mz; - struct page_cgroup *pc; - enum lru_list lru = page_lru(page); - - if (mem_cgroup_disabled()) - return; - - pc = lookup_page_cgroup(page); - /* unused page is not rotated. */ - if (!PageCgroupUsed(pc)) - return; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - list_move_tail(&pc->lru, &mz->lruvec.lists[lru]); -} - -void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) +struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, + enum lru_list lru) { struct mem_cgroup_per_zone *mz; + struct mem_cgroup *memcg; struct page_cgroup *pc; if (mem_cgroup_disabled()) - return; + return &zone->lruvec; pc = lookup_page_cgroup(page); - /* unused page is not rotated. */ - if (!PageCgroupUsed(pc)) - return; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - list_move(&pc->lru, &mz->lruvec.lists[lru]); -} - -void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) -{ - struct page_cgroup *pc; - struct mem_cgroup_per_zone *mz; - - if (mem_cgroup_disabled()) - return; - pc = lookup_page_cgroup(page); VM_BUG_ON(PageCgroupAcctLRU(pc)); /* * putback: charge: @@ -1098,15 +1064,89 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) * LRU during a race. */ smp_mb(); - if (!PageCgroupUsed(pc)) - return; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - /* huge page split is done under lru_lock. so, we have no races. */ + /* + * If the page is uncharged, it may be freed soon, but it + * could also be swap cache (readahead, swapoff) that needs to + * be reclaimable in the future. root_mem_cgroup will babysit + * it for the time being. + */ + if (PageCgroupUsed(pc)) { + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); + memcg = pc->mem_cgroup; + SetPageCgroupAcctLRU(pc); + } else + memcg = root_mem_cgroup; + mz = page_cgroup_zoneinfo(memcg, page); + /* compound_order() is stabilized through lru_lock */ MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); - SetPageCgroupAcctLRU(pc); - list_add(&pc->lru, &mz->lruvec.lists[lru]); + return &mz->lruvec; +} + +/** + * mem_cgroup_lru_del_list - account for removing an lru page + * @page: the page + * @lru: target lru + * + * This function accounts for @page being removed from @lru. + * + * The callsite is then responsible for physically unlinking + * @page->lru. + */ +void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) +{ + struct mem_cgroup_per_zone *mz; + struct mem_cgroup *memcg; + struct page_cgroup *pc; + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(page); + /* + * root_mem_cgroup babysits uncharged LRU pages, but + * PageCgroupUsed is cleared when the page is about to get + * freed. PageCgroupAcctLRU remembers whether the + * LRU-accounting happened against pc->mem_cgroup or + * root_mem_cgroup. + */ + if (TestClearPageCgroupAcctLRU(pc)) { + VM_BUG_ON(!pc->mem_cgroup); + memcg = pc->mem_cgroup; + } else + memcg = root_mem_cgroup; + mz = page_cgroup_zoneinfo(memcg, page); + /* huge page split is done under lru_lock. so, we have no races. */ + MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); +} + +void mem_cgroup_lru_del(struct page *page) +{ + mem_cgroup_lru_del_list(page, page_lru(page)); +} + +/** + * mem_cgroup_lru_move_lists - account for moving a page between lrus + * @zone: zone of the page + * @page: the page + * @from: current lru + * @to: target lru + * + * This function accounts for @page being moved between the lrus @from + * and @to, and returns the lruvec for the given @zone and the memcg + * @page is charged to. + * + * The callsite is then responsible for physically relinking + * @page->lru to the returned lruvec->lists[@to]. + */ +struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, + struct page *page, + enum lru_list from, + enum lru_list to) +{ + /* XXX: Optimize this, especially for @from == @to */ + mem_cgroup_lru_del_list(page, from); + return mem_cgroup_lru_add_list(zone, page, to); } /* @@ -1117,6 +1157,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) */ static void mem_cgroup_lru_del_before_commit(struct page *page) { + enum lru_list lru; unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); @@ -1133,17 +1174,28 @@ static void mem_cgroup_lru_del_before_commit(struct page *page) return; spin_lock_irqsave(&zone->lru_lock, flags); + lru = page_lru(page); /* - * Forget old LRU when this page_cgroup is *not* used. This Used bit - * is guarded by lock_page() because the page is SwapCache. + * The uncharged page could still be registered to the LRU of + * the stale pc->mem_cgroup. + * + * As pc->mem_cgroup is about to get overwritten, the old LRU + * accounting needs to be taken care of. Let root_mem_cgroup + * babysit the page until the new memcg is responsible for it. + * + * The PCG_USED bit is guarded by lock_page() as the page is + * swapcache/pagecache. */ - if (!PageCgroupUsed(pc)) - mem_cgroup_del_lru_list(page, page_lru(page)); + if (PageLRU(page) && PageCgroupAcctLRU(pc) && !PageCgroupUsed(pc)) { + del_page_from_lru_list(zone, page, lru); + add_page_to_lru_list(zone, page, lru); + } spin_unlock_irqrestore(&zone->lru_lock, flags); } static void mem_cgroup_lru_add_after_commit(struct page *page) { + enum lru_list lru; unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); @@ -1161,22 +1213,22 @@ static void mem_cgroup_lru_add_after_commit(struct page *page) if (likely(!PageLRU(page))) return; spin_lock_irqsave(&zone->lru_lock, flags); - /* link when the page is linked to LRU but page_cgroup isn't */ - if (PageLRU(page) && !PageCgroupAcctLRU(pc)) - mem_cgroup_add_lru_list(page, page_lru(page)); + lru = page_lru(page); + /* + * If the page is not on the LRU, someone will soon put it + * there. If it is, and also already accounted for on the + * memcg-side, it must be on the right lruvec as setting + * pc->mem_cgroup and PageCgroupUsed is properly ordered. + * Otherwise, root_mem_cgroup has been babysitting the page + * during the charge. Move it to the new memcg now. + */ + if (PageLRU(page) && !PageCgroupAcctLRU(pc)) { + del_page_from_lru_list(zone, page, lru); + add_page_to_lru_list(zone, page, lru); + } spin_unlock_irqrestore(&zone->lru_lock, flags); } - -void mem_cgroup_move_lists(struct page *page, - enum lru_list from, enum lru_list to) -{ - if (mem_cgroup_disabled()) - return; - mem_cgroup_del_lru_list(page, from); - mem_cgroup_add_lru_list(page, to); -} - /* * Checks whether given mem is same or in the root_mem_cgroup's * hierarchy subtree @@ -1282,68 +1334,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return &mz->reclaim_stat; } -unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, - struct list_head *dst, - unsigned long *scanned, int order, - isolate_mode_t mode, - struct zone *z, - struct mem_cgroup *mem_cont, - int active, int file) -{ - unsigned long nr_taken = 0; - struct page *page; - unsigned long scan; - LIST_HEAD(pc_list); - struct list_head *src; - struct page_cgroup *pc, *tmp; - int nid = zone_to_nid(z); - int zid = zone_idx(z); - struct mem_cgroup_per_zone *mz; - int lru = LRU_FILE * file + active; - int ret; - - BUG_ON(!mem_cont); - mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); - src = &mz->lruvec.lists[lru]; - - scan = 0; - list_for_each_entry_safe_reverse(pc, tmp, src, lru) { - if (scan >= nr_to_scan) - break; - - if (unlikely(!PageCgroupUsed(pc))) - continue; - - page = lookup_cgroup_page(pc); - - if (unlikely(!PageLRU(page))) - continue; - - scan++; - ret = __isolate_lru_page(page, mode, file); - switch (ret) { - case 0: - list_move(&page->lru, dst); - mem_cgroup_del_lru(page); - nr_taken += hpage_nr_pages(page); - break; - case -EBUSY: - /* we don't affect global LRU but rotate in our LRU */ - mem_cgroup_rotate_lru_list(page, page_lru(page)); - break; - default: - break; - } - } - - *scanned = scan; - - trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, - 0, 0, 0, mode); - - return nr_taken; -} - #define mem_cgroup_from_res_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) @@ -3726,11 +3716,11 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, int node, int zid, enum lru_list lru) { - struct zone *zone; struct mem_cgroup_per_zone *mz; - struct page_cgroup *pc, *busy; unsigned long flags, loop; struct list_head *list; + struct page *busy; + struct zone *zone; int ret = 0; zone = &NODE_DATA(node)->node_zones[zid]; @@ -3742,6 +3732,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, loop += 256; busy = NULL; while (loop--) { + struct page_cgroup *pc; struct page *page; ret = 0; @@ -3750,16 +3741,16 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, spin_unlock_irqrestore(&zone->lru_lock, flags); break; } - pc = list_entry(list->prev, struct page_cgroup, lru); - if (busy == pc) { - list_move(&pc->lru, list); + page = list_entry(list->prev, struct page, lru); + if (busy == page) { + list_move(&page->lru, list); busy = NULL; spin_unlock_irqrestore(&zone->lru_lock, flags); continue; } spin_unlock_irqrestore(&zone->lru_lock, flags); - page = lookup_cgroup_page(pc); + pc = lookup_page_cgroup(page); ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); if (ret == -ENOMEM) @@ -3767,7 +3758,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, if (ret == -EBUSY || ret == -EINVAL) { /* found lock contention or "pc" is obsolete. */ - busy = pc; + busy = page; cond_resched(); } else busy = NULL; diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 2d123f94a8df..f59405a8d752 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -16,7 +16,6 @@ static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id) pc->flags = 0; set_page_cgroup_array_id(pc, id); pc->mem_cgroup = NULL; - INIT_LIST_HEAD(&pc->lru); } static unsigned long total_usage; diff --git a/mm/swap.c b/mm/swap.c index 76ef79d3857c..126da2919f60 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -232,12 +232,14 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, static void pagevec_move_tail_fn(struct page *page, void *arg) { int *pgmoved = arg; - struct zone *zone = page_zone(page); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { enum lru_list lru = page_lru_base_type(page); - list_move_tail(&page->lru, &zone->lruvec.lists[lru]); - mem_cgroup_rotate_reclaimable_page(page); + struct lruvec *lruvec; + + lruvec = mem_cgroup_lru_move_lists(page_zone(page), + page, lru, lru); + list_move_tail(&page->lru, &lruvec->lists[lru]); (*pgmoved)++; } } @@ -476,12 +478,13 @@ static void lru_deactivate_fn(struct page *page, void *arg) */ SetPageReclaim(page); } else { + struct lruvec *lruvec; /* * The page's writeback ends up during pagevec * We moves tha page into tail of inactive. */ - list_move_tail(&page->lru, &zone->lruvec.lists[lru]); - mem_cgroup_rotate_reclaimable_page(page); + lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru); + list_move_tail(&page->lru, &lruvec->lists[lru]); __count_vm_event(PGROTATED); } @@ -663,6 +666,8 @@ void lru_add_page_tail(struct zone* zone, SetPageLRU(page_tail); if (page_evictable(page_tail, NULL)) { + struct lruvec *lruvec; + if (PageActive(page)) { SetPageActive(page_tail); active = 1; @@ -672,11 +677,13 @@ void lru_add_page_tail(struct zone* zone, lru = LRU_INACTIVE_ANON; } update_page_reclaim_stat(zone, page_tail, file, active); + lruvec = mem_cgroup_lru_add_list(zone, page_tail, lru); if (likely(PageLRU(page))) - __add_page_to_lru_list(zone, page_tail, lru, - page->lru.prev); + list_add(&page_tail->lru, page->lru.prev); else - add_page_to_lru_list(zone, page_tail, lru); + list_add(&page_tail->lru, &lruvec->lists[lru]); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, + hpage_nr_pages(page_tail)); } else { SetPageUnevictable(page_tail); add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); diff --git a/mm/vmscan.c b/mm/vmscan.c index 93cdc44a1693..813aae820a27 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1139,15 +1139,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, switch (__isolate_lru_page(page, mode, file)) { case 0: + mem_cgroup_lru_del(page); list_move(&page->lru, dst); - mem_cgroup_del_lru(page); nr_taken += hpage_nr_pages(page); break; case -EBUSY: /* else it is being freed elsewhere */ list_move(&page->lru, src); - mem_cgroup_rotate_lru_list(page, page_lru(page)); continue; default: @@ -1197,8 +1196,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, break; if (__isolate_lru_page(cursor_page, mode, file) == 0) { + mem_cgroup_lru_del(cursor_page); list_move(&cursor_page->lru, dst); - mem_cgroup_del_lru(cursor_page); nr_taken += hpage_nr_pages(cursor_page); nr_lumpy_taken++; if (PageDirty(cursor_page)) @@ -1239,18 +1238,20 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, return nr_taken; } -static unsigned long isolate_pages_global(unsigned long nr, - struct list_head *dst, - unsigned long *scanned, int order, - isolate_mode_t mode, - struct zone *z, int active, int file) +static unsigned long isolate_pages(unsigned long nr, struct mem_cgroup_zone *mz, + struct list_head *dst, + unsigned long *scanned, int order, + isolate_mode_t mode, int active, int file) { + struct lruvec *lruvec; int lru = LRU_BASE; + + lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup); if (active) lru += LRU_ACTIVE; if (file) lru += LRU_FILE; - return isolate_lru_pages(nr, &z->lruvec.lists[lru], dst, + return isolate_lru_pages(nr, &lruvec->lists[lru], dst, scanned, order, mode, file); } @@ -1518,14 +1519,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, spin_lock_irq(&zone->lru_lock); - if (scanning_global_lru(mz)) { - nr_taken = isolate_pages_global(nr_to_scan, &page_list, - &nr_scanned, sc->order, reclaim_mode, zone, 0, file); - } else { - nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, - &nr_scanned, sc->order, reclaim_mode, zone, - mz->mem_cgroup, 0, file); - } + nr_taken = isolate_pages(nr_to_scan, mz, &page_list, + &nr_scanned, sc->order, + reclaim_mode, 0, file); if (global_reclaim(sc)) { zone->pages_scanned += nr_scanned; if (current_is_kswapd()) @@ -1625,13 +1621,15 @@ static void move_active_pages_to_lru(struct zone *zone, pagevec_init(&pvec, 1); while (!list_empty(list)) { + struct lruvec *lruvec; + page = lru_to_page(list); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - list_move(&page->lru, &zone->lruvec.lists[lru]); - mem_cgroup_add_lru_list(page, lru); + lruvec = mem_cgroup_lru_add_list(zone, page, lru); + list_move(&page->lru, &lruvec->lists[lru]); pgmoved += hpage_nr_pages(page); if (!pagevec_add(&pvec, page) || list_empty(list)) { @@ -1672,17 +1670,10 @@ static void shrink_active_list(unsigned long nr_pages, reclaim_mode |= ISOLATE_CLEAN; spin_lock_irq(&zone->lru_lock); - if (scanning_global_lru(mz)) { - nr_taken = isolate_pages_global(nr_pages, &l_hold, - &pgscanned, sc->order, - reclaim_mode, zone, - 1, file); - } else { - nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, - &pgscanned, sc->order, - reclaim_mode, zone, - mz->mem_cgroup, 1, file); - } + + nr_taken = isolate_pages(nr_pages, mz, &l_hold, + &pgscanned, sc->order, + reclaim_mode, 1, file); if (global_reclaim(sc)) zone->pages_scanned += pgscanned; @@ -3440,16 +3431,18 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) */ static void check_move_unevictable_page(struct page *page, struct zone *zone) { - VM_BUG_ON(PageActive(page)); + struct lruvec *lruvec; + VM_BUG_ON(PageActive(page)); retry: ClearPageUnevictable(page); if (page_evictable(page, NULL)) { enum lru_list l = page_lru_base_type(page); __dec_zone_state(zone, NR_UNEVICTABLE); - list_move(&page->lru, &zone->lruvec.lists[l]); - mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); + lruvec = mem_cgroup_lru_move_lists(zone, page, + LRU_UNEVICTABLE, l); + list_move(&page->lru, &lruvec->lists[l]); __inc_zone_state(zone, NR_INACTIVE_ANON + l); __count_vm_event(UNEVICTABLE_PGRESCUED); } else { @@ -3457,8 +3450,9 @@ retry: * rotate unevictable list */ SetPageUnevictable(page); - list_move(&page->lru, &zone->lruvec.lists[LRU_UNEVICTABLE]); - mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); + lruvec = mem_cgroup_lru_move_lists(zone, page, LRU_UNEVICTABLE, + LRU_UNEVICTABLE); + list_move(&page->lru, &lruvec->lists[LRU_UNEVICTABLE]); if (page_evictable(page, NULL)) goto retry; } From 6b208e3f6e35aa76d254c395bdcd984b17c6b626 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:18:18 -0800 Subject: [PATCH 27/96] mm: memcg: remove unused node/section info from pc->flags To find the page corresponding to a certain page_cgroup, the pc->flags encoded the node or section ID with the base array to compare the pc pointer to. Now that the per-memory cgroup LRU lists link page descriptors directly, there is no longer any code that knows the struct page_cgroup of a PFN but not the struct page. [hughd@google.com: remove unused node/section info from pc->flags fix] Signed-off-by: Johannes Weiner Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Reviewed-by: Kirill A. Shutemov Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Ying Han Cc: Greg Thelen Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: Christoph Hellwig Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 33 --------------------- mm/page_cgroup.c | 59 +++++-------------------------------- 2 files changed, 7 insertions(+), 85 deletions(-) diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 5bae7535c202..aaa60da8783c 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -121,39 +121,6 @@ static inline void move_unlock_page_cgroup(struct page_cgroup *pc, local_irq_restore(*flags); } -#ifdef CONFIG_SPARSEMEM -#define PCG_ARRAYID_WIDTH SECTIONS_SHIFT -#else -#define PCG_ARRAYID_WIDTH NODES_SHIFT -#endif - -#if (PCG_ARRAYID_WIDTH > BITS_PER_LONG - NR_PCG_FLAGS) -#error Not enough space left in pc->flags to store page_cgroup array IDs -#endif - -/* pc->flags: ARRAY-ID | FLAGS */ - -#define PCG_ARRAYID_MASK ((1UL << PCG_ARRAYID_WIDTH) - 1) - -#define PCG_ARRAYID_OFFSET (BITS_PER_LONG - PCG_ARRAYID_WIDTH) -/* - * Zero the shift count for non-existent fields, to prevent compiler - * warnings and ensure references are optimized away. - */ -#define PCG_ARRAYID_SHIFT (PCG_ARRAYID_OFFSET * (PCG_ARRAYID_WIDTH != 0)) - -static inline void set_page_cgroup_array_id(struct page_cgroup *pc, - unsigned long id) -{ - pc->flags &= ~(PCG_ARRAYID_MASK << PCG_ARRAYID_SHIFT); - pc->flags |= (id & PCG_ARRAYID_MASK) << PCG_ARRAYID_SHIFT; -} - -static inline unsigned long page_cgroup_array_id(struct page_cgroup *pc) -{ - return (pc->flags >> PCG_ARRAYID_SHIFT) & PCG_ARRAYID_MASK; -} - #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index f59405a8d752..f0559e049e00 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -11,12 +11,6 @@ #include #include -static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id) -{ - pc->flags = 0; - set_page_cgroup_array_id(pc, id); - pc->mem_cgroup = NULL; -} static unsigned long total_usage; #if !defined(CONFIG_SPARSEMEM) @@ -41,28 +35,13 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) return base + offset; } -struct page *lookup_cgroup_page(struct page_cgroup *pc) -{ - unsigned long pfn; - struct page *page; - pg_data_t *pgdat; - - pgdat = NODE_DATA(page_cgroup_array_id(pc)); - pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn; - page = pfn_to_page(pfn); - VM_BUG_ON(pc != lookup_page_cgroup(page)); - return page; -} - static int __init alloc_node_page_cgroup(int nid) { - struct page_cgroup *base, *pc; + struct page_cgroup *base; unsigned long table_size; - unsigned long start_pfn, nr_pages, index; + unsigned long nr_pages; - start_pfn = NODE_DATA(nid)->node_start_pfn; nr_pages = NODE_DATA(nid)->node_spanned_pages; - if (!nr_pages) return 0; @@ -72,10 +51,6 @@ static int __init alloc_node_page_cgroup(int nid) table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!base) return -ENOMEM; - for (index = 0; index < nr_pages; index++) { - pc = base + index; - init_page_cgroup(pc, nid); - } NODE_DATA(nid)->node_page_cgroup = base; total_usage += table_size; return 0; @@ -116,23 +91,10 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) return section->page_cgroup + pfn; } -struct page *lookup_cgroup_page(struct page_cgroup *pc) -{ - struct mem_section *section; - struct page *page; - unsigned long nr; - - nr = page_cgroup_array_id(pc); - section = __nr_to_section(nr); - page = pfn_to_page(pc - section->page_cgroup); - VM_BUG_ON(pc != lookup_page_cgroup(page)); - return page; -} - static void *__meminit alloc_page_cgroup(size_t size, int nid) { + gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; void *addr = NULL; - gfp_t flags = GFP_KERNEL | __GFP_NOWARN; addr = alloc_pages_exact_nid(nid, size, flags); if (addr) { @@ -141,9 +103,9 @@ static void *__meminit alloc_page_cgroup(size_t size, int nid) } if (node_state(nid, N_HIGH_MEMORY)) - addr = vmalloc_node(size, nid); + addr = vzalloc_node(size, nid); else - addr = vmalloc(size); + addr = vzalloc(size); return addr; } @@ -166,14 +128,11 @@ static void free_page_cgroup(void *addr) static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) { - struct page_cgroup *base, *pc; struct mem_section *section; + struct page_cgroup *base; unsigned long table_size; - unsigned long nr; - int index; - nr = pfn_to_section_nr(pfn); - section = __nr_to_section(nr); + section = __pfn_to_section(pfn); if (section->page_cgroup) return 0; @@ -193,10 +152,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) return -ENOMEM; } - for (index = 0; index < PAGES_PER_SECTION; index++) { - pc = base + index; - init_page_cgroup(pc, nr); - } /* * The passed "pfn" may not be aligned to SECTION. For the calculation * we need to apply a mask. From e94c8a9cbce1aee4af9e1285802785481b7f93c5 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 12 Jan 2012 17:18:20 -0800 Subject: [PATCH 28/96] memcg: make mem_cgroup_split_huge_fixup() more efficient In split_huge_page(), mem_cgroup_split_huge_fixup() is called to handle page_cgroup modifcations. It takes move_lock_page_cgroup() and modifies page_cgroup and LRU accounting jobs and called HPAGE_PMD_SIZE - 1 times. But thinking again, - compound_lock() is held at move_accout...then, it's not necessary to take move_lock_page_cgroup(). - LRU is locked and all tail pages will go into the same LRU as head is now on. - page_cgroup is contiguous in huge page range. This patch fixes mem_cgroup_split_huge_fixup() as to be called once per hugepage and reduce costs for spliting. [akpm@linux-foundation.org: fix typo, per Michal] Signed-off-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Andrea Arcangeli Reviewed-by: Michal Hocko Cc: Balbir Singh Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++--- mm/huge_memory.c | 3 ++- mm/memcontrol.c | 34 +++++++++++++++++----------------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e2f8e7caf04b..cee3761666f0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -163,7 +163,7 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg); void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); #ifdef CONFIG_TRANSPARENT_HUGEPAGE -void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail); +void mem_cgroup_split_huge_fixup(struct page *head); #endif #ifdef CONFIG_DEBUG_VM @@ -379,8 +379,7 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) return 0; } -static inline void mem_cgroup_split_huge_fixup(struct page *head, - struct page *tail) +static inline void mem_cgroup_split_huge_fixup(struct page *head) { } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 36b3d988b4ef..db522e160cca 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1207,6 +1207,8 @@ static void __split_huge_page_refcount(struct page *page) /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); compound_lock(page); + /* complete memcg works before add pages to LRU */ + mem_cgroup_split_huge_fixup(page); for (i = 1; i < HPAGE_PMD_NR; i++) { struct page *page_tail = page + i; @@ -1278,7 +1280,6 @@ static void __split_huge_page_refcount(struct page *page) BUG_ON(!PageDirty(page_tail)); BUG_ON(!PageSwapBacked(page_tail)); - mem_cgroup_split_huge_fixup(page, page_tail); lru_add_page_tail(zone, page, page_tail); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 972878b648c2..42174612cc0b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2553,39 +2553,39 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) /* * Because tail pages are not marked as "used", set it. We're under - * zone->lru_lock, 'splitting on pmd' and compund_lock. + * zone->lru_lock, 'splitting on pmd' and compound_lock. + * charge/uncharge will be never happen and move_account() is done under + * compound_lock(), so we don't have to take care of races. */ -void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) +void mem_cgroup_split_huge_fixup(struct page *head) { struct page_cgroup *head_pc = lookup_page_cgroup(head); - struct page_cgroup *tail_pc = lookup_page_cgroup(tail); - unsigned long flags; + struct page_cgroup *pc; + int i; if (mem_cgroup_disabled()) return; - /* - * We have no races with charge/uncharge but will have races with - * page state accounting. - */ - move_lock_page_cgroup(head_pc, &flags); + for (i = 1; i < HPAGE_PMD_NR; i++) { + pc = head_pc + i; + pc->mem_cgroup = head_pc->mem_cgroup; + smp_wmb();/* see __commit_charge() */ + /* + * LRU flags cannot be copied because we need to add tail + * page to LRU by generic call and our hooks will be called. + */ + pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; + } - tail_pc->mem_cgroup = head_pc->mem_cgroup; - smp_wmb(); /* see __commit_charge() */ if (PageCgroupAcctLRU(head_pc)) { enum lru_list lru; struct mem_cgroup_per_zone *mz; - /* - * LRU flags cannot be copied because we need to add tail - *.page to LRU by generic call and our hook will be called. * We hold lru_lock, then, reduce counter directly. */ lru = page_lru(head); mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); - MEM_CGROUP_ZSTAT(mz, lru) -= 1; + MEM_CGROUP_ZSTAT(mz, lru) -= HPAGE_PMD_NR - 1; } - tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; - move_unlock_page_cgroup(head_pc, &flags); } #endif From f53d7ce32e13dbd09573b176e6521a04c2c77803 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:18:23 -0800 Subject: [PATCH 29/96] mm: memcg: shorten preempt-disabled section around event checks Only the ratelimit checks themselves have to run with preemption disabled, the resulting actions - checking for usage thresholds, updating the soft limit tree - can and should run with preemption enabled. Signed-off-by: Johannes Weiner Reported-by: Yong Zhang Tested-by: Yong Zhang Reported-by: Luis Henriques Tested-by: Luis Henriques Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Peter Zijlstra Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 79 ++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 41 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 42174612cc0b..abb66a2cba65 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -748,37 +748,32 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, return total; } -static bool __memcg_event_check(struct mem_cgroup *memcg, int target) +static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, + enum mem_cgroup_events_target target) { unsigned long val, next; val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); next = __this_cpu_read(memcg->stat->targets[target]); /* from time_after() in jiffies.h */ - return ((long)next - (long)val < 0); -} - -static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target) -{ - unsigned long val, next; - - val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); - - switch (target) { - case MEM_CGROUP_TARGET_THRESH: - next = val + THRESHOLDS_EVENTS_TARGET; - break; - case MEM_CGROUP_TARGET_SOFTLIMIT: - next = val + SOFTLIMIT_EVENTS_TARGET; - break; - case MEM_CGROUP_TARGET_NUMAINFO: - next = val + NUMAINFO_EVENTS_TARGET; - break; - default: - return; + if ((long)next - (long)val < 0) { + switch (target) { + case MEM_CGROUP_TARGET_THRESH: + next = val + THRESHOLDS_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_SOFTLIMIT: + next = val + SOFTLIMIT_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_NUMAINFO: + next = val + NUMAINFO_EVENTS_TARGET; + break; + default: + break; + } + __this_cpu_write(memcg->stat->targets[target], next); + return true; } - - __this_cpu_write(memcg->stat->targets[target], next); + return false; } /* @@ -789,25 +784,27 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) { preempt_disable(); /* threshold event is triggered in finer grain than soft limit */ - if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) { - mem_cgroup_threshold(memcg); - __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH); - if (unlikely(__memcg_event_check(memcg, - MEM_CGROUP_TARGET_SOFTLIMIT))) { - mem_cgroup_update_tree(memcg, page); - __mem_cgroup_target_update(memcg, - MEM_CGROUP_TARGET_SOFTLIMIT); - } + if (unlikely(mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_THRESH))) { + bool do_softlimit, do_numainfo; + + do_softlimit = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_SOFTLIMIT); #if MAX_NUMNODES > 1 - if (unlikely(__memcg_event_check(memcg, - MEM_CGROUP_TARGET_NUMAINFO))) { - atomic_inc(&memcg->numainfo_events); - __mem_cgroup_target_update(memcg, - MEM_CGROUP_TARGET_NUMAINFO); - } + do_numainfo = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_NUMAINFO); #endif - } - preempt_enable(); + preempt_enable(); + + mem_cgroup_threshold(memcg); + if (unlikely(do_softlimit)) + mem_cgroup_update_tree(memcg, page); +#if MAX_NUMNODES > 1 + if (unlikely(do_numainfo)) + atomic_inc(&memcg->numainfo_events); +#endif + } else + preempt_enable(); } struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) From d66c1ce7df6ceb94d1902ac865ad50df4ffdb95f Mon Sep 17 00:00:00 2001 From: Zhu Yanhai Date: Thu, 12 Jan 2012 17:18:24 -0800 Subject: [PATCH 30/96] Documentation/cgroups/memory.txt: fix typo It should be memsw.max_usage_in_bytes. This typo has been there for a really long time. Signed-off-by: Zhu Yanhai Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 4d8774f6f48a..c7de7dac43b5 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -61,7 +61,7 @@ Brief summary of control files. memory.failcnt # show the number of memory usage hits limits memory.memsw.failcnt # show the number of memory+Swap hits limits memory.max_usage_in_bytes # show max memory usage recorded - memory.memsw.usage_in_bytes # show max memory+Swap usage recorded + memory.memsw.max_usage_in_bytes # show max memory+Swap usage recorded memory.soft_limit_in_bytes # set/show soft limit of memory usage memory.stat # show various statistics memory.use_hierarchy # set/show hierarchical account enabled From 0527b6903b2838e214c895d527806aefd61b3c8f Mon Sep 17 00:00:00 2001 From: Ying Han Date: Thu, 12 Jan 2012 17:18:27 -0800 Subject: [PATCH 31/96] memcg: fix pgpgin/pgpgout documentation The two memcg stats pgpgin/pgpgout have different meaning than the ones in vmstat, which indicates that we picked a bad naming for them. It might be late to change the stat name, but better documentation is always helpful. Signed-off-by: Ying Han Acked-by: KAMEZAWA Hiroyuki Acked-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index c7de7dac43b5..4c95c0034a4b 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -410,8 +410,11 @@ memory.stat file includes following statistics cache - # of bytes of page cache memory. rss - # of bytes of anonymous and swap cache memory. mapped_file - # of bytes of mapped file (includes tmpfs/shmem) -pgpgin - # of pages paged in (equivalent to # of charging events). -pgpgout - # of pages paged out (equivalent to # of uncharging events). +pgpgin - # of charging events to the memory cgroup. The charging + event happens each time a page is accounted as either mapped + anon page(RSS) or cache page(Page Cache) to the cgroup. +pgpgout - # of uncharging events to the memory cgroup. The uncharging + event happens each time a page is unaccounted from the cgroup. swap - # of bytes of swap usage inactive_anon - # of bytes of anonymous memory and swap cache memory on LRU list. From ec0fffd84b162e0563a28a81aa049f946b31a8e2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:18:29 -0800 Subject: [PATCH 32/96] mm: oom_kill: remove memcg argument from oom_kill_task() The memcg argument of oom_kill_task() hasn't been used since 341aea2 'oom-kill: remove boost_dying_task_prio()'. Kill it. Signed-off-by: Johannes Weiner Acked-by: David Rientjes Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: Balbir Singh Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7c122faa05c5..05c0f27d4ed1 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -434,7 +434,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, } #define K(x) ((x) << (PAGE_SHIFT-10)) -static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) +static int oom_kill_task(struct task_struct *p) { struct task_struct *q; struct mm_struct *mm; @@ -533,7 +533,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, } } while_each_thread(p, t); - return oom_kill_task(victim, mem); + return oom_kill_task(victim); } /* From 72835c86ca15d0126354b73d5f29ce9194931c9b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:18:32 -0800 Subject: [PATCH 33/96] mm: unify remaining mem_cont, mem, etc. variable names to memcg Signed-off-by: Johannes Weiner Acked-by: David Rientjes Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: Balbir Singh Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 16 ++++++------ include/linux/oom.h | 2 +- include/linux/rmap.h | 4 +-- mm/memcontrol.c | 52 ++++++++++++++++++++------------------ mm/oom_kill.c | 38 ++++++++++++++-------------- mm/rmap.c | 20 +++++++-------- mm/swapfile.c | 9 ++++--- mm/vmscan.c | 12 ++++----- 8 files changed, 78 insertions(+), 75 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index cee3761666f0..b80de520670b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -54,10 +54,10 @@ extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); /* for swap handling */ extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, gfp_t mask, struct mem_cgroup **ptr); + struct page *page, gfp_t mask, struct mem_cgroup **memcgp); extern void mem_cgroup_commit_charge_swapin(struct page *page, - struct mem_cgroup *ptr); -extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr); + struct mem_cgroup *memcg); +extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg); extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); @@ -101,7 +101,7 @@ extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg); extern int mem_cgroup_prepare_migration(struct page *page, - struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask); + struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask); extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, struct page *oldpage, struct page *newpage, bool migration_ok); @@ -186,17 +186,17 @@ static inline int mem_cgroup_cache_charge(struct page *page, } static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, gfp_t gfp_mask, struct mem_cgroup **ptr) + struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp) { return 0; } static inline void mem_cgroup_commit_charge_swapin(struct page *page, - struct mem_cgroup *ptr) + struct mem_cgroup *memcg) { } -static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr) +static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) { } @@ -275,7 +275,7 @@ static inline struct cgroup_subsys_state static inline int mem_cgroup_prepare_migration(struct page *page, struct page *newpage, - struct mem_cgroup **ptr, gfp_t gfp_mask) + struct mem_cgroup **memcgp, gfp_t gfp_mask) { return 0; } diff --git a/include/linux/oom.h b/include/linux/oom.h index 6f9d04a85336..552fba9c7d5a 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -43,7 +43,7 @@ enum oom_constraint { extern void compare_swap_oom_score_adj(int old_val, int new_val); extern int test_set_oom_score_adj(int new_val); -extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, +extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 1afb9954bbf1..1cdd62a2788a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -158,7 +158,7 @@ static inline void page_dup_rmap(struct page *page) * Called from mm/vmscan.c to handle paging out */ int page_referenced(struct page *, int is_locked, - struct mem_cgroup *cnt, unsigned long *vm_flags); + struct mem_cgroup *memcg, unsigned long *vm_flags); int page_referenced_one(struct page *, struct vm_area_struct *, unsigned long address, unsigned int *mapcount, unsigned long *vm_flags); @@ -236,7 +236,7 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *, #define anon_vma_link(vma) do {} while (0) static inline int page_referenced(struct page *page, int is_locked, - struct mem_cgroup *cnt, + struct mem_cgroup *memcg, unsigned long *vm_flags) { *vm_flags = 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index abb66a2cba65..aeb23933a052 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2844,12 +2844,12 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, */ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, - gfp_t mask, struct mem_cgroup **ptr) + gfp_t mask, struct mem_cgroup **memcgp) { struct mem_cgroup *memcg; int ret; - *ptr = NULL; + *memcgp = NULL; if (mem_cgroup_disabled()) return 0; @@ -2867,27 +2867,27 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, memcg = try_get_mem_cgroup_from_page(page); if (!memcg) goto charge_cur_mm; - *ptr = memcg; - ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); + *memcgp = memcg; + ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); css_put(&memcg->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); + return __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); } static void -__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, +__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, enum charge_type ctype) { if (mem_cgroup_disabled()) return; - if (!ptr) + if (!memcg) return; - cgroup_exclude_rmdir(&ptr->css); + cgroup_exclude_rmdir(&memcg->css); - __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); + __mem_cgroup_commit_charge_lrucare(page, memcg, ctype); /* * Now swap is on-memory. This means this page may be * counted both as mem and swap....double count. @@ -2897,21 +2897,22 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, */ if (do_swap_account && PageSwapCache(page)) { swp_entry_t ent = {.val = page_private(page)}; + struct mem_cgroup *swap_memcg; unsigned short id; - struct mem_cgroup *memcg; id = swap_cgroup_record(ent, 0); rcu_read_lock(); - memcg = mem_cgroup_lookup(id); - if (memcg) { + swap_memcg = mem_cgroup_lookup(id); + if (swap_memcg) { /* * This recorded memcg can be obsolete one. So, avoid * calling css_tryget */ - if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); - mem_cgroup_swap_statistics(memcg, false); - mem_cgroup_put(memcg); + if (!mem_cgroup_is_root(swap_memcg)) + res_counter_uncharge(&swap_memcg->memsw, + PAGE_SIZE); + mem_cgroup_swap_statistics(swap_memcg, false); + mem_cgroup_put(swap_memcg); } rcu_read_unlock(); } @@ -2920,13 +2921,14 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, * So, rmdir()->pre_destroy() can be called while we do this charge. * In that case, we need to call pre_destroy() again. check it here. */ - cgroup_release_and_wakeup_rmdir(&ptr->css); + cgroup_release_and_wakeup_rmdir(&memcg->css); } -void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) +void mem_cgroup_commit_charge_swapin(struct page *page, + struct mem_cgroup *memcg) { - __mem_cgroup_commit_charge_swapin(page, ptr, - MEM_CGROUP_CHARGE_TYPE_MAPPED); + __mem_cgroup_commit_charge_swapin(page, memcg, + MEM_CGROUP_CHARGE_TYPE_MAPPED); } void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) @@ -3255,14 +3257,14 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, * page belongs to. */ int mem_cgroup_prepare_migration(struct page *page, - struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) + struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) { struct mem_cgroup *memcg = NULL; struct page_cgroup *pc; enum charge_type ctype; int ret = 0; - *ptr = NULL; + *memcgp = NULL; VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) @@ -3313,10 +3315,10 @@ int mem_cgroup_prepare_migration(struct page *page, if (!memcg) return 0; - *ptr = memcg; - ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); + *memcgp = memcg; + ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); css_put(&memcg->css);/* drop extra refcnt */ - if (ret || *ptr == NULL) { + if (ret || *memcgp == NULL) { if (PageAnon(page)) { lock_page_cgroup(pc); ClearPageCgroupMigration(pc); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 05c0f27d4ed1..2958fd8e7c9a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -152,7 +152,7 @@ struct task_struct *find_lock_task_mm(struct task_struct *p) /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, - const struct mem_cgroup *mem, const nodemask_t *nodemask) + const struct mem_cgroup *memcg, const nodemask_t *nodemask) { if (is_global_init(p)) return true; @@ -160,7 +160,7 @@ static bool oom_unkillable_task(struct task_struct *p, return true; /* When mem_cgroup_out_of_memory() and p is not member of the group */ - if (mem && !task_in_mem_cgroup(p, mem)) + if (memcg && !task_in_mem_cgroup(p, memcg)) return true; /* p may not have freeable memory in nodemask */ @@ -179,12 +179,12 @@ static bool oom_unkillable_task(struct task_struct *p, * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ -unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, +unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages) { long points; - if (oom_unkillable_task(p, mem, nodemask)) + if (oom_unkillable_task(p, memcg, nodemask)) return 0; p = find_lock_task_mm(p); @@ -308,7 +308,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, * (not docbooked, we don't want this one cluttering up the manual) */ static struct task_struct *select_bad_process(unsigned int *ppoints, - unsigned long totalpages, struct mem_cgroup *mem, + unsigned long totalpages, struct mem_cgroup *memcg, const nodemask_t *nodemask) { struct task_struct *g, *p; @@ -320,7 +320,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, if (p->exit_state) continue; - if (oom_unkillable_task(p, mem, nodemask)) + if (oom_unkillable_task(p, memcg, nodemask)) continue; /* @@ -364,7 +364,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, } } - points = oom_badness(p, mem, nodemask, totalpages); + points = oom_badness(p, memcg, nodemask, totalpages); if (points > *ppoints) { chosen = p; *ppoints = points; @@ -387,14 +387,14 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, * * Call with tasklist_lock read-locked. */ -static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask) +static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) { struct task_struct *p; struct task_struct *task; pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); for_each_process(p) { - if (oom_unkillable_task(p, mem, nodemask)) + if (oom_unkillable_task(p, memcg, nodemask)) continue; task = find_lock_task_mm(p); @@ -417,7 +417,7 @@ static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask) } static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, - struct mem_cgroup *mem, const nodemask_t *nodemask) + struct mem_cgroup *memcg, const nodemask_t *nodemask) { task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " @@ -427,10 +427,10 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); - mem_cgroup_print_oom_info(mem, p); + mem_cgroup_print_oom_info(memcg, p); show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) - dump_tasks(mem, nodemask); + dump_tasks(memcg, nodemask); } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -484,7 +484,7 @@ static int oom_kill_task(struct task_struct *p) static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int points, unsigned long totalpages, - struct mem_cgroup *mem, nodemask_t *nodemask, + struct mem_cgroup *memcg, nodemask_t *nodemask, const char *message) { struct task_struct *victim = p; @@ -493,7 +493,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int victim_points = 0; if (printk_ratelimit()) - dump_header(p, gfp_mask, order, mem, nodemask); + dump_header(p, gfp_mask, order, memcg, nodemask); /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -524,7 +524,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * oom_badness() returns 0 if the thread is unkillable */ - child_points = oom_badness(child, mem, nodemask, + child_points = oom_badness(child, memcg, nodemask, totalpages); if (child_points > victim_points) { victim = child; @@ -561,7 +561,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, } #ifdef CONFIG_CGROUP_MEM_RES_CTLR -void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) +void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) { unsigned long limit; unsigned int points = 0; @@ -578,14 +578,14 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) } check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); - limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; + limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; read_lock(&tasklist_lock); retry: - p = select_bad_process(&points, limit, mem, NULL); + p = select_bad_process(&points, limit, memcg, NULL); if (!p || PTR_ERR(p) == -1UL) goto out; - if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL, + if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL, "Memory cgroup out of memory")) goto retry; out: diff --git a/mm/rmap.c b/mm/rmap.c index a2e5ce1fa081..c8454e06b6c8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -773,7 +773,7 @@ out: } static int page_referenced_anon(struct page *page, - struct mem_cgroup *mem_cont, + struct mem_cgroup *memcg, unsigned long *vm_flags) { unsigned int mapcount; @@ -796,7 +796,7 @@ static int page_referenced_anon(struct page *page, * counting on behalf of references from different * cgroups */ - if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) + if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) continue; referenced += page_referenced_one(page, vma, address, &mapcount, vm_flags); @@ -811,7 +811,7 @@ static int page_referenced_anon(struct page *page, /** * page_referenced_file - referenced check for object-based rmap * @page: the page we're checking references on. - * @mem_cont: target memory controller + * @memcg: target memory control group * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * * For an object-based mapped page, find all the places it is mapped and @@ -822,7 +822,7 @@ static int page_referenced_anon(struct page *page, * This function is only called from page_referenced for object-based pages. */ static int page_referenced_file(struct page *page, - struct mem_cgroup *mem_cont, + struct mem_cgroup *memcg, unsigned long *vm_flags) { unsigned int mapcount; @@ -864,7 +864,7 @@ static int page_referenced_file(struct page *page, * counting on behalf of references from different * cgroups */ - if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) + if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) continue; referenced += page_referenced_one(page, vma, address, &mapcount, vm_flags); @@ -880,7 +880,7 @@ static int page_referenced_file(struct page *page, * page_referenced - test if the page was referenced * @page: the page to test * @is_locked: caller holds lock on the page - * @mem_cont: target memory controller + * @memcg: target memory cgroup * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * * Quick test_and_clear_referenced for all mappings to a page, @@ -888,7 +888,7 @@ static int page_referenced_file(struct page *page, */ int page_referenced(struct page *page, int is_locked, - struct mem_cgroup *mem_cont, + struct mem_cgroup *memcg, unsigned long *vm_flags) { int referenced = 0; @@ -904,13 +904,13 @@ int page_referenced(struct page *page, } } if (unlikely(PageKsm(page))) - referenced += page_referenced_ksm(page, mem_cont, + referenced += page_referenced_ksm(page, memcg, vm_flags); else if (PageAnon(page)) - referenced += page_referenced_anon(page, mem_cont, + referenced += page_referenced_anon(page, memcg, vm_flags); else if (page->mapping) - referenced += page_referenced_file(page, mem_cont, + referenced += page_referenced_file(page, memcg, vm_flags); if (we_locked) unlock_page(page); diff --git a/mm/swapfile.c b/mm/swapfile.c index 9520592d4231..d999f090dfda 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -847,12 +847,13 @@ unsigned int count_swap_pages(int type, int free) static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { - struct mem_cgroup *ptr; + struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; int ret = 1; - if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { + if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, + GFP_KERNEL, &memcg)) { ret = -ENOMEM; goto out_nolock; } @@ -860,7 +861,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { if (ret > 0) - mem_cgroup_cancel_charge_swapin(ptr); + mem_cgroup_cancel_charge_swapin(memcg); ret = 0; goto out; } @@ -871,7 +872,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); page_add_anon_rmap(page, vma, addr); - mem_cgroup_commit_charge_swapin(page, ptr); + mem_cgroup_commit_charge_swapin(page, memcg); swap_free(entry); /* * Move the page to the active list so it is not diff --git a/mm/vmscan.c b/mm/vmscan.c index 813aae820a27..e16ca8384ef7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2376,7 +2376,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR -unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, +unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap, struct zone *zone, unsigned long *nr_scanned) @@ -2388,10 +2388,10 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .may_unmap = 1, .may_swap = !noswap, .order = 0, - .target_mem_cgroup = mem, + .target_mem_cgroup = memcg, }; struct mem_cgroup_zone mz = { - .mem_cgroup = mem, + .mem_cgroup = memcg, .zone = zone, }; @@ -2417,7 +2417,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, return sc.nr_reclaimed; } -unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, +unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap) { @@ -2430,7 +2430,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, .order = 0, - .target_mem_cgroup = mem_cont, + .target_mem_cgroup = memcg, .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), @@ -2444,7 +2444,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, * take care of from where we get pages. So the node where we start the * scan does not need to be the current node. */ - nid = mem_cgroup_select_victim_node(mem_cont); + nid = mem_cgroup_select_victim_node(memcg); zonelist = NODE_DATA(nid)->node_zonelists; From 0e574a932d2cab8eb3b02d21feb59f2c09154738 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:18:35 -0800 Subject: [PATCH 34/96] mm: memcg: clean up fault accounting The fault accounting functions have a single, memcg-internal user, so they don't need to be global. In fact, their one-line bodies can be directly folded into the caller. And since faults happen one at a time, use this_cpu_inc() directly instead of this_cpu_add(foo, 1). Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Acked-by: Balbir Singh Cc: David Rientjes Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index aeb23933a052..93cb16d2b96a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -655,16 +655,6 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); } -void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val) -{ - this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); -} - -void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val) -{ - this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); -} - static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx) { @@ -978,11 +968,11 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) goto out; switch (idx) { - case PGMAJFAULT: - mem_cgroup_pgmajfault(memcg, 1); - break; case PGFAULT: - mem_cgroup_pgfault(memcg, 1); + this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); + break; + case PGMAJFAULT: + this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); break; default: BUG(); From cfa449461e67b60df986170eecb089831fa9e49a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:18:38 -0800 Subject: [PATCH 35/96] mm: memcg: lookup_page_cgroup (almost) never returns NULL Pages have their corresponding page_cgroup descriptors set up before they are used in userspace, and thus managed by a memory cgroup. The only time where lookup_page_cgroup() can return NULL is in the CONFIG_DEBUG_VM-only page sanity checking code that executes while feeding pages into the page allocator for the first time. Remove the NULL checks against lookup_page_cgroup() results from all callsites where we know that corresponding page_cgroup descriptors must be allocated, and add a comment to the callsite that actually does have to check the return value. [hughd@google.com: stop oops in mem_cgroup_update_page_stat()] Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: Balbir Singh Cc: David Rientjes Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 93cb16d2b96a..a63ad141083c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1960,7 +1960,7 @@ void mem_cgroup_update_page_stat(struct page *page, bool need_unlock = false; unsigned long uninitialized_var(flags); - if (unlikely(!pc)) + if (mem_cgroup_disabled()) return; rcu_read_lock(); @@ -2735,8 +2735,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, } pc = lookup_page_cgroup(page); - BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ - ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); if (ret || !memcg) return ret; @@ -3008,7 +3006,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) * Check if our page_cgroup is valid */ pc = lookup_page_cgroup(page); - if (unlikely(!pc || !PageCgroupUsed(pc))) + if (unlikely(!PageCgroupUsed(pc))) return NULL; lock_page_cgroup(pc); @@ -3436,6 +3434,11 @@ static struct page_cgroup *lookup_page_cgroup_used(struct page *page) struct page_cgroup *pc; pc = lookup_page_cgroup(page); + /* + * Can be NULL while feeding pages into the page allocator for + * the first time, i.e. during boot or memory hotplug; + * or when mem_cgroup_disabled(). + */ if (likely(pc) && PageCgroupUsed(pc)) return pc; return NULL; From 00c54c0bac24bb02d2460c516da76651a7451286 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:18:40 -0800 Subject: [PATCH 36/96] mm: page_cgroup: check page_cgroup arrays in lookup_page_cgroup() only when necessary lookup_page_cgroup() is usually used only against pages that are used in userspace. The exception is the CONFIG_DEBUG_VM-only memcg check from the page allocator: it can run on pages without page_cgroup descriptors allocated when the pages are fed into the page allocator for the first time during boot or memory hotplug. Include the array check only when CONFIG_DEBUG_VM is set and save the unnecessary check in production kernels. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: Balbir Singh Cc: David Rientjes Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index f0559e049e00..e910524e5a08 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -28,9 +28,16 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) struct page_cgroup *base; base = NODE_DATA(page_to_nid(page))->node_page_cgroup; +#ifdef CONFIG_DEBUG_VM + /* + * The sanity checks the page allocator does upon freeing a + * page can reach here before the page_cgroup arrays are + * allocated when feeding a range of pages to the allocator + * for the first time during bootup or memory hotplug. + */ if (unlikely(!base)) return NULL; - +#endif offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; return base + offset; } @@ -85,9 +92,16 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); - +#ifdef CONFIG_DEBUG_VM + /* + * The sanity checks the page allocator does upon freeing a + * page can reach here before the page_cgroup arrays are + * allocated when feeding a range of pages to the allocator + * for the first time during bootup or memory hotplug. + */ if (!section->page_cgroup) return NULL; +#endif return section->page_cgroup + pfn; } From 7a0524cfc8f9f585471a31b1282a9ce4a1a7d444 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:18:43 -0800 Subject: [PATCH 37/96] mm: memcg: remove unneeded checks from newpage_charge() All callsites pass in freshly allocated pages and a valid mm. As a result, all checks pertaining to the page's mapcount, page->mapping or the fallback to init_mm are unneeded. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: David Rientjes Cc: Balbir Singh Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a63ad141083c..2450d89486d4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2748,19 +2748,11 @@ int mem_cgroup_newpage_charge(struct page *page, { if (mem_cgroup_disabled()) return 0; - /* - * If already mapped, we don't have to account. - * If page cache, page->mapping has address_space. - * But page->mapping may have out-of-use anon_vma pointer, - * detecit it by PageAnon() check. newly-mapped-anon's page->mapping - * is NULL. - */ - if (page_mapped(page) || (page->mapping && !PageAnon(page))) - return 0; - if (unlikely(!mm)) - mm = &init_mm; + VM_BUG_ON(page_mapped(page)); + VM_BUG_ON(page->mapping && !PageAnon(page)); + VM_BUG_ON(!mm); return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_MAPPED); + MEM_CGROUP_CHARGE_TYPE_MAPPED); } static void From 40f23a21a8501c1b2c65c50c19b516488ac31313 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:18:45 -0800 Subject: [PATCH 38/96] mm: memcg: remove unneeded checks from uncharge_page() mem_cgroup_uncharge_page() is only called on either freshly allocated pages without page->mapping or on rmapped PageAnon() pages. There is no need to check for a page->mapping that is not an anon_vma. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: Balbir Singh Cc: David Rientjes Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2450d89486d4..71a9774e6ead 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3061,8 +3061,7 @@ void mem_cgroup_uncharge_page(struct page *page) /* early check. */ if (page_mapped(page)) return; - if (page->mapping && !PageAnon(page)) - return; + VM_BUG_ON(page->mapping && !PageAnon(page)); __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); } From 9fb4b7cc0724f178d4b24a2a566ea1e7cb120b82 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Thu, 12 Jan 2012 17:18:48 -0800 Subject: [PATCH 39/96] page_cgroup: add helper function to get swap_cgroup There are multiple places which need to get the swap_cgroup address, so add a helper function: static struct swap_cgroup *swap_cgroup_getsc(swp_entry_t ent, struct swap_cgroup_ctrl **ctrl); to simplify the code. Signed-off-by: Bob Liu Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 4 +-- mm/memcontrol.c | 4 +-- mm/page_cgroup.c | 56 +++++++++++++------------------------ 3 files changed, 24 insertions(+), 40 deletions(-) diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index aaa60da8783c..1153095ee457 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -149,7 +149,7 @@ static inline void __init page_cgroup_init_flatmem(void) extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new); extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); -extern unsigned short lookup_swap_cgroup(swp_entry_t ent); +extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); extern int swap_cgroup_swapon(int type, unsigned long max_pages); extern void swap_cgroup_swapoff(int type); #else @@ -161,7 +161,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) } static inline -unsigned short lookup_swap_cgroup(swp_entry_t ent) +unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 71a9774e6ead..4c53e971749e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2474,7 +2474,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) memcg = NULL; } else if (PageSwapCache(page)) { ent.val = page_private(page); - id = lookup_swap_cgroup(ent); + id = lookup_swap_cgroup_id(ent); rcu_read_lock(); memcg = mem_cgroup_lookup(id); if (memcg && !css_tryget(&memcg->css)) @@ -5264,7 +5264,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, } /* There is a swap entry and a page doesn't exist or isn't charged */ if (ent.val && !ret && - css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { + css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { ret = MC_TARGET_SWAP; if (target) target->ent = ent; diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index e910524e5a08..b99d19edf89b 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -334,7 +334,6 @@ struct swap_cgroup { unsigned short id; }; #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) -#define SC_POS_MASK (SC_PER_PAGE - 1) /* * SwapCgroup implements "lookup" and "exchange" operations. @@ -376,6 +375,21 @@ not_enough_page: return -ENOMEM; } +static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, + struct swap_cgroup_ctrl **ctrlp) +{ + pgoff_t offset = swp_offset(ent); + struct swap_cgroup_ctrl *ctrl; + struct page *mappage; + + ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + if (ctrlp) + *ctrlp = ctrl; + + mappage = ctrl->map[offset / SC_PER_PAGE]; + return page_address(mappage) + offset % SC_PER_PAGE; +} + /** * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. * @end: swap entry to be cmpxchged @@ -388,21 +402,13 @@ not_enough_page: unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new) { - int type = swp_type(ent); - unsigned long offset = swp_offset(ent); - unsigned long idx = offset / SC_PER_PAGE; - unsigned long pos = offset & SC_POS_MASK; struct swap_cgroup_ctrl *ctrl; - struct page *mappage; struct swap_cgroup *sc; unsigned long flags; unsigned short retval; - ctrl = &swap_cgroup_ctrl[type]; + sc = lookup_swap_cgroup(ent, &ctrl); - mappage = ctrl->map[idx]; - sc = page_address(mappage); - sc += pos; spin_lock_irqsave(&ctrl->lock, flags); retval = sc->id; if (retval == old) @@ -423,21 +429,13 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, */ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) { - int type = swp_type(ent); - unsigned long offset = swp_offset(ent); - unsigned long idx = offset / SC_PER_PAGE; - unsigned long pos = offset & SC_POS_MASK; struct swap_cgroup_ctrl *ctrl; - struct page *mappage; struct swap_cgroup *sc; unsigned short old; unsigned long flags; - ctrl = &swap_cgroup_ctrl[type]; + sc = lookup_swap_cgroup(ent, &ctrl); - mappage = ctrl->map[idx]; - sc = page_address(mappage); - sc += pos; spin_lock_irqsave(&ctrl->lock, flags); old = sc->id; sc->id = id; @@ -447,28 +445,14 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) } /** - * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry + * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry * @ent: swap entry to be looked up. * * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) */ -unsigned short lookup_swap_cgroup(swp_entry_t ent) +unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { - int type = swp_type(ent); - unsigned long offset = swp_offset(ent); - unsigned long idx = offset / SC_PER_PAGE; - unsigned long pos = offset & SC_POS_MASK; - struct swap_cgroup_ctrl *ctrl; - struct page *mappage; - struct swap_cgroup *sc; - unsigned short ret; - - ctrl = &swap_cgroup_ctrl[type]; - mappage = ctrl->map[idx]; - sc = page_address(mappage); - sc += pos; - ret = sc->id; - return ret; + return lookup_swap_cgroup(ent, NULL)->id; } int swap_cgroup_swapon(int type, unsigned long max_pages) From c3cecc683446ad54ca587d7123bd3ce94bd7b8e1 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 12 Jan 2012 17:18:50 -0800 Subject: [PATCH 40/96] memcg: free entries in soft_limit_tree if allocation fails If we are not able to allocate tree nodes for all NUMA nodes then we should release those that were allocated. Signed-off-by: Michal Hocko Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Hugh Dickins Cc: Balbir Singh Cc: David Rientjes Cc: Andrea Arcangeli Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4c53e971749e..2a1f7847b6ad 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4960,7 +4960,7 @@ static int mem_cgroup_soft_limit_tree_init(void) tmp = -1; rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); if (!rtpn) - return 1; + goto err_cleanup; soft_limit_tree.rb_tree_per_node[node] = rtpn; @@ -4971,6 +4971,16 @@ static int mem_cgroup_soft_limit_tree_init(void) } } return 0; + +err_cleanup: + for_each_node_state(node, N_POSSIBLE) { + if (!soft_limit_tree.rb_tree_per_node[node]) + break; + kfree(soft_limit_tree.rb_tree_per_node[node]); + soft_limit_tree.rb_tree_per_node[node] = NULL; + } + return 1; + } static struct cgroup_subsys_state * __ref From de077d222d5ca6108cab119a09593344c12100ab Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 12 Jan 2012 17:18:52 -0800 Subject: [PATCH 41/96] oom, memcg: fix exclusion of memcg threads after they have detached their mm The oom killer relies on logic that identifies threads that have already been oom killed when scanning the tasklist and, if found, deferring until such threads have exited. This is done by checking for any candidate threads that have the TIF_MEMDIE bit set. For memcg ooms, candidate threads are first found by calling task_in_mem_cgroup() since the oom killer should not defer if there's an oom killed thread in another memcg. Unfortunately, task_in_mem_cgroup() excludes threads if they have detached their mm in the process of exiting so TIF_MEMDIE is never detected for such conditions. This is different for global, mempolicy, and cpuset oom conditions where a detached mm is only excluded after checking for TIF_MEMDIE and deferring, if necessary, in select_bad_process(). The fix is to return true if a task has a detached mm but is still in the memcg or its hierarchy that is currently oom. This will allow the oom killer to appropriately defer rather than kill unnecessarily or, in the worst case, panic the machine if nothing else is available to kill. Signed-off-by: David Rientjes Acked-by: KOSAKI Motohiro Cc: Johannes Weiner Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2a1f7847b6ad..7bc30707d0c6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1238,10 +1238,21 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) struct task_struct *p; p = find_lock_task_mm(task); - if (!p) - return 0; - curr = try_get_mem_cgroup_from_mm(p->mm); - task_unlock(p); + if (p) { + curr = try_get_mem_cgroup_from_mm(p->mm); + task_unlock(p); + } else { + /* + * All threads may have already detached their mm's, but the oom + * killer still needs to detect if they have already been oom + * killed to prevent needlessly killing additional tasks. + */ + task_lock(task); + curr = mem_cgroup_from_task(task); + if (curr) + css_get(&curr->css); + task_unlock(task); + } if (!curr) return 0; /* From dc67d50465f249bb357bf85b3ed1f642eb00130a Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 12 Jan 2012 17:18:55 -0800 Subject: [PATCH 42/96] memcg: simplify page cache charging This patch is a clean up. No functional/logical changes. Because of commit ef6a3c6311 ("mm: add replace_page_cache_page() function") , FUSE uses replace_page_cache() instead of add_to_page_cache(). Then, mem_cgroup_cache_charge() is not called against FUSE's pages from splice. So now, mem_cgroup_cache_charge() gets pages that are not on the LRU with the exception of PageSwapCache pages. For checking, WARN_ON_ONCE(PageLRU(page)) is added. Signed-off-by: KAMEZAWA Hiroyuki Cc: Miklos Szeredi Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Ying Han Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7bc30707d0c6..71bac4d720d7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2790,6 +2790,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { struct mem_cgroup *memcg = NULL; + enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; int ret; if (mem_cgroup_disabled()) @@ -2799,31 +2800,17 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (unlikely(!mm)) mm = &init_mm; + if (!page_is_file_cache(page)) + type = MEM_CGROUP_CHARGE_TYPE_SHMEM; - if (page_is_file_cache(page)) { - ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); - if (ret || !memcg) - return ret; - - /* - * FUSE reuses pages without going through the final - * put that would remove them from the LRU list, make - * sure that they get relinked properly. - */ - __mem_cgroup_commit_charge_lrucare(page, memcg, - MEM_CGROUP_CHARGE_TYPE_CACHE); - return ret; - } - /* shmem */ - if (PageSwapCache(page)) { + if (!PageSwapCache(page)) { + ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); + WARN_ON_ONCE(PageLRU(page)); + } else { /* page is swapcache/shmem */ ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); if (!ret) - __mem_cgroup_commit_charge_swapin(page, memcg, - MEM_CGROUP_CHARGE_TYPE_SHMEM); - } else - ret = mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_SHMEM); - + __mem_cgroup_commit_charge_swapin(page, memcg, type); + } return ret; } From 36b62ad539498d00c2d280a151abad5f7630fa73 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 12 Jan 2012 17:18:57 -0800 Subject: [PATCH 43/96] memcg: simplify corner case handling of LRU. This patch simplifies LRU handling of racy case (memcg+SwapCache). At charging, SwapCache tend to be on LRU already. So, before overwriting pc->mem_cgroup, the page must be removed from LRU and added to LRU later. This patch does spin_lock(zone->lru_lock); if (PageLRU(page)) remove from LRU overwrite pc->mem_cgroup if (PageLRU(page)) add to new LRU. spin_unlock(zone->lru_lock); And guarantee all pages are not on LRU at modifying pc->mem_cgroup. This patch also unfies lru handling of replace_page_cache() and swapin. Signed-off-by: KAMEZAWA Hiroyuki Cc: Miklos Szeredi Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Ying Han Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 109 +++++++----------------------------------------- 1 file changed, 16 insertions(+), 93 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 71bac4d720d7..d58bb5fa4403 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1136,86 +1136,6 @@ struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, return mem_cgroup_lru_add_list(zone, page, to); } -/* - * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed - * while it's linked to lru because the page may be reused after it's fully - * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. - * It's done under lock_page and expected that zone->lru_lock isnever held. - */ -static void mem_cgroup_lru_del_before_commit(struct page *page) -{ - enum lru_list lru; - unsigned long flags; - struct zone *zone = page_zone(page); - struct page_cgroup *pc = lookup_page_cgroup(page); - - /* - * Doing this check without taking ->lru_lock seems wrong but this - * is safe. Because if page_cgroup's USED bit is unset, the page - * will not be added to any memcg's LRU. If page_cgroup's USED bit is - * set, the commit after this will fail, anyway. - * This all charge/uncharge is done under some mutual execustion. - * So, we don't need to taking care of changes in USED bit. - */ - if (likely(!PageLRU(page))) - return; - - spin_lock_irqsave(&zone->lru_lock, flags); - lru = page_lru(page); - /* - * The uncharged page could still be registered to the LRU of - * the stale pc->mem_cgroup. - * - * As pc->mem_cgroup is about to get overwritten, the old LRU - * accounting needs to be taken care of. Let root_mem_cgroup - * babysit the page until the new memcg is responsible for it. - * - * The PCG_USED bit is guarded by lock_page() as the page is - * swapcache/pagecache. - */ - if (PageLRU(page) && PageCgroupAcctLRU(pc) && !PageCgroupUsed(pc)) { - del_page_from_lru_list(zone, page, lru); - add_page_to_lru_list(zone, page, lru); - } - spin_unlock_irqrestore(&zone->lru_lock, flags); -} - -static void mem_cgroup_lru_add_after_commit(struct page *page) -{ - enum lru_list lru; - unsigned long flags; - struct zone *zone = page_zone(page); - struct page_cgroup *pc = lookup_page_cgroup(page); - /* - * putback: charge: - * SetPageLRU SetPageCgroupUsed - * smp_mb smp_mb - * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU - * - * Ensure that one of the two sides adds the page to the memcg - * LRU during a race. - */ - smp_mb(); - /* taking care of that the page is added to LRU while we commit it */ - if (likely(!PageLRU(page))) - return; - spin_lock_irqsave(&zone->lru_lock, flags); - lru = page_lru(page); - /* - * If the page is not on the LRU, someone will soon put it - * there. If it is, and also already accounted for on the - * memcg-side, it must be on the right lruvec as setting - * pc->mem_cgroup and PageCgroupUsed is properly ordered. - * Otherwise, root_mem_cgroup has been babysitting the page - * during the charge. Move it to the new memcg now. - */ - if (PageLRU(page) && !PageCgroupAcctLRU(pc)) { - del_page_from_lru_list(zone, page, lru); - add_page_to_lru_list(zone, page, lru); - } - spin_unlock_irqrestore(&zone->lru_lock, flags); -} - /* * Checks whether given mem is same or in the root_mem_cgroup's * hierarchy subtree @@ -2775,14 +2695,27 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg, enum charge_type ctype) { struct page_cgroup *pc = lookup_page_cgroup(page); + struct zone *zone = page_zone(page); + unsigned long flags; + bool removed = false; + /* * In some case, SwapCache, FUSE(splice_buf->radixtree), the page * is already on LRU. It means the page may on some other page_cgroup's * LRU. Take care of it. */ - mem_cgroup_lru_del_before_commit(page); + spin_lock_irqsave(&zone->lru_lock, flags); + if (PageLRU(page)) { + del_page_from_lru_list(zone, page, page_lru(page)); + ClearPageLRU(page); + removed = true; + } __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); - mem_cgroup_lru_add_after_commit(page); + if (removed) { + add_page_to_lru_list(zone, page, page_lru(page)); + SetPageLRU(page); + } + spin_unlock_irqrestore(&zone->lru_lock, flags); return; } @@ -3383,9 +3316,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, { struct mem_cgroup *memcg; struct page_cgroup *pc; - struct zone *zone; enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; - unsigned long flags; if (mem_cgroup_disabled()) return; @@ -3401,20 +3332,12 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, if (PageSwapBacked(oldpage)) type = MEM_CGROUP_CHARGE_TYPE_SHMEM; - zone = page_zone(newpage); - pc = lookup_page_cgroup(newpage); /* * Even if newpage->mapping was NULL before starting replacement, * the newpage may be on LRU(or pagevec for LRU) already. We lock * LRU while we overwrite pc->mem_cgroup. */ - spin_lock_irqsave(&zone->lru_lock, flags); - if (PageLRU(newpage)) - del_page_from_lru_list(zone, newpage, page_lru(newpage)); - __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type); - if (PageLRU(newpage)) - add_page_to_lru_list(zone, newpage, page_lru(newpage)); - spin_unlock_irqrestore(&zone->lru_lock, flags); + __mem_cgroup_commit_charge_lrucare(newpage, memcg, type); } #ifdef CONFIG_DEBUG_VM From 4e5f01c2b9b94321992acb09c35d34f5ee5bb274 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 12 Jan 2012 17:18:58 -0800 Subject: [PATCH 44/96] memcg: clear pc->mem_cgroup if necessary. This is a preparation before removing a flag PCG_ACCT_LRU in page_cgroup and reducing atomic ops/complexity in memcg LRU handling. In some cases, pages are added to lru before charge to memcg and pages are not classfied to memory cgroup at lru addtion. Now, the lru where the page should be added is determined a bit in page_cgroup->flags and pc->mem_cgroup. I'd like to remove the check of flag. To handle the case pc->mem_cgroup may contain stale pointers if pages are added to LRU before classification. This patch resets pc->mem_cgroup to root_mem_cgroup before lru additions. [akpm@linux-foundation.org: fix CONFIG_CGROUP_MEM_CONT=n build] [hughd@google.com: fix CONFIG_CGROUP_MEM_RES_CTLR=y CONFIG_CGROUP_MEM_RES_CTLR_SWAP=n build] [akpm@linux-foundation.org: ksm.c needs memcontrol.h, per Michal] [hughd@google.com: stop oops in mem_cgroup_reset_owner()] [hughd@google.com: fix page migration to reset_owner] Signed-off-by: KAMEZAWA Hiroyuki Cc: Miklos Szeredi Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Ying Han Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++++ mm/ksm.c | 11 +++++++++++ mm/memcontrol.c | 17 +++++++++++++++++ mm/migrate.c | 2 ++ mm/swap_state.c | 10 ++++++++++ 5 files changed, 45 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b80de520670b..4d34356fe644 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -129,6 +129,7 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, extern void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage); +extern void mem_cgroup_reset_owner(struct page *page); #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP extern int do_swap_account; #endif @@ -391,6 +392,10 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage) { } + +static inline void mem_cgroup_reset_owner(struct page *page) +{ +} #endif /* CONFIG_CGROUP_MEM_CONT */ #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) diff --git a/mm/ksm.c b/mm/ksm.c index 310544a379ae..1925ffbfb27f 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -1571,6 +1572,16 @@ struct page *ksm_does_need_to_copy(struct page *page, new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (new_page) { + /* + * The memcg-specific accounting when moving + * pages around the LRU lists relies on the + * page's owner (memcg) to be valid. Usually, + * pages are assigned to a new owner before + * being put on the LRU list, but since this + * is not the case here, the stale owner from + * a previous allocation cycle must be reset. + */ + mem_cgroup_reset_owner(new_page); copy_user_highpage(new_page, page, address, vma); SetPageDirty(new_page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d58bb5fa4403..c74102d6eb5a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3050,6 +3050,23 @@ void mem_cgroup_uncharge_end(void) batch->memcg = NULL; } +/* + * A function for resetting pc->mem_cgroup for newly allocated pages. + * This function should be called if the newpage will be added to LRU + * before start accounting. + */ +void mem_cgroup_reset_owner(struct page *newpage) +{ + struct page_cgroup *pc; + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(newpage); + VM_BUG_ON(PageCgroupUsed(pc)); + pc->mem_cgroup = root_mem_cgroup; +} + #ifdef CONFIG_SWAP /* * called after __delete_from_swap_cache() and drop "page" account. diff --git a/mm/migrate.c b/mm/migrate.c index 89ea0854332e..fc391985899f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -777,6 +777,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (!newpage) return -ENOMEM; + mem_cgroup_reset_owner(newpage); + if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ goto out; diff --git a/mm/swap_state.c b/mm/swap_state.c index ea6b32d61873..470038a91873 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -300,6 +300,16 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, new_page = alloc_page_vma(gfp_mask, vma, addr); if (!new_page) break; /* Out of memory */ + /* + * The memcg-specific accounting when moving + * pages around the LRU lists relies on the + * page's owner (memcg) to be valid. Usually, + * pages are assigned to a new owner before + * being put on the LRU list, but since this + * is not the case here, the stale owner from + * a previous allocation cycle must be reset. + */ + mem_cgroup_reset_owner(new_page); } /* From 38c5d72f3ebe5ddd57d2f08dc035070fc6c9a287 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 12 Jan 2012 17:19:01 -0800 Subject: [PATCH 45/96] memcg: simplify LRU handling by new rule Now, at LRU handling, memory cgroup needs to do complicated works to see valid pc->mem_cgroup, which may be overwritten. This patch is for relaxing the protocol. This patch guarantees - when pc->mem_cgroup is overwritten, page must not be on LRU. By this, LRU routine can believe pc->mem_cgroup and don't need to check bits on pc->flags. This new rule may adds small overheads to swapin. But in most case, lru handling gets faster. After this patch, PCG_ACCT_LRU bit is obsolete and removed. [akpm@linux-foundation.org: remove unneeded VM_BUG_ON(), restore hannes's christmas tree] [akpm@linux-foundation.org: clean up code comment] [hughd@google.com: fix NULL mem_cgroup_try_charge] Signed-off-by: KAMEZAWA Hiroyuki Cc: Miklos Szeredi Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Ying Han Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 8 --- mm/memcontrol.c | 123 ++++++++++++++++-------------------- 2 files changed, 54 insertions(+), 77 deletions(-) diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 1153095ee457..a2d11771c84b 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -10,8 +10,6 @@ enum { /* flags for mem_cgroup and file and I/O status */ PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ PCG_FILE_MAPPED, /* page is accounted as "mapped" */ - /* No lock in page_cgroup */ - PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ __NR_PCG_FLAGS, }; @@ -75,12 +73,6 @@ TESTPCGFLAG(Used, USED) CLEARPCGFLAG(Used, USED) SETPCGFLAG(Used, USED) -SETPCGFLAG(AcctLRU, ACCT_LRU) -CLEARPCGFLAG(AcctLRU, ACCT_LRU) -TESTPCGFLAG(AcctLRU, ACCT_LRU) -TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU) - - SETPCGFLAG(FileMapped, FILE_MAPPED) CLEARPCGFLAG(FileMapped, FILE_MAPPED) TESTPCGFLAG(FileMapped, FILE_MAPPED) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c74102d6eb5a..ff051ee8fb4b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1040,30 +1040,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, return &zone->lruvec; pc = lookup_page_cgroup(page); - VM_BUG_ON(PageCgroupAcctLRU(pc)); - /* - * putback: charge: - * SetPageLRU SetPageCgroupUsed - * smp_mb smp_mb - * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU - * - * Ensure that one of the two sides adds the page to the memcg - * LRU during a race. - */ - smp_mb(); - /* - * If the page is uncharged, it may be freed soon, but it - * could also be swap cache (readahead, swapoff) that needs to - * be reclaimable in the future. root_mem_cgroup will babysit - * it for the time being. - */ - if (PageCgroupUsed(pc)) { - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - memcg = pc->mem_cgroup; - SetPageCgroupAcctLRU(pc); - } else - memcg = root_mem_cgroup; + memcg = pc->mem_cgroup; mz = page_cgroup_zoneinfo(memcg, page); /* compound_order() is stabilized through lru_lock */ MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); @@ -1090,18 +1067,8 @@ void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) return; pc = lookup_page_cgroup(page); - /* - * root_mem_cgroup babysits uncharged LRU pages, but - * PageCgroupUsed is cleared when the page is about to get - * freed. PageCgroupAcctLRU remembers whether the - * LRU-accounting happened against pc->mem_cgroup or - * root_mem_cgroup. - */ - if (TestClearPageCgroupAcctLRU(pc)) { - VM_BUG_ON(!pc->mem_cgroup); - memcg = pc->mem_cgroup; - } else - memcg = root_mem_cgroup; + memcg = pc->mem_cgroup; + VM_BUG_ON(!memcg); mz = page_cgroup_zoneinfo(memcg, page); /* huge page split is done under lru_lock. so, we have no races. */ MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); @@ -2217,8 +2184,25 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, } /* - * Unlike exported interface, "oom" parameter is added. if oom==true, - * oom-killer can be invoked. + * __mem_cgroup_try_charge() does + * 1. detect memcg to be charged against from passed *mm and *ptr, + * 2. update res_counter + * 3. call memory reclaim if necessary. + * + * In some special case, if the task is fatal, fatal_signal_pending() or + * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup + * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon + * as possible without any hazards. 2: all pages should have a valid + * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg + * pointer, that is treated as a charge to root_mem_cgroup. + * + * So __mem_cgroup_try_charge() will return + * 0 ... on success, filling *ptr with a valid memcg pointer. + * -ENOMEM ... charge failure because of resource limits. + * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. + * + * Unlike the exported interface, an "oom" parameter is added. if oom==true, + * the oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, gfp_t gfp_mask, @@ -2247,7 +2231,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, * set, if so charge the init_mm (happens for pagecache usage). */ if (!*ptr && !mm) - goto bypass; + *ptr = root_mem_cgroup; again: if (*ptr) { /* css should be a valid one */ memcg = *ptr; @@ -2273,7 +2257,9 @@ again: * task-struct. So, mm->owner can be NULL. */ memcg = mem_cgroup_from_task(p); - if (!memcg || mem_cgroup_is_root(memcg)) { + if (!memcg) + memcg = root_mem_cgroup; + if (mem_cgroup_is_root(memcg)) { rcu_read_unlock(); goto done; } @@ -2348,8 +2334,8 @@ nomem: *ptr = NULL; return -ENOMEM; bypass: - *ptr = NULL; - return 0; + *ptr = root_mem_cgroup; + return -EINTR; } /* @@ -2457,6 +2443,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); unlock_page_cgroup(pc); + WARN_ON_ONCE(PageLRU(page)); /* * "charge_statistics" updated event counter. Then, check it. * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. @@ -2468,7 +2455,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ - (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) + (1 << PCG_MIGRATION)) /* * Because tail pages are not marked as "used", set it. We're under * zone->lru_lock, 'splitting on pmd' and compound_lock. @@ -2478,7 +2465,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, void mem_cgroup_split_huge_fixup(struct page *head) { struct page_cgroup *head_pc = lookup_page_cgroup(head); + struct mem_cgroup_per_zone *mz; struct page_cgroup *pc; + enum lru_list lru; int i; if (mem_cgroup_disabled()) @@ -2487,23 +2476,15 @@ void mem_cgroup_split_huge_fixup(struct page *head) pc = head_pc + i; pc->mem_cgroup = head_pc->mem_cgroup; smp_wmb();/* see __commit_charge() */ - /* - * LRU flags cannot be copied because we need to add tail - * page to LRU by generic call and our hooks will be called. - */ pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; } - - if (PageCgroupAcctLRU(head_pc)) { - enum lru_list lru; - struct mem_cgroup_per_zone *mz; - /* - * We hold lru_lock, then, reduce counter directly. - */ - lru = page_lru(head); - mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); - MEM_CGROUP_ZSTAT(mz, lru) -= HPAGE_PMD_NR - 1; - } + /* + * Tail pages will be added to LRU. + * We hold lru_lock,then,reduce counter directly. + */ + lru = page_lru(head); + mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); + MEM_CGROUP_ZSTAT(mz, lru) -= HPAGE_PMD_NR - 1; } #endif @@ -2620,7 +2601,7 @@ static int mem_cgroup_move_parent(struct page *page, parent = mem_cgroup_from_cont(pcg); ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); - if (ret || !parent) + if (ret) goto put_back; if (nr_pages > 1) @@ -2667,9 +2648,8 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, pc = lookup_page_cgroup(page); ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); - if (ret || !memcg) + if (ret == -ENOMEM) return ret; - __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); return 0; } @@ -2736,10 +2716,9 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (!page_is_file_cache(page)) type = MEM_CGROUP_CHARGE_TYPE_SHMEM; - if (!PageSwapCache(page)) { + if (!PageSwapCache(page)) ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); - WARN_ON_ONCE(PageLRU(page)); - } else { /* page is swapcache/shmem */ + else { /* page is swapcache/shmem */ ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); if (!ret) __mem_cgroup_commit_charge_swapin(page, memcg, type); @@ -2781,11 +2760,16 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, *memcgp = memcg; ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); css_put(&memcg->css); + if (ret == -EINTR) + ret = 0; return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); + ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); + if (ret == -EINTR) + ret = 0; + return ret; } static void @@ -3245,7 +3229,7 @@ int mem_cgroup_prepare_migration(struct page *page, *memcgp = memcg; ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); css_put(&memcg->css);/* drop extra refcnt */ - if (ret || *memcgp == NULL) { + if (ret) { if (PageAnon(page)) { lock_page_cgroup(pc); ClearPageCgroupMigration(pc); @@ -3255,6 +3239,7 @@ int mem_cgroup_prepare_migration(struct page *page, */ mem_cgroup_uncharge_page(page); } + /* we'll need to revisit this error code (we have -EINTR) */ return -ENOMEM; } /* @@ -3674,7 +3659,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, pc = lookup_page_cgroup(page); ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); - if (ret == -ENOMEM) + if (ret == -ENOMEM || ret == -EINTR) break; if (ret == -EBUSY || ret == -EINVAL) { @@ -5065,9 +5050,9 @@ one_by_one: } ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &memcg, false); - if (ret || !memcg) + if (ret) /* mem_cgroup_clear_mc() will do uncharge later */ - return -ENOMEM; + return ret; mc.precharge++; } return ret; From 3ed28fa1080c73747ce17f2025b28b062fb5aa7f Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Thu, 12 Jan 2012 17:19:04 -0800 Subject: [PATCH 46/96] memcg: cleanup for_each_node_state() We already have for_each_node(node) define in nodemask.h, better to use it. Signed-off-by: Bob Liu Acked-by: Michal Hocko Cc: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ff051ee8fb4b..609c49f492e6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -565,7 +565,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; - for_each_node_state(node, N_POSSIBLE) { + for_each_node(node) { for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = mem_cgroup_zoneinfo(memcg, node, zone); mctz = soft_limit_tree_node_zone(node, zone); @@ -4818,7 +4818,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) mem_cgroup_remove_from_trees(memcg); free_css_id(&mem_cgroup_subsys, &memcg->css); - for_each_node_state(node, N_POSSIBLE) + for_each_node(node) free_mem_cgroup_per_zone_info(memcg, node); free_percpu(memcg->stat); @@ -4877,7 +4877,7 @@ static int mem_cgroup_soft_limit_tree_init(void) struct mem_cgroup_tree_per_zone *rtpz; int tmp, node, zone; - for_each_node_state(node, N_POSSIBLE) { + for_each_node(node) { tmp = node; if (!node_state(node, N_NORMAL_MEMORY)) tmp = -1; @@ -4896,7 +4896,7 @@ static int mem_cgroup_soft_limit_tree_init(void) return 0; err_cleanup: - for_each_node_state(node, N_POSSIBLE) { + for_each_node(node) { if (!soft_limit_tree.rb_tree_per_node[node]) break; kfree(soft_limit_tree.rb_tree_per_node[node]); @@ -4917,7 +4917,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) if (!memcg) return ERR_PTR(error); - for_each_node_state(node, N_POSSIBLE) + for_each_node(node) if (alloc_mem_cgroup_per_zone_info(memcg, node)) goto free_out; From d0048b0e59c1218d62bb4d014f34bbd7e7c0a214 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Thu, 12 Jan 2012 17:19:07 -0800 Subject: [PATCH 47/96] page_alloc: break early in check_for_regular_memory() If there is a zone below ZONE_NORMAL has present_pages, we can set node state to N_NORMAL_MEMORY, no need to loop to end. Signed-off-by: Bob Liu Acked-by: Michal Hocko Cc: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 25c248eb7d5f..9404b38bcdc5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4642,8 +4642,10 @@ static void check_for_regular_memory(pg_data_t *pgdat) for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; - if (zone->present_pages) + if (zone->present_pages) { node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); + break; + } } #endif } From 0efc8eb9c6a177836dac88b2cbb8815f9e4f8d5a Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Thu, 12 Jan 2012 17:19:08 -0800 Subject: [PATCH 48/96] page_cgroup: drop multi CONFIG_MEMORY_HOTPLUG No need for two CONFIG_MEMORY_HOTPLUG blocks. Signed-off-by: Bob Liu Acked-by: Michal Hocko Cc: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index b99d19edf89b..de1616aa9b1e 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -124,22 +124,6 @@ static void *__meminit alloc_page_cgroup(size_t size, int nid) return addr; } -#ifdef CONFIG_MEMORY_HOTPLUG -static void free_page_cgroup(void *addr) -{ - if (is_vmalloc_addr(addr)) { - vfree(addr); - } else { - struct page *page = virt_to_page(addr); - size_t table_size = - sizeof(struct page_cgroup) * PAGES_PER_SECTION; - - BUG_ON(PageReserved(page)); - free_pages_exact(addr, table_size); - } -} -#endif - static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) { struct mem_section *section; @@ -176,6 +160,20 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) return 0; } #ifdef CONFIG_MEMORY_HOTPLUG +static void free_page_cgroup(void *addr) +{ + if (is_vmalloc_addr(addr)) { + vfree(addr); + } else { + struct page *page = virt_to_page(addr); + size_t table_size = + sizeof(struct page_cgroup) * PAGES_PER_SECTION; + + BUG_ON(PageReserved(page)); + free_pages_exact(addr, table_size); + } +} + void __free_page_cgroup(unsigned long pfn) { struct mem_section *ms; From 569e55900a5c3c30de6e25c3f259ae7c7dbadb96 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 12 Jan 2012 17:19:11 -0800 Subject: [PATCH 49/96] thp: improve the error code path Improve the error code path. Delete unnecessary sysfs file for example. Also remove the #ifdef xxx to make code better. Signed-off-by: Shaohua Li Reviewed-by: Andrea Arcangeli Cc: David Rientjes Cc: Johannes Weiner Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 79 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 25 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index db522e160cca..763711121ef5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -487,41 +487,68 @@ static struct attribute_group khugepaged_attr_group = { .attrs = khugepaged_attr, .name = "khugepaged", }; + +static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) +{ + int err; + + *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); + if (unlikely(!*hugepage_kobj)) { + printk(KERN_ERR "hugepage: failed kobject create\n"); + return -ENOMEM; + } + + err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); + if (err) { + printk(KERN_ERR "hugepage: failed register hugeage group\n"); + goto delete_obj; + } + + err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); + if (err) { + printk(KERN_ERR "hugepage: failed register hugeage group\n"); + goto remove_hp_group; + } + + return 0; + +remove_hp_group: + sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); +delete_obj: + kobject_put(*hugepage_kobj); + return err; +} + +static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) +{ + sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); + sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); + kobject_put(hugepage_kobj); +} +#else +static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) +{ + return 0; +} + +static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) +{ +} #endif /* CONFIG_SYSFS */ static int __init hugepage_init(void) { int err; -#ifdef CONFIG_SYSFS - static struct kobject *hugepage_kobj; -#endif + struct kobject *hugepage_kobj; - err = -EINVAL; if (!has_transparent_hugepage()) { transparent_hugepage_flags = 0; - goto out; + return -EINVAL; } -#ifdef CONFIG_SYSFS - err = -ENOMEM; - hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); - if (unlikely(!hugepage_kobj)) { - printk(KERN_ERR "hugepage: failed kobject create\n"); - goto out; - } - - err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); - if (err) { - printk(KERN_ERR "hugepage: failed register hugeage group\n"); - goto out; - } - - err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); - if (err) { - printk(KERN_ERR "hugepage: failed register hugeage group\n"); - goto out; - } -#endif + err = hugepage_init_sysfs(&hugepage_kobj); + if (err) + return err; err = khugepaged_slab_init(); if (err) @@ -545,7 +572,9 @@ static int __init hugepage_init(void) set_recommended_min_free_kbytes(); + return 0; out: + hugepage_exit_sysfs(hugepage_kobj); return err; } module_init(hugepage_init) From e5591307f0c1eb733d280a0b72473e01d7f88530 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 12 Jan 2012 17:19:13 -0800 Subject: [PATCH 50/96] thp: remove unnecessary tlb flush for mprotect change_protection() will do TLB flush later, don't need duplicate tlb flush. Signed-off-by: Shaohua Li Reviewed-by: Andrea Arcangeli Cc: David Rientjes Cc: Johannes Weiner Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 763711121ef5..964fc5a2edd2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1145,7 +1145,6 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, entry = pmd_modify(entry, newprot); set_pmd_at(mm, addr, pmd, entry); spin_unlock(&vma->vm_mm->page_table_lock); - flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); ret = 1; } } else From f21760b15dcd091e5afd38d0b97197b45f7ef2ea Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 12 Jan 2012 17:19:16 -0800 Subject: [PATCH 51/96] thp: add tlb_remove_pmd_tlb_entry We have tlb_remove_tlb_entry to indicate a pte tlb flush entry should be flushed, but not a corresponding API for pmd entry. This isn't a problem so far because THP is only for x86 currently and tlb_flush() under x86 will flush entire TLB. But this is confusion and could be missed if thp is ported to other arch. Also convert tlb->need_flush = 1 to a VM_BUG_ON(!tlb->need_flush) in __tlb_remove_page() as suggested by Andrea Arcangeli. The __tlb_remove_page() function is supposed to be called after tlb_remove_xxx_tlb_entry() and we can catch any misuse. Signed-off-by: Shaohua Li Reviewed-by: Andrea Arcangeli Cc: David Rientjes Cc: Johannes Weiner Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/tlb.h | 14 ++++++++++++++ include/linux/huge_mm.h | 2 +- mm/huge_memory.c | 3 ++- mm/memory.c | 4 ++-- 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index e58fa777fa09..f96a5b58a975 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -139,6 +139,20 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) +/** + * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation + * This is a nop so far, because only x86 needs it. + */ +#ifndef __tlb_remove_pmd_tlb_entry +#define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0) +#endif + +#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \ + do { \ + tlb->need_flush = 1; \ + __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ + } while (0) + #define pte_free_tlb(tlb, ptep, address) \ do { \ tlb->need_flush = 1; \ diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a9ace9c32507..1b921299abc4 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -18,7 +18,7 @@ extern struct page *follow_trans_huge_pmd(struct mm_struct *mm, unsigned int flags); extern int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, - pmd_t *pmd); + pmd_t *pmd, unsigned long addr); extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 964fc5a2edd2..5a595554bd8c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1026,7 +1026,7 @@ out: } int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, - pmd_t *pmd) + pmd_t *pmd, unsigned long addr) { int ret = 0; @@ -1042,6 +1042,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pgtable = get_pmd_huge_pte(tlb->mm); page = pmd_page(*pmd); pmd_clear(pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); page_remove_rmap(page); VM_BUG_ON(page_mapcount(page) < 0); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); diff --git a/mm/memory.c b/mm/memory.c index 829d43735402..5e30583c2605 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -293,7 +293,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { struct mmu_gather_batch *batch; - tlb->need_flush = 1; + VM_BUG_ON(!tlb->need_flush); if (tlb_fast_mode(tlb)) { free_page_and_swap_cache(page); @@ -1231,7 +1231,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, if (next-addr != HPAGE_PMD_SIZE) { VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); split_huge_page_pmd(vma->vm_mm, pmd); - } else if (zap_huge_pmd(tlb, vma, pmd)) + } else if (zap_huge_pmd(tlb, vma, pmd, addr)) continue; /* fall through */ } From 45676885b76237a4c236d26fe20a9b0cfdb2eb22 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 12 Jan 2012 17:19:18 -0800 Subject: [PATCH 52/96] thp: improve order in lru list for split huge page Put the tail subpages of an isolated hugepage under splitting in the lru reclaim head as they supposedly should be isolated too next. Queues the subpages in physical order in the lru for non isolated hugepages under splitting. That might provide some theoretical cache benefit to the buddy allocator later. Signed-off-by: Shaohua Li Signed-off-by: Andrea Arcangeli Cc: David Rientjes Cc: Johannes Weiner Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 5 ++--- mm/swap.c | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5a595554bd8c..76cc3f7dd4f0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1228,7 +1228,6 @@ static int __split_huge_page_splitting(struct page *page, static void __split_huge_page_refcount(struct page *page) { int i; - unsigned long head_index = page->index; struct zone *zone = page_zone(page); int zonestat; int tail_count = 0; @@ -1239,7 +1238,7 @@ static void __split_huge_page_refcount(struct page *page) /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(page); - for (i = 1; i < HPAGE_PMD_NR; i++) { + for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { struct page *page_tail = page + i; /* tail_page->_mapcount cannot change */ @@ -1302,7 +1301,7 @@ static void __split_huge_page_refcount(struct page *page) BUG_ON(page_tail->mapping); page_tail->mapping = page->mapping; - page_tail->index = ++head_index; + page_tail->index = page->index + i; BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); diff --git a/mm/swap.c b/mm/swap.c index 126da2919f60..ddccf8e0b4ae 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -681,7 +681,7 @@ void lru_add_page_tail(struct zone* zone, if (likely(PageLRU(page))) list_add(&page_tail->lru, page->lru.prev); else - list_add(&page_tail->lru, &lruvec->lists[lru]); + list_add(&page_tail->lru, lruvec->lists[lru].prev); __mod_zone_page_state(zone, NR_LRU_BASE + lru, hpage_nr_pages(page_tail)); } else { From ea4d349ffa8028c655236497c2ba17c17aaa0d65 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 12 Jan 2012 17:19:20 -0800 Subject: [PATCH 53/96] vmscan/trace: Add 'file' info to trace_mm_vmscan_lru_isolate() In trace_mm_vmscan_lru_isolate(), we don't output 'file' information to the trace event and it is a bit inconvenient for the user to get the real information(like pasted below). mm_vmscan_lru_isolate: isolate_mode=2 order=0 nr_requested=32 nr_scanned=32 nr_taken=32 contig_taken=0 contig_dirty=0 contig_failed=0 'active' can be obtained by analyzing mode(Thanks go to Minchan and Mel), So this patch adds 'file' to the trace event and it now looks like: mm_vmscan_lru_isolate: isolate_mode=2 order=0 nr_requested=32 nr_scanned=32 nr_taken=32 contig_taken=0 contig_dirty=0 contig_failed=0 file=0 Signed-off-by: Tao Ma Acked-by: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Cc: Mel Gorman Reviewed-by: Minchan Kim Cc: Rik van Riel Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 22 ++++++++++++++-------- mm/vmscan.c | 2 +- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index edc4b3d25a2d..f64560e204bc 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -266,9 +266,10 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, unsigned long nr_lumpy_taken, unsigned long nr_lumpy_dirty, unsigned long nr_lumpy_failed, - isolate_mode_t isolate_mode), + isolate_mode_t isolate_mode, + int file), - TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode), + TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file), TP_STRUCT__entry( __field(int, order) @@ -279,6 +280,7 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, __field(unsigned long, nr_lumpy_dirty) __field(unsigned long, nr_lumpy_failed) __field(isolate_mode_t, isolate_mode) + __field(int, file) ), TP_fast_assign( @@ -290,9 +292,10 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, __entry->nr_lumpy_dirty = nr_lumpy_dirty; __entry->nr_lumpy_failed = nr_lumpy_failed; __entry->isolate_mode = isolate_mode; + __entry->file = file; ), - TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu contig_taken=%lu contig_dirty=%lu contig_failed=%lu", + TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu contig_taken=%lu contig_dirty=%lu contig_failed=%lu file=%d", __entry->isolate_mode, __entry->order, __entry->nr_requested, @@ -300,7 +303,8 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, __entry->nr_taken, __entry->nr_lumpy_taken, __entry->nr_lumpy_dirty, - __entry->nr_lumpy_failed) + __entry->nr_lumpy_failed, + __entry->file) ); DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, @@ -312,9 +316,10 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, unsigned long nr_lumpy_taken, unsigned long nr_lumpy_dirty, unsigned long nr_lumpy_failed, - isolate_mode_t isolate_mode), + isolate_mode_t isolate_mode, + int file), - TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode) + TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file) ); @@ -327,9 +332,10 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate, unsigned long nr_lumpy_taken, unsigned long nr_lumpy_dirty, unsigned long nr_lumpy_failed, - isolate_mode_t isolate_mode), + isolate_mode_t isolate_mode, + int file), - TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode) + TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file) ); diff --git a/mm/vmscan.c b/mm/vmscan.c index e16ca8384ef7..a85a261bf8f9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1234,7 +1234,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, nr_to_scan, scan, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, - mode); + mode, file); return nr_taken; } From a77ebd333cd810d7b680d544be88c875131c2bd3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:22 -0800 Subject: [PATCH 54/96] mm: compaction: allow compaction to isolate dirty pages Short summary: There are severe stalls when a USB stick using VFAT is used with THP enabled that are reduced by this series. If you are experiencing this problem, please test and report back and considering I have seen complaints from openSUSE and Fedora users on this as well as a few private mails, I'm guessing it's a widespread issue. This is a new type of USB-related stall because it is due to synchronous compaction writing where as in the past the big problem was dirty pages reaching the end of the LRU and being written by reclaim. Am cc'ing Andrew this time and this series would replace mm-do-not-stall-in-synchronous-compaction-for-thp-allocations.patch. I'm also cc'ing Dave Jones as he might have merged that patch to Fedora for wider testing and ideally it would be reverted and replaced by this series. That said, the later patches could really do with some review. If this series is not the answer then a new direction needs to be discussed because as it is, the stalls are unacceptable as the results in this leader show. For testers that try backporting this to 3.1, it won't work because there is a non-obvious dependency on not writing back pages in direct reclaim so you need those patches too. Changelog since V5 o Rebase to 3.2-rc5 o Tidy up the changelogs a bit Changelog since V4 o Added reviewed-bys, credited Andrea properly for sync-light o Allow dirty pages without mappings to be considered for migration o Bound the number of pages freed for compaction o Isolate PageReclaim pages on their own LRU list This is against 3.2-rc5 and follows on from discussions on "mm: Do not stall in synchronous compaction for THP allocations" and "[RFC PATCH 0/5] Reduce compaction-related stalls". Initially, the proposed patch eliminated stalls due to compaction which sometimes resulted in user-visible interactivity problems on browsers by simply never using sync compaction. The downside was that THP success allocation rates were lower because dirty pages were not being migrated as reported by Andrea. His approach at fixing this was nacked on the grounds that it reverted fixes from Rik merged that reduced the amount of pages reclaimed as it severely impacted his workloads performance. This series attempts to reconcile the requirements of maximising THP usage, without stalling in a user-visible fashion due to compaction or cheating by reclaiming an excessive number of pages. Patch 1 partially reverts commit 39deaf85 to allow migration to isolate dirty pages. This is because migration can move some dirty pages without blocking. Patch 2 notes that the /proc/sys/vm/compact_memory handler is not using synchronous compaction when it should be. This is unrelated to the reported stalls but is worth fixing. Patch 3 checks if we isolated a compound page during lumpy scan and account for it properly. For the most part, this affects tracing so it's unrelated to the stalls but worth fixing. Patch 4 notes that it is possible to abort reclaim early for compaction and return 0 to the page allocator potentially entering the "may oom" path. This has not been observed in practice but the rest of the series potentially makes it easier to happen. Patch 5 adds a sync parameter to the migratepage callback and gives the callback responsibility for migrating the page without blocking if sync==false. For example, fallback_migrate_page will not call writepage if sync==false. This increases the number of pages that can be handled by asynchronous compaction thereby reducing stalls. Patch 6 restores filter-awareness to isolate_lru_page for migration. In practice, it means that pages under writeback and pages without a ->migratepage callback will not be isolated for migration. Patch 7 avoids calling direct reclaim if compaction is deferred but makes sure that compaction is only deferred if sync compaction was used. Patch 8 introduces a sync-light migration mechanism that sync compaction uses. The objective is to allow some stalls but to not call ->writepage which can lead to significant user-visible stalls. Patch 9 notes that while we want to abort reclaim ASAP to allow compation to go ahead that we leave a very small window of opportunity for compaction to run. This patch allows more pages to be freed by reclaim but bounds the number to a reasonable level based on the high watermark on each zone. Patch 10 allows slabs to be shrunk even after compaction_ready() is true for one zone. This is to avoid a problem whereby a single small zone can abort reclaim even though no pages have been reclaimed and no suitably large zone is in a usable state. Patch 11 fixes a problem with the rate of page scanning. As reclaim is rarely stalling on pages under writeback it means that scan rates are very high. This is particularly true for direct reclaim which is not calling writepage. The vmstat figures implied that much of this was busy work with PageReclaim pages marked for immediate reclaim. This patch is a prototype that moves these pages to their own LRU list. This has been tested and other than 2 USB keys getting trashed, nothing horrible fell out. That said, I am a bit unhappy with the rescue logic in patch 11 but did not find a better way around it. It does significantly reduce scan rates and System CPU time indicating it is the right direction to take. What is of critical importance is that stalls due to compaction are massively reduced even though sync compaction was still allowed. Testing from people complaining about stalls copying to USBs with THP enabled are particularly welcome. The following tests all involve THP usage and USB keys in some way. Each test follows this type of pattern 1. Read from some fast fast storage, be it raw device or file. Each time the copy finishes, start again until the test ends 2. Write a large file to a filesystem on a USB stick. Each time the copy finishes, start again until the test ends 3. When memory is low, start an alloc process that creates a mapping the size of physical memory to stress THP allocation. This is the "real" part of the test and the part that is meant to trigger stalls when THP is enabled. Copying continues in the background. 4. Record the CPU usage and time to execute of the alloc process 5. Record the number of THP allocs and fallbacks as well as the number of THP pages in use a the end of the test just before alloc exited 6. Run the test 5 times to get an idea of variability 7. Between each run, sync is run and caches dropped and the test waits until nr_dirty is a small number to avoid interference or caching between iterations that would skew the figures. The individual tests were then writebackCPDeviceBasevfat Disable THP, read from a raw device (sda), vfat on USB stick writebackCPDeviceBaseext4 Disable THP, read from a raw device (sda), ext4 on USB stick writebackCPDevicevfat THP enabled, read from a raw device (sda), vfat on USB stick writebackCPDeviceext4 THP enabled, read from a raw device (sda), ext4 on USB stick writebackCPFilevfat THP enabled, read from a file on fast storage and USB, both vfat writebackCPFileext4 THP enabled, read from a file on fast storage and USB, both ext4 The kernels tested were 3.1 3.1 vanilla 3.2-rc5 freemore Patches 1-10 immediate Patches 1-11 andrea The 8 patches Andrea posted as a basis of comparison The results are very long unfortunately. I'll start with the case where we are not using THP at all writebackCPDeviceBasevfat 3.1.0-vanilla rc5-vanilla freemore-v6r1 isolate-v6r1 andrea-v2r1 System Time 1.28 ( 0.00%) 54.49 (-4143.46%) 48.63 (-3687.69%) 4.69 ( -265.11%) 51.88 (-3940.81%) +/- 0.06 ( 0.00%) 2.45 (-4305.55%) 4.75 (-8430.57%) 7.46 (-13282.76%) 4.76 (-8440.70%) User Time 0.09 ( 0.00%) 0.05 ( 40.91%) 0.06 ( 29.55%) 0.07 ( 15.91%) 0.06 ( 27.27%) +/- 0.02 ( 0.00%) 0.01 ( 45.39%) 0.02 ( 25.07%) 0.00 ( 77.06%) 0.01 ( 52.24%) Elapsed Time 110.27 ( 0.00%) 56.38 ( 48.87%) 49.95 ( 54.70%) 11.77 ( 89.33%) 53.43 ( 51.54%) +/- 7.33 ( 0.00%) 3.77 ( 48.61%) 4.94 ( 32.63%) 6.71 ( 8.50%) 4.76 ( 35.03%) THP Active 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) +/- 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) Fault Alloc 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) +/- 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) Fault Fallback 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) +/- 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) The THP figures are obviously all 0 because THP was enabled. The main thing to watch is the elapsed times and how they compare to times when THP is enabled later. It's also important to note that elapsed time is improved by this series as System CPu time is much reduced. writebackCPDevicevfat 3.1.0-vanilla rc5-vanilla freemore-v6r1 isolate-v6r1 andrea-v2r1 System Time 1.22 ( 0.00%) 13.89 (-1040.72%) 46.40 (-3709.20%) 4.44 ( -264.37%) 47.37 (-3789.33%) +/- 0.06 ( 0.00%) 22.82 (-37635.56%) 3.84 (-6249.44%) 6.48 (-10618.92%) 6.60 (-10818.53%) User Time 0.06 ( 0.00%) 0.06 ( -6.90%) 0.05 ( 17.24%) 0.05 ( 13.79%) 0.04 ( 31.03%) +/- 0.01 ( 0.00%) 0.01 ( 33.33%) 0.01 ( 33.33%) 0.01 ( 39.14%) 0.01 ( 25.46%) Elapsed Time 10445.54 ( 0.00%) 2249.92 ( 78.46%) 70.06 ( 99.33%) 16.59 ( 99.84%) 472.43 ( 95.48%) +/- 643.98 ( 0.00%) 811.62 ( -26.03%) 10.02 ( 98.44%) 7.03 ( 98.91%) 59.99 ( 90.68%) THP Active 15.60 ( 0.00%) 35.20 ( 225.64%) 65.00 ( 416.67%) 70.80 ( 453.85%) 62.20 ( 398.72%) +/- 18.48 ( 0.00%) 51.29 ( 277.59%) 15.99 ( 86.52%) 37.91 ( 205.18%) 22.02 ( 119.18%) Fault Alloc 121.80 ( 0.00%) 76.60 ( 62.89%) 155.40 ( 127.59%) 181.20 ( 148.77%) 286.60 ( 235.30%) +/- 73.51 ( 0.00%) 61.11 ( 83.12%) 34.89 ( 47.46%) 31.88 ( 43.36%) 68.13 ( 92.68%) Fault Fallback 881.20 ( 0.00%) 926.60 ( -5.15%) 847.60 ( 3.81%) 822.00 ( 6.72%) 716.60 ( 18.68%) +/- 73.51 ( 0.00%) 61.26 ( 16.67%) 34.89 ( 52.54%) 31.65 ( 56.94%) 67.75 ( 7.84%) MMTests Statistics: duration User/Sys Time Running Test (seconds) 3540.88 1945.37 716.04 64.97 1937.03 Total Elapsed Time (seconds) 52417.33 11425.90 501.02 230.95 2520.28 The first thing to note is the "Elapsed Time" for the vanilla kernels of 2249 seconds versus 56 with THP disabled which might explain the reports of USB stalls with THP enabled. Applying the patches brings performance in line with THP-disabled performance while isolating pages for immediate reclaim from the LRU cuts down System CPU time. The "Fault Alloc" success rate figures are also improved. The vanilla kernel only managed to allocate 76.6 pages on average over the course of 5 iterations where as applying the series allocated 181.20 on average albeit it is well within variance. It's worth noting that applies the series at least descreases the amount of variance which implies an improvement. Andrea's series had a higher success rate for THP allocations but at a severe cost to elapsed time which is still better than vanilla but still much worse than disabling THP altogether. One can bring my series close to Andrea's by removing this check /* * If compaction is deferred for high-order allocations, it is because * sync compaction recently failed. In this is the case and the caller * has requested the system not be heavily disrupted, fail the * allocation now instead of entering direct reclaim */ if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) goto nopage; I didn't include a patch that removed the above check because hurting overall performance to improve the THP figure is not what the average user wants. It's something to consider though if someone really wants to maximise THP usage no matter what it does to the workload initially. This is summary of vmstat figures from the same test. 3.1.0-vanilla rc5-vanilla freemore-v6r1 isolate-v6r1 andrea-v2r1 Page Ins 3257266139 1111844061 17263623 10901575 161423219 Page Outs 81054922 30364312 3626530 3657687 8753730 Swap Ins 3294 2851 6560 4964 4592 Swap Outs 390073 528094 620197 790912 698285 Direct pages scanned 1077581700 3024951463 1764930052 115140570 5901188831 Kswapd pages scanned 34826043 7112868 2131265 1686942 1893966 Kswapd pages reclaimed 28950067 4911036 1246044 966475 1497726 Direct pages reclaimed 805148398 280167837 3623473 2215044 40809360 Kswapd efficiency 83% 69% 58% 57% 79% Kswapd velocity 664.399 622.521 4253.852 7304.360 751.490 Direct efficiency 74% 9% 0% 1% 0% Direct velocity 20557.737 264745.137 3522673.849 498551.938 2341481.435 Percentage direct scans 96% 99% 99% 98% 99% Page writes by reclaim 722646 529174 620319 791018 699198 Page writes file 332573 1080 122 106 913 Page writes anon 390073 528094 620197 790912 698285 Page reclaim immediate 0 2552514720 1635858848 111281140 5478375032 Page rescued immediate 0 0 0 87848 0 Slabs scanned 23552 23552 9216 8192 9216 Direct inode steals 231 0 0 0 0 Kswapd inode steals 0 0 0 0 0 Kswapd skipped wait 28076 786 0 61 6 THP fault alloc 609 383 753 906 1433 THP collapse alloc 12 6 0 0 6 THP splits 536 211 456 593 1136 THP fault fallback 4406 4633 4263 4110 3583 THP collapse fail 120 127 0 0 4 Compaction stalls 1810 728 623 779 3200 Compaction success 196 53 60 80 123 Compaction failures 1614 675 563 699 3077 Compaction pages moved 193158 53545 243185 333457 226688 Compaction move failure 9952 9396 16424 23676 45070 The main things to look at are 1. Page In/out figures are much reduced by the series. 2. Direct page scanning is incredibly high (264745.137 pages scanned per second on the vanilla kernel) but isolating PageReclaim pages on their own list reduces the number of pages scanned significantly. 3. The fact that "Page rescued immediate" is a positive number implies that we sometimes race removing pages from the LRU_IMMEDIATE list that need to be put back on a normal LRU but it happens only for 0.07% of the pages marked for immediate reclaim. writebackCPDeviceext4 3.1.0-vanilla rc5-vanilla freemore-v6r1 isolate-v6r1 andrea-v2r1 System Time 1.51 ( 0.00%) 1.77 ( -17.66%) 1.46 ( 2.92%) 1.15 ( 23.77%) 1.89 ( -25.63%) +/- 0.27 ( 0.00%) 0.67 ( -148.52%) 0.33 ( -22.76%) 0.30 ( -11.15%) 0.19 ( 30.16%) User Time 0.03 ( 0.00%) 0.04 ( -37.50%) 0.05 ( -62.50%) 0.07 ( -112.50%) 0.04 ( -18.75%) +/- 0.01 ( 0.00%) 0.02 ( -146.64%) 0.02 ( -97.91%) 0.02 ( -75.59%) 0.02 ( -63.30%) Elapsed Time 124.93 ( 0.00%) 114.49 ( 8.36%) 96.77 ( 22.55%) 27.48 ( 78.00%) 205.70 ( -64.65%) +/- 20.20 ( 0.00%) 74.39 ( -268.34%) 59.88 ( -196.48%) 7.72 ( 61.79%) 25.03 ( -23.95%) THP Active 161.80 ( 0.00%) 83.60 ( 51.67%) 141.20 ( 87.27%) 84.60 ( 52.29%) 82.60 ( 51.05%) +/- 71.95 ( 0.00%) 43.80 ( 60.88%) 26.91 ( 37.40%) 59.02 ( 82.03%) 52.13 ( 72.45%) Fault Alloc 471.40 ( 0.00%) 228.60 ( 48.49%) 282.20 ( 59.86%) 225.20 ( 47.77%) 388.40 ( 82.39%) +/- 88.07 ( 0.00%) 87.42 ( 99.26%) 73.79 ( 83.78%) 109.62 ( 124.47%) 82.62 ( 93.81%) Fault Fallback 531.60 ( 0.00%) 774.60 ( -45.71%) 720.80 ( -35.59%) 777.80 ( -46.31%) 614.80 ( -15.65%) +/- 88.07 ( 0.00%) 87.26 ( 0.92%) 73.79 ( 16.22%) 109.62 ( -24.47%) 82.29 ( 6.56%) MMTests Statistics: duration User/Sys Time Running Test (seconds) 50.22 33.76 30.65 24.14 128.45 Total Elapsed Time (seconds) 1113.73 1132.19 1029.45 759.49 1707.26 Similar test but the USB stick is using ext4 instead of vfat. As ext4 does not use writepage for migration, the large stalls due to compaction when THP is enabled are not observed. Still, isolating PageReclaim pages on their own list helped completion time largely by reducing the number of pages scanned by direct reclaim although time spend in congestion_wait could also be a factor. Again, Andrea's series had far higher success rates for THP allocation at the cost of elapsed time. I didn't look too closely but a quick look at the vmstat figures tells me kswapd reclaimed 8 times more pages than the patch series and direct reclaim reclaimed roughly three times as many pages. It follows that if memory is aggressively reclaimed, there will be more available for THP. writebackCPFilevfat 3.1.0-vanilla rc5-vanilla freemore-v6r1 isolate-v6r1 andrea-v2r1 System Time 1.76 ( 0.00%) 29.10 (-1555.52%) 46.01 (-2517.18%) 4.79 ( -172.35%) 54.89 (-3022.53%) +/- 0.14 ( 0.00%) 25.61 (-18185.17%) 2.15 (-1434.83%) 6.60 (-4610.03%) 9.75 (-6863.76%) User Time 0.05 ( 0.00%) 0.07 ( -45.83%) 0.05 ( -4.17%) 0.06 ( -29.17%) 0.06 ( -16.67%) +/- 0.02 ( 0.00%) 0.02 ( 20.11%) 0.02 ( -3.14%) 0.01 ( 31.58%) 0.01 ( 47.41%) Elapsed Time 22520.79 ( 0.00%) 1082.85 ( 95.19%) 73.30 ( 99.67%) 32.43 ( 99.86%) 291.84 ( 98.70%) +/- 7277.23 ( 0.00%) 706.29 ( 90.29%) 19.05 ( 99.74%) 17.05 ( 99.77%) 125.55 ( 98.27%) THP Active 83.80 ( 0.00%) 12.80 ( 15.27%) 15.60 ( 18.62%) 13.00 ( 15.51%) 0.80 ( 0.95%) +/- 66.81 ( 0.00%) 20.19 ( 30.22%) 5.92 ( 8.86%) 15.06 ( 22.54%) 1.17 ( 1.75%) Fault Alloc 171.00 ( 0.00%) 67.80 ( 39.65%) 97.40 ( 56.96%) 125.60 ( 73.45%) 133.00 ( 77.78%) +/- 82.91 ( 0.00%) 30.69 ( 37.02%) 53.91 ( 65.02%) 55.05 ( 66.40%) 21.19 ( 25.56%) Fault Fallback 832.00 ( 0.00%) 935.20 ( -12.40%) 906.00 ( -8.89%) 877.40 ( -5.46%) 870.20 ( -4.59%) +/- 82.91 ( 0.00%) 30.69 ( 62.98%) 54.01 ( 34.86%) 55.05 ( 33.60%) 20.91 ( 74.78%) MMTests Statistics: duration User/Sys Time Running Test (seconds) 7229.81 928.42 704.52 80.68 1330.76 Total Elapsed Time (seconds) 112849.04 5618.69 571.11 360.54 1664.28 In this case, the test is reading/writing only from filesystems but as it's vfat, it's slow due to calling writepage during compaction. Little to observe really - the time to complete the test goes way down with the series applied and THP allocation success rates go up in comparison to 3.2-rc5. The success rates are lower than 3.1.0 but the elapsed time for that kernel is abysmal so it is not really a sensible comparison. As before, Andrea's series allocates more THPs at the cost of overall performance. writebackCPFileext4 3.1.0-vanilla rc5-vanilla freemore-v6r1 isolate-v6r1 andrea-v2r1 System Time 1.51 ( 0.00%) 1.77 ( -17.66%) 1.46 ( 2.92%) 1.15 ( 23.77%) 1.89 ( -25.63%) +/- 0.27 ( 0.00%) 0.67 ( -148.52%) 0.33 ( -22.76%) 0.30 ( -11.15%) 0.19 ( 30.16%) User Time 0.03 ( 0.00%) 0.04 ( -37.50%) 0.05 ( -62.50%) 0.07 ( -112.50%) 0.04 ( -18.75%) +/- 0.01 ( 0.00%) 0.02 ( -146.64%) 0.02 ( -97.91%) 0.02 ( -75.59%) 0.02 ( -63.30%) Elapsed Time 124.93 ( 0.00%) 114.49 ( 8.36%) 96.77 ( 22.55%) 27.48 ( 78.00%) 205.70 ( -64.65%) +/- 20.20 ( 0.00%) 74.39 ( -268.34%) 59.88 ( -196.48%) 7.72 ( 61.79%) 25.03 ( -23.95%) THP Active 161.80 ( 0.00%) 83.60 ( 51.67%) 141.20 ( 87.27%) 84.60 ( 52.29%) 82.60 ( 51.05%) +/- 71.95 ( 0.00%) 43.80 ( 60.88%) 26.91 ( 37.40%) 59.02 ( 82.03%) 52.13 ( 72.45%) Fault Alloc 471.40 ( 0.00%) 228.60 ( 48.49%) 282.20 ( 59.86%) 225.20 ( 47.77%) 388.40 ( 82.39%) +/- 88.07 ( 0.00%) 87.42 ( 99.26%) 73.79 ( 83.78%) 109.62 ( 124.47%) 82.62 ( 93.81%) Fault Fallback 531.60 ( 0.00%) 774.60 ( -45.71%) 720.80 ( -35.59%) 777.80 ( -46.31%) 614.80 ( -15.65%) +/- 88.07 ( 0.00%) 87.26 ( 0.92%) 73.79 ( 16.22%) 109.62 ( -24.47%) 82.29 ( 6.56%) MMTests Statistics: duration User/Sys Time Running Test (seconds) 50.22 33.76 30.65 24.14 128.45 Total Elapsed Time (seconds) 1113.73 1132.19 1029.45 759.49 1707.26 Same type of story - elapsed times go down. In this case, allocation success rates are roughtly the same. As before, Andrea's has higher success rates but takes a lot longer. Overall the series does reduce latencies and while the tests are inherency racy as alloc competes with the cp processes, the variability was included. The THP allocation rates are not as high as they could be but that is because we would have to be more aggressive about reclaim and compaction impacting overall performance. This patch: Commit 39deaf85 ("mm: compaction: make isolate_lru_page() filter-aware") noted that compaction does not migrate dirty or writeback pages and that is was meaningless to pick the page and re-add it to the LRU list. What was missed during review is that asynchronous migration moves dirty pages if their ->migratepage callback is migrate_page() because these can be moved without blocking. This potentially impacted hugepage allocation success rates by a factor depending on how many dirty pages are in the system. This patch partially reverts 39deaf85 to allow migration to isolate dirty pages again. This increases how much compaction disrupts the LRU but that is addressed later in the series. Signed-off-by: Mel Gorman Reviewed-by: Andrea Arcangeli Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index e6670c34eb49..396ea2b47f7e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -349,9 +349,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, continue; } - if (!cc->sync) - mode |= ISOLATE_CLEAN; - /* Try isolate the page */ if (__isolate_lru_page(page, mode, 0) != 0) continue; From b16d3d5a5219d01e9be5e586e5d50fbf1ca955ea Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:26 -0800 Subject: [PATCH 55/96] mm: compaction: use synchronous compaction for /proc/sys/vm/compact_memory When asynchronous compaction was introduced, the /proc/sys/vm/compact_memory handler should have been updated to always use synchronous compaction. This did not happen so this patch addresses it. The assumption is if a user writes to /proc/sys/vm/compact_memory, they are willing for that process to stall. Signed-off-by: Mel Gorman Reviewed-by: Andrea Arcangeli Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/compaction.c b/mm/compaction.c index 396ea2b47f7e..d31e64becb38 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -668,6 +668,7 @@ static int compact_node(int nid) .nr_freepages = 0, .nr_migratepages = 0, .order = -1, + .sync = true, }; zone = &pgdat->node_zones[zoneid]; From 5013473152d1ac9d44d787fb02edda845fdf2cb3 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 12 Jan 2012 17:19:29 -0800 Subject: [PATCH 56/96] mm: vmscan: check if we isolated a compound page during lumpy scan Properly take into account if we isolated a compound page during the lumpy scan in reclaim and skip over the tail pages when encountered. This corrects the values given to the tracepoint for number of lumpy pages isolated and will avoid breaking the loop early if compound pages smaller than the requested allocation size are requested. [mgorman@suse.de: Updated changelog] Signed-off-by: Andrea Arcangeli Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: Rik van Riel Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index a85a261bf8f9..866ab27c52a5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1196,13 +1196,17 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, break; if (__isolate_lru_page(cursor_page, mode, file) == 0) { + unsigned int isolated_pages; + mem_cgroup_lru_del(cursor_page); list_move(&cursor_page->lru, dst); - nr_taken += hpage_nr_pages(cursor_page); - nr_lumpy_taken++; + isolated_pages = hpage_nr_pages(cursor_page); + nr_taken += isolated_pages; + nr_lumpy_taken += isolated_pages; if (PageDirty(cursor_page)) - nr_lumpy_dirty++; + nr_lumpy_dirty += isolated_pages; scan++; + pfn += isolated_pages - 1; } else { /* * Check if the page is freed already. From 7335084d446b83cbcb15da80497d03f0c1dc9e21 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:33 -0800 Subject: [PATCH 57/96] mm: vmscan: do not OOM if aborting reclaim to start compaction During direct reclaim it is possible that reclaim will be aborted so that compaction can be attempted to satisfy a high-order allocation. If this decision is made before any pages are reclaimed, it is possible that 0 is returned to the page allocator potentially triggering an OOM. This has not been observed but it is a possibility so this patch addresses it. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 866ab27c52a5..cb68c53db4ec 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2263,6 +2263,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; unsigned long writeback_threshold; + bool should_abort_reclaim; get_mems_allowed(); delayacct_freepages_start(); @@ -2274,7 +2275,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; if (!priority) disable_swap_token(sc->target_mem_cgroup); - if (shrink_zones(priority, zonelist, sc)) + should_abort_reclaim = shrink_zones(priority, zonelist, sc); + if (should_abort_reclaim) break; /* @@ -2342,6 +2344,10 @@ out: if (oom_killer_disabled) return 0; + /* Aborting reclaim to try compaction? don't OOM, then */ + if (should_abort_reclaim) + return 1; + /* top priority shrink_zones still had more to do? don't OOM, then */ if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) return 1; From b969c4ab9f182a6e1b2a0848be349f99714947b0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:34 -0800 Subject: [PATCH 58/96] mm: compaction: determine if dirty pages can be migrated without blocking within ->migratepage Asynchronous compaction is used when allocating transparent hugepages to avoid blocking for long periods of time. Due to reports of stalling, there was a debate on disabling synchronous compaction but this severely impacted allocation success rates. Part of the reason was that many dirty pages are skipped in asynchronous compaction by the following check; if (PageDirty(page) && !sync && mapping->a_ops->migratepage != migrate_page) rc = -EBUSY; This skips over all mapping aops using buffer_migrate_page() even though it is possible to migrate some of these pages without blocking. This patch updates the ->migratepage callback with a "sync" parameter. It is the responsibility of the callback to fail gracefully if migration would block. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/disk-io.c | 4 +- fs/hugetlbfs/inode.c | 3 +- fs/nfs/internal.h | 2 +- fs/nfs/write.c | 4 +- include/linux/fs.h | 9 ++- include/linux/migrate.h | 2 +- mm/migrate.c | 129 ++++++++++++++++++++++++++++------------ 7 files changed, 106 insertions(+), 47 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f99a099a7747..1375494c8cb6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -872,7 +872,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, #ifdef CONFIG_MIGRATION static int btree_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, bool sync) { /* * we can't safely write a btree page from here, @@ -887,7 +887,7 @@ static int btree_migratepage(struct address_space *mapping, if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, sync); } #endif diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index e425ad9d0490..06fd4608a990 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -583,7 +583,8 @@ static int hugetlbfs_set_page_dirty(struct page *page) } static int hugetlbfs_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + bool sync) { int rc; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5ee92538b063..114398a15830 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -332,7 +332,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, - struct page *, struct page *); + struct page *, struct page *, bool); #else #define nfs_migrate_page NULL #endif diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0c3885255f97..889e98bc5a21 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1688,7 +1688,7 @@ out_error: #ifdef CONFIG_MIGRATION int nfs_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page) + struct page *page, bool sync) { /* * If PagePrivate is set, then the page is currently associated with @@ -1703,7 +1703,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, nfs_fscache_release_page(page, GFP_KERNEL); - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, sync); } #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index a7409bc157e0..b92b73d0b2b9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -609,9 +609,12 @@ struct address_space_operations { loff_t offset, unsigned long nr_segs); int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **, unsigned long *); - /* migrate the contents of a page to the specified target */ + /* + * migrate the contents of a page to the specified target. If sync + * is false, it must not block. + */ int (*migratepage) (struct address_space *, - struct page *, struct page *); + struct page *, struct page *, bool); int (*launder_page) (struct page *); int (*is_partially_uptodate) (struct page *, read_descriptor_t *, unsigned long); @@ -2537,7 +2540,7 @@ extern int generic_check_addressable(unsigned, u64); #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, - struct page *, struct page *); + struct page *, struct page *, bool); #else #define buffer_migrate_page NULL #endif diff --git a/include/linux/migrate.h b/include/linux/migrate.h index e39aeecfe9a2..14e6d2a88475 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -11,7 +11,7 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, - struct page *, struct page *); + struct page *, struct page *, bool); extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, bool sync); diff --git a/mm/migrate.c b/mm/migrate.c index fc391985899f..4e86f3bacb85 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -216,6 +216,55 @@ out: pte_unmap_unlock(ptep, ptl); } +#ifdef CONFIG_BLOCK +/* Returns true if all buffers are successfully locked */ +static bool buffer_migrate_lock_buffers(struct buffer_head *head, bool sync) +{ + struct buffer_head *bh = head; + + /* Simple case, sync compaction */ + if (sync) { + do { + get_bh(bh); + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return true; + } + + /* async case, we cannot block on lock_buffer so use trylock_buffer */ + do { + get_bh(bh); + if (!trylock_buffer(bh)) { + /* + * We failed to lock the buffer and cannot stall in + * async migration. Release the taken locks + */ + struct buffer_head *failed_bh = bh; + put_bh(failed_bh); + bh = head; + while (bh != failed_bh) { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + } + return false; + } + + bh = bh->b_this_page; + } while (bh != head); + return true; +} +#else +static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, + bool sync) +{ + return true; +} +#endif /* CONFIG_BLOCK */ + /* * Replace the page in the mapping. * @@ -225,7 +274,8 @@ out: * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. */ static int migrate_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + struct buffer_head *head, bool sync) { int expected_count; void **pslot; @@ -254,6 +304,19 @@ static int migrate_page_move_mapping(struct address_space *mapping, return -EAGAIN; } + /* + * In the async migration case of moving a page with buffers, lock the + * buffers using trylock before the mapping is moved. If the mapping + * was moved, we later failed to lock the buffers and could not move + * the mapping back due to an elevated page count, we would have to + * block waiting on other references to be dropped. + */ + if (!sync && head && !buffer_migrate_lock_buffers(head, sync)) { + page_unfreeze_refs(page, expected_count); + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + /* * Now we know that no one else is looking at the page. */ @@ -409,13 +472,13 @@ EXPORT_SYMBOL(fail_migrate_page); * Pages are locked upon entry and exit. */ int migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, bool sync) { int rc; BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page); + rc = migrate_page_move_mapping(mapping, newpage, page, NULL, sync); if (rc) return rc; @@ -432,28 +495,28 @@ EXPORT_SYMBOL(migrate_page); * exist. */ int buffer_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, bool sync) { struct buffer_head *bh, *head; int rc; if (!page_has_buffers(page)) - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, sync); head = page_buffers(page); - rc = migrate_page_move_mapping(mapping, newpage, page); + rc = migrate_page_move_mapping(mapping, newpage, page, head, sync); if (rc) return rc; - bh = head; - do { - get_bh(bh); - lock_buffer(bh); - bh = bh->b_this_page; - - } while (bh != head); + /* + * In the async case, migrate_page_move_mapping locked the buffers + * with an IRQ-safe spinlock held. In the sync case, the buffers + * need to be locked now + */ + if (sync) + BUG_ON(!buffer_migrate_lock_buffers(head, sync)); ClearPagePrivate(page); set_page_private(newpage, page_private(page)); @@ -530,10 +593,13 @@ static int writeout(struct address_space *mapping, struct page *page) * Default handling if a filesystem does not provide a migration function. */ static int fallback_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, bool sync) { - if (PageDirty(page)) + if (PageDirty(page)) { + if (!sync) + return -EBUSY; return writeout(mapping, page); + } /* * Buffers may be managed in a filesystem specific way. @@ -543,7 +609,7 @@ static int fallback_migrate_page(struct address_space *mapping, !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, sync); } /* @@ -579,29 +645,18 @@ static int move_to_new_page(struct page *newpage, struct page *page, mapping = page_mapping(page); if (!mapping) - rc = migrate_page(mapping, newpage, page); - else { + rc = migrate_page(mapping, newpage, page, sync); + else if (mapping->a_ops->migratepage) /* - * Do not writeback pages if !sync and migratepage is - * not pointing to migrate_page() which is nonblocking - * (swapcache/tmpfs uses migratepage = migrate_page). + * Most pages have a mapping and most filesystems provide a + * migratepage callback. Anonymous pages are part of swap + * space which also has its own migratepage callback. This + * is the most common path for page migration. */ - if (PageDirty(page) && !sync && - mapping->a_ops->migratepage != migrate_page) - rc = -EBUSY; - else if (mapping->a_ops->migratepage) - /* - * Most pages have a mapping and most filesystems - * should provide a migration function. Anonymous - * pages are part of swap space which also has its - * own migration function. This is the most common - * path for page migration. - */ - rc = mapping->a_ops->migratepage(mapping, - newpage, page); - else - rc = fallback_migrate_page(mapping, newpage, page); - } + rc = mapping->a_ops->migratepage(mapping, + newpage, page, sync); + else + rc = fallback_migrate_page(mapping, newpage, page, sync); if (rc) { newpage->mapping = NULL; From c82449352854ff09e43062246af86bdeb628f0c3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:38 -0800 Subject: [PATCH 59/96] mm: compaction: make isolate_lru_page() filter-aware again Commit 39deaf85 ("mm: compaction: make isolate_lru_page() filter-aware") noted that compaction does not migrate dirty or writeback pages and that is was meaningless to pick the page and re-add it to the LRU list. This had to be partially reverted because some dirty pages can be migrated by compaction without blocking. This patch updates "mm: compaction: make isolate_lru_page" by skipping over pages that migration has no possibility of migrating to minimise LRU disruption. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Reviewed-by: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 ++ mm/compaction.c | 3 +++ mm/vmscan.c | 35 +++++++++++++++++++++++++++++++++-- 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 42e544cd4c9f..2038b90ca6e3 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -177,6 +177,8 @@ struct lruvec { #define ISOLATE_CLEAN ((__force isolate_mode_t)0x4) /* Isolate unmapped file */ #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x8) +/* Isolate for asynchronous migration */ +#define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x10) /* LRU Isolation modes. */ typedef unsigned __bitwise__ isolate_mode_t; diff --git a/mm/compaction.c b/mm/compaction.c index d31e64becb38..fb291585e1bf 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -349,6 +349,9 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, continue; } + if (!cc->sync) + mode |= ISOLATE_ASYNC_MIGRATE; + /* Try isolate the page */ if (__isolate_lru_page(page, mode, 0) != 0) continue; diff --git a/mm/vmscan.c b/mm/vmscan.c index cb68c53db4ec..efbcab1c8f54 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1075,8 +1075,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) ret = -EBUSY; - if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) - return ret; + /* + * To minimise LRU disruption, the caller can indicate that it only + * wants to isolate pages it will be able to operate on without + * blocking - clean pages for the most part. + * + * ISOLATE_CLEAN means that only clean pages should be isolated. This + * is used by reclaim when it is cannot write to backing storage + * + * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages + * that it is possible to migrate without blocking + */ + if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { + /* All the caller can do on PageWriteback is block */ + if (PageWriteback(page)) + return ret; + + if (PageDirty(page)) { + struct address_space *mapping; + + /* ISOLATE_CLEAN means only clean pages */ + if (mode & ISOLATE_CLEAN) + return ret; + + /* + * Only pages without mappings or that have a + * ->migratepage callback are possible to migrate + * without blocking + */ + mapping = page_mapping(page); + if (mapping && !mapping->a_ops->migratepage) + return ret; + } + } if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) return ret; From 66199712e9eef5aede09dbcd9dfff87798a66917 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:41 -0800 Subject: [PATCH 60/96] mm: page allocator: do not call direct reclaim for THP allocations while compaction is deferred If compaction is deferred, direct reclaim is used to try to free enough pages for the allocation to succeed. For small high-orders, this has a reasonable chance of success. However, if the caller has specified __GFP_NO_KSWAPD to limit the disruption to the system, it makes more sense to fail the allocation rather than stall the caller in direct reclaim. This patch skips direct reclaim if compaction is deferred and the caller specifies __GFP_NO_KSWAPD. Async compaction only considers a subset of pages so it is possible for compaction to be deferred prematurely and not enter direct reclaim even in cases where it should. To compensate for this, this patch also defers compaction only if sync compaction failed. Signed-off-by: Mel Gorman Acked-by: Minchan Kim Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 45 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9404b38bcdc5..cb5723c491f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1981,14 +1981,20 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress, - bool sync_migration) + int migratetype, bool sync_migration, + bool *deferred_compaction, + unsigned long *did_some_progress) { struct page *page; - if (!order || compaction_deferred(preferred_zone)) + if (!order) return NULL; + if (compaction_deferred(preferred_zone)) { + *deferred_compaction = true; + return NULL; + } + current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration); @@ -2016,7 +2022,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * but not enough to satisfy watermarks. */ count_vm_event(COMPACTFAIL); - defer_compaction(preferred_zone); + + /* + * As async compaction considers a subset of pageblocks, only + * defer if the failure was a sync compaction failure. + */ + if (sync_migration) + defer_compaction(preferred_zone); cond_resched(); } @@ -2028,8 +2040,9 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress, - bool sync_migration) + int migratetype, bool sync_migration, + bool *deferred_compaction, + unsigned long *did_some_progress) { return NULL; } @@ -2179,6 +2192,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long pages_reclaimed = 0; unsigned long did_some_progress; bool sync_migration = false; + bool deferred_compaction = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2259,12 +2273,22 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress, - sync_migration); + migratetype, sync_migration, + &deferred_compaction, + &did_some_progress); if (page) goto got_pg; sync_migration = true; + /* + * If compaction is deferred for high-order allocations, it is because + * sync compaction recently failed. In this is the case and the caller + * has requested the system not be heavily disrupted, fail the + * allocation now instead of entering direct reclaim + */ + if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) + goto nopage; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, @@ -2328,8 +2352,9 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress, - sync_migration); + migratetype, sync_migration, + &deferred_compaction, + &did_some_progress); if (page) goto got_pg; } From a6bc32b899223a877f595ef9ddc1e89ead5072b8 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:43 -0800 Subject: [PATCH 61/96] mm: compaction: introduce sync-light migration for use by compaction This patch adds a lightweight sync migrate operation MIGRATE_SYNC_LIGHT mode that avoids writing back pages to backing storage. Async compaction maps to MIGRATE_ASYNC while sync compaction maps to MIGRATE_SYNC_LIGHT. For other migrate_pages users such as memory hotplug, MIGRATE_SYNC is used. This avoids sync compaction stalling for an excessive length of time, particularly when copying files to a USB stick where there might be a large number of dirty pages backed by a filesystem that does not support ->writepages. [aarcange@redhat.com: This patch is heavily based on Andrea's work] [akpm@linux-foundation.org: fix fs/nfs/write.c build] [akpm@linux-foundation.org: fix fs/btrfs/disk-io.c build] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/disk-io.c | 5 +-- fs/hugetlbfs/inode.c | 2 +- fs/nfs/internal.h | 2 +- fs/nfs/write.c | 4 +-- include/linux/fs.h | 6 ++-- include/linux/migrate.h | 23 +++++++++--- mm/compaction.c | 2 +- mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 78 +++++++++++++++++++++++------------------ 11 files changed, 76 insertions(+), 52 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1375494c8cb6..d8525662ca7a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -872,7 +872,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, #ifdef CONFIG_MIGRATION static int btree_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, bool sync) + struct page *newpage, struct page *page, + enum migrate_mode mode) { /* * we can't safely write a btree page from here, @@ -887,7 +888,7 @@ static int btree_migratepage(struct address_space *mapping, if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page, sync); + return migrate_page(mapping, newpage, page, mode); } #endif diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 06fd4608a990..1e85a7ac0217 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -584,7 +584,7 @@ static int hugetlbfs_set_page_dirty(struct page *page) static int hugetlbfs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, - bool sync) + enum migrate_mode mode) { int rc; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 114398a15830..8102db9b926c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -332,7 +332,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, - struct page *, struct page *, bool); + struct page *, struct page *, enum migrate_mode); #else #define nfs_migrate_page NULL #endif diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 889e98bc5a21..834f0fe96f89 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1688,7 +1688,7 @@ out_error: #ifdef CONFIG_MIGRATION int nfs_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page, bool sync) + struct page *page, enum migrate_mode mode) { /* * If PagePrivate is set, then the page is currently associated with @@ -1703,7 +1703,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, nfs_fscache_release_page(page, GFP_KERNEL); - return migrate_page(mapping, newpage, page, sync); + return migrate_page(mapping, newpage, page, mode); } #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index b92b73d0b2b9..e694bd4434a4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -525,6 +525,7 @@ enum positive_aop_returns { struct page; struct address_space; struct writeback_control; +enum migrate_mode; struct iov_iter { const struct iovec *iov; @@ -614,7 +615,7 @@ struct address_space_operations { * is false, it must not block. */ int (*migratepage) (struct address_space *, - struct page *, struct page *, bool); + struct page *, struct page *, enum migrate_mode); int (*launder_page) (struct page *); int (*is_partially_uptodate) (struct page *, read_descriptor_t *, unsigned long); @@ -2540,7 +2541,8 @@ extern int generic_check_addressable(unsigned, u64); #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, - struct page *, struct page *, bool); + struct page *, struct page *, + enum migrate_mode); #else #define buffer_migrate_page NULL #endif diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 14e6d2a88475..eaf867412f7a 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -6,18 +6,31 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); +/* + * MIGRATE_ASYNC means never block + * MIGRATE_SYNC_LIGHT in the current implementation means to allow blocking + * on most operations but not ->writepage as the potential stall time + * is too significant + * MIGRATE_SYNC will block when migrating pages + */ +enum migrate_mode { + MIGRATE_ASYNC, + MIGRATE_SYNC_LIGHT, + MIGRATE_SYNC, +}; + #ifdef CONFIG_MIGRATION #define PAGE_MIGRATION 1 extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, - struct page *, struct page *, bool); + struct page *, struct page *, enum migrate_mode); extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - bool sync); + enum migrate_mode mode); extern int migrate_huge_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - bool sync); + enum migrate_mode mode); extern int fail_migrate_page(struct address_space *, struct page *, struct page *); @@ -36,10 +49,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - bool sync) { return -ENOSYS; } + enum migrate_mode mode) { return -ENOSYS; } static inline int migrate_huge_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - bool sync) { return -ENOSYS; } + enum migrate_mode mode) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } diff --git a/mm/compaction.c b/mm/compaction.c index fb291585e1bf..71a58f67f481 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -557,7 +557,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, (unsigned long)cc, false, - cc->sync); + cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 06d3479513aa..56080ea36140 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1557,7 +1557,7 @@ int soft_offline_page(struct page *page, int flags) page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, - 0, true); + 0, MIGRATE_SYNC); if (ret) { putback_lru_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2168489c0bc9..6629fafd6ce4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -809,7 +809,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) } /* this function returns # of failed pages */ ret = migrate_pages(&source, hotremove_migrate_alloc, 0, - true, true); + true, MIGRATE_SYNC); if (ret) putback_lru_pages(&source); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e3d58f088466..06b145fb64ab 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -942,7 +942,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, - false, true); + false, MIGRATE_SYNC); if (err) putback_lru_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index 4e86f3bacb85..9871a56d82c3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -218,12 +218,13 @@ out: #ifdef CONFIG_BLOCK /* Returns true if all buffers are successfully locked */ -static bool buffer_migrate_lock_buffers(struct buffer_head *head, bool sync) +static bool buffer_migrate_lock_buffers(struct buffer_head *head, + enum migrate_mode mode) { struct buffer_head *bh = head; /* Simple case, sync compaction */ - if (sync) { + if (mode != MIGRATE_ASYNC) { do { get_bh(bh); lock_buffer(bh); @@ -259,7 +260,7 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head, bool sync) } #else static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, - bool sync) + enum migrate_mode mode) { return true; } @@ -275,7 +276,7 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, */ static int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, - struct buffer_head *head, bool sync) + struct buffer_head *head, enum migrate_mode mode) { int expected_count; void **pslot; @@ -311,7 +312,8 @@ static int migrate_page_move_mapping(struct address_space *mapping, * the mapping back due to an elevated page count, we would have to * block waiting on other references to be dropped. */ - if (!sync && head && !buffer_migrate_lock_buffers(head, sync)) { + if (mode == MIGRATE_ASYNC && head && + !buffer_migrate_lock_buffers(head, mode)) { page_unfreeze_refs(page, expected_count); spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; @@ -472,13 +474,14 @@ EXPORT_SYMBOL(fail_migrate_page); * Pages are locked upon entry and exit. */ int migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, bool sync) + struct page *newpage, struct page *page, + enum migrate_mode mode) { int rc; BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page, NULL, sync); + rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); if (rc) return rc; @@ -495,17 +498,17 @@ EXPORT_SYMBOL(migrate_page); * exist. */ int buffer_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, bool sync) + struct page *newpage, struct page *page, enum migrate_mode mode) { struct buffer_head *bh, *head; int rc; if (!page_has_buffers(page)) - return migrate_page(mapping, newpage, page, sync); + return migrate_page(mapping, newpage, page, mode); head = page_buffers(page); - rc = migrate_page_move_mapping(mapping, newpage, page, head, sync); + rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); if (rc) return rc; @@ -515,8 +518,8 @@ int buffer_migrate_page(struct address_space *mapping, * with an IRQ-safe spinlock held. In the sync case, the buffers * need to be locked now */ - if (sync) - BUG_ON(!buffer_migrate_lock_buffers(head, sync)); + if (mode != MIGRATE_ASYNC) + BUG_ON(!buffer_migrate_lock_buffers(head, mode)); ClearPagePrivate(page); set_page_private(newpage, page_private(page)); @@ -593,10 +596,11 @@ static int writeout(struct address_space *mapping, struct page *page) * Default handling if a filesystem does not provide a migration function. */ static int fallback_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, bool sync) + struct page *newpage, struct page *page, enum migrate_mode mode) { if (PageDirty(page)) { - if (!sync) + /* Only writeback pages in full synchronous migration */ + if (mode != MIGRATE_SYNC) return -EBUSY; return writeout(mapping, page); } @@ -609,7 +613,7 @@ static int fallback_migrate_page(struct address_space *mapping, !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page, sync); + return migrate_page(mapping, newpage, page, mode); } /* @@ -624,7 +628,7 @@ static int fallback_migrate_page(struct address_space *mapping, * == 0 - success */ static int move_to_new_page(struct page *newpage, struct page *page, - int remap_swapcache, bool sync) + int remap_swapcache, enum migrate_mode mode) { struct address_space *mapping; int rc; @@ -645,7 +649,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, mapping = page_mapping(page); if (!mapping) - rc = migrate_page(mapping, newpage, page, sync); + rc = migrate_page(mapping, newpage, page, mode); else if (mapping->a_ops->migratepage) /* * Most pages have a mapping and most filesystems provide a @@ -654,9 +658,9 @@ static int move_to_new_page(struct page *newpage, struct page *page, * is the most common path for page migration. */ rc = mapping->a_ops->migratepage(mapping, - newpage, page, sync); + newpage, page, mode); else - rc = fallback_migrate_page(mapping, newpage, page, sync); + rc = fallback_migrate_page(mapping, newpage, page, mode); if (rc) { newpage->mapping = NULL; @@ -671,7 +675,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, } static int __unmap_and_move(struct page *page, struct page *newpage, - int force, bool offlining, bool sync) + int force, bool offlining, enum migrate_mode mode) { int rc = -EAGAIN; int remap_swapcache = 1; @@ -680,7 +684,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, struct anon_vma *anon_vma = NULL; if (!trylock_page(page)) { - if (!force || !sync) + if (!force || mode == MIGRATE_ASYNC) goto out; /* @@ -726,10 +730,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (PageWriteback(page)) { /* - * For !sync, there is no point retrying as the retry loop - * is expected to be too short for PageWriteback to be cleared + * Only in the case of a full syncronous migration is it + * necessary to wait for PageWriteback. In the async case, + * the retry loop is too short and in the sync-light case, + * the overhead of stalling is too much */ - if (!sync) { + if (mode != MIGRATE_SYNC) { rc = -EBUSY; goto uncharge; } @@ -800,7 +806,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, skip_unmap: if (!page_mapped(page)) - rc = move_to_new_page(newpage, page, remap_swapcache, sync); + rc = move_to_new_page(newpage, page, remap_swapcache, mode); if (rc && remap_swapcache) remove_migration_ptes(page, page); @@ -823,7 +829,8 @@ out: * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, bool offlining, bool sync) + struct page *page, int force, bool offlining, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -843,7 +850,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (unlikely(split_huge_page(page))) goto out; - rc = __unmap_and_move(page, newpage, force, offlining, sync); + rc = __unmap_and_move(page, newpage, force, offlining, mode); out: if (rc != -EAGAIN) { /* @@ -891,7 +898,8 @@ out: */ static int unmap_and_move_huge_page(new_page_t get_new_page, unsigned long private, struct page *hpage, - int force, bool offlining, bool sync) + int force, bool offlining, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -904,7 +912,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, rc = -EAGAIN; if (!trylock_page(hpage)) { - if (!force || !sync) + if (!force || mode != MIGRATE_SYNC) goto out; lock_page(hpage); } @@ -915,7 +923,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); if (!page_mapped(hpage)) - rc = move_to_new_page(new_hpage, hpage, 1, sync); + rc = move_to_new_page(new_hpage, hpage, 1, mode); if (rc) remove_migration_ptes(hpage, hpage); @@ -958,7 +966,7 @@ out: */ int migrate_pages(struct list_head *from, new_page_t get_new_page, unsigned long private, bool offlining, - bool sync) + enum migrate_mode mode) { int retry = 1; int nr_failed = 0; @@ -979,7 +987,7 @@ int migrate_pages(struct list_head *from, rc = unmap_and_move(get_new_page, private, page, pass > 2, offlining, - sync); + mode); switch(rc) { case -ENOMEM: @@ -1009,7 +1017,7 @@ out: int migrate_huge_pages(struct list_head *from, new_page_t get_new_page, unsigned long private, bool offlining, - bool sync) + enum migrate_mode mode) { int retry = 1; int nr_failed = 0; @@ -1026,7 +1034,7 @@ int migrate_huge_pages(struct list_head *from, rc = unmap_and_move_huge_page(get_new_page, private, page, pass > 2, offlining, - sync); + mode); switch(rc) { case -ENOMEM: @@ -1155,7 +1163,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm, 0, true); + (unsigned long)pm, 0, MIGRATE_SYNC); if (err) putback_lru_pages(&pagelist); } From fe4b1b244bdb96136855f2c694071cb09d140766 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:45 -0800 Subject: [PATCH 62/96] mm: vmscan: when reclaiming for compaction, ensure there are sufficient free pages available In commit e0887c19 ("vmscan: limit direct reclaim for higher order allocations"), Rik noted that reclaim was too aggressive when THP was enabled. In his initial patch he used the number of free pages to decide if reclaim should abort for compaction. My feedback was that reclaim and compaction should be using the same logic when deciding if reclaim should be aborted. Unfortunately, this had the effect of reducing THP success rates when the workload included something like streaming reads that continually allocated pages. The window during which compaction could run and return a THP was too small. This patch combines Rik's two patches together. compaction_suitable() is still used to decide if reclaim should be aborted to allow compaction is used. However, it will also ensure that there is a reasonable buffer of free pages available. This improves upon the THP allocation success rates but bounds the number of pages that are freed for compaction. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index efbcab1c8f54..3b8ede882396 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2163,6 +2163,42 @@ static void shrink_zone(int priority, struct zone *zone, } while (memcg); } +/* Returns true if compaction should go ahead for a high-order request */ +static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) +{ + unsigned long balance_gap, watermark; + bool watermark_ok; + + /* Do not consider compaction for orders reclaim is meant to satisfy */ + if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) + return false; + + /* + * Compaction takes time to run and there are potentially other + * callers using the pages just freed. Continue reclaiming until + * there is a buffer of free pages available to give compaction + * a reasonable chance of completing and allocating the page + */ + balance_gap = min(low_wmark_pages(zone), + (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / + KSWAPD_ZONE_BALANCE_GAP_RATIO); + watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); + + /* + * If compaction is deferred, reclaim up to a point where + * compaction will have a chance of success when re-enabled + */ + if (compaction_deferred(zone)) + return watermark_ok; + + /* If compaction is not ready to start, keep reclaiming */ + if (!compaction_suitable(zone, sc->order)) + return false; + + return watermark_ok; +} + /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation @@ -2180,8 +2216,8 @@ static void shrink_zone(int priority, struct zone *zone, * scan then give up on it. * * This function returns true if a zone is being reclaimed for a costly - * high-order allocation and compaction is either ready to begin or deferred. - * This indicates to the caller that it should retry the allocation or fail. + * high-order allocation and compaction is ready to begin. This indicates to + * the caller that it should retry the allocation or fail. */ static bool shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) @@ -2215,9 +2251,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, * noticable problem, like transparent huge page * allocations. */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER && - (compaction_suitable(zone, sc->order) || - compaction_deferred(zone))) { + if (compaction_ready(zone, sc)) { should_abort_reclaim = true; continue; } From 0cee34fd72c582b4f8ad8ce00645b75fb4168199 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:49 -0800 Subject: [PATCH 63/96] mm: vmscan: check if reclaim should really abort even if compaction_ready() is true for one zone If compaction can proceed for a given zone, shrink_zones() does not reclaim any more pages from it. After commit [e0c2327: vmscan: abort reclaim/compaction if compaction can proceed], do_try_to_free_pages() tries to finish as soon as possible once one zone can compact. This was intended to prevent slabs being shrunk unnecessarily but there are side-effects. One is that a small zone that is ready for compaction will abort reclaim even if the chances of successfully allocating a THP from that zone is small. It also means that reclaim can return too early even though sc->nr_to_reclaim pages were not reclaimed. This partially reverts the commit until it is proven that slabs are really being shrunk unnecessarily but preserves the check to return 1 to avoid OOM if reclaim was aborted prematurely. [aarcange@redhat.com: This patch replaces a revert from Andrea] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 3b8ede882396..25f90383b391 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2217,7 +2217,8 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * * This function returns true if a zone is being reclaimed for a costly * high-order allocation and compaction is ready to begin. This indicates to - * the caller that it should retry the allocation or fail. + * the caller that it should consider retrying the allocation instead of + * further reclaim. */ static bool shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) @@ -2226,7 +2227,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; - bool should_abort_reclaim = false; + bool aborted_reclaim = false; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -2252,7 +2253,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, * allocations. */ if (compaction_ready(zone, sc)) { - should_abort_reclaim = true; + aborted_reclaim = true; continue; } } @@ -2274,7 +2275,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, shrink_zone(priority, zone, sc); } - return should_abort_reclaim; + return aborted_reclaim; } static bool zone_reclaimable(struct zone *zone) @@ -2328,7 +2329,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; unsigned long writeback_threshold; - bool should_abort_reclaim; + bool aborted_reclaim; get_mems_allowed(); delayacct_freepages_start(); @@ -2340,9 +2341,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; if (!priority) disable_swap_token(sc->target_mem_cgroup); - should_abort_reclaim = shrink_zones(priority, zonelist, sc); - if (should_abort_reclaim) - break; + aborted_reclaim = shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from @@ -2409,8 +2408,8 @@ out: if (oom_killer_disabled) return 0; - /* Aborting reclaim to try compaction? don't OOM, then */ - if (should_abort_reclaim) + /* Aborted reclaim to try compaction? don't OOM, then */ + if (aborted_reclaim) return 1; /* top priority shrink_zones still had more to do? don't OOM, then */ From 12d27107867fc7216e8faaff0b894b0f162dcf75 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 12 Jan 2012 17:19:52 -0800 Subject: [PATCH 64/96] memcg: fix split_huge_page_refcounts() This patch started off as a cleanup: __split_huge_page_refcounts() has to cope with two scenarios, when the hugepage being split is already on LRU, and when it is not; but why does it have to split that accounting across three different sites? Consolidate it in lru_add_page_tail(), handling evictable and unevictable alike, and use standard add_page_to_lru_list() when accounting is needed (when the head is not yet on LRU). But a recent regression in -next, I guess the removal of PageCgroupAcctLRU test from mem_cgroup_split_huge_fixup(), makes this now a necessary fix: under load, the MEM_CGROUP_ZSTAT count was wrapping to a huge number, messing up reclaim calculations and causing a freeze at rmdir of cgroup. Add a VM_BUG_ON to mem_cgroup_lru_del_list() when we're about to wrap that count - this has not been the only such incident. Document that lru_add_page_tail() is for Transparent HugePages by #ifdef around it. Signed-off-by: Hugh Dickins Cc: Daisuke Nishimura Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 10 ---------- mm/memcontrol.c | 12 ++---------- mm/swap.c | 29 +++++++++++++++++++---------- 3 files changed, 21 insertions(+), 30 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 76cc3f7dd4f0..b3ffc21ce801 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1229,7 +1229,6 @@ static void __split_huge_page_refcount(struct page *page) { int i; struct zone *zone = page_zone(page); - int zonestat; int tail_count = 0; /* prevent PageLRU to go away from under us, and freeze lru stats */ @@ -1317,15 +1316,6 @@ static void __split_huge_page_refcount(struct page *page) __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); - /* - * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics, - * so adjust those appropriately if this page is on the LRU. - */ - if (PageLRU(page)) { - zonestat = NR_LRU_BASE + page_lru(page); - __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1)); - } - ClearPageCompound(page); compound_unlock(page); spin_unlock_irq(&zone->lru_lock); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 609c49f492e6..9f2f64697409 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1071,6 +1071,7 @@ void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) VM_BUG_ON(!memcg); mz = page_cgroup_zoneinfo(memcg, page); /* huge page split is done under lru_lock. so, we have no races. */ + VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page))); MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); } @@ -2465,9 +2466,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, void mem_cgroup_split_huge_fixup(struct page *head) { struct page_cgroup *head_pc = lookup_page_cgroup(head); - struct mem_cgroup_per_zone *mz; struct page_cgroup *pc; - enum lru_list lru; int i; if (mem_cgroup_disabled()) @@ -2478,15 +2477,8 @@ void mem_cgroup_split_huge_fixup(struct page *head) smp_wmb();/* see __commit_charge() */ pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; } - /* - * Tail pages will be added to LRU. - * We hold lru_lock,then,reduce counter directly. - */ - lru = page_lru(head); - mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); - MEM_CGROUP_ZSTAT(mz, lru) -= HPAGE_PMD_NR - 1; } -#endif +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /** * mem_cgroup_move_account - move account of the page diff --git a/mm/swap.c b/mm/swap.c index ddccf8e0b4ae..db6defaf2e55 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -650,6 +650,7 @@ void __pagevec_release(struct pagevec *pvec) EXPORT_SYMBOL(__pagevec_release); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* used by __split_huge_page_refcount() */ void lru_add_page_tail(struct zone* zone, struct page *page, struct page *page_tail) @@ -666,8 +667,6 @@ void lru_add_page_tail(struct zone* zone, SetPageLRU(page_tail); if (page_evictable(page_tail, NULL)) { - struct lruvec *lruvec; - if (PageActive(page)) { SetPageActive(page_tail); active = 1; @@ -677,18 +676,28 @@ void lru_add_page_tail(struct zone* zone, lru = LRU_INACTIVE_ANON; } update_page_reclaim_stat(zone, page_tail, file, active); - lruvec = mem_cgroup_lru_add_list(zone, page_tail, lru); - if (likely(PageLRU(page))) - list_add(&page_tail->lru, page->lru.prev); - else - list_add(&page_tail->lru, lruvec->lists[lru].prev); - __mod_zone_page_state(zone, NR_LRU_BASE + lru, - hpage_nr_pages(page_tail)); } else { SetPageUnevictable(page_tail); - add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); + lru = LRU_UNEVICTABLE; + } + + if (likely(PageLRU(page))) + list_add_tail(&page_tail->lru, &page->lru); + else { + struct list_head *list_head; + /* + * Head page has not yet been counted, as an hpage, + * so we must account for each subpage individually. + * + * Use the standard add function to put page_tail on the list, + * but then correct its position so they all end up in order. + */ + add_page_to_lru_list(zone, page_tail, lru); + list_head = page_tail->lru.prev; + list_move_tail(&page_tail->lru, list_head); } } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static void ____pagevec_lru_add_fn(struct page *page, void *arg) { From 90b3feaec8ffb167abd8903bf111605c2f035aa8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 12 Jan 2012 17:19:54 -0800 Subject: [PATCH 65/96] memcg: fix mem_cgroup_print_bad_page If DEBUG_VM, mem_cgroup_print_bad_page() is called whenever bad_page() shows a "Bad page state" message, removes page from circulation, adds a taint and continues. This is at a very low level, often when a spinlock is held (sometimes when page table lock is held, for example). We want to recover from this badness, not make it worse: we must not kmalloc memory here, we must not do a cgroup path lookup via dubious pointers. No doubt that code was useful to debug a particular case at one time, and may be again, but take it out of the mainline kernel. Signed-off-by: Hugh Dickins Cc: Daisuke Nishimura Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9f2f64697409..602207be9853 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3364,23 +3364,8 @@ void mem_cgroup_print_bad_page(struct page *page) pc = lookup_page_cgroup_used(page); if (pc) { - int ret = -1; - char *path; - - printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", + printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", pc, pc->flags, pc->mem_cgroup); - - path = kmalloc(PATH_MAX, GFP_KERNEL); - if (path) { - rcu_read_lock(); - ret = cgroup_path(pc->mem_cgroup->css.cgroup, - path, PATH_MAX); - rcu_read_unlock(); - } - - printk(KERN_CONT "(%s)\n", - (ret < 0) ? "cannot get the path" : path); - kfree(path); } } #endif From 2bcf887963812c075f80a14e1fad8ec7e1c67acf Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 12 Jan 2012 17:19:56 -0800 Subject: [PATCH 66/96] mm: take pagevecs off reclaim stack Replace pagevecs in putback_lru_pages() and move_active_pages_to_lru() by lists of pages_to_free: then apply Konstantin Khlebnikov's free_hot_cold_page_list() to them instead of pagevec_release(). Which simplifies the flow (no need to drop and retake lock whenever pagevec fills up) and reduces stale addresses in stack backtraces (which often showed through the pagevecs); but more importantly, removes another 120 bytes from the deepest stacks in page reclaim. Although I've not recently seen an actual stack overflow here with a vanilla kernel, move_active_pages_to_lru() has often featured in deep backtraces. However, free_hot_cold_page_list() does not handle compound pages (nor need it: a Transparent HugePage would have been split by the time it reaches the call in shrink_page_list()), but it is possible for putback_lru_pages() or move_active_pages_to_lru() to be left holding the last reference on a THP, so must exclude the unlikely compound case before putting on pages_to_free. Remove pagevec_strip(), its work now done in move_active_pages_to_lru(). The pagevec in scan_mapping_unevictable_pages() remains in mm/vmscan.c, but that is never on the reclaim path, and cannot be replaced by a list. Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Reviewed-by: Konstantin Khlebnikov Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 2 -- mm/swap.c | 19 -------------- mm/vmscan.c | 58 ++++++++++++++++++++++++++++------------- 3 files changed, 40 insertions(+), 39 deletions(-) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index ed17024d2ebe..9def9121f8a2 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -22,7 +22,6 @@ struct pagevec { void __pagevec_release(struct pagevec *pvec); void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); -void pagevec_strip(struct pagevec *pvec); unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, unsigned nr_pages); unsigned pagevec_lookup_tag(struct pagevec *pvec, @@ -59,7 +58,6 @@ static inline unsigned pagevec_add(struct pagevec *pvec, struct page *page) return pagevec_space(pvec); } - static inline void pagevec_release(struct pagevec *pvec) { if (pagevec_count(pvec)) diff --git a/mm/swap.c b/mm/swap.c index db6defaf2e55..79c22a649a3e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -23,7 +23,6 @@ #include #include #include -#include /* for try_to_release_page() */ #include #include #include @@ -730,24 +729,6 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) EXPORT_SYMBOL(____pagevec_lru_add); -/* - * Try to drop buffers from the pages in a pagevec - */ -void pagevec_strip(struct pagevec *pvec) -{ - int i; - - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - - if (page_has_private(page) && trylock_page(page)) { - if (page_has_private(page)) - try_to_release_page(page, 0); - unlock_page(page); - } - } -} - /** * pagevec_lookup - gang pagecache lookup * @pvec: Where the resulting pages are placed diff --git a/mm/vmscan.c b/mm/vmscan.c index 25f90383b391..7724fb8e7498 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1398,12 +1398,10 @@ putback_lru_pages(struct mem_cgroup_zone *mz, struct scan_control *sc, struct list_head *page_list) { struct page *page; - struct pagevec pvec; + LIST_HEAD(pages_to_free); struct zone *zone = mz->zone; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); - pagevec_init(&pvec, 1); - /* * Put back any unfreeable pages. */ @@ -1427,17 +1425,24 @@ putback_lru_pages(struct mem_cgroup_zone *mz, struct scan_control *sc, int numpages = hpage_nr_pages(page); reclaim_stat->recent_rotated[file] += numpages; } - if (!pagevec_add(&pvec, page)) { - spin_unlock_irq(&zone->lru_lock); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); + if (put_page_testzero(page)) { + __ClearPageLRU(page); + __ClearPageActive(page); + del_page_from_lru_list(zone, page, lru); + + if (unlikely(PageCompound(page))) { + spin_unlock_irq(&zone->lru_lock); + (*get_compound_page_dtor(page))(page); + spin_lock_irq(&zone->lru_lock); + } else + list_add(&page->lru, &pages_to_free); } } __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); spin_unlock_irq(&zone->lru_lock); - pagevec_release(&pvec); + free_hot_cold_page_list(&pages_to_free, 1); } static noinline_for_stack void @@ -1647,13 +1652,23 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, static void move_active_pages_to_lru(struct zone *zone, struct list_head *list, + struct list_head *pages_to_free, enum lru_list lru) { unsigned long pgmoved = 0; - struct pagevec pvec; struct page *page; - pagevec_init(&pvec, 1); + if (buffer_heads_over_limit) { + spin_unlock_irq(&zone->lru_lock); + list_for_each_entry(page, list, lru) { + if (page_has_private(page) && trylock_page(page)) { + if (page_has_private(page)) + try_to_release_page(page, 0); + unlock_page(page); + } + } + spin_lock_irq(&zone->lru_lock); + } while (!list_empty(list)) { struct lruvec *lruvec; @@ -1667,12 +1682,17 @@ static void move_active_pages_to_lru(struct zone *zone, list_move(&page->lru, &lruvec->lists[lru]); pgmoved += hpage_nr_pages(page); - if (!pagevec_add(&pvec, page) || list_empty(list)) { - spin_unlock_irq(&zone->lru_lock); - if (buffer_heads_over_limit) - pagevec_strip(&pvec); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); + if (put_page_testzero(page)) { + __ClearPageLRU(page); + __ClearPageActive(page); + del_page_from_lru_list(zone, page, lru); + + if (unlikely(PageCompound(page))) { + spin_unlock_irq(&zone->lru_lock); + (*get_compound_page_dtor(page))(page); + spin_lock_irq(&zone->lru_lock); + } else + list_add(&page->lru, pages_to_free); } } __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); @@ -1766,12 +1786,14 @@ static void shrink_active_list(unsigned long nr_pages, */ reclaim_stat->recent_rotated[file] += nr_rotated; - move_active_pages_to_lru(zone, &l_active, + move_active_pages_to_lru(zone, &l_active, &l_hold, LRU_ACTIVE + file * LRU_FILE); - move_active_pages_to_lru(zone, &l_inactive, + move_active_pages_to_lru(zone, &l_inactive, &l_hold, LRU_BASE + file * LRU_FILE); __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); + + free_hot_cold_page_list(&l_hold, 1); } #ifdef CONFIG_SWAP From 5095ae83759f035c823fb375c6ed2de99c81d5ec Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 12 Jan 2012 17:19:58 -0800 Subject: [PATCH 67/96] mm: fewer underscores in ____pagevec_lru_add What's so special about ____pagevec_lru_add() that it needs four leading underscores? Nothing, it just helped to distinguish from __pagevec_lru_add() in 2.6.28 development. Cut two leading underscores. Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 10 +++++----- mm/swap.c | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 9def9121f8a2..2aa12b8499c0 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -21,7 +21,7 @@ struct pagevec { }; void __pagevec_release(struct pagevec *pvec); -void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); +void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, unsigned nr_pages); unsigned pagevec_lookup_tag(struct pagevec *pvec, @@ -66,22 +66,22 @@ static inline void pagevec_release(struct pagevec *pvec) static inline void __pagevec_lru_add_anon(struct pagevec *pvec) { - ____pagevec_lru_add(pvec, LRU_INACTIVE_ANON); + __pagevec_lru_add(pvec, LRU_INACTIVE_ANON); } static inline void __pagevec_lru_add_active_anon(struct pagevec *pvec) { - ____pagevec_lru_add(pvec, LRU_ACTIVE_ANON); + __pagevec_lru_add(pvec, LRU_ACTIVE_ANON); } static inline void __pagevec_lru_add_file(struct pagevec *pvec) { - ____pagevec_lru_add(pvec, LRU_INACTIVE_FILE); + __pagevec_lru_add(pvec, LRU_INACTIVE_FILE); } static inline void __pagevec_lru_add_active_file(struct pagevec *pvec) { - ____pagevec_lru_add(pvec, LRU_ACTIVE_FILE); + __pagevec_lru_add(pvec, LRU_ACTIVE_FILE); } static inline void pagevec_lru_add_file(struct pagevec *pvec) diff --git a/mm/swap.c b/mm/swap.c index 79c22a649a3e..e1cd623d9b2b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -378,7 +378,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru) page_cache_get(page); if (!pagevec_add(pvec, page)) - ____pagevec_lru_add(pvec, lru); + __pagevec_lru_add(pvec, lru); put_cpu_var(lru_add_pvecs); } EXPORT_SYMBOL(__lru_cache_add); @@ -506,7 +506,7 @@ static void drain_cpu_pagevecs(int cpu) for_each_lru(lru) { pvec = &pvecs[lru - LRU_BASE]; if (pagevec_count(pvec)) - ____pagevec_lru_add(pvec, lru); + __pagevec_lru_add(pvec, lru); } pvec = &per_cpu(lru_rotate_pvecs, cpu); @@ -698,7 +698,7 @@ void lru_add_page_tail(struct zone* zone, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static void ____pagevec_lru_add_fn(struct page *page, void *arg) +static void __pagevec_lru_add_fn(struct page *page, void *arg) { enum lru_list lru = (enum lru_list)arg; struct zone *zone = page_zone(page); @@ -720,14 +720,14 @@ static void ____pagevec_lru_add_fn(struct page *page, void *arg) * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ -void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) +void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) { VM_BUG_ON(is_unevictable_lru(lru)); - pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); + pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru); } -EXPORT_SYMBOL(____pagevec_lru_add); +EXPORT_SYMBOL(__pagevec_lru_add); /** * pagevec_lookup - gang pagecache lookup From 4d06f382c733f99ec67df006255e87525ac1efd3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 12 Jan 2012 17:20:00 -0800 Subject: [PATCH 68/96] mm: no blank line after EXPORT_SYMBOL in swap.c checkpatch rightly protests WARNING: EXPORT_SYMBOL(foo); should immediately follow its function/variable so fix the five offenders in mm/swap.c. Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index e1cd623d9b2b..0d1b24b3fa87 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -369,7 +369,6 @@ void mark_page_accessed(struct page *page) SetPageReferenced(page); } } - EXPORT_SYMBOL(mark_page_accessed); void __lru_cache_add(struct page *page, enum lru_list lru) @@ -646,7 +645,6 @@ void __pagevec_release(struct pagevec *pvec) release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); pagevec_reinit(pvec); } - EXPORT_SYMBOL(__pagevec_release); #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -726,7 +724,6 @@ void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru); } - EXPORT_SYMBOL(__pagevec_lru_add); /** @@ -751,7 +748,6 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); return pagevec_count(pvec); } - EXPORT_SYMBOL(pagevec_lookup); unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, @@ -761,7 +757,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, nr_pages, pvec->pages); return pagevec_count(pvec); } - EXPORT_SYMBOL(pagevec_lookup_tag); /* From 4111304dab198c687bc60f2e235a9f7ee92c47c8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 12 Jan 2012 17:20:01 -0800 Subject: [PATCH 69/96] mm: enum lru_list lru Mostly we use "enum lru_list lru": change those few "l"s to "lru"s. Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 26 +++++++++++++------------- include/linux/mmzone.h | 16 ++++++++-------- mm/page_alloc.c | 6 +++--- mm/vmscan.c | 22 +++++++++++----------- 4 files changed, 35 insertions(+), 35 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 4e3478e71926..8f84d2e53d0f 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -22,21 +22,21 @@ static inline int page_is_file_cache(struct page *page) } static inline void -add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) +add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list lru) { struct lruvec *lruvec; - lruvec = mem_cgroup_lru_add_list(zone, page, l); - list_add(&page->lru, &lruvec->lists[l]); - __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page)); + lruvec = mem_cgroup_lru_add_list(zone, page, lru); + list_add(&page->lru, &lruvec->lists[lru]); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, hpage_nr_pages(page)); } static inline void -del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) +del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list lru) { - mem_cgroup_lru_del_list(page, l); + mem_cgroup_lru_del_list(page, lru); list_del(&page->lru); - __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, -hpage_nr_pages(page)); } /** @@ -57,21 +57,21 @@ static inline enum lru_list page_lru_base_type(struct page *page) static inline void del_page_from_lru(struct zone *zone, struct page *page) { - enum lru_list l; + enum lru_list lru; if (PageUnevictable(page)) { __ClearPageUnevictable(page); - l = LRU_UNEVICTABLE; + lru = LRU_UNEVICTABLE; } else { - l = page_lru_base_type(page); + lru = page_lru_base_type(page); if (PageActive(page)) { __ClearPageActive(page); - l += LRU_ACTIVE; + lru += LRU_ACTIVE; } } - mem_cgroup_lru_del_list(page, l); + mem_cgroup_lru_del_list(page, lru); list_del(&page->lru); - __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, -hpage_nr_pages(page)); } /** diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2038b90ca6e3..650ba2fb3301 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -140,23 +140,23 @@ enum lru_list { NR_LRU_LISTS }; -#define for_each_lru(l) for (l = 0; l < NR_LRU_LISTS; l++) +#define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) -#define for_each_evictable_lru(l) for (l = 0; l <= LRU_ACTIVE_FILE; l++) +#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) -static inline int is_file_lru(enum lru_list l) +static inline int is_file_lru(enum lru_list lru) { - return (l == LRU_INACTIVE_FILE || l == LRU_ACTIVE_FILE); + return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); } -static inline int is_active_lru(enum lru_list l) +static inline int is_active_lru(enum lru_list lru) { - return (l == LRU_ACTIVE_ANON || l == LRU_ACTIVE_FILE); + return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } -static inline int is_unevictable_lru(enum lru_list l) +static inline int is_unevictable_lru(enum lru_list lru) { - return (l == LRU_UNEVICTABLE); + return (lru == LRU_UNEVICTABLE); } struct lruvec { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cb5723c491f0..0027d8f4a1bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4262,7 +4262,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, memmap_pages; - enum lru_list l; + enum lru_list lru; size = zone_spanned_pages_in_node(nid, j, zones_size); realsize = size - zone_absent_pages_in_node(nid, j, @@ -4312,8 +4312,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->zone_pgdat = pgdat; zone_pcp_init(zone); - for_each_lru(l) - INIT_LIST_HEAD(&zone->lruvec.lists[l]); + for_each_lru(lru) + INIT_LIST_HEAD(&zone->lruvec.lists[lru]); zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; zone->reclaim_stat.recent_scanned[0] = 0; diff --git a/mm/vmscan.c b/mm/vmscan.c index 7724fb8e7498..01466bf783fd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1920,7 +1920,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, unsigned long ap, fp; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); u64 fraction[2], denominator; - enum lru_list l; + enum lru_list lru; int noswap = 0; bool force_scan = false; @@ -2010,18 +2010,18 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, fraction[1] = fp; denominator = ap + fp + 1; out: - for_each_evictable_lru(l) { - int file = is_file_lru(l); + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); unsigned long scan; - scan = zone_nr_lru_pages(mz, l); + scan = zone_nr_lru_pages(mz, lru); if (priority || noswap) { scan >>= priority; if (!scan && force_scan) scan = SWAP_CLUSTER_MAX; scan = div64_u64(scan * fraction[file], denominator); } - nr[l] = scan; + nr[lru] = scan; } } @@ -2097,7 +2097,7 @@ static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz, { unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; - enum lru_list l; + enum lru_list lru; unsigned long nr_reclaimed, nr_scanned; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct blk_plug plug; @@ -2110,13 +2110,13 @@ restart: blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { - for_each_evictable_lru(l) { - if (nr[l]) { + for_each_evictable_lru(lru) { + if (nr[lru]) { nr_to_scan = min_t(unsigned long, - nr[l], SWAP_CLUSTER_MAX); - nr[l] -= nr_to_scan; + nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; - nr_reclaimed += shrink_list(l, nr_to_scan, + nr_reclaimed += shrink_list(lru, nr_to_scan, mz, sc, priority); } } From 1c1c53d43b387d02174911ecb42ce846577b0ea0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 12 Jan 2012 17:20:04 -0800 Subject: [PATCH 70/96] mm: remove del_page_from_lru, add page_off_lru del_page_from_lru() repeats del_page_from_lru_list(), also working out which LRU the page was on, clearing the relevant bits. Decouple those functions: remove del_page_from_lru() and add page_off_lru(). Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 15 +++++++++------ mm/swap.c | 4 ++-- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8f84d2e53d0f..227fd3e9a9c9 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -54,8 +54,14 @@ static inline enum lru_list page_lru_base_type(struct page *page) return LRU_INACTIVE_ANON; } -static inline void -del_page_from_lru(struct zone *zone, struct page *page) +/** + * page_off_lru - which LRU list was page on? clearing its lru flags. + * @page: the page to test + * + * Returns the LRU list a page was on, as an index into the array of LRU + * lists; and clears its Unevictable or Active flags, ready for freeing. + */ +static inline enum lru_list page_off_lru(struct page *page) { enum lru_list lru; @@ -69,9 +75,7 @@ del_page_from_lru(struct zone *zone, struct page *page) lru += LRU_ACTIVE; } } - mem_cgroup_lru_del_list(page, lru); - list_del(&page->lru); - __mod_zone_page_state(zone, NR_LRU_BASE + lru, -hpage_nr_pages(page)); + return lru; } /** @@ -92,7 +96,6 @@ static inline enum lru_list page_lru(struct page *page) if (PageActive(page)) lru += LRU_ACTIVE; } - return lru; } diff --git a/mm/swap.c b/mm/swap.c index 0d1b24b3fa87..b0f529b38979 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -53,7 +53,7 @@ static void __page_cache_release(struct page *page) spin_lock_irqsave(&zone->lru_lock, flags); VM_BUG_ON(!PageLRU(page)); __ClearPageLRU(page); - del_page_from_lru(zone, page); + del_page_from_lru_list(zone, page, page_off_lru(page)); spin_unlock_irqrestore(&zone->lru_lock, flags); } } @@ -617,7 +617,7 @@ void release_pages(struct page **pages, int nr, int cold) } VM_BUG_ON(!PageLRU(page)); __ClearPageLRU(page); - del_page_from_lru(zone, page); + del_page_from_lru_list(zone, page, page_off_lru(page)); } list_add(&page->lru, &pages_to_free); From f626012db08b9ea71363327d81fe60c2782eea9f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 12 Jan 2012 17:20:06 -0800 Subject: [PATCH 71/96] mm: remove isolate_pages() The isolate_pages() level in vmscan.c offers little but indirection: merge it into isolate_lru_pages() as the compiler does, and use the names nr_to_scan and nr_scanned in each case. Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 61 ++++++++++++++++++++++++----------------------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 01466bf783fd..9aab5dc51718 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1136,25 +1136,36 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) * Appropriate locks must be held before calling this function. * * @nr_to_scan: The number of pages to look through on the list. - * @src: The LRU list to pull pages off. + * @mz: The mem_cgroup_zone to pull pages from. * @dst: The temp list to put pages on to. - * @scanned: The number of pages that were scanned. + * @nr_scanned: The number of pages that were scanned. * @order: The caller's attempted allocation order * @mode: One of the LRU isolation modes + * @active: True [1] if isolating active pages * @file: True [1] if isolating file [!anon] pages * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, - struct list_head *src, struct list_head *dst, - unsigned long *scanned, int order, isolate_mode_t mode, - int file) + struct mem_cgroup_zone *mz, struct list_head *dst, + unsigned long *nr_scanned, int order, isolate_mode_t mode, + int active, int file) { + struct lruvec *lruvec; + struct list_head *src; unsigned long nr_taken = 0; unsigned long nr_lumpy_taken = 0; unsigned long nr_lumpy_dirty = 0; unsigned long nr_lumpy_failed = 0; unsigned long scan; + int lru = LRU_BASE; + + lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup); + if (active) + lru += LRU_ACTIVE; + if (file) + lru += LRU_FILE; + src = &lruvec->lists[lru]; for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { struct page *page; @@ -1263,7 +1274,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, nr_lumpy_failed++; } - *scanned = scan; + *nr_scanned = scan; trace_mm_vmscan_lru_isolate(order, nr_to_scan, scan, @@ -1273,23 +1284,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, return nr_taken; } -static unsigned long isolate_pages(unsigned long nr, struct mem_cgroup_zone *mz, - struct list_head *dst, - unsigned long *scanned, int order, - isolate_mode_t mode, int active, int file) -{ - struct lruvec *lruvec; - int lru = LRU_BASE; - - lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup); - if (active) - lru += LRU_ACTIVE; - if (file) - lru += LRU_FILE; - return isolate_lru_pages(nr, &lruvec->lists[lru], dst, - scanned, order, mode, file); -} - /* * clear_active_flags() is a helper for shrink_active_list(), clearing * any active bits from the pages in the list. @@ -1559,9 +1553,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, spin_lock_irq(&zone->lru_lock); - nr_taken = isolate_pages(nr_to_scan, mz, &page_list, - &nr_scanned, sc->order, - reclaim_mode, 0, file); + nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, + &nr_scanned, sc->order, + reclaim_mode, 0, file); if (global_reclaim(sc)) { zone->pages_scanned += nr_scanned; if (current_is_kswapd()) @@ -1700,13 +1694,13 @@ static void move_active_pages_to_lru(struct zone *zone, __count_vm_events(PGDEACTIVATE, pgmoved); } -static void shrink_active_list(unsigned long nr_pages, +static void shrink_active_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, struct scan_control *sc, int priority, int file) { unsigned long nr_taken; - unsigned long pgscanned; + unsigned long nr_scanned; unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_active); @@ -1726,16 +1720,15 @@ static void shrink_active_list(unsigned long nr_pages, spin_lock_irq(&zone->lru_lock); - nr_taken = isolate_pages(nr_pages, mz, &l_hold, - &pgscanned, sc->order, - reclaim_mode, 1, file); - + nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, + &nr_scanned, sc->order, + reclaim_mode, 1, file); if (global_reclaim(sc)) - zone->pages_scanned += pgscanned; + zone->pages_scanned += nr_scanned; reclaim_stat->recent_scanned[file] += nr_taken; - __count_zone_vm_events(PGREFILL, zone, pgscanned); + __count_zone_vm_events(PGREFILL, zone, nr_scanned); if (file) __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); else From 3f79768f239746d19accd88da96263ef35d6a219 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 12 Jan 2012 17:20:07 -0800 Subject: [PATCH 72/96] mm: rearrange putback_inactive_pages There is sometimes confusion between the global putback_lru_pages() in migrate.c and the static putback_lru_pages() in vmscan.c: rename the latter putback_inactive_pages(): it helps shrink_inactive_list() rather as move_active_pages_to_lru() helps shrink_active_list(). Remove unused scan_control arg from putback_inactive_pages() and from update_isolated_counts(). Move clear_active_flags() inside update_isolated_counts(). Move NR_ISOLATED accounting up into shrink_inactive_list() itself, so the balance is clearer. Do the spin_lock_irq() before calling putback_inactive_pages() and spin_unlock_irq() after return from it, so that it better matches update_isolated_counts() and move_active_pages_to_lru(). Signed-off-by: Hugh Dickins Cc: Johannes Weiner Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 96 ++++++++++++++++++++++++----------------------------- 1 file changed, 44 insertions(+), 52 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9aab5dc51718..2880396f7953 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1284,32 +1284,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, return nr_taken; } -/* - * clear_active_flags() is a helper for shrink_active_list(), clearing - * any active bits from the pages in the list. - */ -static unsigned long clear_active_flags(struct list_head *page_list, - unsigned int *count) -{ - int nr_active = 0; - int lru; - struct page *page; - - list_for_each_entry(page, page_list, lru) { - int numpages = hpage_nr_pages(page); - lru = page_lru_base_type(page); - if (PageActive(page)) { - lru += LRU_ACTIVE; - ClearPageActive(page); - nr_active += numpages; - } - if (count) - count[lru] += numpages; - } - - return nr_active; -} - /** * isolate_lru_page - tries to isolate a page from its LRU list * @page: page to isolate from its LRU list @@ -1383,26 +1357,21 @@ static int too_many_isolated(struct zone *zone, int file, return isolated > inactive; } -/* - * TODO: Try merging with migrations version of putback_lru_pages - */ static noinline_for_stack void -putback_lru_pages(struct mem_cgroup_zone *mz, struct scan_control *sc, - unsigned long nr_anon, unsigned long nr_file, - struct list_head *page_list) +putback_inactive_pages(struct mem_cgroup_zone *mz, + struct list_head *page_list) { - struct page *page; - LIST_HEAD(pages_to_free); - struct zone *zone = mz->zone; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); + struct zone *zone = mz->zone; + LIST_HEAD(pages_to_free); /* * Put back any unfreeable pages. */ - spin_lock(&zone->lru_lock); while (!list_empty(page_list)) { + struct page *page = lru_to_page(page_list); int lru; - page = lru_to_page(page_list); + VM_BUG_ON(PageLRU(page)); list_del(&page->lru); if (unlikely(!page_evictable(page, NULL))) { @@ -1432,26 +1401,40 @@ putback_lru_pages(struct mem_cgroup_zone *mz, struct scan_control *sc, list_add(&page->lru, &pages_to_free); } } - __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); - spin_unlock_irq(&zone->lru_lock); - free_hot_cold_page_list(&pages_to_free, 1); + /* + * To save our caller's stack, now use input list for pages to free. + */ + list_splice(&pages_to_free, page_list); } static noinline_for_stack void update_isolated_counts(struct mem_cgroup_zone *mz, - struct scan_control *sc, + struct list_head *page_list, unsigned long *nr_anon, - unsigned long *nr_file, - struct list_head *isolated_list) + unsigned long *nr_file) { - unsigned long nr_active; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); struct zone *zone = mz->zone; unsigned int count[NR_LRU_LISTS] = { 0, }; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); + unsigned long nr_active = 0; + struct page *page; + int lru; + + /* + * Count pages and clear active flags + */ + list_for_each_entry(page, page_list, lru) { + int numpages = hpage_nr_pages(page); + lru = page_lru_base_type(page); + if (PageActive(page)) { + lru += LRU_ACTIVE; + ClearPageActive(page); + nr_active += numpages; + } + count[lru] += numpages; + } - nr_active = clear_active_flags(isolated_list, count); __count_vm_events(PGDEACTIVATE, nr_active); __mod_zone_page_state(zone, NR_ACTIVE_FILE, @@ -1465,8 +1448,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz, *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; - __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); reclaim_stat->recent_scanned[0] += *nr_anon; reclaim_stat->recent_scanned[1] += *nr_file; @@ -1571,7 +1552,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, return 0; } - update_isolated_counts(mz, sc, &nr_anon, &nr_file, &page_list); + update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); + + __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); spin_unlock_irq(&zone->lru_lock); @@ -1585,12 +1569,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, priority, &nr_dirty, &nr_writeback); } - local_irq_disable(); + spin_lock_irq(&zone->lru_lock); + if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_reclaimed); __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); - putback_lru_pages(mz, sc, nr_anon, nr_file, &page_list); + putback_inactive_pages(mz, &page_list); + + __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); + + spin_unlock_irq(&zone->lru_lock); + + free_hot_cold_page_list(&page_list, 1); /* * If reclaim is isolating dirty pages under writeback, it implies From f1db7afd917e54711798c64d78f8f5fb090f950d Mon Sep 17 00:00:00 2001 From: Kautuk Consul Date: Thu, 12 Jan 2012 17:20:08 -0800 Subject: [PATCH 73/96] mm/vmalloc.c: eliminate extra loop in pcpu_get_vm_areas error path If either of the vas or vms arrays are not properly kzalloced, then the code jumps to the err_free label. The err_free label runs a loop to check and free each of the array members of the vas and vms arrays which is not required for this situation as none of the array members have been allocated till this point. Eliminate the extra loop we have to go through by introducing a new label err_free2 and then jumping to it. [akpm@linux-foundation.org: remove now-unneeded tests] Signed-off-by: Kautuk Consul Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 877ca046f43d..86ce9a526c17 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2378,7 +2378,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); if (!vas || !vms) - goto err_free; + goto err_free2; for (area = 0; area < nr_vms; area++) { vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); @@ -2476,11 +2476,10 @@ found: err_free: for (area = 0; area < nr_vms; area++) { - if (vas) - kfree(vas[area]); - if (vms) - kfree(vms[area]); + kfree(vas[area]); + kfree(vms[area]); } +err_free2: kfree(vas); kfree(vms); return NULL; From 9512938b885304f72c847379611d6018064af840 Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Thu, 12 Jan 2012 17:20:09 -0800 Subject: [PATCH 74/96] cpumask: update setup_node_to_cpumask_map() comments node_to_cpumask() has been replaced by cpumask_of_node(), and wholly removed since commit 29c337a0 ("cpumask: remove obsolete node_to_cpumask now everyone uses cpumask_of_node"). So update the comments for setup_node_to_cpumask_map(). Signed-off-by: Wanlong Gao Acked-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/numa.c | 2 +- arch/x86/mm/numa.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 4ff3d8e411a7..3feefc3842a8 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -58,7 +58,7 @@ static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; * Allocate node_to_cpumask_map based on number of available nodes * Requires node_possible_map to be valid. * - * Note: node_to_cpumask() is not valid until after this is done. + * Note: cpumask_of_node() is not valid until after this is done. */ static void __init setup_node_to_cpumask_map(void) { diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 020cd2e80873..19d3fa08b119 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -110,7 +110,7 @@ void __cpuinit numa_clear_node(int cpu) * Allocate node_to_cpumask_map based on number of available nodes * Requires node_possible_map to be valid. * - * Note: node_to_cpumask() is not valid until after this is done. + * Note: cpumask_of_node() is not valid until after this is done. * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) */ void __init setup_node_to_cpumask_map(void) From a3dd3323058d281abd584b15ad4c5b65064d7a61 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 12 Jan 2012 17:20:11 -0800 Subject: [PATCH 75/96] kexec: remove KMSG_DUMP_KEXEC KMSG_DUMP_KEXEC is useless because we already save kernel messages inside /proc/vmcore, and it is unsafe to allow modules to do other stuffs in a crash dump scenario. [akpm@linux-foundation.org: fix powerpc build] Signed-off-by: WANG Cong Reported-by: Vivek Goyal Acked-by: Vivek Goyal Acked-by: Jarod Wilson Cc: "Eric W. Biederman" Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/pseries/nvram.c | 1 - drivers/char/ramoops.c | 3 +-- drivers/mtd/mtdoops.c | 3 +-- include/linux/kmsg_dump.h | 1 - kernel/kexec.c | 3 --- 5 files changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 330a57b7c17c..36f957f31842 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -638,7 +638,6 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, /* These are almost always orderly shutdowns. */ return; case KMSG_DUMP_OOPS: - case KMSG_DUMP_KEXEC: break; case KMSG_DUMP_PANIC: panicking = true; diff --git a/drivers/char/ramoops.c b/drivers/char/ramoops.c index 7c7f42a1f880..feda90cdac9f 100644 --- a/drivers/char/ramoops.c +++ b/drivers/char/ramoops.c @@ -83,8 +83,7 @@ static void ramoops_do_dump(struct kmsg_dumper *dumper, struct timeval timestamp; if (reason != KMSG_DUMP_OOPS && - reason != KMSG_DUMP_PANIC && - reason != KMSG_DUMP_KEXEC) + reason != KMSG_DUMP_PANIC) return; /* Only dump oopses if dump_oops is set */ diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c index db8e8272d69b..3ce99e00a49e 100644 --- a/drivers/mtd/mtdoops.c +++ b/drivers/mtd/mtdoops.c @@ -315,8 +315,7 @@ static void mtdoops_do_dump(struct kmsg_dumper *dumper, char *dst; if (reason != KMSG_DUMP_OOPS && - reason != KMSG_DUMP_PANIC && - reason != KMSG_DUMP_KEXEC) + reason != KMSG_DUMP_PANIC) return; /* Only dump oopses if dump_oops is set */ diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index ee0c952188de..fee66317e071 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -18,7 +18,6 @@ enum kmsg_dump_reason { KMSG_DUMP_OOPS, KMSG_DUMP_PANIC, - KMSG_DUMP_KEXEC, KMSG_DUMP_RESTART, KMSG_DUMP_HALT, KMSG_DUMP_POWEROFF, diff --git a/kernel/kexec.c b/kernel/kexec.c index 090ee10d9604..20ed47ae252f 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include @@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs) if (kexec_crash_image) { struct pt_regs fixed_regs; - kmsg_dump(KMSG_DUMP_KEXEC); - crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); From 6480e5a0923756b500634d9777ec4189492fbbfe Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Thu, 12 Jan 2012 17:20:14 -0800 Subject: [PATCH 76/96] kdump: add missing RAM resource in crash_shrink_memory() When shrinking crashkernel memory using /sys/kernel/kexec_crash_size for the newly added memory no RAM resource is created at the moment. Example: $ cat /proc/iomem 00000000-bfffffff : System RAM 00000000-005b7ac3 : Kernel code 005b7ac4-009743bf : Kernel data 009bb000-00a85c33 : Kernel bss c0000000-cfffffff : Crash kernel d0000000-ffffffff : System RAM $ echo 0 > /sys/kernel/kexec_crash_size $ cat /proc/iomem 00000000-bfffffff : System RAM 00000000-005b7ac3 : Kernel code 005b7ac4-009743bf : Kernel data 009bb000-00a85c33 : Kernel bss <<-- here is System RAM missing d0000000-ffffffff : System RAM One result of this bug is that the memory chunk can never be set offline using memory hotplug. With this patch I insert a new "System RAM" resource for the released memory. Then the upper example looks like the following: $ echo 0 > /sys/kernel/kexec_crash_size $ cat /proc/iomem 00000000-bfffffff : System RAM 00000000-005b7ac3 : Kernel code 005b7ac4-009743bf : Kernel data 009bb000-00a85c33 : Kernel bss c0000000-cfffffff : System RAM <<-- new rescoure d0000000-ffffffff : System RAM And now I can set chunk c0000000-cfffffff offline. Signed-off-by: Michael Holzheu Cc: Vivek Goyal Cc: "Eric W. Biederman" Cc: Heiko Carstens Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kernel/kexec.c b/kernel/kexec.c index 20ed47ae252f..60bf181b3eae 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1129,6 +1129,7 @@ int crash_shrink_memory(unsigned long new_size) { int ret = 0; unsigned long start, end; + struct resource *ram_res; mutex_lock(&kexec_mutex); @@ -1146,6 +1147,12 @@ int crash_shrink_memory(unsigned long new_size) goto unlock; } + ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); + if (!ram_res) { + ret = -ENOMEM; + goto unlock; + } + start = roundup(start, KEXEC_CRASH_MEM_ALIGN); end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); @@ -1154,7 +1161,15 @@ int crash_shrink_memory(unsigned long new_size) if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); + + ram_res->start = end; + ram_res->end = crashk_res.end; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; + ram_res->name = "System RAM"; + crashk_res.end = end - 1; + + insert_resource(&iomem_resource, ram_res); crash_unmap_reserved_pages(); unlock: From bec013c40bc89671d8d457944fdf7d2b8e79d651 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Thu, 12 Jan 2012 17:20:15 -0800 Subject: [PATCH 77/96] kdump: crashk_res init check for /sys/kernel/kexec_crash_size Currently it is possible to set the crash_size via the sysfs /sys/kernel/kexec_crash_size even if no crash kernel memory has been defined with the "crashkernel" parameter. In this case "crashk_res" is not initialized and crashk_res.start = crashk_res.end = 0. Unfortunately resource_size(&crashk_res) returns 1 in this case. This breaks the s390 implementation of crash_(un)map_reserved_pages(). To fix the problem the correct "old_size" is now calculated in crash_shrink_memory(). "old_size is set to "0" if crashk_res is not initialized. With this change crash_shrink_memory() will do nothing, when "crashk_res" is not initialized. It will return "0" for "echo 0 > /sys/kernel/kexec_crash_size" and -EINVAL for "echo [not zero] > /sys/kernel/kexec_crash_size". In addition to that this patch also simplifies the "ret = -EINVAL" vs. "ret = 0" logic as suggested by Simon Horman. Signed-off-by: Michael Holzheu Reviewed-by: Dave Young Reviewed-by: WANG Cong Reviewed-by: Simon Horman Cc: Vivek Goyal Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/kexec.c b/kernel/kexec.c index 60bf181b3eae..7b0886786701 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1129,6 +1129,7 @@ int crash_shrink_memory(unsigned long new_size) { int ret = 0; unsigned long start, end; + unsigned long old_size; struct resource *ram_res; mutex_lock(&kexec_mutex); @@ -1139,11 +1140,9 @@ int crash_shrink_memory(unsigned long new_size) } start = crashk_res.start; end = crashk_res.end; - - if (new_size >= end - start + 1) { - ret = -EINVAL; - if (new_size == end - start + 1) - ret = 0; + old_size = (end == 0) ? 0 : end - start + 1; + if (new_size >= old_size) { + ret = (new_size == old_size) ? 0 : -EINVAL; goto unlock; } From 93e13a360ba331915220f82f6e9543df961ffa1f Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Thu, 12 Jan 2012 17:20:18 -0800 Subject: [PATCH 78/96] kdump: fix crash_kexec()/smp_send_stop() race in panic() When two CPUs call panic at the same time there is a possible race condition that can stop kdump. The first CPU calls crash_kexec() and the second CPU calls smp_send_stop() in panic() before crash_kexec() finished on the first CPU. So the second CPU stops the first CPU and therefore kdump fails: 1st CPU: panic()->crash_kexec()->mutex_trylock(&kexec_mutex)-> do kdump 2nd CPU: panic()->crash_kexec()->kexec_mutex already held by 1st CPU ->smp_send_stop()-> stop 1st CPU (stop kdump) This patch fixes the problem by introducing a spinlock in panic that allows only one CPU to process crash_kexec() and the subsequent panic code. All other CPUs call the weak function panic_smp_self_stop() that stops the CPU itself. This function can be overloaded by architecture code. For example "tile" can use their lower-power "nap" instruction for that. Signed-off-by: Michael Holzheu Acked-by: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/kernel/panic.c b/kernel/panic.c index 6fd09ed6fd90..5dce5404eeef 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -49,6 +49,15 @@ static long no_blink(int state) long (*panic_blink)(int state); EXPORT_SYMBOL(panic_blink); +/* + * Stop ourself in panic -- architecture code may override this + */ +void __weak panic_smp_self_stop(void) +{ + while (1) + cpu_relax(); +} + /** * panic - halt the system * @fmt: The text string to print @@ -59,6 +68,7 @@ EXPORT_SYMBOL(panic_blink); */ void panic(const char *fmt, ...) { + static DEFINE_SPINLOCK(panic_lock); static char buf[1024]; va_list args; long i, i_next = 0; @@ -68,8 +78,14 @@ void panic(const char *fmt, ...) * It's possible to come here directly from a panic-assertion and * not have preempt disabled. Some functions called from here want * preempt to be disabled. No point enabling it later though... + * + * Only one CPU is allowed to execute the panic code from here. For + * multiple parallel invocations of panic, all other CPUs either + * stop themself or will wait until they are stopped by the 1st CPU + * with smp_send_stop(). */ - preempt_disable(); + if (!spin_trylock(&panic_lock)) + panic_smp_self_stop(); console_verbose(); bust_spinlocks(1); From 1f536b9e9f85456df93614b3c2f6a1a2b7d7cb9b Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Thu, 12 Jan 2012 17:20:20 -0800 Subject: [PATCH 79/96] include/linux/crash_dump.h needs elf.h Building an ARM target we get the following warnings: CC arch/arm/kernel/setup.o In file included from arch/arm/kernel/setup.c:39: arch/arm/include/asm/elf.h:102:1: warning: "vmcore_elf64_check_arch" redefined In file included from arch/arm/kernel/setup.c:24: include/linux/crash_dump.h:30:1: warning: this is the location of the previous definition Quoting Russell King: "linux/crash_dump.h makes no attempt to include asm/elf.h, but it depends on stuff in asm/elf.h to determine how stuff inside this file is defined at parse time. So, if asm/elf.h is included after linux/crash_dump.h or not at all, you get a different result from the situation where asm/elf.h is included before." So add elf.h header to crash_dump.h to avoid this problem. The original discussion about this can be found at: http://www.spinics.net/lists/arm-kernel/msg154113.html Signed-off-by: Fabio Estevam Cc: Russell King Cc: [3.2.1] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crash_dump.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 5c4abce94ad1..b936763f2236 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -5,6 +5,7 @@ #include #include #include +#include #define ELFCORE_ADDR_MAX (-1ULL) #define ELFCORE_ADDR_ERR (-2ULL) From f5138e42211d4e8bfbd6ac5b3816348da1533433 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Thu, 12 Jan 2012 17:20:23 -0800 Subject: [PATCH 80/96] kdump: add udev events for memory online/offline Currently no udev events for memory hotplug "online" and "offline" are generated: # udevadm monitor # echo offline > /sys/devices/system/memory/memory4/state ==> No event When kdump is loaded, kexec detects the current memory configuration and stores it in the pre-allocated ELF core header. Therefore, for kdump it is necessary to reload the kdump kernel with kexec when the memory configuration changes (e.g. for online/offline hotplug memory). In order to do this automatically, udev rules should be used. This kernel patch adds udev events for "online" and "offline". Together with this kernel patch, the following udev rules for online/offline have to be added to "/etc/udev/rules.d/98-kexec.rules": SUBSYSTEM=="memory", ACTION=="online", PROGRAM="/etc/init.d/kdump restart" SUBSYSTEM=="memory", ACTION=="offline", PROGRAM="/etc/init.d/kdump restart" [sfr@canb.auug.org.au: fixups for class to subsystem conversion] Signed-off-by: Michael Holzheu Cc: Heiko Carstens Cc: Vivek Goyal Cc: "Eric W. Biederman" Cc: Kay Sievers Cc: Dave Hansen Cc: Martin Schwidefsky Cc: Greg KH Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index f17e3ea041c0..ed5de58c340f 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -295,11 +295,22 @@ static int memory_block_change_state(struct memory_block *mem, ret = memory_block_action(mem->start_section_nr, to_state); - if (ret) + if (ret) { mem->state = from_state_req; - else - mem->state = to_state; + goto out; + } + mem->state = to_state; + switch (mem->state) { + case MEM_OFFLINE: + kobject_uevent(&mem->dev.kobj, KOBJ_OFFLINE); + break; + case MEM_ONLINE: + kobject_uevent(&mem->dev.kobj, KOBJ_ONLINE); + break; + default: + break; + } out: mutex_unlock(&mem->state_mutex); return ret; From b8f566b04d3cddd192cfd2418ae6d54ac6353792 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 12 Jan 2012 17:20:27 -0800 Subject: [PATCH 81/96] sysctl: add the kernel.ns_last_pid control The sysctl works on the current task's pid namespace, getting and setting its last_pid field. Writing is allowed for CAP_SYS_ADMIN-capable tasks thus making it possible to create a task with desired pid value. This ability is required badly for the checkpoint/restore in userspace. This approach suits all the parties for now. Signed-off-by: Pavel Emelyanov Acked-by: Tejun Heo Cc: Oleg Nesterov Cc: Cyrill Gorcunov Cc: "Eric W. Biederman" Cc: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 8 ++++++++ kernel/pid.c | 4 +++- kernel/pid_namespace.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 6d8cd8b2c30d..8c20fbd8b42d 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -415,6 +415,14 @@ PIDs of value pid_max or larger are not allocated. ============================================================== +ns_last_pid: + +The last pid allocated in the current (the one task using this sysctl +lives in) pid namespace. When selecting a pid for a next task on fork +kernel tries to allocate a number starting from this one. + +============================================================== + powersave-nap: (PPC only) If set, Linux-PPC will use the 'nap' mode of powersaving, diff --git a/kernel/pid.c b/kernel/pid.c index fa5f72227e5f..ce8e00deaccb 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b) } /* - * We might be racing with someone else trying to set pid_ns->last_pid. + * We might be racing with someone else trying to set pid_ns->last_pid + * at the pid allocation time (there's also a sysctl for this, but racing + * with this one is OK, see comment in kernel/pid_namespace.c about it). * We want the winner to have the "later" value, because if the * "earlier" value prevails, then a pid may get reused immediately. * diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index e9c9adc84ca6..a8968396046d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) return; } +static int pid_ns_ctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * Writing directly to ns' last_pid field is OK, since this field + * is volatile in a living namespace anyway and a code writing to + * it should synchronize its usage with external means. + */ + + tmp.data = ¤t->nsproxy->pid_ns->last_pid; + return proc_dointvec(&tmp, write, buffer, lenp, ppos); +} + +static struct ctl_table pid_ns_ctl_table[] = { + { + .procname = "ns_last_pid", + .maxlen = sizeof(int), + .mode = 0666, /* permissions are checked in the handler */ + .proc_handler = pid_ns_ctl_handler, + }, + { } +}; + +static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + register_sysctl_paths(kern_path, pid_ns_ctl_table); return 0; } From 6e6f0a1f0fa6bba1493c296eb30d1e176e1f8530 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Jan 2012 17:20:30 -0800 Subject: [PATCH 82/96] panic: don't print redundant backtraces on oops When an oops causes a panic and panic prints another backtrace it's pretty common to have the original oops data be scrolled away on a 80x50 screen. The second backtrace is quite redundant and not needed anyways. So don't print the panic backtrace when oops_in_progress is true. [akpm@linux-foundation.org: add comment] Signed-off-by: Andi Kleen Cc: Michael Holzheu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/panic.c b/kernel/panic.c index 5dce5404eeef..80aed44e345a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -94,7 +94,11 @@ void panic(const char *fmt, ...) va_end(args); printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); #ifdef CONFIG_DEBUG_BUGVERBOSE - dump_stack(); + /* + * Avoid nested stack-dumping if a panic occurs during oops processing + */ + if (!oops_in_progress) + dump_stack(); #endif /* From 45dac90f0ca7d9efeda8f3d416c808c71207bf20 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 12 Jan 2012 17:20:32 -0800 Subject: [PATCH 83/96] drivers/parport/parport_pc.c: fix warnings drivers/parport/parport_pc.c: In function '__check_irq': drivers/parport/parport_pc.c:3415: warning: return from incompatible pointer type drivers/parport/parport_pc.c: In function '__check_dma': drivers/parport/parport_pc.c:3417: warning: return from incompatible pointer type Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/parport/parport_pc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c index d0b597b50398..0cb64f50cecd 100644 --- a/drivers/parport/parport_pc.c +++ b/drivers/parport/parport_pc.c @@ -3404,8 +3404,8 @@ static int __init parport_init_mode_setup(char *str) #endif #ifdef MODULE -static const char *irq[PARPORT_PC_MAX_PORTS]; -static const char *dma[PARPORT_PC_MAX_PORTS]; +static char *irq[PARPORT_PC_MAX_PORTS]; +static char *dma[PARPORT_PC_MAX_PORTS]; MODULE_PARM_DESC(io, "Base I/O address (SPP regs)"); module_param_array(io, int, NULL, 0); From ae55e1aaa7e2e57e538cb98cf617f511c5dc4f73 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 12 Jan 2012 17:20:33 -0800 Subject: [PATCH 84/96] fs/direct-io.c: calculate fs_count correctly in get_more_blocks() In get_more_blocks(), we use dio_count to calcuate fs_count and do some tricky things to increase fs_count if dio_count isn't aligned. But actually it still has some corner cases that can't be coverd. See the following example: dio_write foo -s 1024 -w 4096 (direct write 4096 bytes at offset 1024). The same goes if the offset isn't aligned to fs_blocksize. In this case, the old calculation counts fs_count to be 1, but actually we will write into 2 different blocks (if fs_blocksize=4096). The old code just works, since it will call get_block twice (and may have to allocate and create extents twice for filesystems like ext4). So we'd better call get_block just once with the proper fs_count. Signed-off-by: Tao Ma Cc: "Theodore Ts'o" Cc: Christoph Hellwig Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index d740ab67ff6e..389863f59cda 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -580,9 +580,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, { int ret; sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ + sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ unsigned long fs_count; /* Number of filesystem-sized blocks */ - unsigned long dio_count;/* Number of dio_block-sized blocks */ - unsigned long blkmask; int create; /* @@ -593,11 +592,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, if (ret == 0) { BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); fs_startblk = sdio->block_in_file >> sdio->blkfactor; - dio_count = sdio->final_block_in_request - sdio->block_in_file; - fs_count = dio_count >> sdio->blkfactor; - blkmask = (1 << sdio->blkfactor) - 1; - if (dio_count & blkmask) - fs_count++; + fs_endblk = (sdio->final_block_in_request - 1) >> + sdio->blkfactor; + fs_count = fs_endblk - fs_startblk + 1; map_bh->b_state = 0; map_bh->b_size = fs_count << dio->inode->i_blkbits; From 87192a2a49c475cf322cb143e0fa63b0102d8567 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Jan 2012 17:20:34 -0800 Subject: [PATCH 85/96] vfs: cache request_queue in struct block_device This makes it possible to get from the inode to the request_queue with one less cache miss. Used in followon optimization. The livetime of the pointer is the same as the gendisk. This assumes that the queue will always stay the same in the gendisk while it's visible to block_devices. I think that's safe correct? Signed-off-by: Andi Kleen Acked-by: Jeff Moyer Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 3 +++ include/linux/fs.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/fs/block_dev.c b/fs/block_dev.c index afe74dda632b..0e575d1304b4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1139,6 +1139,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { bdev->bd_disk = disk; + bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; if (!partno) { struct backing_dev_info *bdi; @@ -1159,6 +1160,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; + bdev->bd_queue = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); put_disk(disk); @@ -1232,6 +1234,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_disk = NULL; bdev->bd_part = NULL; + bdev->bd_queue = NULL; bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); diff --git a/include/linux/fs.h b/include/linux/fs.h index e694bd4434a4..4bc8169fb5a1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -660,6 +660,7 @@ struct address_space { * must be enforced here for CRIS, to let the least significant bit * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. */ +struct request_queue; struct block_device { dev_t bd_dev; /* not a kdev_t - it's a search key */ @@ -682,6 +683,7 @@ struct block_device { unsigned bd_part_count; int bd_invalidated; struct gendisk * bd_disk; + struct request_queue * bd_queue; struct list_head bd_list; /* * Private data. You must have bd_claim'ed the block_device From 65dd2aa90aa17a26703c28652408192856aa0396 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Jan 2012 17:20:35 -0800 Subject: [PATCH 86/96] dio: optimize cache misses in the submission path Some investigation of a transaction processing workload showed that a major consumer of cycles in __blockdev_direct_IO is the cache miss while accessing the block size. This is because it has to walk the chain from block_dev to gendisk to queue. The block size is needed early on to check alignment and sizes. It's only done if the check for the inode block size fails. But the costly block device state is unconditionally fetched. - Reorganize the code to only fetch block dev state when actually needed. Then do a prefetch on the block dev early on in the direct IO path. This is worth it, because there is substantial code run before we actually touch the block dev now. - I also added some unlikelies to make it clear the compiler that block device fetch code is not normally executed. This gave a small, but measurable improvement on a large database benchmark (about 0.3%) [akpm@linux-foundation.org: coding-style fixes] [sfr@canb.auug.org.au: using prefetch requires including prefetch.h] Signed-off-by: Andi Kleen Cc: Jeff Moyer Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 46 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 389863f59cda..4a588dbd11bf 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -36,6 +36,7 @@ #include #include #include +#include /* * How many user pages to map in one call to get_user_pages(). This determines @@ -1087,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio) * individual fields and will generate much worse code. This is important * for the whole file. */ -ssize_t -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, +static inline ssize_t +do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) @@ -1097,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, size_t size; unsigned long addr; unsigned blkbits = inode->i_blkbits; - unsigned bdev_blkbits = 0; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; loff_t end = offset; @@ -1110,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (rw & WRITE) rw = WRITE_ODIRECT; - if (bdev) - bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); + /* + * Avoid references to bdev if not absolutely needed to give + * the early prefetch in the caller enough time. + */ if (offset & blocksize_mask) { if (bdev) - blkbits = bdev_blkbits; + blkbits = blksize_bits(bdev_logical_block_size(bdev)); blocksize_mask = (1 << blkbits) - 1; if (offset & blocksize_mask) goto out; @@ -1126,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, addr = (unsigned long)iov[seg].iov_base; size = iov[seg].iov_len; end += size; - if ((addr & blocksize_mask) || (size & blocksize_mask)) { + if (unlikely((addr & blocksize_mask) || + (size & blocksize_mask))) { if (bdev) - blkbits = bdev_blkbits; + blkbits = blksize_bits( + bdev_logical_block_size(bdev)); blocksize_mask = (1 << blkbits) - 1; - if ((addr & blocksize_mask) || (size & blocksize_mask)) + if ((addr & blocksize_mask) || (size & blocksize_mask)) goto out; } } @@ -1313,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, out: return retval; } + +ssize_t +__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int flags) +{ + /* + * The block device state is needed in the end to finally + * submit everything. Since it's likely to be cache cold + * prefetch it here as first thing to hide some of the + * latency. + * + * Attempt to prefetch the pieces we likely need later. + */ + prefetch(&bdev->bd_disk->part_tbl); + prefetch(bdev->bd_queue); + prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); + + return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_block, end_io, + submit_io, flags); +} + EXPORT_SYMBOL(__blockdev_direct_IO); static __init int dio_init(void) From 928da837aca77a9d3cb5076bf07b3224b1ba293b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 12 Jan 2012 17:20:39 -0800 Subject: [PATCH 87/96] radix_tree: remove radix_tree_indirect_to_ptr() It is not used anymore, remove it Signed-off-by: Xiao Guangrong Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 9d4539c52e53..07e360b1b282 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -49,9 +49,6 @@ #define RADIX_TREE_EXCEPTIONAL_ENTRY 2 #define RADIX_TREE_EXCEPTIONAL_SHIFT 2 -#define radix_tree_indirect_to_ptr(ptr) \ - radix_tree_indirect_to_ptr((void __force *)(ptr)) - static inline int radix_tree_is_indirect_ptr(void *ptr) { return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR); From e2bdb933ab8b7db71c318a4ddcf78a9fffd61ecb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 12 Jan 2012 17:20:41 -0800 Subject: [PATCH 88/96] radix_tree: take radix_tree_path off stack Down, down in the deepest depths of GFP_NOIO page reclaim, we have shrink_page_list() calling __remove_mapping() calling __delete_from_ swap_cache() or __delete_from_page_cache(). You would not expect those to need much stack, but in fact they call radix_tree_delete(): which declares a 192-byte radix_tree_path array on its stack (to record the node,offsets it visits when descending, in case it needs to ascend to update them). And if any tag is still set [1], that calls radix_tree_tag_clear(), which declares a further such 192-byte radix_tree_path array on the stack. (At least we have interrupts disabled here, so won't then be pushing registers too.) That was probably a good choice when most users were 32-bit (array of half the size), and adding fields to radix_tree_node would have bloated it unnecessarily. But nowadays many are 64-bit, and each radix_tree_node contains a struct rcu_head, which is only used when freeing; whereas the radix_tree_path info is only used for updating the tree (deleting, clearing tags or setting tags if tagged) when a lock must be held, of no interest when accessing the tree locklessly. So add a parent pointer to the radix_tree_node, in union with the rcu_head, and remove all uses of the radix_tree_path. There would be space in that union to save the offset when descending as before (we can argue that a lock must already be held to exclude other users), but recalculating it when ascending is both easy (a constant shift and a constant mask) and uncommon, so it seems better just to do that. Two little optimizations: no need to decrement height when descending, adjusting shift is enough; and once radix_tree_tag_if_tagged() has set tag on a node and its ancestors, it need not ascend from that node again. perf on the radix tree test harness reports radix_tree_insert() as 2% slower (now having to set parent), but radix_tree_delete() 24% faster. Surely that's an exaggeration from rtth's artificially low map shift 3, but forcing it back to 6 still rates radix_tree_delete() 8% faster. [1] Can a pagecache tag (dirty, writeback or towrite) actually still be set at the time of radix_tree_delete()? Perhaps not if the filesystem is well-behaved. But although I've not tracked any stack overflow down to this cause, I have observed a curious case in which a dirty tag is set and left set on tmpfs: page migration's migrate_page_copy() happens to use __set_page_dirty_nobuffers() to set PageDirty on the newpage, and that sets PAGECACHE_TAG_DIRTY as a side-effect - harmless to a filesystem which doesn't use tags, except for this stack depth issue. Signed-off-by: Hugh Dickins Cc: Jan Kara Cc: Dave Chinner Cc: Mel Gorman Cc: Nai Xia Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/radix-tree.c | 158 +++++++++++++++++++++++------------------------ 1 file changed, 78 insertions(+), 80 deletions(-) diff --git a/lib/radix-tree.c b/lib/radix-tree.c index d9df7454519c..dc63d0818394 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -48,16 +48,14 @@ struct radix_tree_node { unsigned int height; /* Height from the bottom */ unsigned int count; - struct rcu_head rcu_head; + union { + struct radix_tree_node *parent; /* Used when ascending tree */ + struct rcu_head rcu_head; /* Used when freeing node */ + }; void __rcu *slots[RADIX_TREE_MAP_SIZE]; unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; }; -struct radix_tree_path { - struct radix_tree_node *node; - int offset; -}; - #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) @@ -256,6 +254,7 @@ static inline unsigned long radix_tree_maxindex(unsigned int height) static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) { struct radix_tree_node *node; + struct radix_tree_node *slot; unsigned int height; int tag; @@ -274,18 +273,23 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) if (!(node = radix_tree_node_alloc(root))) return -ENOMEM; - /* Increase the height. */ - node->slots[0] = indirect_to_ptr(root->rnode); - /* Propagate the aggregated tag info into the new root */ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { if (root_tag_get(root, tag)) tag_set(node, tag, 0); } + /* Increase the height. */ newheight = root->height+1; node->height = newheight; node->count = 1; + node->parent = NULL; + slot = root->rnode; + if (newheight > 1) { + slot = indirect_to_ptr(slot); + slot->parent = node; + } + node->slots[0] = slot; node = ptr_to_indirect(node); rcu_assign_pointer(root->rnode, node); root->height = newheight; @@ -331,6 +335,7 @@ int radix_tree_insert(struct radix_tree_root *root, if (!(slot = radix_tree_node_alloc(root))) return -ENOMEM; slot->height = height; + slot->parent = node; if (node) { rcu_assign_pointer(node->slots[offset], slot); node->count++; @@ -504,47 +509,41 @@ EXPORT_SYMBOL(radix_tree_tag_set); void *radix_tree_tag_clear(struct radix_tree_root *root, unsigned long index, unsigned int tag) { - /* - * The radix tree path needs to be one longer than the maximum path - * since the "list" is null terminated. - */ - struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path; + struct radix_tree_node *node = NULL; struct radix_tree_node *slot = NULL; unsigned int height, shift; + int uninitialized_var(offset); height = root->height; if (index > radix_tree_maxindex(height)) goto out; - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - pathp->node = NULL; + shift = height * RADIX_TREE_MAP_SHIFT; slot = indirect_to_ptr(root->rnode); - while (height > 0) { - int offset; - + while (shift) { if (slot == NULL) goto out; - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - pathp[1].offset = offset; - pathp[1].node = slot; - slot = slot->slots[offset]; - pathp++; shift -= RADIX_TREE_MAP_SHIFT; - height--; + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + node = slot; + slot = slot->slots[offset]; } if (slot == NULL) goto out; - while (pathp->node) { - if (!tag_get(pathp->node, tag, pathp->offset)) + while (node) { + if (!tag_get(node, tag, offset)) goto out; - tag_clear(pathp->node, tag, pathp->offset); - if (any_tag_set(pathp->node, tag)) + tag_clear(node, tag, offset); + if (any_tag_set(node, tag)) goto out; - pathp--; + + index >>= RADIX_TREE_MAP_SHIFT; + offset = index & RADIX_TREE_MAP_MASK; + node = node->parent; } /* clear the root's tag bit */ @@ -646,8 +645,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, unsigned int iftag, unsigned int settag) { unsigned int height = root->height; - struct radix_tree_path path[height]; - struct radix_tree_path *pathp = path; + struct radix_tree_node *node = NULL; struct radix_tree_node *slot; unsigned int shift; unsigned long tagged = 0; @@ -671,14 +669,8 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, shift = (height - 1) * RADIX_TREE_MAP_SHIFT; slot = indirect_to_ptr(root->rnode); - /* - * we fill the path from (root->height - 2) to 0, leaving the index at - * (root->height - 1) as a terminator. Zero the node in the terminator - * so that we can use this to end walk loops back up the path. - */ - path[height - 1].node = NULL; - for (;;) { + unsigned long upindex; int offset; offset = (index >> shift) & RADIX_TREE_MAP_MASK; @@ -686,12 +678,10 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, goto next; if (!tag_get(slot, iftag, offset)) goto next; - if (height > 1) { + if (shift) { /* Go down one level */ - height--; shift -= RADIX_TREE_MAP_SHIFT; - path[height - 1].node = slot; - path[height - 1].offset = offset; + node = slot; slot = slot->slots[offset]; continue; } @@ -701,15 +691,27 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, tag_set(slot, settag, offset); /* walk back up the path tagging interior nodes */ - pathp = &path[0]; - while (pathp->node) { + upindex = index; + while (node) { + upindex >>= RADIX_TREE_MAP_SHIFT; + offset = upindex & RADIX_TREE_MAP_MASK; + /* stop if we find a node with the tag already set */ - if (tag_get(pathp->node, settag, pathp->offset)) + if (tag_get(node, settag, offset)) break; - tag_set(pathp->node, settag, pathp->offset); - pathp++; + tag_set(node, settag, offset); + node = node->parent; } + /* + * Small optimization: now clear that node pointer. + * Since all of this slot's ancestors now have the tag set + * from setting it above, we have no further need to walk + * back up the tree setting tags, until we update slot to + * point to another radix_tree_node. + */ + node = NULL; + next: /* Go to next item at level determined by 'shift' */ index = ((index >> shift) + 1) << shift; @@ -724,8 +726,7 @@ next: * last_index is guaranteed to be in the tree, what * we do below cannot wander astray. */ - slot = path[height - 1].node; - height++; + slot = slot->parent; shift += RADIX_TREE_MAP_SHIFT; } } @@ -1299,7 +1300,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) /* try to shrink tree height */ while (root->height > 0) { struct radix_tree_node *to_free = root->rnode; - void *newptr; + struct radix_tree_node *slot; BUG_ON(!radix_tree_is_indirect_ptr(to_free)); to_free = indirect_to_ptr(to_free); @@ -1320,10 +1321,12 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) * (to_free->slots[0]), it will be safe to dereference the new * one (root->rnode) as far as dependent read barriers go. */ - newptr = to_free->slots[0]; - if (root->height > 1) - newptr = ptr_to_indirect(newptr); - root->rnode = newptr; + slot = to_free->slots[0]; + if (root->height > 1) { + slot->parent = NULL; + slot = ptr_to_indirect(slot); + } + root->rnode = slot; root->height--; /* @@ -1363,16 +1366,12 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) */ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) { - /* - * The radix tree path needs to be one longer than the maximum path - * since the "list" is null terminated. - */ - struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path; + struct radix_tree_node *node = NULL; struct radix_tree_node *slot = NULL; struct radix_tree_node *to_free; unsigned int height, shift; int tag; - int offset; + int uninitialized_var(offset); height = root->height; if (index > radix_tree_maxindex(height)) @@ -1385,39 +1384,35 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) goto out; } slot = indirect_to_ptr(slot); - - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - pathp->node = NULL; + shift = height * RADIX_TREE_MAP_SHIFT; do { if (slot == NULL) goto out; - pathp++; - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - pathp->offset = offset; - pathp->node = slot; - slot = slot->slots[offset]; shift -= RADIX_TREE_MAP_SHIFT; - height--; - } while (height > 0); + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + node = slot; + slot = slot->slots[offset]; + } while (shift); if (slot == NULL) goto out; /* - * Clear all tags associated with the just-deleted item + * Clear all tags associated with the item to be deleted. + * This way of doing it would be inefficient, but seldom is any set. */ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { - if (tag_get(pathp->node, tag, pathp->offset)) + if (tag_get(node, tag, offset)) radix_tree_tag_clear(root, index, tag); } to_free = NULL; /* Now free the nodes we do not need anymore */ - while (pathp->node) { - pathp->node->slots[pathp->offset] = NULL; - pathp->node->count--; + while (node) { + node->slots[offset] = NULL; + node->count--; /* * Queue the node for deferred freeing after the * last reference to it disappears (set NULL, above). @@ -1425,17 +1420,20 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) if (to_free) radix_tree_node_free(to_free); - if (pathp->node->count) { - if (pathp->node == indirect_to_ptr(root->rnode)) + if (node->count) { + if (node == indirect_to_ptr(root->rnode)) radix_tree_shrink(root); goto out; } /* Node with zero slots in use so free it */ - to_free = pathp->node; - pathp--; + to_free = node; + index >>= RADIX_TREE_MAP_SHIFT; + offset = index & RADIX_TREE_MAP_MASK; + node = node->parent; } + root_tag_clear_all(root); root->height = 0; root->rnode = NULL; From 274343ad3e63c4dcee6744a75b5553940de4a0f6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 12 Jan 2012 17:20:44 -0800 Subject: [PATCH 89/96] selftests: new very basic kernel selftests directory Bring a new kernel selftests directory in tools/testing/selftests. To add a new selftest, create a subdirectory with the sources and a makefile that creates a target named "run_test" then add the subdirectory name to the TARGET var in tools/testing/selftests/Makefile and tools/testing/selftests/run_tests script. This can help centralizing and maintaining any useful selftest that developers usually tend to let rust in peace on some random server. Suggested-by: Andrew Morton Signed-off-by: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Jason Wessel Cc: Will Deacon Cc: Steven Rostedt Cc: Michal Marek Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/Makefile | 11 +++++++++++ tools/testing/selftests/run_tests | 8 ++++++++ 2 files changed, 19 insertions(+) create mode 100644 tools/testing/selftests/Makefile create mode 100644 tools/testing/selftests/run_tests diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile new file mode 100644 index 000000000000..298a5c0c1ead --- /dev/null +++ b/tools/testing/selftests/Makefile @@ -0,0 +1,11 @@ +TARGETS = + +all: + for TARGET in $(TARGETS); do \ + make -C $$TARGET; \ + done; + +clean: + for TARGET in $(TARGETS); do \ + make -C $$TARGET clean; \ + done; diff --git a/tools/testing/selftests/run_tests b/tools/testing/selftests/run_tests new file mode 100644 index 000000000000..701960d9e531 --- /dev/null +++ b/tools/testing/selftests/run_tests @@ -0,0 +1,8 @@ +#!/bin/bash + +TARGETS= + +for TARGET in $TARGETS +do + $TARGET/run_test +done From 85bbddc37b2bf947a577d572b1c4c23bf829217f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 12 Jan 2012 17:20:46 -0800 Subject: [PATCH 90/96] selftests: new x86 breakpoints selftest Bring a first selftest in the relevant directory. This tests several combinations of breakpoints and watchpoints in x86, as well as icebp traps and int3 traps. Given the amount of breakpoint regressions we raised after we merged the generic breakpoint infrastructure, such selftest became necessary and can still serve today as a basis for new patches that touch the do_debug() path. Signed-off-by: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Jason Wessel Cc: Will Deacon Cc: Michal Marek Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/Makefile | 2 +- tools/testing/selftests/breakpoints/Makefile | 20 + .../selftests/breakpoints/breakpoint_test.c | 394 ++++++++++++++++++ tools/testing/selftests/run_tests | 2 +- 4 files changed, 416 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/breakpoints/Makefile create mode 100644 tools/testing/selftests/breakpoints/breakpoint_test.c diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 298a5c0c1ead..4ec84018cc13 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -1,4 +1,4 @@ -TARGETS = +TARGETS = breakpoints all: for TARGET in $(TARGETS); do \ diff --git a/tools/testing/selftests/breakpoints/Makefile b/tools/testing/selftests/breakpoints/Makefile new file mode 100644 index 000000000000..f362722cdce7 --- /dev/null +++ b/tools/testing/selftests/breakpoints/Makefile @@ -0,0 +1,20 @@ +# Taken from perf makefile +uname_M := $(shell uname -m 2>/dev/null || echo not) +ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/) +ifeq ($(ARCH),i386) + ARCH := x86 +endif +ifeq ($(ARCH),x86_64) + ARCH := x86 +endif + + +all: +ifeq ($(ARCH),x86) + gcc breakpoint_test.c -o run_test +else + echo "Not an x86 target, can't build breakpoints selftests" +endif + +clean: + rm -fr run_test diff --git a/tools/testing/selftests/breakpoints/breakpoint_test.c b/tools/testing/selftests/breakpoints/breakpoint_test.c new file mode 100644 index 000000000000..a0743f3b2b57 --- /dev/null +++ b/tools/testing/selftests/breakpoints/breakpoint_test.c @@ -0,0 +1,394 @@ +/* + * Copyright (C) 2011 Red Hat, Inc., Frederic Weisbecker + * + * Licensed under the terms of the GNU GPL License version 2 + * + * Selftests for breakpoints (and more generally the do_debug() path) in x86. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Breakpoint access modes */ +enum { + BP_X = 1, + BP_RW = 2, + BP_W = 4, +}; + +static pid_t child_pid; + +/* + * Ensures the child and parent are always "talking" about + * the same test sequence. (ie: that we haven't forgotten + * to call check_trapped() somewhere). + */ +static int nr_tests; + +static void set_breakpoint_addr(void *addr, int n) +{ + int ret; + + ret = ptrace(PTRACE_POKEUSER, child_pid, + offsetof(struct user, u_debugreg[n]), addr); + if (ret) { + perror("Can't set breakpoint addr\n"); + exit(-1); + } +} + +static void toggle_breakpoint(int n, int type, int len, + int local, int global, int set) +{ + int ret; + + int xtype, xlen; + unsigned long vdr7, dr7; + + switch (type) { + case BP_X: + xtype = 0; + break; + case BP_W: + xtype = 1; + break; + case BP_RW: + xtype = 3; + break; + } + + switch (len) { + case 1: + xlen = 0; + break; + case 2: + xlen = 4; + break; + case 4: + xlen = 0xc; + break; + case 8: + xlen = 8; + break; + } + + dr7 = ptrace(PTRACE_PEEKUSER, child_pid, + offsetof(struct user, u_debugreg[7]), 0); + + vdr7 = (xlen | xtype) << 16; + vdr7 <<= 4 * n; + + if (local) { + vdr7 |= 1 << (2 * n); + vdr7 |= 1 << 8; + } + if (global) { + vdr7 |= 2 << (2 * n); + vdr7 |= 1 << 9; + } + + if (set) + dr7 |= vdr7; + else + dr7 &= ~vdr7; + + ret = ptrace(PTRACE_POKEUSER, child_pid, + offsetof(struct user, u_debugreg[7]), dr7); + if (ret) { + perror("Can't set dr7"); + exit(-1); + } +} + +/* Dummy variables to test read/write accesses */ +static unsigned long long dummy_var[4]; + +/* Dummy functions to test execution accesses */ +static void dummy_func(void) { } +static void dummy_func1(void) { } +static void dummy_func2(void) { } +static void dummy_func3(void) { } + +static void (*dummy_funcs[])(void) = { + dummy_func, + dummy_func1, + dummy_func2, + dummy_func3, +}; + +static int trapped; + +static void check_trapped(void) +{ + /* + * If we haven't trapped, wake up the parent + * so that it notices the failure. + */ + if (!trapped) + kill(getpid(), SIGUSR1); + trapped = 0; + + nr_tests++; +} + +static void write_var(int len) +{ + char *pcval; short *psval; int *pival; long long *plval; + int i; + + for (i = 0; i < 4; i++) { + switch (len) { + case 1: + pcval = (char *)&dummy_var[i]; + *pcval = 0xff; + break; + case 2: + psval = (short *)&dummy_var[i]; + *psval = 0xffff; + break; + case 4: + pival = (int *)&dummy_var[i]; + *pival = 0xffffffff; + break; + case 8: + plval = (long long *)&dummy_var[i]; + *plval = 0xffffffffffffffffLL; + break; + } + check_trapped(); + } +} + +static void read_var(int len) +{ + char cval; short sval; int ival; long long lval; + int i; + + for (i = 0; i < 4; i++) { + switch (len) { + case 1: + cval = *(char *)&dummy_var[i]; + break; + case 2: + sval = *(short *)&dummy_var[i]; + break; + case 4: + ival = *(int *)&dummy_var[i]; + break; + case 8: + lval = *(long long *)&dummy_var[i]; + break; + } + check_trapped(); + } +} + +/* + * Do the r/w/x accesses to trigger the breakpoints. And run + * the usual traps. + */ +static void trigger_tests(void) +{ + int len, local, global, i; + char val; + int ret; + + ret = ptrace(PTRACE_TRACEME, 0, NULL, 0); + if (ret) { + perror("Can't be traced?\n"); + return; + } + + /* Wake up father so that it sets up the first test */ + kill(getpid(), SIGUSR1); + + /* Test instruction breakpoints */ + for (local = 0; local < 2; local++) { + for (global = 0; global < 2; global++) { + if (!local && !global) + continue; + + for (i = 0; i < 4; i++) { + dummy_funcs[i](); + check_trapped(); + } + } + } + + /* Test write watchpoints */ + for (len = 1; len <= sizeof(long); len <<= 1) { + for (local = 0; local < 2; local++) { + for (global = 0; global < 2; global++) { + if (!local && !global) + continue; + write_var(len); + } + } + } + + /* Test read/write watchpoints (on read accesses) */ + for (len = 1; len <= sizeof(long); len <<= 1) { + for (local = 0; local < 2; local++) { + for (global = 0; global < 2; global++) { + if (!local && !global) + continue; + read_var(len); + } + } + } + + /* Icebp trap */ + asm(".byte 0xf1\n"); + check_trapped(); + + /* Int 3 trap */ + asm("int $3\n"); + check_trapped(); + + kill(getpid(), SIGUSR1); +} + +static void check_success(const char *msg) +{ + const char *msg2; + int child_nr_tests; + int status; + + /* Wait for the child to SIGTRAP */ + wait(&status); + + msg2 = "Failed"; + + if (WSTOPSIG(status) == SIGTRAP) { + child_nr_tests = ptrace(PTRACE_PEEKDATA, child_pid, + &nr_tests, 0); + if (child_nr_tests == nr_tests) + msg2 = "Ok"; + if (ptrace(PTRACE_POKEDATA, child_pid, &trapped, 1)) { + perror("Can't poke\n"); + exit(-1); + } + } + + nr_tests++; + + printf("%s [%s]\n", msg, msg2); +} + +static void launch_instruction_breakpoints(char *buf, int local, int global) +{ + int i; + + for (i = 0; i < 4; i++) { + set_breakpoint_addr(dummy_funcs[i], i); + toggle_breakpoint(i, BP_X, 1, local, global, 1); + ptrace(PTRACE_CONT, child_pid, NULL, 0); + sprintf(buf, "Test breakpoint %d with local: %d global: %d", + i, local, global); + check_success(buf); + toggle_breakpoint(i, BP_X, 1, local, global, 0); + } +} + +static void launch_watchpoints(char *buf, int mode, int len, + int local, int global) +{ + const char *mode_str; + int i; + + if (mode == BP_W) + mode_str = "write"; + else + mode_str = "read"; + + for (i = 0; i < 4; i++) { + set_breakpoint_addr(&dummy_var[i], i); + toggle_breakpoint(i, mode, len, local, global, 1); + ptrace(PTRACE_CONT, child_pid, NULL, 0); + sprintf(buf, "Test %s watchpoint %d with len: %d local: " + "%d global: %d", mode_str, i, len, local, global); + check_success(buf); + toggle_breakpoint(i, mode, len, local, global, 0); + } +} + +/* Set the breakpoints and check the child successfully trigger them */ +static void launch_tests(void) +{ + char buf[1024]; + int len, local, global, i; + + /* Instruction breakpoints */ + for (local = 0; local < 2; local++) { + for (global = 0; global < 2; global++) { + if (!local && !global) + continue; + launch_instruction_breakpoints(buf, local, global); + } + } + + /* Write watchpoint */ + for (len = 1; len <= sizeof(long); len <<= 1) { + for (local = 0; local < 2; local++) { + for (global = 0; global < 2; global++) { + if (!local && !global) + continue; + launch_watchpoints(buf, BP_W, len, + local, global); + } + } + } + + /* Read-Write watchpoint */ + for (len = 1; len <= sizeof(long); len <<= 1) { + for (local = 0; local < 2; local++) { + for (global = 0; global < 2; global++) { + if (!local && !global) + continue; + launch_watchpoints(buf, BP_RW, len, + local, global); + } + } + } + + /* Icebp traps */ + ptrace(PTRACE_CONT, child_pid, NULL, 0); + check_success("Test icebp"); + + /* Int 3 traps */ + ptrace(PTRACE_CONT, child_pid, NULL, 0); + check_success("Test int 3 trap"); + + ptrace(PTRACE_CONT, child_pid, NULL, 0); +} + +int main(int argc, char **argv) +{ + pid_t pid; + int ret; + + pid = fork(); + if (!pid) { + trigger_tests(); + return 0; + } + + child_pid = pid; + + wait(NULL); + + launch_tests(); + + wait(NULL); + + return 0; +} diff --git a/tools/testing/selftests/run_tests b/tools/testing/selftests/run_tests index 701960d9e531..320718a4e6bf 100644 --- a/tools/testing/selftests/run_tests +++ b/tools/testing/selftests/run_tests @@ -1,6 +1,6 @@ #!/bin/bash -TARGETS= +TARGETS=breakpoints for TARGET in $TARGETS do From 067bce1a06c1f84146f873a598cd7c3a28eee1d5 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 12 Jan 2012 17:20:49 -0800 Subject: [PATCH 91/96] c/r: introduce CHECKPOINT_RESTORE symbol For checkpoint/restore we need auxilary features being compiled into the kernel, such as additional prctl codes, /proc//map_files and etc... but same time these features are not mandatory for a regular kernel so CHECKPOINT_RESTORE config symbol should bring a way to disable them all at once if one wish to get rid of additional functionality. Signed-off-by: Cyrill Gorcunov Cc: Tejun Heo Cc: Andrew Vagin Cc: Serge Hallyn Cc: Vasiliy Kulikov Reviewed-by: Kees Cook Cc: KAMEZAWA Hiroyuki Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/Kconfig | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index 018d206c21f7..6ac2236244c3 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -783,6 +783,17 @@ config DEBUG_BLK_CGROUP endif # CGROUPS +config CHECKPOINT_RESTORE + bool "Checkpoint/restore support" if EXPERT + default n + help + Enables additional kernel features in a sake of checkpoint/restore. + In particular it adds auxiliary prctl codes to setup process text, + data and heap segment sizes, and a few additional /proc filesystem + entries. + + If unsure, say N here. + menuconfig NAMESPACES bool "Namespaces support" if EXPERT default !EXPERT From b3f7f573a20081910e34e99cbc91831f4f02f1ff Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 12 Jan 2012 17:20:53 -0800 Subject: [PATCH 92/96] c/r: procfs: add start_data, end_data, start_brk members to /proc/$pid/stat v4 The mm->start_code/end_code, mm->start_data/end_data, mm->start_brk are involved into calculation of program text/data segment sizes (which might be seen in /proc//statm) and into brk() call final address. For restore we need to know all these values. While mm->start_code/end_code already present in /proc/$pid/stat, the rest members are not, so this patch brings them in. The restore procedure of these members is addressed in another patch using prctl(). Signed-off-by: Cyrill Gorcunov Acked-by: Serge Hallyn Reviewed-by: Kees Cook Reviewed-by: KAMEZAWA Hiroyuki Cc: Alexey Dobriyan Cc: Tejun Heo Cc: Andrew Vagin Cc: Vasiliy Kulikov Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 3 +++ fs/proc/array.c | 7 +++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 12fee132fbe2..a76a26a1db8a 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -307,6 +307,9 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7) blkio_ticks time spent waiting for block IO gtime guest time of the task in jiffies cgtime guest time of the task children in jiffies + start_data address above which program data+bss is placed + end_data address below which program data+bss is placed + start_brk address above which program heap can be expanded with brk() .............................................................................. The /proc/PID/maps file containing the currently mapped memory regions and diff --git a/fs/proc/array.c b/fs/proc/array.c index 8c344f037bd0..9252ee3b71e3 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n", pid_nr_ns(pid, ns), tcomm, state, @@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, task->policy, (unsigned long long)delayacct_blkio_ticks(task), cputime_to_clock_t(gtime), - cputime_to_clock_t(cgtime)); + cputime_to_clock_t(cgtime), + (mm && permitted) ? mm->start_data : 0, + (mm && permitted) ? mm->end_data : 0, + (mm && permitted) ? mm->start_brk : 0); if (mm) mmput(mm); return 0; From 028ee4be34a09a6d48bdf30ab991ae933a7bc036 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 12 Jan 2012 17:20:55 -0800 Subject: [PATCH 93/96] c/r: prctl: add PR_SET_MM codes to set up mm_struct entries When we restore a task we need to set up text, data and data heap sizes from userspace to the values a task had at checkpoint time. This patch adds auxilary prctl codes for that. While most of them have a statistical nature (their values are involved into calculation of /proc//statm output) the start_brk and brk values are used to compute an allowed size of program data segment expansion. Which means an arbitrary changes of this values might be dangerous operation. So to restrict access the following requirements applied to prctl calls: - The process has to have CAP_SYS_ADMIN capability granted. - For all opcodes except start_brk/brk members an appropriate VMA area must exist and should fit certain VMA flags, such as: - code segment must be executable but not writable; - data segment must not be executable. start_brk/brk values must not intersect with data segment and must not exceed RLIMIT_DATA resource limit. Still the main guard is CAP_SYS_ADMIN capability check. Note the kernel should be compiled with CONFIG_CHECKPOINT_RESTORE support otherwise these prctl calls will return -EINVAL. [akpm@linux-foundation.org: cache current->mm in a local, saving 200 bytes text] Signed-off-by: Cyrill Gorcunov Reviewed-by: Kees Cook Cc: Tejun Heo Cc: Andrew Vagin Cc: Serge Hallyn Cc: Pavel Emelyanov Cc: Vasiliy Kulikov Cc: KAMEZAWA Hiroyuki Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/prctl.h | 12 +++++ kernel/sys.c | 121 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) diff --git a/include/linux/prctl.h b/include/linux/prctl.h index a3baeb2c2161..7ddc7f1b480f 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -102,4 +102,16 @@ #define PR_MCE_KILL_GET 34 +/* + * Tune up process memory map specifics. + */ +#define PR_SET_MM 35 +# define PR_SET_MM_START_CODE 1 +# define PR_SET_MM_END_CODE 2 +# define PR_SET_MM_START_DATA 3 +# define PR_SET_MM_END_DATA 4 +# define PR_SET_MM_START_STACK 5 +# define PR_SET_MM_START_BRK 6 +# define PR_SET_MM_BRK 7 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index ddf8155bf3f8..40701538fbd1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask) return mask; } +#ifdef CONFIG_CHECKPOINT_RESTORE +static int prctl_set_mm(int opt, unsigned long addr, + unsigned long arg4, unsigned long arg5) +{ + unsigned long rlim = rlimit(RLIMIT_DATA); + unsigned long vm_req_flags; + unsigned long vm_bad_flags; + struct vm_area_struct *vma; + int error = 0; + struct mm_struct *mm = current->mm; + + if (arg4 | arg5) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (addr >= TASK_SIZE) + return -EINVAL; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, addr); + + if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) { + /* It must be existing VMA */ + if (!vma || vma->vm_start > addr) + goto out; + } + + error = -EINVAL; + switch (opt) { + case PR_SET_MM_START_CODE: + case PR_SET_MM_END_CODE: + vm_req_flags = VM_READ | VM_EXEC; + vm_bad_flags = VM_WRITE | VM_MAYSHARE; + + if ((vma->vm_flags & vm_req_flags) != vm_req_flags || + (vma->vm_flags & vm_bad_flags)) + goto out; + + if (opt == PR_SET_MM_START_CODE) + mm->start_code = addr; + else + mm->end_code = addr; + break; + + case PR_SET_MM_START_DATA: + case PR_SET_MM_END_DATA: + vm_req_flags = VM_READ | VM_WRITE; + vm_bad_flags = VM_EXEC | VM_MAYSHARE; + + if ((vma->vm_flags & vm_req_flags) != vm_req_flags || + (vma->vm_flags & vm_bad_flags)) + goto out; + + if (opt == PR_SET_MM_START_DATA) + mm->start_data = addr; + else + mm->end_data = addr; + break; + + case PR_SET_MM_START_STACK: + +#ifdef CONFIG_STACK_GROWSUP + vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP; +#else + vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN; +#endif + if ((vma->vm_flags & vm_req_flags) != vm_req_flags) + goto out; + + mm->start_stack = addr; + break; + + case PR_SET_MM_START_BRK: + if (addr <= mm->end_data) + goto out; + + if (rlim < RLIM_INFINITY && + (mm->brk - addr) + + (mm->end_data - mm->start_data) > rlim) + goto out; + + mm->start_brk = addr; + break; + + case PR_SET_MM_BRK: + if (addr <= mm->end_data) + goto out; + + if (rlim < RLIM_INFINITY && + (addr - mm->start_brk) + + (mm->end_data - mm->start_data) > rlim) + goto out; + + mm->brk = addr; + break; + + default: + error = -EINVAL; + goto out; + } + + error = 0; + +out: + up_read(&mm->mmap_sem); + + return error; +} +#else /* CONFIG_CHECKPOINT_RESTORE */ +static int prctl_set_mm(int opt, unsigned long addr, + unsigned long arg4, unsigned long arg5) +{ + return -EINVAL; +} +#endif + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else error = PR_MCE_KILL_DEFAULT; break; + case PR_SET_MM: + error = prctl_set_mm(arg2, arg3, arg4, arg5); + break; default: error = -EINVAL; break; From fdb5950754eb3dedb9fea7c8828d3e51d9dbc3f7 Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Thu, 12 Jan 2012 17:20:58 -0800 Subject: [PATCH 94/96] ramoops: fix use of rounddown_pow_of_two() The return value of rounddown_pow_of_two wasn't evaluated, so the operation was a no-op. Signed-off-by: Marco Stornelli Reported-by: Andrew Morton Reviewed-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/ramoops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/char/ramoops.c b/drivers/char/ramoops.c index feda90cdac9f..9c152ab2bfa7 100644 --- a/drivers/char/ramoops.c +++ b/drivers/char/ramoops.c @@ -125,8 +125,8 @@ static int __init ramoops_probe(struct platform_device *pdev) goto fail3; } - rounddown_pow_of_two(pdata->mem_size); - rounddown_pow_of_two(pdata->record_size); + pdata->mem_size = rounddown_pow_of_two(pdata->mem_size); + pdata->record_size = rounddown_pow_of_two(pdata->record_size); /* Check for the minimum memory size */ if (pdata->mem_size < MIN_MEM_SIZE && From c755201eb5c1e09f3477d0e83ae1e3aadac0e8d1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 12 Jan 2012 17:20:59 -0800 Subject: [PATCH 95/96] ramoops: update parameters only after successful init If a platform device exists on the system, but ramoops fails to attach to it, the module parameters are overridden before ramoops can fall back and try to use passed module parameters. Move update to end of init routine. Signed-off-by: Kees Cook Cc: Marco Stornelli Cc: Sergiu Iordache Cc: Seiji Aguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/ramoops.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/char/ramoops.c b/drivers/char/ramoops.c index 9c152ab2bfa7..9fec3232b736 100644 --- a/drivers/char/ramoops.c +++ b/drivers/char/ramoops.c @@ -147,14 +147,6 @@ static int __init ramoops_probe(struct platform_device *pdev) cxt->phys_addr = pdata->mem_address; cxt->record_size = pdata->record_size; cxt->dump_oops = pdata->dump_oops; - /* - * Update the module parameter variables as well so they are visible - * through /sys/module/ramoops/parameters/ - */ - mem_size = pdata->mem_size; - mem_address = pdata->mem_address; - record_size = pdata->record_size; - dump_oops = pdata->dump_oops; if (!request_mem_region(cxt->phys_addr, cxt->size, "ramoops")) { pr_err("request mem region failed\n"); @@ -175,6 +167,15 @@ static int __init ramoops_probe(struct platform_device *pdev) goto fail1; } + /* + * Update the module parameter variables as well so they are visible + * through /sys/module/ramoops/parameters/ + */ + mem_size = pdata->mem_size; + mem_address = pdata->mem_address; + record_size = pdata->record_size; + dump_oops = pdata->dump_oops; + return 0; fail1: From 35f1526845a9d804206883e19bd257d3dcef758f Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Thu, 12 Jan 2012 17:21:01 -0800 Subject: [PATCH 96/96] unlzo: fix input buffer free unlzo modifies the pointer to in_buf, so we have to free the original buffer, not the modified pointer. Signed-off-by: Sascha Hauer Cc: Lasse Collin Cc: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/decompress_unlzo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/decompress_unlzo.c b/lib/decompress_unlzo.c index 5a7a2adf4c4c..4531294fa62f 100644 --- a/lib/decompress_unlzo.c +++ b/lib/decompress_unlzo.c @@ -279,7 +279,7 @@ STATIC inline int INIT unlzo(u8 *input, int in_len, ret = 0; exit_2: if (!input) - free(in_buf); + free(in_buf_save); exit_1: if (!output) free(out_buf);