From 6e28b761c5020c4bc943c7e38da86bea8af77d5c Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 8 Oct 2012 16:28:05 -0700
Subject: [PATCH 001/160] MAINTAINERS: add Konrad as the SWIOTLB maintainer

Now that I've an IA64 box on top of the other boxes (IBM with Calgary-X,
Intel VT-d, AMD Vi, and AMD GART - that can use SWIOTLB as fallback) I can
reliably do regression testing.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index b4e5ecf8d3d2..3f1131b7a09c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7039,6 +7039,14 @@ S:	Maintained
 F:	Documentation/svga.txt
 F:	arch/x86/boot/video*
 
+SWIOTLB SUBSYSTEM
+M:	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+L:	linux-kernel@vger.kernel.org
+S:	Supported
+F:	lib/swiotlb.c
+F:	arch/*/kernel/pci-swiotlb.c
+F:	include/linux/swiotlb.h
+
 SYSV FILESYSTEM
 M:	Christoph Hellwig <hch@infradead.org>
 S:	Maintained

From af1839eb4bd4fe079a125eb199205fceb6ae19e6 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 8 Oct 2012 16:28:08 -0700
Subject: [PATCH 002/160] Kconfig: clean up the long arch list for the UID16
 config option

Introduce HAVE_UID16 config option and select it in corresponding
architecture Kconfig files.  UID16 now only depends on HAVE_UID16.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/Kconfig       | 1 +
 arch/arm64/Kconfig     | 1 +
 arch/blackfin/Kconfig  | 1 +
 arch/cris/Kconfig      | 1 +
 arch/frv/Kconfig       | 1 +
 arch/h8300/Kconfig     | 1 +
 arch/m68k/Kconfig      | 1 +
 arch/s390/Kconfig      | 1 +
 arch/sh/Kconfig        | 1 +
 arch/sparc/Kconfig     | 2 ++
 arch/um/Kconfig.common | 1 +
 arch/x86/Kconfig       | 2 ++
 init/Kconfig           | 6 ++++--
 13 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 6d2f7f5c0036..5f5439672932 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -39,6 +39,7 @@ config ARM
 	select HARDIRQS_SW_RESEND
 	select GENERIC_IRQ_PROBE
 	select GENERIC_IRQ_SHOW
+	select HAVE_UID16
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select HARDIRQS_SW_RESEND
 	select CPU_PM if (SUSPEND || CPU_IDLE)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 767ba5685454..e61acae0d891 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -193,6 +193,7 @@ config COMPAT
 	bool "Kernel support for 32-bit EL0"
 	depends on !ARM64_64K_PAGES
 	select COMPAT_BINFMT_ELF
+	select HAVE_UID16
 	help
 	  This option enables support for a 32-bit EL0 running under a 64-bit
 	  kernel at EL1. AArch32-specific components such as system calls,
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index c7092e6057c5..2169889c73ec 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -33,6 +33,7 @@ config BLACKFIN
 	select HAVE_PERF_EVENTS
 	select ARCH_HAVE_CUSTOM_GPIO_H
 	select ARCH_WANT_OPTIONAL_GPIOLIB
+	select HAVE_UID16
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select HAVE_GENERIC_HARDIRQS
 	select GENERIC_ATOMIC64
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 72bd5ae50a89..a118163b04ee 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -42,6 +42,7 @@ config CRIS
 	select HAVE_IDE
 	select GENERIC_ATOMIC64
 	select HAVE_GENERIC_HARDIRQS
+	select HAVE_UID16
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select GENERIC_IRQ_SHOW
 	select GENERIC_IOMAP
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index 971c0a19facb..cc5709d18350 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -5,6 +5,7 @@ config FRV
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
+	select HAVE_UID16
 	select HAVE_GENERIC_HARDIRQS
 	select GENERIC_IRQ_SHOW
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 5e8a0d9a09ce..90462eb23d02 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -3,6 +3,7 @@ config H8300
 	default y
 	select HAVE_IDE
 	select HAVE_GENERIC_HARDIRQS
+	select HAVE_UID16
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select GENERIC_IRQ_SHOW
 	select GENERIC_CPU_DEVICES
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index b22df9410dce..3e2b2f66db60 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -6,6 +6,7 @@ config M68K
 	select HAVE_GENERIC_HARDIRQS
 	select GENERIC_IRQ_SHOW
 	select GENERIC_ATOMIC64
+	select HAVE_UID16
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS
 	select GENERIC_CPU_DEVICES
 	select GENERIC_STRNCPY_FROM_USER if MMU
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index c8af429991d9..baba37cfcf84 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -126,6 +126,7 @@ config S390
 	select ARCH_INLINE_WRITE_UNLOCK_BH
 	select ARCH_INLINE_WRITE_UNLOCK_IRQ
 	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+	select HAVE_UID16 if 32BIT
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select GENERIC_SMP_IDLE_THREAD
 	select GENERIC_TIME_VSYSCALL
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 36f5141e8041..f0c85e424777 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -21,6 +21,7 @@ config SUPERH
 	select HAVE_KERNEL_LZMA
 	select HAVE_KERNEL_XZ
 	select HAVE_KERNEL_LZO
+	select HAVE_UID16
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 67f1f6f5f4e1..e66481015d3b 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -42,6 +42,7 @@ config SPARC32
 	def_bool !64BIT
 	select GENERIC_ATOMIC64
 	select CLZ_TAB
+	select HAVE_UID16
 
 config SPARC64
 	def_bool 64BIT
@@ -571,6 +572,7 @@ config COMPAT
 	depends on SPARC64
 	default y
 	select COMPAT_BINFMT_ELF
+	select HAVE_UID16
 	select ARCH_WANT_OLD_COMPAT_IPC
 
 config SYSVIPC_COMPAT
diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common
index cb837c223922..648121b037d5 100644
--- a/arch/um/Kconfig.common
+++ b/arch/um/Kconfig.common
@@ -7,6 +7,7 @@ config UML
 	bool
 	default y
 	select HAVE_GENERIC_HARDIRQS
+	select HAVE_UID16
 	select GENERIC_IRQ_SHOW
 	select GENERIC_CPU_DEVICES
 	select GENERIC_IO
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b72777ff32a9..fd5d7c2c2daa 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -10,6 +10,7 @@ config X86_32
 	def_bool y
 	depends on !64BIT
 	select CLKSRC_I8253
+	select HAVE_UID16
 
 config X86_64
 	def_bool y
@@ -2168,6 +2169,7 @@ config IA32_EMULATION
 	bool "IA32 Emulation"
 	depends on X86_64
 	select COMPAT_BINFMT_ELF
+	select HAVE_UID16
 	---help---
 	  Include code to run legacy 32-bit programs under a
 	  64-bit kernel. You should likely turn this on, unless you're
diff --git a/init/Kconfig b/init/Kconfig
index ed6334dd5e71..38bab420bd9b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1125,10 +1125,12 @@ menuconfig EXPERT
           environments which can tolerate a "non-standard" kernel.
           Only use this if you really know what you are doing.
 
+config HAVE_UID16
+	bool
+
 config UID16
 	bool "Enable 16-bit UID system calls" if EXPERT
-	depends on ARM || BLACKFIN || CRIS || FRV || H8300 || X86_32 || M68K || (S390 && !64BIT) || SUPERH || SPARC32 || (SPARC64 && COMPAT) || UML || (X86_64 && IA32_EMULATION) \
-		|| AARCH32_EMULATION
+	depends on HAVE_UID16
 	default y
 	help
 	  This enables the legacy 16-bit UID syscall wrappers.

From b69ec42b1b194cc88f04b3fbcda8d3f93182d6c3 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 8 Oct 2012 16:28:11 -0700
Subject: [PATCH 003/160] Kconfig: clean up the long arch list for the
 DEBUG_KMEMLEAK config option

Introduce HAVE_DEBUG_KMEMLEAK config option and select it in corresponding
architecture Kconfig files.  DEBUG_KMEMLEAK now only depends on
HAVE_DEBUG_KMEMLEAK.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/Kconfig        | 1 +
 arch/arm64/Kconfig      | 1 +
 arch/microblaze/Kconfig | 1 +
 arch/mips/Kconfig       | 1 +
 arch/powerpc/Kconfig    | 1 +
 arch/s390/Kconfig       | 1 +
 arch/sh/Kconfig         | 1 +
 arch/sparc/Kconfig      | 1 +
 arch/tile/Kconfig       | 1 +
 arch/x86/Kconfig        | 1 +
 lib/Kconfig.debug       | 8 ++++----
 11 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 5f5439672932..2867a7742306 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -25,6 +25,7 @@ config ARM
 	select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL)
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
 	select HAVE_GENERIC_DMA_COHERENT
+	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_KERNEL_LZMA
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e61acae0d891..5dc9273781d6 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -10,6 +10,7 @@ config ARM64
 	select GENERIC_TIME_VSYSCALL
 	select HARDIRQS_SW_RESEND
 	select HAVE_ARCH_TRACEHOOK
+	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_ATTRS
 	select HAVE_GENERIC_DMA_COHERENT
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 6133bed2b855..53fd94ab60f0 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -16,6 +16,7 @@ config MICROBLAZE
 	select OF
 	select OF_EARLY_FLATTREE
 	select ARCH_WANT_IPC_PARSE_VERSION
+	select HAVE_DEBUG_KMEMLEAK
 	select IRQ_DOMAIN
 	select HAVE_GENERIC_HARDIRQS
 	select GENERIC_IRQ_PROBE
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index faf65286574e..335115e5bdd9 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -17,6 +17,7 @@ config MIPS
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_KPROBES
 	select HAVE_KRETPROBES
+	select HAVE_DEBUG_KMEMLEAK
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
 	select RTC_LIB if !MACH_LOONGSON
 	select GENERIC_ATOMIC64 if !64BIT
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 4ce0be32d153..6a798a70a6d1 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -113,6 +113,7 @@ config PPC
 	select HAVE_DMA_API_DEBUG
 	select USE_GENERIC_SMP_HELPERS if SMP
 	select HAVE_OPROFILE
+	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_SYSCALL_WRAPPERS if PPC64
 	select GENERIC_ATOMIC64 if PPC32
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index baba37cfcf84..8c6d7986f6d2 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -80,6 +80,7 @@ config S390
 	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
+	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index f0c85e424777..cfbf3e3c982b 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -16,6 +16,7 @@ config SUPERH
 	select ARCH_HAVE_CUSTOM_GPIO_H
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A)
 	select PERF_USE_VMALLOC
+	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e66481015d3b..274d6cf0ada2 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -60,6 +60,7 @@ config SPARC64
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_SYSCALL_TRACEPOINTS
+	select HAVE_DEBUG_KMEMLEAK
 	select RTC_DRV_CMOS
 	select RTC_DRV_BQ4802
 	select RTC_DRV_SUN4V
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index c9a3c1fe7297..9a0d77d3ba14 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -9,6 +9,7 @@ config TILE
 	select GENERIC_FIND_FIRST_BIT
 	select USE_GENERIC_SMP_HELPERS
 	select CC_OPTIMIZE_FOR_SIZE
+	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_GENERIC_HARDIRQS
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PENDING_IRQ if SMP
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fd5d7c2c2daa..3fea1848d955 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -66,6 +66,7 @@ config X86
 	select HAVE_PERF_EVENTS_NMI
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
+	select HAVE_DEBUG_KMEMLEAK
 	select ANON_INODES
 	select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
 	select HAVE_CMPXCHG_LOCAL if !M386
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 7fba3a98967f..736db3990506 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -450,12 +450,12 @@ config SLUB_STATS
 	  out which slabs are relevant to a particular load.
 	  Try running: slabinfo -DA
 
+config HAVE_DEBUG_KMEMLEAK
+	bool
+
 config DEBUG_KMEMLEAK
 	bool "Kernel memory leak detector"
-	depends on DEBUG_KERNEL && EXPERIMENTAL && \
-		(X86 || ARM || PPC || MIPS || S390 || SPARC64 || SUPERH || \
-		 MICROBLAZE || TILE || ARM64)
-
+	depends on DEBUG_KERNEL && EXPERIMENTAL && HAVE_DEBUG_KMEMLEAK
 	select DEBUG_FS
 	select STACKTRACE if STACKTRACE_SUPPORT
 	select KALLSYMS

From 9b2a60c484715e2d2f07d8192fd9f18435cbc77c Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 8 Oct 2012 16:28:13 -0700
Subject: [PATCH 004/160] Kconfig: clean up the long arch list for the
 DEBUG_BUGVERBOSE config option

Introduce HAVE_DEBUG_BUGVERBOSE config option and select it in
corresponding architecture Kconfig files.  Architectures that already
select GENERIC_BUG don't need to select HAVE_DEBUG_BUGVERBOSE.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/Kconfig | 1 +
 arch/frv/Kconfig   | 1 +
 arch/m32r/Kconfig  | 1 +
 arch/m68k/Kconfig  | 1 +
 arch/sh/Kconfig    | 1 +
 arch/sparc/Kconfig | 1 +
 arch/tile/Kconfig  | 1 +
 lib/Kconfig.debug  | 8 ++++----
 8 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 5dc9273781d6..a30856058742 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -10,6 +10,7 @@ config ARM64
 	select GENERIC_TIME_VSYSCALL
 	select HARDIRQS_SW_RESEND
 	select HAVE_ARCH_TRACEHOOK
+	select HAVE_DEBUG_BUGVERBOSE
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_ATTRS
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index cc5709d18350..9d262645f667 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -8,6 +8,7 @@ config FRV
 	select HAVE_UID16
 	select HAVE_GENERIC_HARDIRQS
 	select GENERIC_IRQ_SHOW
+	select HAVE_DEBUG_BUGVERBOSE
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_CPU_DEVICES
 	select ARCH_WANT_IPC_PARSE_VERSION
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index 49498bbb9616..e875fc3ce9cb 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -8,6 +8,7 @@ config M32R
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
 	select ARCH_WANT_IPC_PARSE_VERSION
+	select HAVE_DEBUG_BUGVERBOSE
 	select HAVE_GENERIC_HARDIRQS
 	select GENERIC_IRQ_PROBE
 	select GENERIC_IRQ_SHOW
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 3e2b2f66db60..dae1e7e16a37 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -3,6 +3,7 @@ config M68K
 	default y
 	select HAVE_IDE
 	select HAVE_AOUT if MMU
+	select HAVE_DEBUG_BUGVERBOSE
 	select HAVE_GENERIC_HARDIRQS
 	select GENERIC_IRQ_SHOW
 	select GENERIC_ATOMIC64
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index cfbf3e3c982b..3b3e27a3ff2c 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -13,6 +13,7 @@ config SUPERH
 	select HAVE_DMA_ATTRS
 	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
+	select HAVE_DEBUG_BUGVERBOSE
 	select ARCH_HAVE_CUSTOM_GPIO_H
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A)
 	select PERF_USE_VMALLOC
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 274d6cf0ada2..700a01adec3a 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -32,6 +32,7 @@ config SPARC
 	select GENERIC_PCI_IOMAP
 	select HAVE_NMI_WATCHDOG if SPARC64
 	select HAVE_BPF_JIT
+	select HAVE_DEBUG_BUGVERBOSE
 	select GENERIC_SMP_IDLE_THREAD
 	select GENERIC_CMOS_UPDATE
 	select GENERIC_CLOCKEVENTS
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 9a0d77d3ba14..df69d4296b4b 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -14,6 +14,7 @@ config TILE
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PENDING_IRQ if SMP
 	select GENERIC_IRQ_SHOW
+	select HAVE_DEBUG_BUGVERBOSE
 	select HAVE_SYSCALL_WRAPPERS if TILEGX
 	select SYS_HYPERVISOR
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 736db3990506..b7281e4d1473 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -751,12 +751,12 @@ config DEBUG_HIGHMEM
 	  This options enables addition error checking for high memory systems.
 	  Disable for production systems.
 
+config HAVE_DEBUG_BUGVERBOSE
+	bool
+
 config DEBUG_BUGVERBOSE
 	bool "Verbose BUG() reporting (adds 70K)" if DEBUG_KERNEL && EXPERT
-	depends on BUG
-	depends on ARM || AVR32 || M32R || M68K || SPARC32 || SPARC64 || \
-		   FRV || SUPERH || GENERIC_BUG || BLACKFIN || MN10300 || \
-		   TILE || ARM64
+	depends on BUG && (GENERIC_BUG || HAVE_DEBUG_BUGVERBOSE)
 	default y
 	help
 	  Say Y here to make BUG() panics output the file name and line number

From 7ac57a89de958fbb5271dc504d0c25e34dbeec32 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 8 Oct 2012 16:28:16 -0700
Subject: [PATCH 005/160] Kconfig: clean up the "#if defined(arch)" list for
 exception-trace sysctl entry

Introduce SYSCTL_EXCEPTION_TRACE config option and selec it in the
architectures requiring support for the "exception-trace" debug_table
entry in kernel/sysctl.c.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/Kconfig   | 1 +
 arch/powerpc/Kconfig | 1 +
 arch/s390/Kconfig    | 1 +
 arch/sparc/Kconfig   | 1 +
 arch/tile/Kconfig    | 1 +
 arch/x86/Kconfig     | 1 +
 init/Kconfig         | 5 +++++
 kernel/sysctl.c      | 3 +--
 8 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a30856058742..7ff68c946073 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -28,6 +28,7 @@ config ARM64
 	select PERF_USE_VMALLOC
 	select RTC_LIB
 	select SPARSE_IRQ
+	select SYSCTL_EXCEPTION_TRACE
 	help
 	  ARM 64-bit (AArch64) Linux support.
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6a798a70a6d1..df7edb887a04 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -99,6 +99,7 @@ config PPC
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
+	select SYSCTL_EXCEPTION_TRACE
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select HAVE_IDE
 	select HAVE_IOREMAP_PROT
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 8c6d7986f6d2..ceff7aef2477 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -68,6 +68,7 @@ config S390
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_SYSCALL_TRACEPOINTS
+	select SYSCTL_EXCEPTION_TRACE
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 700a01adec3a..e184075877d7 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -18,6 +18,7 @@ config SPARC
 	select HAVE_OPROFILE
 	select HAVE_ARCH_KGDB if !SMP || SPARC64
 	select HAVE_ARCH_TRACEHOOK
+	select SYSCTL_EXCEPTION_TRACE
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select RTC_CLASS
 	select RTC_DRV_M48T59
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index df69d4296b4b..dc46490adca0 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -7,6 +7,7 @@ config TILE
 	select HAVE_DMA_API_DEBUG
 	select HAVE_KVM if !TILEGX
 	select GENERIC_FIND_FIRST_BIT
+	select SYSCTL_EXCEPTION_TRACE
 	select USE_GENERIC_SMP_HELPERS
 	select CC_OPTIMIZE_FOR_SIZE
 	select HAVE_DEBUG_KMEMLEAK
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3fea1848d955..6119d6c7002e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -47,6 +47,7 @@ config X86
 	select HAVE_FUNCTION_GRAPH_FP_TEST
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	select HAVE_SYSCALL_TRACEPOINTS
+	select SYSCTL_EXCEPTION_TRACE
 	select HAVE_KVM
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_TRACEHOOK
diff --git a/init/Kconfig b/init/Kconfig
index 38bab420bd9b..4c93533da42c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1152,6 +1152,11 @@ config SYSCTL_SYSCALL
 
 	  If unsure say N here.
 
+config SYSCTL_EXCEPTION_TRACE
+	bool
+	help
+	  Enable support for /proc/sys/debug/exception-trace.
+
 config KALLSYMS
 	 bool "Load all symbols for debugging/ksymoops" if EXPERT
 	 default y
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c2a2f8084bad..26f65eaa01f9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1549,8 +1549,7 @@ static struct ctl_table fs_table[] = {
 };
 
 static struct ctl_table debug_table[] = {
-#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
-    defined(CONFIG_S390) || defined(CONFIG_TILE) || defined(CONFIG_ARM64)
+#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
 	{
 		.procname	= "exception-trace",
 		.data		= &show_unhandled_signals,

From 075663d19885eb3738fd2d7dbdb8947e12563b68 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:28:20 -0700
Subject: [PATCH 006/160] CPU hotplug, debug: detect imbalance between
 get_online_cpus() and put_online_cpus()

The synchronization between CPU hotplug readers and writers is achieved
by means of refcounting, safeguarded by the cpu_hotplug.lock.

get_online_cpus() increments the refcount, whereas put_online_cpus()
decrements it.  If we ever hit an imbalance between the two, we end up
compromising the guarantees of the hotplug synchronization i.e, for
example, an extra call to put_online_cpus() can end up allowing a
hotplug reader to execute concurrently with a hotplug writer.

So, add a WARN_ON() in put_online_cpus() to detect such cases where the
refcount can go negative, and also attempt to fix it up, so that we can
continue to run.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index f560598807c1..42bd331ee0ab 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -80,6 +80,10 @@ void put_online_cpus(void)
 	if (cpu_hotplug.active_writer == current)
 		return;
 	mutex_lock(&cpu_hotplug.lock);
+
+	if (WARN_ON(!cpu_hotplug.refcount))
+		cpu_hotplug.refcount++; /* try to fix things up */
+
 	if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
 		wake_up_process(cpu_hotplug.active_writer);
 	mutex_unlock(&cpu_hotplug.lock);

From c654345924f7cce87bb221b89db91cba890421ba Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 8 Oct 2012 16:28:21 -0700
Subject: [PATCH 007/160] mm: remove __GFP_NO_KSWAPD

When transparent huge pages were introduced, memory compaction and swap
storms were an issue, and the kernel had to be careful to not make THP
allocations cause pageout or compaction.

Now that we have working compaction deferral, kswapd is smart enough to
invoke compaction and the quadratic behaviour around isolate_free_pages
has been fixed, it should be safe to remove __GFP_NO_KSWAPD.

[minchan@kernel.org: Comment fix]
[mgorman@suse.de: Avoid direct reclaim for deferred compaction]
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mtd/mtdcore.c           | 6 ++----
 include/linux/gfp.h             | 5 +----
 include/trace/events/gfpflags.h | 1 -
 mm/page_alloc.c                 | 7 +++----
 4 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 575730744fdb..b9adff543f5f 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1056,8 +1056,7 @@ EXPORT_SYMBOL_GPL(mtd_writev);
  * until the request succeeds or until the allocation size falls below
  * the system page size. This attempts to make sure it does not adversely
  * impact system performance, so when allocating more than one page, we
- * ask the memory allocator to avoid re-trying, swapping, writing back
- * or performing I/O.
+ * ask the memory allocator to avoid re-trying.
  *
  * Note, this function also makes sure that the allocated buffer is aligned to
  * the MTD device's min. I/O unit, i.e. the "mtd->writesize" value.
@@ -1071,8 +1070,7 @@ EXPORT_SYMBOL_GPL(mtd_writev);
  */
 void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size)
 {
-	gfp_t flags = __GFP_NOWARN | __GFP_WAIT |
-		       __GFP_NORETRY | __GFP_NO_KSWAPD;
+	gfp_t flags = __GFP_NOWARN | __GFP_WAIT | __GFP_NORETRY;
 	size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE);
 	void *kbuf;
 
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 4883f393f50a..f9bc873ce7d6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -35,7 +35,6 @@ struct vm_area_struct;
 #else
 #define ___GFP_NOTRACK		0
 #endif
-#define ___GFP_NO_KSWAPD	0x400000u
 #define ___GFP_OTHER_NODE	0x800000u
 #define ___GFP_WRITE		0x1000000u
 
@@ -90,7 +89,6 @@ struct vm_area_struct;
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
 #define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */
 
-#define __GFP_NO_KSWAPD	((__force gfp_t)___GFP_NO_KSWAPD)
 #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
 #define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)	/* Allocator intends to dirty page */
 
@@ -120,8 +118,7 @@ struct vm_area_struct;
 				 __GFP_MOVABLE)
 #define GFP_IOFS	(__GFP_IO | __GFP_FS)
 #define GFP_TRANSHUGE	(GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
-			 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
-			 __GFP_NO_KSWAPD)
+			 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN)
 
 #ifdef CONFIG_NUMA
 #define GFP_THISNODE	(__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
index d6fd8e5b14b7..9391706e9254 100644
--- a/include/trace/events/gfpflags.h
+++ b/include/trace/events/gfpflags.h
@@ -36,7 +36,6 @@
 	{(unsigned long)__GFP_RECLAIMABLE,	"GFP_RECLAIMABLE"},	\
 	{(unsigned long)__GFP_MOVABLE,		"GFP_MOVABLE"},		\
 	{(unsigned long)__GFP_NOTRACK,		"GFP_NOTRACK"},		\
-	{(unsigned long)__GFP_NO_KSWAPD,	"GFP_NO_KSWAPD"},	\
 	{(unsigned long)__GFP_OTHER_NODE,	"GFP_OTHER_NODE"}	\
 	) : "GFP_NOWAIT"
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c13ea7538891..5e92698e5395 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2362,9 +2362,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		goto nopage;
 
 restart:
-	if (!(gfp_mask & __GFP_NO_KSWAPD))
-		wake_all_kswapd(order, zonelist, high_zoneidx,
-						zone_idx(preferred_zone));
+	wake_all_kswapd(order, zonelist, high_zoneidx,
+					zone_idx(preferred_zone));
 
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
@@ -2441,7 +2440,7 @@ rebalance:
 	 * system then fail the allocation instead of entering direct reclaim.
 	 */
 	if ((deferred_compaction || contended_compaction) &&
-						(gfp_mask & __GFP_NO_KSWAPD))
+	    (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
 		goto nopage;
 
 	/* Try direct reclaim and then allocating */

From b1a86e15dc0304366f50ba1720834bc419c801b1 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 8 Oct 2012 16:28:23 -0700
Subject: [PATCH 008/160] x86, pat: remove the dependency on 'vm_pgoff' in
 track/untrack pfn vma routines

'pfn' argument for track_pfn_vma_new() can be used for reserving the
attribute for the pfn range.  No need to depend on 'vm_pgoff'

Similarly, untrack_pfn_vma() can depend on the 'pfn' argument if it is
non-zero or can use follow_phys() to get the starting value of the pfn
range.

Also the non zero 'size' argument can be used instead of recomputing it
from vma.

This cleanup also prepares the ground for the track/untrack pfn vma
routines to take over the ownership of setting PAT specific vm_flag in the
'vma'.

[khlebnikov@openvz.org: Clear pfn to paddr conversion]
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/pat.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 3d68ef6d2266..de36c886cd38 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -704,21 +704,18 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
 int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
 			unsigned long pfn, unsigned long size)
 {
+	resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
 	unsigned long flags;
-	resource_size_t paddr;
-	unsigned long vma_size = vma->vm_end - vma->vm_start;
 
-	if (is_linear_pfn_mapping(vma)) {
-		/* reserve the whole chunk starting from vm_pgoff */
-		paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
-		return reserve_pfn_range(paddr, vma_size, prot, 0);
-	}
+	/* reserve the whole chunk starting from paddr */
+	if (is_linear_pfn_mapping(vma))
+		return reserve_pfn_range(paddr, size, prot, 0);
 
 	if (!pat_enabled)
 		return 0;
 
 	/* for vm_insert_pfn and friends, we set prot based on lookup */
-	flags = lookup_memtype(pfn << PAGE_SHIFT);
+	flags = lookup_memtype(paddr);
 	*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
 			 flags);
 
@@ -728,20 +725,28 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
 /*
  * untrack_pfn_vma is called while unmapping a pfnmap for a region.
  * untrack can be called for a specific region indicated by pfn and size or
- * can be for the entire vma (in which case size can be zero).
+ * can be for the entire vma (in which case pfn, size are zero).
  */
 void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
 			unsigned long size)
 {
 	resource_size_t paddr;
-	unsigned long vma_size = vma->vm_end - vma->vm_start;
+	unsigned long prot;
 
-	if (is_linear_pfn_mapping(vma)) {
-		/* free the whole chunk starting from vm_pgoff */
-		paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
-		free_pfn_range(paddr, vma_size);
+	if (!is_linear_pfn_mapping(vma))
 		return;
+
+	/* free the chunk starting from pfn or the whole chunk */
+	paddr = (resource_size_t)pfn << PAGE_SHIFT;
+	if (!paddr && !size) {
+		if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
+			WARN_ON_ONCE(1);
+			return;
+		}
+
+		size = vma->vm_end - vma->vm_start;
 	}
+	free_pfn_range(paddr, size);
 }
 
 pgprot_t pgprot_writecombine(pgprot_t prot)

From 5180da410db6369d1f95c9014da1c9bc33fb043e Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 8 Oct 2012 16:28:29 -0700
Subject: [PATCH 009/160] x86, pat: separate the pfn attribute tracking for
 remap_pfn_range and vm_insert_pfn

With PAT enabled, vm_insert_pfn() looks up the existing pfn memory
attribute and uses it.  Expectation is that the driver reserves the
memory attributes for the pfn before calling vm_insert_pfn().

remap_pfn_range() (when called for the whole vma) will setup a new
attribute (based on the prot argument) for the specified pfn range.
This addresses the legacy usage which typically calls remap_pfn_range()
with a desired memory attribute.  For ranges smaller than the vma size
(which is typically not the case), remap_pfn_range() will use the
existing memory attribute for the pfn range.

Expose two different API's for these different behaviors.
track_pfn_insert() for tracking the pfn attribute set by vm_insert_pfn()
and track_pfn_remap() for the remap_pfn_range().

This cleanup also prepares the ground for the track/untrack pfn vma
routines to take over the ownership of setting PAT specific vm_flag in
the 'vma'.

[khlebnikov@openvz.org: Clear checks in track_pfn_remap()]
[akpm@linux-foundation.org: tweak a few comments]
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/pat.c             | 47 +++++++++++++++++++++++-------
 include/asm-generic/pgtable.h | 55 ++++++++++++++++++++---------------
 mm/memory.c                   | 13 ++++-----
 3 files changed, 73 insertions(+), 42 deletions(-)

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index de36c886cd38..74a702674e86 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -664,13 +664,13 @@ static void free_pfn_range(u64 paddr, unsigned long size)
 }
 
 /*
- * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
+ * track_pfn_copy is called when vma that is covering the pfnmap gets
  * copied through copy_page_range().
  *
  * If the vma has a linear pfn mapping for the entire range, we get the prot
  * from pte and reserve the entire vma range with single reserve_pfn_range call.
  */
-int track_pfn_vma_copy(struct vm_area_struct *vma)
+int track_pfn_copy(struct vm_area_struct *vma)
 {
 	resource_size_t paddr;
 	unsigned long prot;
@@ -694,15 +694,12 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
 }
 
 /*
- * track_pfn_vma_new is called when a _new_ pfn mapping is being established
- * for physical range indicated by pfn and size.
- *
  * prot is passed in as a parameter for the new mapping. If the vma has a
  * linear pfn mapping for the entire range reserve the entire vma range with
  * single reserve_pfn_range call.
  */
-int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
-			unsigned long pfn, unsigned long size)
+int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
+		    unsigned long pfn, unsigned long size)
 {
 	resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
 	unsigned long flags;
@@ -714,8 +711,36 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
 	if (!pat_enabled)
 		return 0;
 
-	/* for vm_insert_pfn and friends, we set prot based on lookup */
+	/*
+	 * For anything smaller than the vma size we set prot based on the
+	 * lookup.
+	 */
 	flags = lookup_memtype(paddr);
+
+	/* Check memtype for the remaining pages */
+	while (size > PAGE_SIZE) {
+		size -= PAGE_SIZE;
+		paddr += PAGE_SIZE;
+		if (flags != lookup_memtype(paddr))
+			return -EINVAL;
+	}
+
+	*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+			 flags);
+
+	return 0;
+}
+
+int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
+		     unsigned long pfn)
+{
+	unsigned long flags;
+
+	if (!pat_enabled)
+		return 0;
+
+	/* Set prot based on lookup */
+	flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT);
 	*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
 			 flags);
 
@@ -723,12 +748,12 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
 }
 
 /*
- * untrack_pfn_vma is called while unmapping a pfnmap for a region.
+ * untrack_pfn is called while unmapping a pfnmap for a region.
  * untrack can be called for a specific region indicated by pfn and size or
  * can be for the entire vma (in which case pfn, size are zero).
  */
-void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
-			unsigned long size)
+void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
+		 unsigned long size)
 {
 	resource_size_t paddr;
 	unsigned long prot;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index ff4947b7a976..d4d4592c97fc 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -381,48 +381,57 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
 
 #ifndef __HAVE_PFNMAP_TRACKING
 /*
- * Interface that can be used by architecture code to keep track of
- * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn)
- *
- * track_pfn_vma_new is called when a _new_ pfn mapping is being established
- * for physical range indicated by pfn and size.
+ * Interfaces that can be used by architecture code to keep track of
+ * memory type of pfn mappings specified by the remap_pfn_range,
+ * vm_insert_pfn.
  */
-static inline int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
-					unsigned long pfn, unsigned long size)
+
+/*
+ * track_pfn_remap is called when a _new_ pfn mapping is being established
+ * by remap_pfn_range() for physical range indicated by pfn and size.
+ */
+static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
+				  unsigned long pfn, unsigned long size)
 {
 	return 0;
 }
 
 /*
- * Interface that can be used by architecture code to keep track of
- * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn)
- *
- * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
+ * track_pfn_insert is called when a _new_ single pfn is established
+ * by vm_insert_pfn().
+ */
+static inline int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
+				   unsigned long pfn)
+{
+	return 0;
+}
+
+/*
+ * track_pfn_copy is called when vma that is covering the pfnmap gets
  * copied through copy_page_range().
  */
-static inline int track_pfn_vma_copy(struct vm_area_struct *vma)
+static inline int track_pfn_copy(struct vm_area_struct *vma)
 {
 	return 0;
 }
 
 /*
- * Interface that can be used by architecture code to keep track of
- * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn)
- *
  * untrack_pfn_vma is called while unmapping a pfnmap for a region.
  * untrack can be called for a specific region indicated by pfn and size or
- * can be for the entire vma (in which case size can be zero).
+ * can be for the entire vma (in which case pfn, size are zero).
  */
-static inline void untrack_pfn_vma(struct vm_area_struct *vma,
-					unsigned long pfn, unsigned long size)
+static inline void untrack_pfn(struct vm_area_struct *vma,
+			       unsigned long pfn, unsigned long size)
 {
 }
 #else
-extern int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
-				unsigned long pfn, unsigned long size);
-extern int track_pfn_vma_copy(struct vm_area_struct *vma);
-extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
-				unsigned long size);
+extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
+			   unsigned long pfn, unsigned long size);
+extern int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
+			    unsigned long pfn);
+extern int track_pfn_copy(struct vm_area_struct *vma);
+extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
+			unsigned long size);
 #endif
 
 #ifdef CONFIG_MMU
diff --git a/mm/memory.c b/mm/memory.c
index 57361708d1a5..6bef278ad303 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1060,7 +1060,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		 * We do not free on error cases below as remove_vma
 		 * gets called on error from higher level routine
 		 */
-		ret = track_pfn_vma_copy(vma);
+		ret = track_pfn_copy(vma);
 		if (ret)
 			return ret;
 	}
@@ -1328,7 +1328,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
 		uprobe_munmap(vma, start, end);
 
 	if (unlikely(is_pfn_mapping(vma)))
-		untrack_pfn_vma(vma, 0, 0);
+		untrack_pfn(vma, 0, 0);
 
 	if (start != end) {
 		if (unlikely(is_vm_hugetlb_page(vma))) {
@@ -2162,14 +2162,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 
 	if (addr < vma->vm_start || addr >= vma->vm_end)
 		return -EFAULT;
-	if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
+	if (track_pfn_insert(vma, &pgprot, pfn))
 		return -EINVAL;
 
 	ret = insert_pfn(vma, addr, pfn, pgprot);
 
-	if (ret)
-		untrack_pfn_vma(vma, pfn, PAGE_SIZE);
-
 	return ret;
 }
 EXPORT_SYMBOL(vm_insert_pfn);
@@ -2311,7 +2308,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 
 	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
 
-	err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
+	err = track_pfn_remap(vma, &prot, pfn, PAGE_ALIGN(size));
 	if (err) {
 		/*
 		 * To indicate that track_pfn related cleanup is not
@@ -2335,7 +2332,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	} while (pgd++, addr = next, addr != end);
 
 	if (err)
-		untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
+		untrack_pfn(vma, pfn, PAGE_ALIGN(size));
 
 	return err;
 }

From b3b9c2932c32e0692018ed5f12f3fd8c70eea8ce Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon, 8 Oct 2012 16:28:34 -0700
Subject: [PATCH 010/160] mm, x86, pat: rework linear pfn-mmap tracking

Replace the generic vma-flag VM_PFN_AT_MMAP with x86-only VM_PAT.

We can toss mapping address from remap_pfn_range() into
track_pfn_vma_new(), and collect all PAT-related logic together in
arch/x86/.

This patch also restores orignal frustration-free is_cow_mapping() check
in remap_pfn_range(), as it was before commit v2.6.28-rc8-88-g3c8bb73
("x86: PAT: store vm_pgoff for all linear_over_vma_region mappings - v3")

is_linear_pfn_mapping() checks can be removed from mm/huge_memory.c,
because it already handled by VM_PFNMAP in VM_NO_THP bit-mask.

[suresh.b.siddha@intel.com: Reset the VM_PAT flag as part of untrack_pfn_vma()]
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/pat.c             | 17 ++++++++++++-----
 include/asm-generic/pgtable.h |  6 ++++--
 include/linux/mm.h            | 20 +-------------------
 mm/huge_memory.c              | 19 +++----------------
 mm/memory.c                   | 26 ++++++++++----------------
 5 files changed, 30 insertions(+), 58 deletions(-)

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 74a702674e86..0eb572eda406 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -677,7 +677,7 @@ int track_pfn_copy(struct vm_area_struct *vma)
 	unsigned long vma_size = vma->vm_end - vma->vm_start;
 	pgprot_t pgprot;
 
-	if (is_linear_pfn_mapping(vma)) {
+	if (vma->vm_flags & VM_PAT) {
 		/*
 		 * reserve the whole chunk covered by vma. We need the
 		 * starting address and protection from pte.
@@ -699,14 +699,20 @@ int track_pfn_copy(struct vm_area_struct *vma)
  * single reserve_pfn_range call.
  */
 int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
-		    unsigned long pfn, unsigned long size)
+		    unsigned long pfn, unsigned long addr, unsigned long size)
 {
 	resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
 	unsigned long flags;
 
 	/* reserve the whole chunk starting from paddr */
-	if (is_linear_pfn_mapping(vma))
-		return reserve_pfn_range(paddr, size, prot, 0);
+	if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) {
+		int ret;
+
+		ret = reserve_pfn_range(paddr, size, prot, 0);
+		if (!ret)
+			vma->vm_flags |= VM_PAT;
+		return ret;
+	}
 
 	if (!pat_enabled)
 		return 0;
@@ -758,7 +764,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 	resource_size_t paddr;
 	unsigned long prot;
 
-	if (!is_linear_pfn_mapping(vma))
+	if (!(vma->vm_flags & VM_PAT))
 		return;
 
 	/* free the chunk starting from pfn or the whole chunk */
@@ -772,6 +778,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 		size = vma->vm_end - vma->vm_start;
 	}
 	free_pfn_range(paddr, size);
+	vma->vm_flags &= ~VM_PAT;
 }
 
 pgprot_t pgprot_writecombine(pgprot_t prot)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index d4d4592c97fc..c9a612069c8e 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -391,7 +391,8 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
  * by remap_pfn_range() for physical range indicated by pfn and size.
  */
 static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
-				  unsigned long pfn, unsigned long size)
+				  unsigned long pfn, unsigned long addr,
+				  unsigned long size)
 {
 	return 0;
 }
@@ -426,7 +427,8 @@ static inline void untrack_pfn(struct vm_area_struct *vma,
 }
 #else
 extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
-			   unsigned long pfn, unsigned long size);
+			   unsigned long pfn, unsigned long addr,
+			   unsigned long size);
 extern int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
 			    unsigned long pfn);
 extern int track_pfn_copy(struct vm_area_struct *vma);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 311be906b57d..75d1632d3477 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -117,7 +117,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_CAN_NONLINEAR 0x08000000	/* Has ->fault & does nonlinear pages */
 #define VM_MIXEDMAP	0x10000000	/* Can contain "struct page" and pure PFN pages */
 #define VM_SAO		0x20000000	/* Strong Access Ordering (powerpc) */
-#define VM_PFN_AT_MMAP	0x40000000	/* PFNMAP vma that is fully mapped at mmap time */
+#define VM_PAT		0x40000000	/* PAT reserves whole VMA at once (x86) */
 #define VM_MERGEABLE	0x80000000	/* KSM may merge identical pages */
 
 /* Bits set in the VMA until the stack is in its final location */
@@ -158,24 +158,6 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_RETRY_NOWAIT	0x10	/* Don't drop mmap_sem and wait when retrying */
 #define FAULT_FLAG_KILLABLE	0x20	/* The fault task is in SIGKILL killable region */
 
-/*
- * This interface is used by x86 PAT code to identify a pfn mapping that is
- * linear over entire vma. This is to optimize PAT code that deals with
- * marking the physical region with a particular prot. This is not for generic
- * mm use. Note also that this check will not work if the pfn mapping is
- * linear for a vma starting at physical address 0. In which case PAT code
- * falls back to slow path of reserving physical range page by page.
- */
-static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
-{
-	return !!(vma->vm_flags & VM_PFN_AT_MMAP);
-}
-
-static inline int is_pfn_mapping(struct vm_area_struct *vma)
-{
-	return !!(vma->vm_flags & VM_PFNMAP);
-}
-
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
  * ->fault function. The vma's ->fault is responsible for returning a bitmask
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 141dbb695097..73cb22ee9665 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1655,11 +1655,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
 	if (vma->vm_ops)
 		/* khugepaged not yet working on file or special mappings */
 		return 0;
-	/*
-	 * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
-	 * true too, verify it here.
-	 */
-	VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
+	VM_BUG_ON(vma->vm_flags & VM_NO_THP);
 	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
 	hend = vma->vm_end & HPAGE_PMD_MASK;
 	if (hstart < hend)
@@ -1912,11 +1908,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 		goto out;
 	if (is_vma_temporary_stack(vma))
 		goto out;
-	/*
-	 * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
-	 * true too, verify it here.
-	 */
-	VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
+	VM_BUG_ON(vma->vm_flags & VM_NO_THP);
 
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
@@ -2154,12 +2146,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 			goto skip;
 		if (is_vma_temporary_stack(vma))
 			goto skip;
-		/*
-		 * If is_pfn_mapping() is true is_learn_pfn_mapping()
-		 * must be true too, verify it here.
-		 */
-		VM_BUG_ON(is_linear_pfn_mapping(vma) ||
-			  vma->vm_flags & VM_NO_THP);
+		VM_BUG_ON(vma->vm_flags & VM_NO_THP);
 
 		hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
 		hend = vma->vm_end & HPAGE_PMD_MASK;
diff --git a/mm/memory.c b/mm/memory.c
index 6bef278ad303..655e1429388a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1055,7 +1055,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
 
-	if (unlikely(is_pfn_mapping(vma))) {
+	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
 		/*
 		 * We do not free on error cases below as remove_vma
 		 * gets called on error from higher level routine
@@ -1327,7 +1327,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
 	if (vma->vm_file)
 		uprobe_munmap(vma, start, end);
 
-	if (unlikely(is_pfn_mapping(vma)))
+	if (unlikely(vma->vm_flags & VM_PFNMAP))
 		untrack_pfn(vma, 0, 0);
 
 	if (start != end) {
@@ -2299,26 +2299,20 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 * There's a horrible special case to handle copy-on-write
 	 * behaviour that some programs depend on. We mark the "original"
 	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
+	 * See vm_normal_page() for details.
 	 */
-	if (addr == vma->vm_start && end == vma->vm_end) {
+	if (is_cow_mapping(vma->vm_flags)) {
+		if (addr != vma->vm_start || end != vma->vm_end)
+			return -EINVAL;
 		vma->vm_pgoff = pfn;
-		vma->vm_flags |= VM_PFN_AT_MMAP;
-	} else if (is_cow_mapping(vma->vm_flags))
+	}
+
+	err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
+	if (err)
 		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
 
-	err = track_pfn_remap(vma, &prot, pfn, PAGE_ALIGN(size));
-	if (err) {
-		/*
-		 * To indicate that track_pfn related cleanup is not
-		 * needed from higher level routine calling unmap_vmas
-		 */
-		vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
-		vma->vm_flags &= ~VM_PFN_AT_MMAP;
-		return -EINVAL;
-	}
-
 	BUG_ON(addr >= end);
 	pfn -= addr >> PAGE_SHIFT;
 	pgd = pgd_offset(mm, addr);

From cc2383ec06be093789469852e1fe96e1148e9a2c Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon, 8 Oct 2012 16:28:37 -0700
Subject: [PATCH 011/160] mm: introduce arch-specific vma flag VM_ARCH_1

Combine several arch-specific vma flags into one.

before patch:

        0x00000200      0x01000000      0x20000000      0x40000000
x86     VM_NOHUGEPAGE   VM_HUGEPAGE     -               VM_PAT
powerpc -               -               VM_SAO          -
parisc  VM_GROWSUP      -               -               -
ia64    VM_GROWSUP      -               -               -
nommu   -               VM_MAPPED_COPY  -               -
others  -               -               -               -

after patch:

        0x00000200      0x01000000      0x20000000      0x40000000
x86     -               VM_PAT          VM_HUGEPAGE     VM_NOHUGEPAGE
powerpc -               VM_SAO          -               -
parisc  -               VM_GROWSUP      -               -
ia64    -               VM_GROWSUP      -               -
nommu   -               VM_MAPPED_COPY  -               -
others  -               VM_ARCH_1       -               -

And voila! One completely free bit.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 34 +++++++++++++++++++++-------------
 mm/huge_memory.c   |  2 +-
 mm/ksm.c           |  7 ++++++-
 3 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 75d1632d3477..9c039f84b63c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -70,6 +70,8 @@ extern unsigned int kobjsize(const void *objp);
 /*
  * vm_flags in vm_area_struct, see mm_types.h.
  */
+#define VM_NONE		0x00000000
+
 #define VM_READ		0x00000001	/* currently active flags */
 #define VM_WRITE	0x00000002
 #define VM_EXEC		0x00000004
@@ -82,12 +84,6 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_MAYSHARE	0x00000080
 
 #define VM_GROWSDOWN	0x00000100	/* general info on the segment */
-#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
-#define VM_GROWSUP	0x00000200
-#else
-#define VM_GROWSUP	0x00000000
-#define VM_NOHUGEPAGE	0x00000200	/* MADV_NOHUGEPAGE marked this vma */
-#endif
 #define VM_PFNMAP	0x00000400	/* Page-ranges managed without "struct page", just pure PFN */
 #define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
 
@@ -106,20 +102,32 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_NORESERVE	0x00200000	/* should the VM suppress accounting */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
 #define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
-#ifndef CONFIG_TRANSPARENT_HUGEPAGE
-#define VM_MAPPED_COPY	0x01000000	/* T if mapped copy of data (nommu mmap) */
-#else
-#define VM_HUGEPAGE	0x01000000	/* MADV_HUGEPAGE marked this vma */
-#endif
+#define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
 #define VM_INSERTPAGE	0x02000000	/* The vma has had "vm_insert_page()" done on it */
 #define VM_NODUMP	0x04000000	/* Do not include in the core dump */
 
 #define VM_CAN_NONLINEAR 0x08000000	/* Has ->fault & does nonlinear pages */
 #define VM_MIXEDMAP	0x10000000	/* Can contain "struct page" and pure PFN pages */
-#define VM_SAO		0x20000000	/* Strong Access Ordering (powerpc) */
-#define VM_PAT		0x40000000	/* PAT reserves whole VMA at once (x86) */
+#define VM_HUGEPAGE	0x20000000	/* MADV_HUGEPAGE marked this vma */
+#define VM_NOHUGEPAGE	0x40000000	/* MADV_NOHUGEPAGE marked this vma */
 #define VM_MERGEABLE	0x80000000	/* KSM may merge identical pages */
 
+#if defined(CONFIG_X86)
+# define VM_PAT		VM_ARCH_1	/* PAT reserves whole VMA at once (x86) */
+#elif defined(CONFIG_PPC)
+# define VM_SAO		VM_ARCH_1	/* Strong Access Ordering (powerpc) */
+#elif defined(CONFIG_PARISC)
+# define VM_GROWSUP	VM_ARCH_1
+#elif defined(CONFIG_IA64)
+# define VM_GROWSUP	VM_ARCH_1
+#elif !defined(CONFIG_MMU)
+# define VM_MAPPED_COPY	VM_ARCH_1	/* T if mapped copy of data (nommu mmap) */
+#endif
+
+#ifndef VM_GROWSUP
+# define VM_GROWSUP	VM_NONE
+#endif
+
 /* Bits set in the VMA until the stack is in its final location */
 #define VM_STACK_INCOMPLETE_SETUP	(VM_RAND_READ | VM_SEQ_READ)
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 73cb22ee9665..47206692cf88 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1491,7 +1491,7 @@ out:
 	return ret;
 }
 
-#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
+#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP| \
 		   VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
 
 int hugepage_madvise(struct vm_area_struct *vma,
diff --git a/mm/ksm.c b/mm/ksm.c
index 47c885368890..d1cbe2aa6b3a 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1470,9 +1470,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
 				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
 				 VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
-				 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
+				 VM_NONLINEAR | VM_MIXEDMAP))
 			return 0;		/* just ignore the advice */
 
+#ifdef VM_SAO
+		if (*vm_flags & VM_SAO)
+			return 0;
+#endif
+
 		if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
 			err = __ksm_enter(mm);
 			if (err)

From 4b6e1e37026ec7dae9b23d78ffcebdd5ddb1bfa1 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon, 8 Oct 2012 16:28:40 -0700
Subject: [PATCH 012/160] mm: kill vma flag VM_INSERTPAGE

Merge VM_INSERTPAGE into VM_MIXEDMAP.  VM_MIXEDMAP VMA can mix pure-pfn
ptes, special ptes and normal ptes.

Now copy_page_range() always copies VM_MIXEDMAP VMA on fork like
VM_PFNMAP.  If driver populates whole VMA at mmap() it probably not
expects page-faults.

This patch removes special check from vma_wants_writenotify() which
disables pages write tracking for VMA populated via vm_instert_page().
BDI below mapped file should not use dirty-accounting, moreover
do_wp_page() can handle this.

vm_insert_page() still marks vma after first usage.  Usually it is called
from f_op->mmap() handler under mm->mmap_sem write-lock, so it able to
change vma->vm_flags.  Caller must set VM_MIXEDMAP at mmap time if it
wants to call this function from other places, for example from page-fault
handler.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  1 -
 mm/huge_memory.c   |  3 +--
 mm/ksm.c           |  2 +-
 mm/memory.c        | 14 ++++++++++++--
 mm/mmap.c          |  2 +-
 5 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9c039f84b63c..fb0685b17914 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -103,7 +103,6 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
 #define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
 #define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
-#define VM_INSERTPAGE	0x02000000	/* The vma has had "vm_insert_page()" done on it */
 #define VM_NODUMP	0x04000000	/* Do not include in the core dump */
 
 #define VM_CAN_NONLINEAR 0x08000000	/* Has ->fault & does nonlinear pages */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 47206692cf88..9b72d1270519 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1491,8 +1491,7 @@ out:
 	return ret;
 }
 
-#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP| \
-		   VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
+#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
 
 int hugepage_madvise(struct vm_area_struct *vma,
 		     unsigned long *vm_flags, int advice)
diff --git a/mm/ksm.c b/mm/ksm.c
index d1cbe2aa6b3a..f9ccb16559ee 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1469,7 +1469,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		 */
 		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
 				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
-				 VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
+				 VM_RESERVED  | VM_HUGETLB |
 				 VM_NONLINEAR | VM_MIXEDMAP))
 			return 0;		/* just ignore the advice */
 
diff --git a/mm/memory.c b/mm/memory.c
index 655e1429388a..7b1e4feaec06 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1047,7 +1047,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 * readonly mappings. The tradeoff is that copy_page_range is more
 	 * efficient than faulting.
 	 */
-	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
+	if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
+			       VM_PFNMAP | VM_MIXEDMAP))) {
 		if (!vma->anon_vma)
 			return 0;
 	}
@@ -2085,6 +2086,11 @@ out:
  * ask for a shared writable mapping!
  *
  * The page does not need to be reserved.
+ *
+ * Usually this function is called from f_op->mmap() handler
+ * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
+ * Caller must set VM_MIXEDMAP on vma if it wants to call this
+ * function from other places, for example from page-fault handler.
  */
 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
 			struct page *page)
@@ -2093,7 +2099,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
 		return -EFAULT;
 	if (!page_count(page))
 		return -EINVAL;
-	vma->vm_flags |= VM_INSERTPAGE;
+	if (!(vma->vm_flags & VM_MIXEDMAP)) {
+		BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
+		BUG_ON(vma->vm_flags & VM_PFNMAP);
+		vma->vm_flags |= VM_MIXEDMAP;
+	}
 	return insert_page(vma, addr, page, vma->vm_page_prot);
 }
 EXPORT_SYMBOL(vm_insert_page);
diff --git a/mm/mmap.c b/mm/mmap.c
index 872441e81914..b0989f4d4f09 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1190,7 +1190,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
 		return 0;
 
 	/* Specialty mapping? */
-	if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE))
+	if (vm_flags & VM_PFNMAP)
 		return 0;
 
 	/* Can the mapping track the dirty pages? */

From 0b173bc4daa8f8ec03a85abf5e47b23502ff80af Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon, 8 Oct 2012 16:28:46 -0700
Subject: [PATCH 013/160] mm: kill vma flag VM_CAN_NONLINEAR

Move actual pte filling for non-linear file mappings into the new special
vma operation: ->remap_pages().

Filesystems must implement this method to get non-linear mapping support,
if it uses filemap_fault() then generic_file_remap_pages() can be used.

Now device drivers can implement this method and obtain nonlinear vma support.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>	#arch/tile
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/staging/android/ashmem.c |  1 -
 fs/9p/vfs_file.c                 |  1 +
 fs/btrfs/file.c                  |  2 +-
 fs/ceph/addr.c                   |  2 +-
 fs/cifs/file.c                   |  1 +
 fs/ext4/file.c                   |  2 +-
 fs/fuse/file.c                   |  1 +
 fs/gfs2/file.c                   |  2 +-
 fs/nfs/file.c                    |  1 +
 fs/nilfs2/file.c                 |  2 +-
 fs/ocfs2/mmap.c                  |  2 +-
 fs/ubifs/file.c                  |  1 +
 fs/xfs/xfs_file.c                |  2 +-
 include/linux/fs.h               |  2 ++
 include/linux/mm.h               |  7 ++++---
 mm/filemap.c                     |  2 +-
 mm/filemap_xip.c                 |  3 ++-
 mm/fremap.c                      | 14 ++++++++------
 mm/mmap.c                        |  3 +--
 mm/nommu.c                       |  8 ++++++++
 mm/shmem.c                       |  3 +--
 21 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index 94a740d2883d..634b9ae713e0 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -332,7 +332,6 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	vma->vm_file = asma->file;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
 
 out:
 	mutex_unlock(&ashmem_mutex);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index dd6f7ee1e312..c2483e97beee 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -738,6 +738,7 @@ v9fs_cached_file_write(struct file *filp, const char __user * data,
 static const struct vm_operations_struct v9fs_file_vm_ops = {
 	.fault = filemap_fault,
 	.page_mkwrite = v9fs_vm_page_mkwrite,
+	.remap_pages = generic_file_remap_pages,
 };
 
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5caf285c6e4d..f6b40e86121b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1599,6 +1599,7 @@ out:
 static const struct vm_operations_struct btrfs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite	= btrfs_page_mkwrite,
+	.remap_pages	= generic_file_remap_pages,
 };
 
 static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
@@ -1610,7 +1611,6 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
 
 	file_accessed(filp);
 	vma->vm_ops = &btrfs_file_vm_ops;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
 
 	return 0;
 }
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 22b6e4583faa..6690269f5dde 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1224,6 +1224,7 @@ out:
 static struct vm_operations_struct ceph_vmops = {
 	.fault		= filemap_fault,
 	.page_mkwrite	= ceph_page_mkwrite,
+	.remap_pages	= generic_file_remap_pages,
 };
 
 int ceph_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1234,6 +1235,5 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)
 		return -ENOEXEC;
 	file_accessed(file);
 	vma->vm_ops = &ceph_vmops;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 7d7bbdc4c8e7..edb25b4bbb95 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3003,6 +3003,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 static struct vm_operations_struct cifs_file_vm_ops = {
 	.fault = filemap_fault,
 	.page_mkwrite = cifs_page_mkwrite,
+	.remap_pages = generic_file_remap_pages,
 };
 
 int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ca6f07afe601..bf3966bccd34 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -207,6 +207,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 static const struct vm_operations_struct ext4_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite   = ext4_page_mkwrite,
+	.remap_pages	= generic_file_remap_pages,
 };
 
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -217,7 +218,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 		return -ENOEXEC;
 	file_accessed(file);
 	vma->vm_ops = &ext4_file_vm_ops;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index aba15f1b7ad2..78d2837bc940 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1379,6 +1379,7 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
 	.close		= fuse_vma_close,
 	.fault		= filemap_fault,
 	.page_mkwrite	= fuse_page_mkwrite,
+	.remap_pages	= generic_file_remap_pages,
 };
 
 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 30e21997a1a1..0def0504afc1 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -492,6 +492,7 @@ out:
 static const struct vm_operations_struct gfs2_vm_ops = {
 	.fault = filemap_fault,
 	.page_mkwrite = gfs2_page_mkwrite,
+	.remap_pages = generic_file_remap_pages,
 };
 
 /**
@@ -526,7 +527,6 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 			return error;
 	}
 	vma->vm_ops = &gfs2_vm_ops;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
 
 	return 0;
 }
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 6a7fcab7ecb3..f692be97676d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -578,6 +578,7 @@ out:
 static const struct vm_operations_struct nfs_file_vm_ops = {
 	.fault = filemap_fault,
 	.page_mkwrite = nfs_vm_page_mkwrite,
+	.remap_pages = generic_file_remap_pages,
 };
 
 static int nfs_need_sync_write(struct file *filp, struct inode *inode)
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 5b387a4c293e..16f35f7423c5 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -135,13 +135,13 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 static const struct vm_operations_struct nilfs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite	= nilfs_page_mkwrite,
+	.remap_pages	= generic_file_remap_pages,
 };
 
 static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	file_accessed(file);
 	vma->vm_ops = &nilfs_file_vm_ops;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
 
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index d150372fd81d..47a87dda54ce 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -173,6 +173,7 @@ out:
 static const struct vm_operations_struct ocfs2_file_vm_ops = {
 	.fault		= ocfs2_fault,
 	.page_mkwrite	= ocfs2_page_mkwrite,
+	.remap_pages	= generic_file_remap_pages,
 };
 
 int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
@@ -188,7 +189,6 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
 	ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
 out:
 	vma->vm_ops = &ocfs2_file_vm_ops;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
 
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index ff48c5a85309..5bc77817f382 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1536,6 +1536,7 @@ out_unlock:
 static const struct vm_operations_struct ubifs_file_vm_ops = {
 	.fault        = filemap_fault,
 	.page_mkwrite = ubifs_vm_page_mkwrite,
+	.remap_pages = generic_file_remap_pages,
 };
 
 static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1eaeb8be3aae..aa473fa640a2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -940,7 +940,6 @@ xfs_file_mmap(
 	struct vm_area_struct *vma)
 {
 	vma->vm_ops = &xfs_file_vm_ops;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
 
 	file_accessed(filp);
 	return 0;
@@ -1443,4 +1442,5 @@ const struct file_operations xfs_dir_file_operations = {
 static const struct vm_operations_struct xfs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite	= xfs_vm_page_mkwrite,
+	.remap_pages	= generic_file_remap_pages,
 };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ca6d8c806f47..5a8a273d5b2f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2552,6 +2552,8 @@ extern int sb_min_blocksize(struct super_block *, int);
 
 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
+extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
+		unsigned long size, pgoff_t pgoff);
 extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fb0685b17914..44d3fc25f556 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -105,7 +105,6 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
 #define VM_NODUMP	0x04000000	/* Do not include in the core dump */
 
-#define VM_CAN_NONLINEAR 0x08000000	/* Has ->fault & does nonlinear pages */
 #define VM_MIXEDMAP	0x10000000	/* Can contain "struct page" and pure PFN pages */
 #define VM_HUGEPAGE	0x20000000	/* MADV_HUGEPAGE marked this vma */
 #define VM_NOHUGEPAGE	0x40000000	/* MADV_NOHUGEPAGE marked this vma */
@@ -171,8 +170,7 @@ extern pgprot_t protection_map[16];
  * of VM_FAULT_xxx flags that give details about how the fault was handled.
  *
  * pgoff should be used in favour of virtual_address, if possible. If pgoff
- * is used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get nonlinear
- * mapping support.
+ * is used, one may implement ->remap_pages to get nonlinear mapping support.
  */
 struct vm_fault {
 	unsigned int flags;		/* FAULT_FLAG_xxx flags */
@@ -230,6 +228,9 @@ struct vm_operations_struct {
 	int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from,
 		const nodemask_t *to, unsigned long flags);
 #endif
+	/* called by sys_remap_file_pages() to populate non-linear mapping */
+	int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr,
+			   unsigned long size, pgoff_t pgoff);
 };
 
 struct mmu_gather;
diff --git a/mm/filemap.c b/mm/filemap.c
index 384344575c37..a9827b42556e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1737,6 +1737,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
 const struct vm_operations_struct generic_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite	= filemap_page_mkwrite,
+	.remap_pages	= generic_file_remap_pages,
 };
 
 /* This is used for a general mmap of a disk file */
@@ -1749,7 +1750,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
 		return -ENOEXEC;
 	file_accessed(file);
 	vma->vm_ops = &generic_file_vm_ops;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
 
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 13e013b1270c..91750227a191 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -305,6 +305,7 @@ out:
 static const struct vm_operations_struct xip_file_vm_ops = {
 	.fault	= xip_file_fault,
 	.page_mkwrite	= filemap_page_mkwrite,
+	.remap_pages = generic_file_remap_pages,
 };
 
 int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -313,7 +314,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
 
 	file_accessed(file);
 	vma->vm_ops = &xip_file_vm_ops;
-	vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP;
+	vma->vm_flags |= VM_MIXEDMAP;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xip_file_mmap);
diff --git a/mm/fremap.c b/mm/fremap.c
index 048659c0c03d..3d731a498788 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -5,6 +5,7 @@
  *
  * started by Ingo Molnar, Copyright (C) 2002, 2003
  */
+#include <linux/export.h>
 #include <linux/backing-dev.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
@@ -80,9 +81,10 @@ out:
 	return err;
 }
 
-static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long addr, unsigned long size, pgoff_t pgoff)
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
+			     unsigned long size, pgoff_t pgoff)
 {
+	struct mm_struct *mm = vma->vm_mm;
 	int err;
 
 	do {
@@ -95,9 +97,9 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
 		pgoff++;
 	} while (size);
 
-        return 0;
-
+	return 0;
 }
+EXPORT_SYMBOL(generic_file_remap_pages);
 
 /**
  * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
@@ -167,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
 		goto out;
 
-	if (!(vma->vm_flags & VM_CAN_NONLINEAR))
+	if (!vma->vm_ops->remap_pages)
 		goto out;
 
 	if (start < vma->vm_start || start + size > vma->vm_end)
@@ -228,7 +230,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	}
 
 	mmu_notifier_invalidate_range_start(mm, start, start + size);
-	err = populate_range(mm, vma, start, size, pgoff);
+	err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
 	mmu_notifier_invalidate_range_end(mm, start, start + size);
 	if (!err && !(flags & MAP_NONBLOCK)) {
 		if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/mmap.c b/mm/mmap.c
index b0989f4d4f09..d0686d355113 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -669,8 +669,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
 			struct file *file, unsigned long vm_flags)
 {
-	/* VM_CAN_NONLINEAR may get set later by f_op->mmap() */
-	if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
+	if (vma->vm_flags ^ vm_flags)
 		return 0;
 	if (vma->vm_file != file)
 		return 0;
diff --git a/mm/nommu.c b/mm/nommu.c
index dee2ff89fd58..98318dcff742 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1961,6 +1961,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 EXPORT_SYMBOL(filemap_fault);
 
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
+			     unsigned long size, pgoff_t pgoff)
+{
+	BUG();
+	return 0;
+}
+EXPORT_SYMBOL(generic_file_remap_pages);
+
 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long addr, void *buf, int len, int write)
 {
diff --git a/mm/shmem.c b/mm/shmem.c
index d3752110c8c7..cc12072f8787 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1339,7 +1339,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	file_accessed(file);
 	vma->vm_ops = &shmem_vm_ops;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
 
@@ -2643,6 +2642,7 @@ static const struct vm_operations_struct shmem_vm_ops = {
 	.set_policy     = shmem_set_policy,
 	.get_policy     = shmem_get_policy,
 #endif
+	.remap_pages	= generic_file_remap_pages,
 };
 
 static struct dentry *shmem_mount(struct file_system_type *fs_type,
@@ -2836,7 +2836,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 		fput(vma->vm_file);
 	vma->vm_file = file;
 	vma->vm_ops = &shmem_vm_ops;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
 

From 2dd8ad81e31d0d36a5d448329c646ab43eb17788 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon, 8 Oct 2012 16:28:51 -0700
Subject: [PATCH 014/160] mm: use mm->exe_file instead of first VM_EXECUTABLE
 vma->vm_file

Some security modules and oprofile still uses VM_EXECUTABLE for retrieving
a task's executable file.  After this patch they will use mm->exe_file
directly.  mm->exe_file is protected with mm->mmap_sem, so locking stays
the same.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Acked-by: Chris Metcalf <cmetcalf@tilera.com>			[arch/tile]
Acked-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>	[tomoyo]
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Acked-by: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/oprofile/cell/spu_task_sync.c | 15 ++++-----------
 arch/tile/mm/elf.c                         | 19 +++++++------------
 drivers/oprofile/buffer_sync.c             | 17 +++--------------
 kernel/auditsc.c                           | 13 ++-----------
 kernel/fork.c                              |  3 +--
 security/tomoyo/util.c                     |  9 ++-------
 6 files changed, 19 insertions(+), 57 deletions(-)

diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c
index 642fca137ccb..28f1af2db1f5 100644
--- a/arch/powerpc/oprofile/cell/spu_task_sync.c
+++ b/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -304,7 +304,7 @@ static inline unsigned long fast_get_dcookie(struct path *path)
 	return cookie;
 }
 
-/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
+/* Look up the dcookie for the task's mm->exe_file,
  * which corresponds loosely to "application name". Also, determine
  * the offset for the SPU ELF object.  If computed offset is
  * non-zero, it implies an embedded SPU object; otherwise, it's a
@@ -321,7 +321,6 @@ get_exec_dcookie_and_offset(struct spu *spu, unsigned int *offsetp,
 {
 	unsigned long app_cookie = 0;
 	unsigned int my_offset = 0;
-	struct file *app = NULL;
 	struct vm_area_struct *vma;
 	struct mm_struct *mm = spu->mm;
 
@@ -330,16 +329,10 @@ get_exec_dcookie_and_offset(struct spu *spu, unsigned int *offsetp,
 
 	down_read(&mm->mmap_sem);
 
-	for (vma = mm->mmap; vma; vma = vma->vm_next) {
-		if (!vma->vm_file)
-			continue;
-		if (!(vma->vm_flags & VM_EXECUTABLE))
-			continue;
-		app_cookie = fast_get_dcookie(&vma->vm_file->f_path);
+	if (mm->exe_file) {
+		app_cookie = fast_get_dcookie(&mm->exe_file->f_path);
 		pr_debug("got dcookie for %s\n",
-			 vma->vm_file->f_dentry->d_name.name);
-		app = vma->vm_file;
-		break;
+			 mm->exe_file->f_dentry->d_name.name);
 	}
 
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 758b6038c2b7..3cfa98bf9125 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -36,19 +36,14 @@ static void sim_notify_exec(const char *binary_name)
 	} while (c);
 }
 
-static int notify_exec(void)
+static int notify_exec(struct mm_struct *mm)
 {
 	int retval = 0;  /* failure */
-	struct vm_area_struct *vma = current->mm->mmap;
-	while (vma) {
-		if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file)
-			break;
-		vma = vma->vm_next;
-	}
-	if (vma) {
+
+	if (mm->exe_file) {
 		char *buf = (char *) __get_free_page(GFP_KERNEL);
 		if (buf) {
-			char *path = d_path(&vma->vm_file->f_path,
+			char *path = d_path(&mm->exe_file->f_path,
 					    buf, PAGE_SIZE);
 			if (!IS_ERR(path)) {
 				sim_notify_exec(path);
@@ -106,16 +101,16 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
 	unsigned long vdso_base;
 	int retval = 0;
 
+	down_write(&mm->mmap_sem);
+
 	/*
 	 * Notify the simulator that an exec just occurred.
 	 * If we can't find the filename of the mapping, just use
 	 * whatever was passed as the linux_binprm filename.
 	 */
-	if (!notify_exec())
+	if (!notify_exec(mm))
 		sim_notify_exec(bprm->filename);
 
-	down_write(&mm->mmap_sem);
-
 	/*
 	 * MAYWRITE to allow gdb to COW and set breakpoints
 	 */
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index f34b5b29fb95..d93b2b6b1f7a 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -216,7 +216,7 @@ static inline unsigned long fast_get_dcookie(struct path *path)
 }
 
 
-/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
+/* Look up the dcookie for the task's mm->exe_file,
  * which corresponds loosely to "application name". This is
  * not strictly necessary but allows oprofile to associate
  * shared-library samples with particular applications
@@ -224,21 +224,10 @@ static inline unsigned long fast_get_dcookie(struct path *path)
 static unsigned long get_exec_dcookie(struct mm_struct *mm)
 {
 	unsigned long cookie = NO_COOKIE;
-	struct vm_area_struct *vma;
 
-	if (!mm)
-		goto out;
+	if (mm && mm->exe_file)
+		cookie = fast_get_dcookie(&mm->exe_file->f_path);
 
-	for (vma = mm->mmap; vma; vma = vma->vm_next) {
-		if (!vma->vm_file)
-			continue;
-		if (!(vma->vm_flags & VM_EXECUTABLE))
-			continue;
-		cookie = fast_get_dcookie(&vma->vm_file->f_path);
-		break;
-	}
-
-out:
 	return cookie;
 }
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 29e090cc0e46..f4a7756f999c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1151,7 +1151,6 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 	const struct cred *cred;
 	char name[sizeof(tsk->comm)];
 	struct mm_struct *mm = tsk->mm;
-	struct vm_area_struct *vma;
 	char *tty;
 
 	if (!ab)
@@ -1191,16 +1190,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 
 	if (mm) {
 		down_read(&mm->mmap_sem);
-		vma = mm->mmap;
-		while (vma) {
-			if ((vma->vm_flags & VM_EXECUTABLE) &&
-			    vma->vm_file) {
-				audit_log_d_path(ab, " exe=",
-						 &vma->vm_file->f_path);
-				break;
-			}
-			vma = vma->vm_next;
-		}
+		if (mm->exe_file)
+			audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
 		up_read(&mm->mmap_sem);
 	}
 	audit_log_task_context(ab);
diff --git a/kernel/fork.c b/kernel/fork.c
index a2b1efc20928..a57a993681ed 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -656,8 +656,7 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
 {
 	struct file *exe_file;
 
-	/* We need mmap_sem to protect against races with removal of
-	 * VM_EXECUTABLE vmas */
+	/* We need mmap_sem to protect against races with removal of exe_file */
 	down_read(&mm->mmap_sem);
 	exe_file = mm->exe_file;
 	if (exe_file)
diff --git a/security/tomoyo/util.c b/security/tomoyo/util.c
index 867558c98334..2952ba576fb9 100644
--- a/security/tomoyo/util.c
+++ b/security/tomoyo/util.c
@@ -949,18 +949,13 @@ bool tomoyo_path_matches_pattern(const struct tomoyo_path_info *filename,
 const char *tomoyo_get_exe(void)
 {
 	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
 	const char *cp = NULL;
 
 	if (!mm)
 		return NULL;
 	down_read(&mm->mmap_sem);
-	for (vma = mm->mmap; vma; vma = vma->vm_next) {
-		if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) {
-			cp = tomoyo_realpath_from_path(&vma->vm_file->f_path);
-			break;
-		}
-	}
+	if (mm->exe_file)
+		cp = tomoyo_realpath_from_path(&mm->exe_file->f_path);
 	up_read(&mm->mmap_sem);
 	return cp;
 }

From e9714acf8c439688884234dcac2bfc38bb607d38 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon, 8 Oct 2012 16:28:54 -0700
Subject: [PATCH 015/160] mm: kill vma flag VM_EXECUTABLE and
 mm->num_exe_file_vmas

Currently the kernel sets mm->exe_file during sys_execve() and then tracks
number of vmas with VM_EXECUTABLE flag in mm->num_exe_file_vmas, as soon
as this counter drops to zero kernel resets mm->exe_file to NULL.  Plus it
resets mm->exe_file at last mmput() when mm->mm_users drops to zero.

VMA with VM_EXECUTABLE flag appears after mapping file with flag
MAP_EXECUTABLE, such vmas can appears only at sys_execve() or after vma
splitting, because sys_mmap ignores this flag.  Usually binfmt module sets
mm->exe_file and mmaps executable vmas with this file, they hold
mm->exe_file while task is running.

comment from v2.6.25-6245-g925d1c4 ("procfs task exe symlink"),
where all this stuff was introduced:

> The kernel implements readlink of /proc/pid/exe by getting the file from
> the first executable VMA.  Then the path to the file is reconstructed and
> reported as the result.
>
> Because of the VMA walk the code is slightly different on nommu systems.
> This patch avoids separate /proc/pid/exe code on nommu systems.  Instead of
> walking the VMAs to find the first executable file-backed VMA we store a
> reference to the exec'd file in the mm_struct.
>
> That reference would prevent the filesystem holding the executable file
> from being unmounted even after unmapping the VMAs.  So we track the number
> of VM_EXECUTABLE VMAs and drop the new reference when the last one is
> unmapped.  This avoids pinning the mounted filesystem.

exe_file's vma accounting is hooked into every file mmap/unmmap and vma
split/merge just to fix some hypothetical pinning fs from umounting by mm,
which already unmapped all its executable files, but still alive.

Seems like currently nobody depends on this behaviour.  We can try to
remove this logic and keep mm->exe_file until final mmput().

mm->exe_file is still protected with mm->mmap_sem, because we want to
change it via new sys_prctl(PR_SET_MM_EXE_FILE).  Also via this syscall
task can change its mm->exe_file and unpin mountpoint explicitly.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h       |  4 ----
 include/linux/mm_types.h |  1 -
 include/linux/mman.h     |  1 -
 kernel/fork.c            | 21 ---------------------
 mm/mmap.c                | 25 ++++---------------------
 mm/nommu.c               | 11 +----------
 6 files changed, 5 insertions(+), 58 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 44d3fc25f556..25ef49c1f2bd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -87,7 +87,6 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_PFNMAP	0x00000400	/* Page-ranges managed without "struct page", just pure PFN */
 #define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
 
-#define VM_EXECUTABLE	0x00001000
 #define VM_LOCKED	0x00002000
 #define VM_IO           0x00004000	/* Memory mapped I/O or similar */
 
@@ -1396,9 +1395,6 @@ extern void exit_mmap(struct mm_struct *);
 extern int mm_take_all_locks(struct mm_struct *mm);
 extern void mm_drop_all_locks(struct mm_struct *mm);
 
-/* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */
-extern void added_exe_file_vma(struct mm_struct *mm);
-extern void removed_exe_file_vma(struct mm_struct *mm);
 extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
 extern struct file *get_mm_exe_file(struct mm_struct *mm);
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bf7867200b95..58d3173eb365 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -394,7 +394,6 @@ struct mm_struct {
 
 	/* store ref to file /proc/<pid>/exe symlink points to */
 	struct file *exe_file;
-	unsigned long num_exe_file_vmas;
 #ifdef CONFIG_MMU_NOTIFIER
 	struct mmu_notifier_mm *mmu_notifier_mm;
 #endif
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 8b74e9b1d0ad..77cec2f45cb7 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -86,7 +86,6 @@ calc_vm_flag_bits(unsigned long flags)
 {
 	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
 	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
-	       _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
 	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
 }
 #endif /* __KERNEL__ */
diff --git a/kernel/fork.c b/kernel/fork.c
index a57a993681ed..ec667f797af3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -622,26 +622,6 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
-/*
- * We added or removed a vma mapping the executable. The vmas are only mapped
- * during exec and are not mapped with the mmap system call.
- * Callers must hold down_write() on the mm's mmap_sem for these
- */
-void added_exe_file_vma(struct mm_struct *mm)
-{
-	mm->num_exe_file_vmas++;
-}
-
-void removed_exe_file_vma(struct mm_struct *mm)
-{
-	mm->num_exe_file_vmas--;
-	if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
-		fput(mm->exe_file);
-		mm->exe_file = NULL;
-	}
-
-}
-
 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 {
 	if (new_exe_file)
@@ -649,7 +629,6 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 	if (mm->exe_file)
 		fput(mm->exe_file);
 	mm->exe_file = new_exe_file;
-	mm->num_exe_file_vmas = 0;
 }
 
 struct file *get_mm_exe_file(struct mm_struct *mm)
diff --git a/mm/mmap.c b/mm/mmap.c
index d0686d355113..c1ad2e78ea58 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -231,11 +231,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	might_sleep();
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
-	if (vma->vm_file) {
+	if (vma->vm_file)
 		fput(vma->vm_file);
-		if (vma->vm_flags & VM_EXECUTABLE)
-			removed_exe_file_vma(vma->vm_mm);
-	}
 	mpol_put(vma_policy(vma));
 	kmem_cache_free(vm_area_cachep, vma);
 	return next;
@@ -636,8 +633,6 @@ again:			remove_next = 1 + (end > next->vm_end);
 		if (file) {
 			uprobe_munmap(next, next->vm_start, next->vm_end);
 			fput(file);
-			if (next->vm_flags & VM_EXECUTABLE)
-				removed_exe_file_vma(mm);
 		}
 		if (next->anon_vma)
 			anon_vma_merge(vma, next);
@@ -1304,8 +1299,6 @@ munmap_back:
 		error = file->f_op->mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
-		if (vm_flags & VM_EXECUTABLE)
-			added_exe_file_vma(mm);
 
 		/* Can addr have changed??
 		 *
@@ -1987,11 +1980,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 	if (anon_vma_clone(new, vma))
 		goto out_free_mpol;
 
-	if (new->vm_file) {
+	if (new->vm_file)
 		get_file(new->vm_file);
-		if (vma->vm_flags & VM_EXECUTABLE)
-			added_exe_file_vma(mm);
-	}
 
 	if (new->vm_ops && new->vm_ops->open)
 		new->vm_ops->open(new);
@@ -2009,11 +1999,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 	/* Clean everything up if vma_adjust failed. */
 	if (new->vm_ops && new->vm_ops->close)
 		new->vm_ops->close(new);
-	if (new->vm_file) {
-		if (vma->vm_flags & VM_EXECUTABLE)
-			removed_exe_file_vma(mm);
+	if (new->vm_file)
 		fput(new->vm_file);
-	}
 	unlink_anon_vmas(new);
  out_free_mpol:
 	mpol_put(pol);
@@ -2408,12 +2395,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 			new_vma->vm_start = addr;
 			new_vma->vm_end = addr + len;
 			new_vma->vm_pgoff = pgoff;
-			if (new_vma->vm_file) {
+			if (new_vma->vm_file)
 				get_file(new_vma->vm_file);
-
-				if (vma->vm_flags & VM_EXECUTABLE)
-					added_exe_file_vma(mm);
-			}
 			if (new_vma->vm_ops && new_vma->vm_ops->open)
 				new_vma->vm_ops->open(new_vma);
 			vma_link(mm, new_vma, prev, rb_link, rb_parent);
diff --git a/mm/nommu.c b/mm/nommu.c
index 98318dcff742..9c4a7b63a4df 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -789,11 +789,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
 	kenter("%p", vma);
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
-	if (vma->vm_file) {
+	if (vma->vm_file)
 		fput(vma->vm_file);
-		if (vma->vm_flags & VM_EXECUTABLE)
-			removed_exe_file_vma(mm);
-	}
 	put_nommu_region(vma->vm_region);
 	kmem_cache_free(vm_area_cachep, vma);
 }
@@ -1284,10 +1281,6 @@ unsigned long do_mmap_pgoff(struct file *file,
 	if (file) {
 		region->vm_file = get_file(file);
 		vma->vm_file = get_file(file);
-		if (vm_flags & VM_EXECUTABLE) {
-			added_exe_file_vma(current->mm);
-			vma->vm_mm = current->mm;
-		}
 	}
 
 	down_write(&nommu_region_sem);
@@ -1440,8 +1433,6 @@ error:
 	kmem_cache_free(vm_region_jar, region);
 	if (vma->vm_file)
 		fput(vma->vm_file);
-	if (vma->vm_flags & VM_EXECUTABLE)
-		removed_exe_file_vma(vma->vm_mm);
 	kmem_cache_free(vm_area_cachep, vma);
 	kleave(" = %d", ret);
 	return ret;

From 0103bd16fb90bc741c7a03fd1ea4e8a505abad23 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon, 8 Oct 2012 16:28:59 -0700
Subject: [PATCH 016/160] mm: prepare VM_DONTDUMP for using in drivers

Rename VM_NODUMP into VM_DONTDUMP: this name matches other negative flags:
VM_DONTEXPAND, VM_DONTCOPY.  Currently this flag used only for
sys_madvise.  The next patch will use it for replacing the outdated flag
VM_RESERVED.

Also forbid madvise(MADV_DODUMP) for special kernel mappings VM_SPECIAL
(VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c    | 2 +-
 include/linux/mm.h | 2 +-
 mm/madvise.c       | 8 ++++++--
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 28a64e769527..2b72d26e2e4b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1123,7 +1123,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
 	if (always_dump_vma(vma))
 		goto whole;
 
-	if (vma->vm_flags & VM_NODUMP)
+	if (vma->vm_flags & VM_DONTDUMP)
 		return 0;
 
 	/* Hugetlb memory check */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 25ef49c1f2bd..dc08d558e058 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -102,7 +102,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
 #define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
 #define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
-#define VM_NODUMP	0x04000000	/* Do not include in the core dump */
+#define VM_DONTDUMP	0x04000000	/* Do not include in the core dump */
 
 #define VM_MIXEDMAP	0x10000000	/* Can contain "struct page" and pure PFN pages */
 #define VM_HUGEPAGE	0x20000000	/* MADV_HUGEPAGE marked this vma */
diff --git a/mm/madvise.c b/mm/madvise.c
index 14d260fa0d17..03dfa5c7adb3 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -69,10 +69,14 @@ static long madvise_behavior(struct vm_area_struct * vma,
 		new_flags &= ~VM_DONTCOPY;
 		break;
 	case MADV_DONTDUMP:
-		new_flags |= VM_NODUMP;
+		new_flags |= VM_DONTDUMP;
 		break;
 	case MADV_DODUMP:
-		new_flags &= ~VM_NODUMP;
+		if (new_flags & VM_SPECIAL) {
+			error = -EINVAL;
+			goto out;
+		}
+		new_flags &= ~VM_DONTDUMP;
 		break;
 	case MADV_MERGEABLE:
 	case MADV_UNMERGEABLE:

From 314e51b9851b4f4e8ab302243ff5a6fc6147f379 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon, 8 Oct 2012 16:29:02 -0700
Subject: [PATCH 017/160] mm: kill vma flag VM_RESERVED and mm->reserved_vm
 counter

A long time ago, in v2.4, VM_RESERVED kept swapout process off VMA,
currently it lost original meaning but still has some effects:

 | effect                 | alternative flags
-+------------------------+---------------------------------------------
1| account as reserved_vm | VM_IO
2| skip in core dump      | VM_IO, VM_DONTDUMP
3| do not merge or expand | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP
4| do not mlock           | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP

This patch removes reserved_vm counter from mm_struct.  Seems like nobody
cares about it, it does not exported into userspace directly, it only
reduces total_vm showed in proc.

Thus VM_RESERVED can be replaced with VM_IO or pair VM_DONTEXPAND | VM_DONTDUMP.

remap_pfn_range() and io_remap_pfn_range() set VM_IO|VM_DONTEXPAND|VM_DONTDUMP.
remap_vmalloc_range() set VM_DONTEXPAND | VM_DONTDUMP.

[akpm@linux-foundation.org: drivers/vfio/pci/vfio_pci.c fixup]
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/unevictable-lru.txt             |  4 ++--
 arch/alpha/kernel/pci-sysfs.c                    |  2 +-
 arch/ia64/kernel/perfmon.c                       |  2 +-
 arch/ia64/mm/init.c                              |  3 ++-
 arch/powerpc/kvm/book3s_hv.c                     |  2 +-
 arch/sparc/kernel/pci.c                          |  2 +-
 arch/unicore32/kernel/process.c                  |  2 +-
 arch/x86/xen/mmu.c                               |  3 +--
 drivers/char/mbcs.c                              |  2 +-
 drivers/char/mem.c                               |  2 +-
 drivers/char/mspec.c                             |  2 +-
 drivers/gpu/drm/drm_gem.c                        |  2 +-
 drivers/gpu/drm/drm_vm.c                         | 10 ++--------
 drivers/gpu/drm/exynos/exynos_drm_gem.c          |  2 +-
 drivers/gpu/drm/gma500/framebuffer.c             |  3 +--
 drivers/gpu/drm/ttm/ttm_bo_vm.c                  |  4 ++--
 drivers/gpu/drm/udl/udl_fb.c                     |  2 +-
 drivers/infiniband/hw/ehca/ehca_uverbs.c         |  4 ++--
 drivers/infiniband/hw/ipath/ipath_file_ops.c     |  2 +-
 drivers/infiniband/hw/qib/qib_file_ops.c         |  2 +-
 drivers/media/pci/meye/meye.c                    |  2 +-
 drivers/media/platform/omap/omap_vout.c          |  2 +-
 drivers/media/platform/vino.c                    |  2 +-
 drivers/media/usb/sn9c102/sn9c102_core.c         |  3 +--
 drivers/media/usb/usbvision/usbvision-video.c    |  3 +--
 drivers/media/v4l2-core/videobuf-dma-sg.c        |  2 +-
 drivers/media/v4l2-core/videobuf-vmalloc.c       |  2 +-
 drivers/media/v4l2-core/videobuf2-memops.c       |  2 +-
 drivers/misc/carma/carma-fpga.c                  |  2 --
 drivers/misc/sgi-gru/grufile.c                   |  5 ++---
 drivers/mtd/mtdchar.c                            |  2 +-
 drivers/scsi/sg.c                                |  2 +-
 drivers/staging/omapdrm/omap_gem_dmabuf.c        |  2 +-
 drivers/staging/tidspbridge/rmgr/drv_interface.c |  2 +-
 drivers/uio/uio.c                                |  4 +---
 drivers/usb/mon/mon_bin.c                        |  2 +-
 drivers/video/68328fb.c                          |  2 +-
 drivers/video/aty/atyfb_base.c                   |  3 +--
 drivers/video/fb-puv3.c                          |  3 +--
 drivers/video/fb_defio.c                         |  2 +-
 drivers/video/fbmem.c                            |  3 +--
 drivers/video/gbefb.c                            |  2 +-
 drivers/video/omap2/omapfb/omapfb-main.c         |  2 +-
 drivers/video/sbuslib.c                          |  5 ++---
 drivers/video/smscufx.c                          |  1 -
 drivers/video/udlfb.c                            |  1 -
 drivers/video/vermilion/vermilion.c              |  1 -
 drivers/video/vfb.c                              |  1 -
 drivers/xen/gntalloc.c                           |  2 +-
 drivers/xen/gntdev.c                             |  2 +-
 drivers/xen/privcmd.c                            |  3 ++-
 fs/binfmt_elf.c                                  |  2 +-
 fs/binfmt_elf_fdpic.c                            |  2 +-
 fs/hugetlbfs/inode.c                             |  2 +-
 fs/proc/task_mmu.c                               |  2 +-
 include/linux/mempolicy.h                        |  2 +-
 include/linux/mm.h                               |  3 +--
 include/linux/mm_types.h                         |  1 -
 kernel/events/core.c                             |  2 +-
 mm/ksm.c                                         |  3 +--
 mm/memory.c                                      | 11 +++++------
 mm/mlock.c                                       |  2 +-
 mm/mmap.c                                        |  2 --
 mm/nommu.c                                       |  2 +-
 mm/vmalloc.c                                     |  3 +--
 security/selinux/selinuxfs.c                     |  2 +-
 sound/core/pcm_native.c                          |  6 +++---
 sound/usb/usx2y/us122l.c                         |  2 +-
 sound/usb/usx2y/usX2Yhwdep.c                     |  2 +-
 sound/usb/usx2y/usx2yhwdeppcm.c                  |  2 +-
 70 files changed, 77 insertions(+), 105 deletions(-)

diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt
index fa206cccf89f..323ff5dba1cc 100644
--- a/Documentation/vm/unevictable-lru.txt
+++ b/Documentation/vm/unevictable-lru.txt
@@ -371,8 +371,8 @@ mlock_fixup() filters several classes of "special" VMAs:
    mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to
    allocate the huge pages and populate the ptes.
 
-3) VMAs with VM_DONTEXPAND or VM_RESERVED are generally userspace mappings of
-   kernel pages, such as the VDSO page, relay channel pages, etc.  These pages
+3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages,
+   such as the VDSO page, relay channel pages, etc. These pages
    are inherently unevictable and are not managed on the LRU lists.
    mlock_fixup() treats these VMAs the same as hugetlbfs VMAs.  It calls
    make_pages_present() to populate the ptes.
diff --git a/arch/alpha/kernel/pci-sysfs.c b/arch/alpha/kernel/pci-sysfs.c
index 53649c7d0068..b51f7b4818cd 100644
--- a/arch/alpha/kernel/pci-sysfs.c
+++ b/arch/alpha/kernel/pci-sysfs.c
@@ -26,7 +26,7 @@ static int hose_mmap_page_range(struct pci_controller *hose,
 		base = sparse ? hose->sparse_io_base : hose->dense_io_base;
 
 	vma->vm_pgoff += base >> PAGE_SHIFT;
-	vma->vm_flags |= (VM_IO | VM_RESERVED);
+	vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
 
 	return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
 				  vma->vm_end - vma->vm_start,
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index f388b4e18a37..ea39eba61ef5 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2307,7 +2307,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
 	 */
 	vma->vm_mm	     = mm;
 	vma->vm_file	     = get_file(filp);
-	vma->vm_flags	     = VM_READ| VM_MAYREAD |VM_RESERVED;
+	vma->vm_flags	     = VM_READ|VM_MAYREAD|VM_DONTEXPAND|VM_DONTDUMP;
 	vma->vm_page_prot    = PAGE_READONLY; /* XXX may need to change */
 
 	/*
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 0eab454867a2..082e383c1b6f 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -138,7 +138,8 @@ ia64_init_addr_space (void)
 			vma->vm_mm = current->mm;
 			vma->vm_end = PAGE_SIZE;
 			vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT);
-			vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED;
+			vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO |
+					VM_DONTEXPAND | VM_DONTDUMP;
 			down_write(&current->mm->mmap_sem);
 			if (insert_vm_struct(current->mm, vma)) {
 				up_write(&current->mm->mmap_sem);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 83e929e66f9d..721d4603a235 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1183,7 +1183,7 @@ static const struct vm_operations_struct kvm_rma_vm_ops = {
 
 static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_ops = &kvm_rma_vm_ops;
 	return 0;
 }
diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index acc8c838ff72..75b31bcdeadf 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c
@@ -779,7 +779,7 @@ static int __pci_mmap_make_offset(struct pci_dev *pdev,
 static void __pci_mmap_set_flags(struct pci_dev *dev, struct vm_area_struct *vma,
 					    enum pci_mmap_state mmap_state)
 {
-	vma->vm_flags |= (VM_IO | VM_RESERVED);
+	vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
 }
 
 /* Set vm_page_prot of VMA, as appropriate for this architecture, for a pci
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index b6f0458c3143..b008586dad75 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -380,7 +380,7 @@ int vectors_user_mapping(void)
 	return install_special_mapping(mm, 0xffff0000, PAGE_SIZE,
 				       VM_READ | VM_EXEC |
 				       VM_MAYREAD | VM_MAYEXEC |
-				       VM_RESERVED,
+				       VM_DONTEXPAND | VM_DONTDUMP,
 				       NULL);
 }
 
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5a16824cc2b3..fd28d86fe3d2 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2451,8 +2451,7 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
 
 	prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
 
-	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
-				(VM_PFNMAP | VM_RESERVED | VM_IO)));
+	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
 
 	rmd.mfn = mfn;
 	rmd.prot = prot;
diff --git a/drivers/char/mbcs.c b/drivers/char/mbcs.c
index 0c7d340b9ab9..f74e892711dd 100644
--- a/drivers/char/mbcs.c
+++ b/drivers/char/mbcs.c
@@ -507,7 +507,7 @@ static int mbcs_gscr_mmap(struct file *fp, struct vm_area_struct *vma)
 
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
-	/* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */
+	/* Remap-pfn-range will mark the range VM_IO */
 	if (remap_pfn_range(vma,
 			    vma->vm_start,
 			    __pa(soft->gscr_addr) >> PAGE_SHIFT,
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index e5eedfa24c91..0537903c985b 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -322,7 +322,7 @@ static int mmap_mem(struct file *file, struct vm_area_struct *vma)
 
 	vma->vm_ops = &mmap_mem_ops;
 
-	/* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */
+	/* Remap-pfn-range will mark the range VM_IO */
 	if (remap_pfn_range(vma,
 			    vma->vm_start,
 			    vma->vm_pgoff,
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c
index 845f97fd1832..e1f60f968fdd 100644
--- a/drivers/char/mspec.c
+++ b/drivers/char/mspec.c
@@ -286,7 +286,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma,
 	atomic_set(&vdata->refcnt, 1);
 	vma->vm_private_data = vdata;
 
-	vma->vm_flags |= (VM_IO | VM_RESERVED | VM_PFNMAP | VM_DONTEXPAND);
+	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
 	if (vdata->type == MSPEC_FETCHOP || vdata->type == MSPEC_UNCACHED)
 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 	vma->vm_ops = &mspec_vm_ops;
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index 92177d5aedee..24efae464e2c 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -706,7 +706,7 @@ int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
 		goto out_unlock;
 	}
 
-	vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND;
+	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_ops = obj->dev->driver->gem_vm_ops;
 	vma->vm_private_data = map->handle;
 	vma->vm_page_prot =  pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c
index 23a824e6a22a..db7bd292410b 100644
--- a/drivers/gpu/drm/drm_vm.c
+++ b/drivers/gpu/drm/drm_vm.c
@@ -514,8 +514,7 @@ static int drm_mmap_dma(struct file *filp, struct vm_area_struct *vma)
 
 	vma->vm_ops = &drm_vm_dma_ops;
 
-	vma->vm_flags |= VM_RESERVED;	/* Don't swap */
-	vma->vm_flags |= VM_DONTEXPAND;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 
 	drm_vm_open_locked(dev, vma);
 	return 0;
@@ -643,21 +642,16 @@ int drm_mmap_locked(struct file *filp, struct vm_area_struct *vma)
 	case _DRM_SHM:
 		vma->vm_ops = &drm_vm_shm_ops;
 		vma->vm_private_data = (void *)map;
-		/* Don't let this area swap.  Change when
-		   DRM_KERNEL advisory is supported. */
-		vma->vm_flags |= VM_RESERVED;
 		break;
 	case _DRM_SCATTER_GATHER:
 		vma->vm_ops = &drm_vm_sg_ops;
 		vma->vm_private_data = (void *)map;
-		vma->vm_flags |= VM_RESERVED;
 		vma->vm_page_prot = drm_dma_prot(map->type, vma);
 		break;
 	default:
 		return -EINVAL;	/* This should never happen. */
 	}
-	vma->vm_flags |= VM_RESERVED;	/* Don't swap */
-	vma->vm_flags |= VM_DONTEXPAND;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 
 	drm_vm_open_locked(dev, vma);
 	return 0;
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c
index fcdbe46914f7..d2545560664f 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_gem.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c
@@ -500,7 +500,7 @@ static int exynos_drm_gem_mmap_buffer(struct file *filp,
 
 	DRM_DEBUG_KMS("%s\n", __FILE__);
 
-	vma->vm_flags |= (VM_IO | VM_RESERVED);
+	vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
 
 	update_vm_cache_attr(exynos_gem_obj, vma);
 
diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c
index 884ba73ac6ce..afded54dbb10 100644
--- a/drivers/gpu/drm/gma500/framebuffer.c
+++ b/drivers/gpu/drm/gma500/framebuffer.c
@@ -178,8 +178,7 @@ static int psbfb_mmap(struct fb_info *info, struct vm_area_struct *vma)
 	 */
 	vma->vm_ops = &psbfb_vm_ops;
 	vma->vm_private_data = (void *)psbfb;
-	vma->vm_flags |= VM_RESERVED | VM_IO |
-					VM_MIXEDMAP | VM_DONTEXPAND;
+	vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP;
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index a877813571a4..3ba72dbdc4bd 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -285,7 +285,7 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma,
 	 */
 
 	vma->vm_private_data = bo;
-	vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND;
+	vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP;
 	return 0;
 out_unref:
 	ttm_bo_unref(&bo);
@@ -300,7 +300,7 @@ int ttm_fbdev_mmap(struct vm_area_struct *vma, struct ttm_buffer_object *bo)
 
 	vma->vm_ops = &ttm_bo_vm_ops;
 	vma->vm_private_data = ttm_bo_reference(bo);
-	vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND;
+	vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND;
 	return 0;
 }
 EXPORT_SYMBOL(ttm_fbdev_mmap);
diff --git a/drivers/gpu/drm/udl/udl_fb.c b/drivers/gpu/drm/udl/udl_fb.c
index 67df842fbb33..69a2b16f42a6 100644
--- a/drivers/gpu/drm/udl/udl_fb.c
+++ b/drivers/gpu/drm/udl/udl_fb.c
@@ -243,7 +243,7 @@ static int udl_fb_mmap(struct fb_info *info, struct vm_area_struct *vma)
 			size = 0;
 	}
 
-	vma->vm_flags |= VM_RESERVED;	/* avoid to swap out this VMA */
+	/* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
 	return 0;
 }
 
diff --git a/drivers/infiniband/hw/ehca/ehca_uverbs.c b/drivers/infiniband/hw/ehca/ehca_uverbs.c
index 45ee89b65c23..1a1d5d99fcf9 100644
--- a/drivers/infiniband/hw/ehca/ehca_uverbs.c
+++ b/drivers/infiniband/hw/ehca/ehca_uverbs.c
@@ -117,7 +117,7 @@ static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas,
 	physical = galpas->user.fw_handle;
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 	ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical);
-	/* VM_IO | VM_RESERVED are set by remap_pfn_range() */
+	/* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
 	ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT,
 			   vma->vm_page_prot);
 	if (unlikely(ret)) {
@@ -139,7 +139,7 @@ static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue,
 	u64 start, ofs;
 	struct page *page;
 
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	start = vma->vm_start;
 	for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) {
 		u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs);
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c
index 736d9edbdbe7..3eb7e454849b 100644
--- a/drivers/infiniband/hw/ipath/ipath_file_ops.c
+++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c
@@ -1225,7 +1225,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
 
 	vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
 	vma->vm_ops = &ipath_file_vm_ops;
-	vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	ret = 1;
 
 bail:
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index faa44cb08071..959a5c4ff812 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -971,7 +971,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
 
 	vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
 	vma->vm_ops = &qib_file_vm_ops;
-	vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	ret = 1;
 
 bail:
diff --git a/drivers/media/pci/meye/meye.c b/drivers/media/pci/meye/meye.c
index 7bc775219f97..e5a76da86081 100644
--- a/drivers/media/pci/meye/meye.c
+++ b/drivers/media/pci/meye/meye.c
@@ -1647,7 +1647,7 @@ static int meye_mmap(struct file *file, struct vm_area_struct *vma)
 
 	vma->vm_ops = &meye_vm_ops;
 	vma->vm_flags &= ~VM_IO;	/* not I/O memory */
-	vma->vm_flags |= VM_RESERVED;	/* avoid to swap out this VMA */
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_private_data = (void *) (offset / gbufsize);
 	meye_vm_open(vma);
 
diff --git a/drivers/media/platform/omap/omap_vout.c b/drivers/media/platform/omap/omap_vout.c
index 66ac21d466af..134016f0e660 100644
--- a/drivers/media/platform/omap/omap_vout.c
+++ b/drivers/media/platform/omap/omap_vout.c
@@ -911,7 +911,7 @@ static int omap_vout_mmap(struct file *file, struct vm_area_struct *vma)
 
 	q->bufs[i]->baddr = vma->vm_start;
 
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
 	vma->vm_ops = &omap_vout_vm_ops;
 	vma->vm_private_data = (void *) vout;
diff --git a/drivers/media/platform/vino.c b/drivers/media/platform/vino.c
index 790d96cffeea..70b0bf4b2900 100644
--- a/drivers/media/platform/vino.c
+++ b/drivers/media/platform/vino.c
@@ -3950,7 +3950,7 @@ found:
 
 	fb->map_count = 1;
 
-	vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_flags &= ~VM_IO;
 	vma->vm_private_data = fb;
 	vma->vm_file = file;
diff --git a/drivers/media/usb/sn9c102/sn9c102_core.c b/drivers/media/usb/sn9c102/sn9c102_core.c
index 19ea780b16ff..5bfc8e2f018f 100644
--- a/drivers/media/usb/sn9c102/sn9c102_core.c
+++ b/drivers/media/usb/sn9c102/sn9c102_core.c
@@ -2126,8 +2126,7 @@ static int sn9c102_mmap(struct file* filp, struct vm_area_struct *vma)
 		return -EINVAL;
 	}
 
-	vma->vm_flags |= VM_IO;
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
 
 	pos = cam->frame[i].bufmem;
 	while (size > 0) { /* size is page-aligned */
diff --git a/drivers/media/usb/usbvision/usbvision-video.c b/drivers/media/usb/usbvision/usbvision-video.c
index f67018ed3795..5c36a57e6590 100644
--- a/drivers/media/usb/usbvision/usbvision-video.c
+++ b/drivers/media/usb/usbvision/usbvision-video.c
@@ -1108,8 +1108,7 @@ static int usbvision_mmap(struct file *file, struct vm_area_struct *vma)
 	}
 
 	/* VM_IO is eventually going to replace PageReserved altogether */
-	vma->vm_flags |= VM_IO;
-	vma->vm_flags |= VM_RESERVED;	/* avoid to swap out this VMA */
+	vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
 
 	pos = usbvision->frame[i].data;
 	while (size > 0) {
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index f300deafd268..828e7c10bd70 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -582,7 +582,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q,
 	map->count    = 1;
 	map->q        = q;
 	vma->vm_ops   = &videobuf_vm_ops;
-	vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_flags &= ~VM_IO; /* using shared anonymous pages */
 	vma->vm_private_data = map;
 	dprintk(1, "mmap %p: q=%p %08lx-%08lx pgoff %08lx bufs %d-%d\n",
diff --git a/drivers/media/v4l2-core/videobuf-vmalloc.c b/drivers/media/v4l2-core/videobuf-vmalloc.c
index df142580e44c..2ff7fcc77b11 100644
--- a/drivers/media/v4l2-core/videobuf-vmalloc.c
+++ b/drivers/media/v4l2-core/videobuf-vmalloc.c
@@ -270,7 +270,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q,
 	}
 
 	vma->vm_ops          = &videobuf_vm_ops;
-	vma->vm_flags       |= VM_DONTEXPAND | VM_RESERVED;
+	vma->vm_flags       |= VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_private_data = map;
 
 	dprintk(1, "mmap %p: q=%p %08lx-%08lx (%lx) pgoff %08lx buf %d\n",
diff --git a/drivers/media/v4l2-core/videobuf2-memops.c b/drivers/media/v4l2-core/videobuf2-memops.c
index 504cd4cbe29e..051ea3571b20 100644
--- a/drivers/media/v4l2-core/videobuf2-memops.c
+++ b/drivers/media/v4l2-core/videobuf2-memops.c
@@ -163,7 +163,7 @@ int vb2_mmap_pfn_range(struct vm_area_struct *vma, unsigned long paddr,
 		return ret;
 	}
 
-	vma->vm_flags		|= VM_DONTEXPAND | VM_RESERVED;
+	vma->vm_flags		|= VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_private_data	= priv;
 	vma->vm_ops		= vm_ops;
 
diff --git a/drivers/misc/carma/carma-fpga.c b/drivers/misc/carma/carma-fpga.c
index 0c43297ed9ac..8835eabb3b87 100644
--- a/drivers/misc/carma/carma-fpga.c
+++ b/drivers/misc/carma/carma-fpga.c
@@ -1243,8 +1243,6 @@ static int data_mmap(struct file *filp, struct vm_area_struct *vma)
 		return -EINVAL;
 	}
 
-	/* IO memory (stop cacheing) */
-	vma->vm_flags |= VM_IO | VM_RESERVED;
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	return io_remap_pfn_range(vma, vma->vm_start, addr, vsize,
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
index ecafa4ba238b..492c8cac69ac 100644
--- a/drivers/misc/sgi-gru/grufile.c
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -108,9 +108,8 @@ static int gru_file_mmap(struct file *file, struct vm_area_struct *vma)
 				vma->vm_end & (GRU_GSEG_PAGESIZE - 1))
 		return -EINVAL;
 
-	vma->vm_flags |=
-	    (VM_IO | VM_DONTCOPY | VM_LOCKED | VM_DONTEXPAND | VM_PFNMAP |
-			VM_RESERVED);
+	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_LOCKED |
+			 VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_page_prot = PAGE_SHARED;
 	vma->vm_ops = &gru_vm_ops;
 
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index a6e74514e662..73ae81a629f2 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -1182,7 +1182,7 @@ static int mtdchar_mmap(struct file *file, struct vm_area_struct *vma)
 			return -EINVAL;
 		if (set_vm_offset(vma, off) < 0)
 			return -EINVAL;
-		vma->vm_flags |= VM_IO | VM_RESERVED;
+		vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
 
 #ifdef pgprot_noncached
 		if (file->f_flags & O_DSYNC || off >= __pa(high_memory))
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 9c5c5f2b3962..be2c9a6561ff 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1257,7 +1257,7 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma)
 	}
 
 	sfp->mmap_called = 1;
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_private_data = sfp;
 	vma->vm_ops = &sg_mmap_vm_ops;
 	return 0;
diff --git a/drivers/staging/omapdrm/omap_gem_dmabuf.c b/drivers/staging/omapdrm/omap_gem_dmabuf.c
index 42728e0cc194..c6f3ef6f57b9 100644
--- a/drivers/staging/omapdrm/omap_gem_dmabuf.c
+++ b/drivers/staging/omapdrm/omap_gem_dmabuf.c
@@ -160,7 +160,7 @@ static int omap_gem_dmabuf_mmap(struct dma_buf *buffer,
 		goto out_unlock;
 	}
 
-	vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND;
+	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_ops = obj->dev->driver->gem_vm_ops;
 	vma->vm_private_data = obj;
 	vma->vm_page_prot =  pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
diff --git a/drivers/staging/tidspbridge/rmgr/drv_interface.c b/drivers/staging/tidspbridge/rmgr/drv_interface.c
index bddea1d3b2c3..701a11ac676d 100644
--- a/drivers/staging/tidspbridge/rmgr/drv_interface.c
+++ b/drivers/staging/tidspbridge/rmgr/drv_interface.c
@@ -261,7 +261,7 @@ static int bridge_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	u32 status;
 
-	vma->vm_flags |= VM_RESERVED | VM_IO;
+	/* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	dev_dbg(bridge, "%s: vm filp %p start %lx end %lx page_prot %ulx "
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index a783d533a1a6..5110f367f1f1 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -653,8 +653,6 @@ static int uio_mmap_physical(struct vm_area_struct *vma)
 	if (mi < 0)
 		return -EINVAL;
 
-	vma->vm_flags |= VM_IO | VM_RESERVED;
-
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	return remap_pfn_range(vma,
@@ -666,7 +664,7 @@ static int uio_mmap_physical(struct vm_area_struct *vma)
 
 static int uio_mmap_logical(struct vm_area_struct *vma)
 {
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_ops = &uio_vm_ops;
 	uio_vma_open(vma);
 	return 0;
diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c
index 91cd85076a44..9a62e89d6dc0 100644
--- a/drivers/usb/mon/mon_bin.c
+++ b/drivers/usb/mon/mon_bin.c
@@ -1247,7 +1247,7 @@ static int mon_bin_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	/* don't do anything here: "fault" will set up page table entries */
 	vma->vm_ops = &mon_bin_vm_ops;
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_private_data = filp->private_data;
 	mon_bin_vma_open(vma);
 	return 0;
diff --git a/drivers/video/68328fb.c b/drivers/video/68328fb.c
index a425d65d5ba2..fa44fbed397d 100644
--- a/drivers/video/68328fb.c
+++ b/drivers/video/68328fb.c
@@ -400,7 +400,7 @@ static int mc68x328fb_mmap(struct fb_info *info, struct vm_area_struct *vma)
 #ifndef MMU
 	/* this is uClinux (no MMU) specific code */
 
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_start = videomemory;
 
 	return 0;
diff --git a/drivers/video/aty/atyfb_base.c b/drivers/video/aty/atyfb_base.c
index 3f2e8c13f1ca..868932f904ef 100644
--- a/drivers/video/aty/atyfb_base.c
+++ b/drivers/video/aty/atyfb_base.c
@@ -1942,8 +1942,7 @@ static int atyfb_mmap(struct fb_info *info, struct vm_area_struct *vma)
 	off = vma->vm_pgoff << PAGE_SHIFT;
 	size = vma->vm_end - vma->vm_start;
 
-	/* To stop the swapper from even considering these pages. */
-	vma->vm_flags |= (VM_IO | VM_RESERVED);
+	/* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
 
 	if (((vma->vm_pgoff == 0) && (size == info->fix.smem_len)) ||
 	    ((off == info->fix.smem_len) && (size == PAGE_SIZE)))
diff --git a/drivers/video/fb-puv3.c b/drivers/video/fb-puv3.c
index 60a787fa32cf..7d106f1f4906 100644
--- a/drivers/video/fb-puv3.c
+++ b/drivers/video/fb-puv3.c
@@ -653,9 +653,8 @@ int unifb_mmap(struct fb_info *info,
 				vma->vm_page_prot))
 		return -EAGAIN;
 
-	vma->vm_flags |= VM_RESERVED;	/* avoid to swap out this VMA */
+	/* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
 	return 0;
-
 }
 
 static struct fb_ops unifb_ops = {
diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c
index 64cda560c488..88cad6b8b479 100644
--- a/drivers/video/fb_defio.c
+++ b/drivers/video/fb_defio.c
@@ -166,7 +166,7 @@ static const struct address_space_operations fb_deferred_io_aops = {
 static int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
 	vma->vm_ops = &fb_deferred_io_vm_ops;
-	vma->vm_flags |= ( VM_RESERVED | VM_DONTEXPAND );
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	if (!(info->flags & FBINFO_VIRTFB))
 		vma->vm_flags |= VM_IO;
 	vma->vm_private_data = info;
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 0dff12a1daef..3ff0105a496a 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1410,8 +1410,7 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
 		return -EINVAL;
 	off += start;
 	vma->vm_pgoff = off >> PAGE_SHIFT;
-	/* This is an IO map - tell maydump to skip this VMA */
-	vma->vm_flags |= VM_IO | VM_RESERVED;
+	/* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by io_remap_pfn_range()*/
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	fb_pgprotect(file, vma, off);
 	if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT,
diff --git a/drivers/video/gbefb.c b/drivers/video/gbefb.c
index 7e7b7a9ba274..05e2a8a99d8f 100644
--- a/drivers/video/gbefb.c
+++ b/drivers/video/gbefb.c
@@ -1024,7 +1024,7 @@ static int gbefb_mmap(struct fb_info *info,
 	pgprot_val(vma->vm_page_prot) =
 		pgprot_fb(pgprot_val(vma->vm_page_prot));
 
-	vma->vm_flags |= VM_IO | VM_RESERVED;
+	/* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
 
 	/* look for the starting tile */
 	tile = &gbe_tiles.cpu[offset >> TILE_SHIFT];
diff --git a/drivers/video/omap2/omapfb/omapfb-main.c b/drivers/video/omap2/omapfb/omapfb-main.c
index 3c39aa8de928..15373f4aee19 100644
--- a/drivers/video/omap2/omapfb/omapfb-main.c
+++ b/drivers/video/omap2/omapfb/omapfb-main.c
@@ -1128,7 +1128,7 @@ static int omapfb_mmap(struct fb_info *fbi, struct vm_area_struct *vma)
 	DBG("user mmap region start %lx, len %d, off %lx\n", start, len, off);
 
 	vma->vm_pgoff = off >> PAGE_SHIFT;
-	vma->vm_flags |= VM_IO | VM_RESERVED;
+	/* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
 	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
 	vma->vm_ops = &mmap_user_ops;
 	vma->vm_private_data = rg;
diff --git a/drivers/video/sbuslib.c b/drivers/video/sbuslib.c
index 3c1de981a18c..296afae442f4 100644
--- a/drivers/video/sbuslib.c
+++ b/drivers/video/sbuslib.c
@@ -57,9 +57,8 @@ int sbusfb_mmap_helper(struct sbus_mmap_map *map,
 
 	off = vma->vm_pgoff << PAGE_SHIFT;
 
-	/* To stop the swapper from even considering these pages */
-	vma->vm_flags |= (VM_IO | VM_RESERVED);
-	
+	/* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
+
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	/* Each page, see which map applies */
diff --git a/drivers/video/smscufx.c b/drivers/video/smscufx.c
index 5533a32c6ca1..97bd6620c364 100644
--- a/drivers/video/smscufx.c
+++ b/drivers/video/smscufx.c
@@ -803,7 +803,6 @@ static int ufx_ops_mmap(struct fb_info *info, struct vm_area_struct *vma)
 			size = 0;
 	}
 
-	vma->vm_flags |= VM_RESERVED;	/* avoid to swap out this VMA */
 	return 0;
 }
 
diff --git a/drivers/video/udlfb.c b/drivers/video/udlfb.c
index 8af64148294b..f45eba3d6150 100644
--- a/drivers/video/udlfb.c
+++ b/drivers/video/udlfb.c
@@ -345,7 +345,6 @@ static int dlfb_ops_mmap(struct fb_info *info, struct vm_area_struct *vma)
 			size = 0;
 	}
 
-	vma->vm_flags |= VM_RESERVED;	/* avoid to swap out this VMA */
 	return 0;
 }
 
diff --git a/drivers/video/vermilion/vermilion.c b/drivers/video/vermilion/vermilion.c
index 970e43d13f52..89aef343e295 100644
--- a/drivers/video/vermilion/vermilion.c
+++ b/drivers/video/vermilion/vermilion.c
@@ -1018,7 +1018,6 @@ static int vmlfb_mmap(struct fb_info *info, struct vm_area_struct *vma)
 	offset += vinfo->vram_start;
 	pgprot_val(vma->vm_page_prot) |= _PAGE_PCD;
 	pgprot_val(vma->vm_page_prot) &= ~_PAGE_PWT;
-	vma->vm_flags |= VM_RESERVED | VM_IO;
 	if (remap_pfn_range(vma, vma->vm_start, offset >> PAGE_SHIFT,
 						size, vma->vm_page_prot))
 		return -EAGAIN;
diff --git a/drivers/video/vfb.c b/drivers/video/vfb.c
index 501a922aa9dc..c7f692525b88 100644
--- a/drivers/video/vfb.c
+++ b/drivers/video/vfb.c
@@ -439,7 +439,6 @@ static int vfb_mmap(struct fb_info *info,
 			size = 0;
 	}
 
-	vma->vm_flags |= VM_RESERVED;	/* avoid to swap out this VMA */
 	return 0;
 
 }
diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c
index 934985d14c24..4097987b330e 100644
--- a/drivers/xen/gntalloc.c
+++ b/drivers/xen/gntalloc.c
@@ -535,7 +535,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma)
 
 	vma->vm_private_data = vm_priv;
 
-	vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 
 	vma->vm_ops = &gntalloc_vmops;
 
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 5df9fd847b2e..610bfc6be177 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -720,7 +720,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
 
 	vma->vm_ops = &gntdev_vmops;
 
-	vma->vm_flags |= VM_RESERVED|VM_DONTEXPAND;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 
 	if (use_ptemod)
 		vma->vm_flags |= VM_DONTCOPY;
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index ef6389580b8c..8adb9cc267f9 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -455,7 +455,8 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	/* DONTCOPY is essential for Xen because copy_page_range doesn't know
 	 * how to recreate these mappings */
-	vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
+	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY |
+			 VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_ops = &privcmd_vm_ops;
 	vma->vm_private_data = NULL;
 
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 2b72d26e2e4b..e800dec958c3 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1135,7 +1135,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
 	}
 
 	/* Do not dump I/O mapped devices or special mappings */
-	if (vma->vm_flags & (VM_IO | VM_RESERVED))
+	if (vma->vm_flags & VM_IO)
 		return 0;
 
 	/* By default, dump shared memory if mapped from an anonymous file. */
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 08d812b32282..262db114ff01 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1205,7 +1205,7 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
 	int dump_ok;
 
 	/* Do not dump I/O mapped devices or special mappings */
-	if (vma->vm_flags & (VM_IO | VM_RESERVED)) {
+	if (vma->vm_flags & VM_IO) {
 		kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags);
 		return 0;
 	}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9460120a5170..0a0ab8e21b19 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -110,7 +110,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	 * way when do_mmap_pgoff unwinds (may be important on powerpc
 	 * and ia64).
 	 */
-	vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
+	vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_ops = &hugetlb_vm_ops;
 
 	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4540b8f76f16..79827ce03e3b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -54,7 +54,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmPTE:\t%8lu kB\n"
 		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
-		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
+		total_vm << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
 		mm->pinned_vm << (PAGE_SHIFT-10),
 		hiwater_rss << (PAGE_SHIFT-10),
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 95b738c7abff..ba7a0ff19d39 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -239,7 +239,7 @@ extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol,
 /* Check if a vma is migratable */
 static inline int vma_migratable(struct vm_area_struct *vma)
 {
-	if (vma->vm_flags & (VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
+	if (vma->vm_flags & (VM_IO | VM_HUGETLB | VM_PFNMAP))
 		return 0;
 	/*
 	 * Migration allocates pages in the highest zone. If we cannot
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dc08d558e058..0514fe9d3c84 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -96,7 +96,6 @@ extern unsigned int kobjsize(const void *objp);
 
 #define VM_DONTCOPY	0x00020000      /* Do not copy this vma on fork */
 #define VM_DONTEXPAND	0x00040000	/* Cannot expand with mremap() */
-#define VM_RESERVED	0x00080000	/* Count as reserved_vm like IO */
 #define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
 #define VM_NORESERVE	0x00200000	/* should the VM suppress accounting */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
@@ -148,7 +147,7 @@ extern unsigned int kobjsize(const void *objp);
  * Special vmas that are non-mergable, non-mlock()able.
  * Note: mm/huge_memory.c VM_NO_THP depends on this definition.
  */
-#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
+#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP)
 
 /*
  * mapping from the currently active vm_flags protection bits (the
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 58d3173eb365..a57a43f5ca7c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -349,7 +349,6 @@ struct mm_struct {
 	unsigned long shared_vm;	/* Shared pages (files) */
 	unsigned long exec_vm;		/* VM_EXEC & ~VM_WRITE */
 	unsigned long stack_vm;		/* VM_GROWSUP/DOWN */
-	unsigned long reserved_vm;	/* VM_RESERVED|VM_IO pages */
 	unsigned long def_flags;
 	unsigned long nr_ptes;		/* Page table pages */
 	unsigned long start_code, end_code, start_data, end_data;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f16f3c58f11a..cda3ebd49e86 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3671,7 +3671,7 @@ unlock:
 		atomic_inc(&event->mmap_count);
 	mutex_unlock(&event->mmap_mutex);
 
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_ops = &perf_mmap_vmops;
 
 	return ret;
diff --git a/mm/ksm.c b/mm/ksm.c
index f9ccb16559ee..9638620a7530 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1469,8 +1469,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		 */
 		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
 				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
-				 VM_RESERVED  | VM_HUGETLB |
-				 VM_NONLINEAR | VM_MIXEDMAP))
+				 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
 			return 0;		/* just ignore the advice */
 
 #ifdef VM_SAO
diff --git a/mm/memory.c b/mm/memory.c
index 7b1e4feaec06..e09c04813186 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2297,14 +2297,13 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 * rest of the world about it:
 	 *   VM_IO tells people not to look at these pages
 	 *	(accesses can have side effects).
-	 *   VM_RESERVED is specified all over the place, because
-	 *	in 2.4 it kept swapout's vma scan off this vma; but
-	 *	in 2.6 the LRU scan won't even find its pages, so this
-	 *	flag means no more than count its pages in reserved_vm,
-	 * 	and omit it from core dump, even when VM_IO turned off.
 	 *   VM_PFNMAP tells the core MM that the base pages are just
 	 *	raw PFN mappings, and do not have a "struct page" associated
 	 *	with them.
+	 *   VM_DONTEXPAND
+	 *      Disable vma merging and expanding with mremap().
+	 *   VM_DONTDUMP
+	 *      Omit vma from core dump, even when VM_IO turned off.
 	 *
 	 * There's a horrible special case to handle copy-on-write
 	 * behaviour that some programs depend on. We mark the "original"
@@ -2321,7 +2320,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	if (err)
 		return -EINVAL;
 
-	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
 
 	BUG_ON(addr >= end);
 	pfn -= addr >> PAGE_SHIFT;
diff --git a/mm/mlock.c b/mm/mlock.c
index ef726e8aa8e9..a948be4b7ba7 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
 		goto no_mlock;
 
-	if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
+	if (!((vma->vm_flags & VM_DONTEXPAND) ||
 			is_vm_hugetlb_page(vma) ||
 			vma == get_gate_vma(current->mm))) {
 
diff --git a/mm/mmap.c b/mm/mmap.c
index c1ad2e78ea58..a76042dc806d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -945,8 +945,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
 			mm->exec_vm += pages;
 	} else if (flags & stack_flags)
 		mm->stack_vm += pages;
-	if (flags & (VM_RESERVED|VM_IO))
-		mm->reserved_vm += pages;
 }
 #endif /* CONFIG_PROC_FS */
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 9c4a7b63a4df..12e84e69dd06 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1811,7 +1811,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	if (addr != (pfn << PAGE_SHIFT))
 		return -EINVAL;
 
-	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
 	return 0;
 }
 EXPORT_SYMBOL(remap_pfn_range);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2bb90b1d241c..8de704679bfc 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
 		usize -= PAGE_SIZE;
 	} while (usize > 0);
 
-	/* Prevent "things" like memory migration? VM_flags need a cleanup... */
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 
 	return 0;
 }
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 55af8c5b57e6..3a6e8731646c 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -485,7 +485,7 @@ static int sel_mmap_policy(struct file *filp, struct vm_area_struct *vma)
 			return -EACCES;
 	}
 
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_ops = &sel_mmap_policy_ops;
 
 	return 0;
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 20554eff5a21..5e12e5bacbba 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -3039,7 +3039,7 @@ static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file
 		return -EINVAL;
 	area->vm_ops = &snd_pcm_vm_ops_status;
 	area->vm_private_data = substream;
-	area->vm_flags |= VM_RESERVED;
+	area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	return 0;
 }
 
@@ -3076,7 +3076,7 @@ static int snd_pcm_mmap_control(struct snd_pcm_substream *substream, struct file
 		return -EINVAL;
 	area->vm_ops = &snd_pcm_vm_ops_control;
 	area->vm_private_data = substream;
-	area->vm_flags |= VM_RESERVED;
+	area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	return 0;
 }
 #else /* ! coherent mmap */
@@ -3170,7 +3170,7 @@ static const struct vm_operations_struct snd_pcm_vm_ops_data_fault = {
 int snd_pcm_lib_default_mmap(struct snd_pcm_substream *substream,
 			     struct vm_area_struct *area)
 {
-	area->vm_flags |= VM_RESERVED;
+	area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 #ifdef ARCH_HAS_DMA_MMAP_COHERENT
 	if (!substream->ops->page &&
 	    substream->dma_buffer.dev.type == SNDRV_DMA_TYPE_DEV)
diff --git a/sound/usb/usx2y/us122l.c b/sound/usb/usx2y/us122l.c
index c4fd3b1d9592..d0323a693ba2 100644
--- a/sound/usb/usx2y/us122l.c
+++ b/sound/usb/usx2y/us122l.c
@@ -262,7 +262,7 @@ static int usb_stream_hwdep_mmap(struct snd_hwdep *hw,
 	}
 
 	area->vm_ops = &usb_stream_hwdep_vm_ops;
-	area->vm_flags |= VM_RESERVED;
+	area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	area->vm_private_data = us122l;
 	atomic_inc(&us122l->mmap_count);
 out:
diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c
index 04aafb43a13c..0b34dbc8f302 100644
--- a/sound/usb/usx2y/usX2Yhwdep.c
+++ b/sound/usb/usx2y/usX2Yhwdep.c
@@ -82,7 +82,7 @@ static int snd_us428ctls_mmap(struct snd_hwdep * hw, struct file *filp, struct v
 		us428->us428ctls_sharedmem->CtlSnapShotLast = -2;
 	}
 	area->vm_ops = &us428ctls_vm_ops;
-	area->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
+	area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	area->vm_private_data = hw->private_data;
 	return 0;
 }
diff --git a/sound/usb/usx2y/usx2yhwdeppcm.c b/sound/usb/usx2y/usx2yhwdeppcm.c
index 8e40b6e67e9e..cc56007791e0 100644
--- a/sound/usb/usx2y/usx2yhwdeppcm.c
+++ b/sound/usb/usx2y/usx2yhwdeppcm.c
@@ -723,7 +723,7 @@ static int snd_usX2Y_hwdep_pcm_mmap(struct snd_hwdep * hw, struct file *filp, st
 		return -ENODEV;
 	}
 	area->vm_ops = &snd_usX2Y_hwdep_pcm_vm_ops;
-	area->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
+	area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	area->vm_private_data = hw->private_data;
 	return 0;
 }

From d741c9cdeee6a569dae0dbbaf028065402955b59 Mon Sep 17 00:00:00 2001
From: Robin Dong <sanbai@taobao.com>
Date: Mon, 8 Oct 2012 16:29:05 -0700
Subject: [PATCH 018/160] mm: fix nonuniform page status when writing new file
 with small buffer

When writing a new file with 2048 bytes buffer, such as write(fd, buffer,
2048), it will call generic_perform_write() twice for every page:

	write_begin
	mark_page_accessed(page)
	write_end

	write_begin
	mark_page_accessed(page)
	write_end

Pages 1-13 will be added to lru-pvecs in write_begin() and will *NOT* be
added to active_list even they have be accessed twice because they are not
PageLRU(page).  But when page 14th comes, all pages in lru-pvecs will be
moved to inactive_list (by __lru_cache_add() ) in first write_begin(), now
page 14th *is* PageLRU(page).  And after second write_end() only page 14th
will be in active_list.

In Hadoop environment, we do comes to this situation: after writing a
file, we find out that only 14th, 28th, 42th...  page are in active_list
and others in inactive_list.  Now kswapd works, shrinks the inactive_list,
the file only have 14th, 28th...pages in memory, the readahead request
size will be broken to only 52k (13*4k), system's performance falls
dramatically.

This problem can also replay by below steps (the machine has 8G memory):

	1. dd if=/dev/zero of=/test/file.out bs=1024 count=1048576
	2. cat another 7.5G file to /dev/null
	3. vmtouch -m 1G -v /test/file.out, it will show:

	/test/file.out
	[oooooooooooooooooooOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO] 187847/262144

	the 'o' means same pages are in memory but same are not.

The solution for this problem is simple: the 14th page should be added to
lru_add_pvecs before mark_page_accessed() just as other pages.

[akpm@linux-foundation.org: tweak comment]
[akpm@linux-foundation.org: grab better comment from the v3 patch]
Signed-off-by: Robin Dong <sanbai@taobao.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swap.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/mm/swap.c b/mm/swap.c
index 77825883298f..f76c76c7501b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -446,13 +446,22 @@ void mark_page_accessed(struct page *page)
 }
 EXPORT_SYMBOL(mark_page_accessed);
 
+/*
+ * Order of operations is important: flush the pagevec when it's already
+ * full, not when adding the last page, to make sure that last page is
+ * not added to the LRU directly when passed to this function. Because
+ * mark_page_accessed() (called after this when writing) only activates
+ * pages that are on the LRU, linear writes in subpage chunks would see
+ * every PAGEVEC_SIZE page activated, which is unexpected.
+ */
 void __lru_cache_add(struct page *page, enum lru_list lru)
 {
 	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
 
 	page_cache_get(page);
-	if (!pagevec_add(pvec, page))
+	if (!pagevec_space(pvec))
 		__pagevec_lru_add(pvec, lru);
+	pagevec_add(pvec, page);
 	put_cpu_var(lru_add_pvecs);
 }
 EXPORT_SYMBOL(__lru_cache_add);

From 6597d783397aebb793fb529474cce5089aa4c67f Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 8 Oct 2012 16:29:07 -0700
Subject: [PATCH 019/160] mm/mmap.c: replace find_vma_prepare() with clearer
 find_vma_links()

People get confused by find_vma_prepare(), because it doesn't care about
what it returns in its output args, when its callers won't be interested.

Clarify by passing in end-of-range address too, and returning failure if
any existing vma overlaps the new range: instead of returning an ambiguous
vma which most callers then must check.  find_vma_links() is a clearer
name.

This does revert 2.6.27's dfe195fb79e88 ("mm: fix uninitialized variables
for find_vma_prepare callers"), but it looks like gcc 4.3.0 was one of
those releases too eager to shout about uninitialized variables: only
copy_vma() warns with 4.5.1 and 4.7.1, which a BUG on error silences.

[hughd@google.com: fix warning, remove BUG()]
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Benny Halevy <bhalevy@tonian.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 45 +++++++++++++++++++++------------------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index a76042dc806d..2ba27434e843 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -353,17 +353,14 @@ void validate_mm(struct mm_struct *mm)
 #define validate_mm(mm) do { } while (0)
 #endif
 
-static struct vm_area_struct *
-find_vma_prepare(struct mm_struct *mm, unsigned long addr,
-		struct vm_area_struct **pprev, struct rb_node ***rb_link,
-		struct rb_node ** rb_parent)
+static int find_vma_links(struct mm_struct *mm, unsigned long addr,
+		unsigned long end, struct vm_area_struct **pprev,
+		struct rb_node ***rb_link, struct rb_node **rb_parent)
 {
-	struct vm_area_struct * vma;
-	struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
+	struct rb_node **__rb_link, *__rb_parent, *rb_prev;
 
 	__rb_link = &mm->mm_rb.rb_node;
 	rb_prev = __rb_parent = NULL;
-	vma = NULL;
 
 	while (*__rb_link) {
 		struct vm_area_struct *vma_tmp;
@@ -372,9 +369,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
 		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
 
 		if (vma_tmp->vm_end > addr) {
-			vma = vma_tmp;
-			if (vma_tmp->vm_start <= addr)
-				break;
+			/* Fail if an existing vma overlaps the area */
+			if (vma_tmp->vm_start < end)
+				return -ENOMEM;
 			__rb_link = &__rb_parent->rb_left;
 		} else {
 			rb_prev = __rb_parent;
@@ -387,7 +384,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
 		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
 	*rb_link = __rb_link;
 	*rb_parent = __rb_parent;
-	return vma;
+	return 0;
 }
 
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -456,11 +453,12 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
  */
 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-	struct vm_area_struct *__vma, *prev;
+	struct vm_area_struct *prev;
 	struct rb_node **rb_link, *rb_parent;
 
-	__vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
-	BUG_ON(__vma && __vma->vm_start < vma->vm_end);
+	if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+			   &prev, &rb_link, &rb_parent))
+		BUG();
 	__vma_link(mm, vma, prev, rb_link, rb_parent);
 	mm->map_count++;
 }
@@ -1221,8 +1219,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	/* Clear old maps */
 	error = -ENOMEM;
 munmap_back:
-	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
-	if (vma && vma->vm_start < addr + len) {
+	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
 		if (do_munmap(mm, addr, len))
 			return -ENOMEM;
 		goto munmap_back;
@@ -2183,8 +2180,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 	 * Clear old maps.  this also does some error checking for us
 	 */
  munmap_back:
-	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
-	if (vma && vma->vm_start < addr + len) {
+	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
 		if (do_munmap(mm, addr, len))
 			return -ENOMEM;
 		goto munmap_back;
@@ -2298,10 +2294,10 @@ void exit_mmap(struct mm_struct *mm)
  * and into the inode's i_mmap tree.  If vm_file is non-NULL
  * then i_mmap_mutex is taken here.
  */
-int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-	struct vm_area_struct * __vma, * prev;
-	struct rb_node ** rb_link, * rb_parent;
+	struct vm_area_struct *prev;
+	struct rb_node **rb_link, *rb_parent;
 
 	/*
 	 * The vm_pgoff of a purely anonymous vma should be irrelevant
@@ -2319,8 +2315,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 		BUG_ON(vma->anon_vma);
 		vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
 	}
-	__vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
-	if (__vma && __vma->vm_start < vma->vm_end)
+	if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+			   &prev, &rb_link, &rb_parent))
 		return -ENOMEM;
 	if ((vma->vm_flags & VM_ACCOUNT) &&
 	     security_vm_enough_memory_mm(mm, vma_pages(vma)))
@@ -2354,7 +2350,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		faulted_in_anon_vma = false;
 	}
 
-	find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
+		return NULL;	/* should never get here */
 	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
 			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
 	if (new_vma) {

From 4ffb6335da87b51c17e7ff6495170785f21558dd Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:29:09 -0700
Subject: [PATCH 020/160] mm: compaction: update comment in
 try_to_compact_pages

Allocation success rates have been far lower since 3.4 due to commit
fe2c2a106663 ("vmscan: reclaim at order 0 when compaction is enabled").
This commit was introduced for good reasons and it was known in advance
that the success rates would suffer but it was justified on the grounds
that the high allocation success rates were achieved by aggressive
reclaim.  Success rates are expected to suffer even more in 3.6 due to
commit 7db8889ab05b ("mm: have order > 0 compaction start off where it
left") which testing has shown to severely reduce allocation success
rates under load - to 0% in one case.

This series aims to improve the allocation success rates without
regressing the benefits of commit fe2c2a106663.  The series is based on
latest mmotm and takes into account the __GFP_NO_KSWAPD flag is going
away.

Patch 1 updates a stale comment seeing as I was in the general area.

Patch 2 updates reclaim/compaction to reclaim pages scaled on the number
	of recent failures.

Patch 3 captures suitable high-order pages freed by compaction to reduce
	races with parallel allocation requests.

Patch 4 fixes the upstream commit [7db8889a: mm: have order > 0 compaction
	start off where it left] to enable compaction again

Patch 5 identifies when compacion is taking too long due to contention
	and aborts.

STRESS-HIGHALLOC
		 3.6-rc1-akpm	  full-series
Pass 1          36.00 ( 0.00%)    51.00 (15.00%)
Pass 2          42.00 ( 0.00%)    63.00 (21.00%)
while Rested    86.00 ( 0.00%)    86.00 ( 0.00%)

From

  http://www.csn.ul.ie/~mel/postings/mmtests-20120424/global-dhp__stress-highalloc-performance-ext3/hydra/comparison.html

I know that the allocation success rates in 3.3.6 was 78% in comparison
to 36% in in the current akpm tree.  With the full series applied, the
success rates are up to around 51% with some variability in the results.
This is not as high a success rate but it does not reclaim excessively
which is a key point.

MMTests Statistics: vmstat
Page Ins                                     3050912     3078892
Page Outs                                    8033528     8039096
Swap Ins                                           0           0
Swap Outs                                          0           0

Note that swap in/out rates remain at 0. In 3.3.6 with 78% success rates
there were 71881 pages swapped out.

Direct pages scanned                           70942      122976
Kswapd pages scanned                         1366300     1520122
Kswapd pages reclaimed                       1366214     1484629
Direct pages reclaimed                         70936      105716
Kswapd efficiency                                99%         97%
Kswapd velocity                             1072.550    1182.615
Direct efficiency                                99%         85%
Direct velocity                               55.690      95.672

The kswapd velocity changes very little as expected.  kswapd velocity is
around the 1000 pages/sec mark where as in kernel 3.3.6 with the high
allocation success rates it was 8140 pages/second.  Direct velocity is
higher as a result of patch 2 of the series but this is expected and is
acceptable.  The direct reclaim and kswapd velocities change very little.

If these get accepted for merging then there is a difficulty in how they
should be handled.  7db8889a ("mm: have order > 0 compaction start off
where it left") is broken but it is already in 3.6-rc1 and needs to be
fixed.  However, if just patch 4 from this series is applied then Jim
Schutt's workload is known to break again as his workload also requires
patch 5.  While it would be preferred to have all these patches in 3.6 to
improve compaction in general, it would at least be acceptable if just
patches 4 and 5 were merged to 3.6 to fix a known problem without breaking
compaction completely.  On the face of it, that would force
__GFP_NO_KSWAPD patches to be merged at the same time but I can do a
version of this series with __GFP_NO_KSWAPD change reverted and then
rebase it on top of this series.  That might be best overall because I
note that the __GFP_NO_KSWAPD patch should have removed
deferred_compaction from page_alloc.c but it didn't but fixing that causes
collisions with this series.

This patch:

The comment about order applied when the check was order >
PAGE_ALLOC_COSTLY_ORDER which has not been the case since c5a73c3d ("thp:
use compaction for all allocation orders").  Fixing the comment while I'm
in the general area.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 7fcd3a52e68d..7168edc7592c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -869,11 +869,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 	struct zone *zone;
 	int rc = COMPACT_SKIPPED;
 
-	/*
-	 * Check whether it is worth even starting compaction. The order check is
-	 * made because an assumption is made that the page allocator can satisfy
-	 * the "cheaper" orders without taking special steps
-	 */
+	/* Check if the GFP flags allow compaction */
 	if (!order || !may_enter_fs || !may_perform_io)
 		return rc;
 

From 83fde0f22872aa8c1d46f775cc7bdfa864499e65 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:29:11 -0700
Subject: [PATCH 021/160] mm: vmscan: scale number of pages reclaimed by
 reclaim/compaction based on failures

If allocation fails after compaction then compaction may be deferred for
a number of allocation attempts.  If there are subsequent failures,
compact_defer_shift is increased to defer for longer periods.  This
patch uses that information to scale the number of pages reclaimed with
compact_defer_shift until allocations succeed again.  The rationale is
that reclaiming the normal number of pages still allowed compaction to
fail and its success depends on the number of pages.  If it's failing,
reclaim more pages until it succeeds again.

Note that this is not implying that VM reclaim is not reclaiming enough
pages or that its logic is broken.  try_to_free_pages() always asks for
SWAP_CLUSTER_MAX pages to be reclaimed regardless of order and that is
what it does.  Direct reclaim stops normally with this check.

	if (sc->nr_reclaimed >= sc->nr_to_reclaim)
		goto out;

should_continue_reclaim delays when that check is made until a minimum
number of pages for reclaim/compaction are reclaimed.  It is possible
that this patch could instead set nr_to_reclaim in try_to_free_pages()
and drive it from there but that's behaves differently and not
necessarily for the better.  If driven from do_try_to_free_pages(), it
is also possible that priorities will rise.

When they reach DEF_PRIORITY-2, it will also start stalling and setting
pages for immediate reclaim which is more disruptive than not desirable
in this case.  That is a more wide-reaching change that could cause
another regression related to THP requests causing interactive jitter.

[akpm@linux-foundation.org: fix build]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 99b434b674c0..f6fdae7a8d4c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1729,6 +1729,28 @@ static bool in_reclaim_compaction(struct scan_control *sc)
 	return false;
 }
 
+#ifdef CONFIG_COMPACTION
+/*
+ * If compaction is deferred for sc->order then scale the number of pages
+ * reclaimed based on the number of consecutive allocation failures
+ */
+static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
+			struct lruvec *lruvec, struct scan_control *sc)
+{
+	struct zone *zone = lruvec_zone(lruvec);
+
+	if (zone->compact_order_failed <= sc->order)
+		pages_for_compaction <<= zone->compact_defer_shift;
+	return pages_for_compaction;
+}
+#else
+static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
+			struct lruvec *lruvec, struct scan_control *sc)
+{
+	return pages_for_compaction;
+}
+#endif
+
 /*
  * Reclaim/compaction is used for high-order allocation requests. It reclaims
  * order-0 pages before compacting the zone. should_continue_reclaim() returns
@@ -1776,6 +1798,9 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
 	 * inactive lists are large enough, continue reclaiming
 	 */
 	pages_for_compaction = (2UL << sc->order);
+
+	pages_for_compaction = scale_for_compaction(pages_for_compaction,
+						    lruvec, sc);
 	inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
 	if (nr_swap_pages > 0)
 		inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);

From 1fb3f8ca0e9222535a39b884cb67a34628411b9f Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:29:12 -0700
Subject: [PATCH 022/160] mm: compaction: capture a suitable high-order page
 immediately when it is made available

While compaction is migrating pages to free up large contiguous blocks
for allocation it races with other allocation requests that may steal
these blocks or break them up.  This patch alters direct compaction to
capture a suitable free page as soon as it becomes available to reduce
this race.  It uses similar logic to split_free_page() to ensure that
watermarks are still obeyed.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h |  4 +-
 include/linux/mm.h         |  1 +
 mm/compaction.c            | 88 +++++++++++++++++++++++++++++++++-----
 mm/internal.h              |  1 +
 mm/page_alloc.c            | 63 ++++++++++++++++++++-------
 5 files changed, 129 insertions(+), 28 deletions(-)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index ef658147e4e8..0e38a1deeb23 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *mask,
-			bool sync, bool *contended);
+			bool sync, bool *contended, struct page **page);
 extern int compact_pgdat(pg_data_t *pgdat, int order);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
 
@@ -64,7 +64,7 @@ static inline bool compaction_deferred(struct zone *zone, int order)
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			bool sync, bool *contended, struct page **page)
 {
 	return COMPACT_CONTINUE;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0514fe9d3c84..5ddb11b2b4bb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -442,6 +442,7 @@ void put_pages_list(struct list_head *pages);
 
 void split_page(struct page *page, unsigned int order);
 int split_free_page(struct page *page);
+int capture_free_page(struct page *page, int alloc_order, int migratetype);
 
 /*
  * Compound pages have a destructor function.  Provide a
diff --git a/mm/compaction.c b/mm/compaction.c
index 7168edc7592c..0fbc6b73a522 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -91,6 +91,60 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
 	return compact_checklock_irqsave(lock, flags, false, cc);
 }
 
+static void compact_capture_page(struct compact_control *cc)
+{
+	unsigned long flags;
+	int mtype, mtype_low, mtype_high;
+
+	if (!cc->page || *cc->page)
+		return;
+
+	/*
+	 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
+	 * regardless of the migratetype of the freelist is is captured from.
+	 * This is fine because the order for a high-order MIGRATE_MOVABLE
+	 * allocation is typically at least a pageblock size and overall
+	 * fragmentation is not impaired. Other allocation types must
+	 * capture pages from their own migratelist because otherwise they
+	 * could pollute other pageblocks like MIGRATE_MOVABLE with
+	 * difficult to move pages and making fragmentation worse overall.
+	 */
+	if (cc->migratetype == MIGRATE_MOVABLE) {
+		mtype_low = 0;
+		mtype_high = MIGRATE_PCPTYPES;
+	} else {
+		mtype_low = cc->migratetype;
+		mtype_high = cc->migratetype + 1;
+	}
+
+	/* Speculatively examine the free lists without zone lock */
+	for (mtype = mtype_low; mtype < mtype_high; mtype++) {
+		int order;
+		for (order = cc->order; order < MAX_ORDER; order++) {
+			struct page *page;
+			struct free_area *area;
+			area = &(cc->zone->free_area[order]);
+			if (list_empty(&area->free_list[mtype]))
+				continue;
+
+			/* Take the lock and attempt capture of the page */
+			if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
+				return;
+			if (!list_empty(&area->free_list[mtype])) {
+				page = list_entry(area->free_list[mtype].next,
+							struct page, lru);
+				if (capture_free_page(page, cc->order, mtype)) {
+					spin_unlock_irqrestore(&cc->zone->lock,
+									flags);
+					*cc->page = page;
+					return;
+				}
+			}
+			spin_unlock_irqrestore(&cc->zone->lock, flags);
+		}
+	}
+}
+
 /*
  * Isolate free pages onto a private freelist. Caller must hold zone->lock.
  * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
@@ -645,7 +699,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 static int compact_finished(struct zone *zone,
 			    struct compact_control *cc)
 {
-	unsigned int order;
 	unsigned long watermark;
 
 	if (fatal_signal_pending(current))
@@ -688,14 +741,22 @@ static int compact_finished(struct zone *zone,
 		return COMPACT_CONTINUE;
 
 	/* Direct compactor: Is a suitable page free? */
-	for (order = cc->order; order < MAX_ORDER; order++) {
-		/* Job done if page is free of the right migratetype */
-		if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
+	if (cc->page) {
+		/* Was a suitable page captured? */
+		if (*cc->page)
 			return COMPACT_PARTIAL;
+	} else {
+		unsigned int order;
+		for (order = cc->order; order < MAX_ORDER; order++) {
+			struct free_area *area = &zone->free_area[cc->order];
+			/* Job done if page is free of the right migratetype */
+			if (!list_empty(&area->free_list[cc->migratetype]))
+				return COMPACT_PARTIAL;
 
-		/* Job done if allocation would set block type */
-		if (order >= pageblock_order && zone->free_area[order].nr_free)
-			return COMPACT_PARTIAL;
+			/* Job done if allocation would set block type */
+			if (cc->order >= pageblock_order && area->nr_free)
+				return COMPACT_PARTIAL;
+		}
 	}
 
 	return COMPACT_CONTINUE;
@@ -817,6 +878,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 				goto out;
 			}
 		}
+
+		/* Capture a page now if it is a suitable size */
+		compact_capture_page(cc);
 	}
 
 out:
@@ -829,7 +893,8 @@ out:
 
 static unsigned long compact_zone_order(struct zone *zone,
 				 int order, gfp_t gfp_mask,
-				 bool sync, bool *contended)
+				 bool sync, bool *contended,
+				 struct page **page)
 {
 	struct compact_control cc = {
 		.nr_freepages = 0,
@@ -839,6 +904,7 @@ static unsigned long compact_zone_order(struct zone *zone,
 		.zone = zone,
 		.sync = sync,
 		.contended = contended,
+		.page = page,
 	};
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
@@ -860,7 +926,7 @@ int sysctl_extfrag_threshold = 500;
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			bool sync, bool *contended, struct page **page)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
@@ -881,7 +947,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 		int status;
 
 		status = compact_zone_order(zone, order, gfp_mask, sync,
-						contended);
+						contended, page);
 		rc = max(status, rc);
 
 		/* If a normal allocation would succeed, stop compacting */
@@ -936,6 +1002,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
 	struct compact_control cc = {
 		.order = order,
 		.sync = false,
+		.page = NULL,
 	};
 
 	return __compact_pgdat(pgdat, &cc);
@@ -946,6 +1013,7 @@ static int compact_node(int nid)
 	struct compact_control cc = {
 		.order = -1,
 		.sync = true,
+		.page = NULL,
 	};
 
 	return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/internal.h b/mm/internal.h
index b8c91b342e24..e549a7fbc296 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -131,6 +131,7 @@ struct compact_control {
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
 	struct zone *zone;
 	bool *contended;		/* True if a lock was contended */
+	struct page **page;		/* Page captured of requested size */
 };
 
 unsigned long
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5e92698e5395..cfd565dbe124 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1380,16 +1380,11 @@ void split_page(struct page *page, unsigned int order)
 }
 
 /*
- * Similar to split_page except the page is already free. As this is only
- * being used for migration, the migratetype of the block also changes.
- * As this is called with interrupts disabled, the caller is responsible
- * for calling arch_alloc_page() and kernel_map_page() after interrupts
- * are enabled.
- *
- * Note: this is probably too low level an operation for use in drivers.
- * Please consult with lkml before using this in your driver.
+ * Similar to the split_page family of functions except that the page
+ * required at the given order and being isolated now to prevent races
+ * with parallel allocators
  */
-int split_free_page(struct page *page)
+int capture_free_page(struct page *page, int alloc_order, int migratetype)
 {
 	unsigned int order;
 	unsigned long watermark;
@@ -1411,10 +1406,11 @@ int split_free_page(struct page *page)
 	rmv_page_order(page);
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
 
-	/* Split into individual pages */
-	set_page_refcounted(page);
-	split_page(page, order);
+	if (alloc_order != order)
+		expand(zone, page, alloc_order, order,
+			&zone->free_area[order], migratetype);
 
+	/* Set the pageblock if the captured page is at least a pageblock */
 	if (order >= pageblock_order - 1) {
 		struct page *endpage = page + (1 << order) - 1;
 		for (; page < endpage; page += pageblock_nr_pages) {
@@ -1425,7 +1421,35 @@ int split_free_page(struct page *page)
 		}
 	}
 
-	return 1 << order;
+	return 1UL << order;
+}
+
+/*
+ * Similar to split_page except the page is already free. As this is only
+ * being used for migration, the migratetype of the block also changes.
+ * As this is called with interrupts disabled, the caller is responsible
+ * for calling arch_alloc_page() and kernel_map_page() after interrupts
+ * are enabled.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+int split_free_page(struct page *page)
+{
+	unsigned int order;
+	int nr_pages;
+
+	BUG_ON(!PageBuddy(page));
+	order = page_order(page);
+
+	nr_pages = capture_free_page(page, order, 0);
+	if (!nr_pages)
+		return 0;
+
+	/* Split into individual pages */
+	set_page_refcounted(page);
+	split_page(page, order);
+	return nr_pages;
 }
 
 /*
@@ -2105,7 +2129,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
-	struct page *page;
+	struct page *page = NULL;
 
 	if (!order)
 		return NULL;
@@ -2118,10 +2142,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
 						nodemask, sync_migration,
-						contended_compaction);
+						contended_compaction, &page);
 	current->flags &= ~PF_MEMALLOC;
-	if (*did_some_progress != COMPACT_SKIPPED) {
 
+	/* If compaction captured a page, prep and use it */
+	if (page) {
+		prep_new_page(page, order, gfp_mask);
+		goto got_page;
+	}
+
+	if (*did_some_progress != COMPACT_SKIPPED) {
 		/* Page migration frees to the PCP lists but we want merging */
 		drain_pages(get_cpu());
 		put_cpu();
@@ -2131,6 +2161,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 				alloc_flags & ~ALLOC_NO_WATERMARKS,
 				preferred_zone, migratetype);
 		if (page) {
+got_page:
 			preferred_zone->compact_considered = 0;
 			preferred_zone->compact_defer_shift = 0;
 			if (order >= preferred_zone->compact_order_failed)

From 8d34694c1abf29df1f3c7317936b7e3e2e308d9b Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Date: Mon, 8 Oct 2012 16:29:14 -0700
Subject: [PATCH 023/160] revert "mm: mempolicy: Let vma_merge and vma_split
 handle vma->vm_policy linkages"

Commit 05f144a0d5c2 ("mm: mempolicy: Let vma_merge and vma_split handle
vma->vm_policy linkages") removed vma->vm_policy updates code but it is
the purpose of mbind_range().  Now, mbind_range() is virtually a no-op
and while it does not allow memory corruption it is not the right fix.
This patch is a revert.

[mgorman@suse.de: Edited changelog]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: Josh Boyer <jwboyer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ada3be6e252..92daa267baf2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -607,6 +607,27 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	return first;
 }
 
+/* Apply policy to a single VMA */
+static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
+{
+	int err = 0;
+	struct mempolicy *old = vma->vm_policy;
+
+	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
+		 vma->vm_ops, vma->vm_file,
+		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+
+	if (vma->vm_ops && vma->vm_ops->set_policy)
+		err = vma->vm_ops->set_policy(vma, new);
+	if (!err) {
+		mpol_get(new);
+		vma->vm_policy = new;
+		mpol_put(old);
+	}
+	return err;
+}
+
 /* Step 2: apply policy to a range and do splits. */
 static int mbind_range(struct mm_struct *mm, unsigned long start,
 		       unsigned long end, struct mempolicy *new_pol)
@@ -655,23 +676,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 			if (err)
 				goto out;
 		}
-
-		/*
-		 * Apply policy to a single VMA. The reference counting of
-		 * policy for vma_policy linkages has already been handled by
-		 * vma_merge and split_vma as necessary. If this is a shared
-		 * policy then ->set_policy will increment the reference count
-		 * for an sp node.
-		 */
-		pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
-			vma->vm_start, vma->vm_end, vma->vm_pgoff,
-			vma->vm_ops, vma->vm_file,
-			vma->vm_ops ? vma->vm_ops->set_policy : NULL);
-		if (vma->vm_ops && vma->vm_ops->set_policy) {
-			err = vma->vm_ops->set_policy(vma, new_pol);
-			if (err)
-				goto out;
-		}
+		err = policy_vma(vma, new_pol);
+		if (err)
+			goto out;
 	}
 
  out:

From 869833f2c5c6e4dd09a5378cfc665ffb4615e5d2 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 8 Oct 2012 16:29:16 -0700
Subject: [PATCH 024/160] mempolicy: remove mempolicy sharing

Dave Jones' system call fuzz testing tool "trinity" triggered the
following bug error with slab debugging enabled

    =============================================================================
    BUG numa_policy (Not tainted): Poison overwritten
    -----------------------------------------------------------------------------

    INFO: 0xffff880146498250-0xffff880146498250. First byte 0x6a instead of 0x6b
    INFO: Allocated in mpol_new+0xa3/0x140 age=46310 cpu=6 pid=32154
     __slab_alloc+0x3d3/0x445
     kmem_cache_alloc+0x29d/0x2b0
     mpol_new+0xa3/0x140
     sys_mbind+0x142/0x620
     system_call_fastpath+0x16/0x1b

    INFO: Freed in __mpol_put+0x27/0x30 age=46268 cpu=6 pid=32154
     __slab_free+0x2e/0x1de
     kmem_cache_free+0x25a/0x260
     __mpol_put+0x27/0x30
     remove_vma+0x68/0x90
     exit_mmap+0x118/0x140
     mmput+0x73/0x110
     exit_mm+0x108/0x130
     do_exit+0x162/0xb90
     do_group_exit+0x4f/0xc0
     sys_exit_group+0x17/0x20
     system_call_fastpath+0x16/0x1b

    INFO: Slab 0xffffea0005192600 objects=27 used=27 fp=0x          (null) flags=0x20000000004080
    INFO: Object 0xffff880146498250 @offset=592 fp=0xffff88014649b9d0

The problem is that the structure is being prematurely freed due to a
reference count imbalance. In the following case mbind(addr, len) should
replace the memory policies of both vma1 and vma2 and thus they will
become to share the same mempolicy and the new mempolicy will have the
MPOL_F_SHARED flag.

  +-------------------+-------------------+
  |     vma1          |     vma2(shmem)   |
  +-------------------+-------------------+
  |                                       |
 addr                                 addr+len

alloc_pages_vma() uses get_vma_policy() and mpol_cond_put() pair for
maintaining the mempolicy reference count.  The current rule is that
get_vma_policy() only increments refcount for shmem VMA and
mpol_conf_put() only decrements refcount if the policy has
MPOL_F_SHARED.

In above case, vma1 is not shmem vma and vma->policy has MPOL_F_SHARED!
The reference count will be decreased even though was not increased
whenever alloc_page_vma() is called.  This has been broken since commit
[52cd3b07: mempolicy: rework mempolicy Reference Counting] in 2008.

There is another serious bug with the sharing of memory policies.
Currently, mempolicy rebind logic (it is called from cpuset rebinding)
ignores a refcount of mempolicy and override it forcibly.  Thus, any
mempolicy sharing may cause mempolicy corruption.  The bug was
introduced by commit [68860ec1: cpusets: automatic numa mempolicy
rebinding].

Ideally, the shared policy handling would be rewritten to either
properly handle COW of the policy structures or at least reference count
MPOL_F_SHARED based exclusively on information within the policy.
However, this patch takes the easier approach of disabling any policy
sharing between VMAs.  Each new range allocated with sp_alloc will
allocate a new policy, set the reference count to 1 and drop the
reference count of the old policy.  This increases the memory footprint
but is not expected to be a major problem as mbind() is unlikely to be
used for fine-grained ranges.  It is also inefficient because it means
we allocate a new policy even in cases where mbind_range() could use the
new_policy passed to it.  However, it is more straight-forward and the
change should be invisible to the user.

[mgorman@suse.de: Edited changelog]
Reported-by: Dave Jones <davej@redhat.com>,
Cc: Christoph Lameter <cl@linux.com>,
Reviewed-by: Christoph Lameter <cl@linux.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Josh Boyer <jwboyer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 52 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 14 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 92daa267baf2..f0728ae74672 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -607,24 +607,39 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	return first;
 }
 
-/* Apply policy to a single VMA */
-static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
+/*
+ * Apply policy to a single VMA
+ * This must be called with the mmap_sem held for writing.
+ */
+static int vma_replace_policy(struct vm_area_struct *vma,
+						struct mempolicy *pol)
 {
-	int err = 0;
-	struct mempolicy *old = vma->vm_policy;
+	int err;
+	struct mempolicy *old;
+	struct mempolicy *new;
 
 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 		 vma->vm_ops, vma->vm_file,
 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 
-	if (vma->vm_ops && vma->vm_ops->set_policy)
+	new = mpol_dup(pol);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+
+	if (vma->vm_ops && vma->vm_ops->set_policy) {
 		err = vma->vm_ops->set_policy(vma, new);
-	if (!err) {
-		mpol_get(new);
-		vma->vm_policy = new;
-		mpol_put(old);
+		if (err)
+			goto err_out;
 	}
+
+	old = vma->vm_policy;
+	vma->vm_policy = new; /* protected by mmap_sem */
+	mpol_put(old);
+
+	return 0;
+ err_out:
+	mpol_put(new);
 	return err;
 }
 
@@ -676,7 +691,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 			if (err)
 				goto out;
 		}
-		err = policy_vma(vma, new_pol);
+		err = vma_replace_policy(vma, new_pol);
 		if (err)
 			goto out;
 	}
@@ -2153,15 +2168,24 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
 				struct mempolicy *pol)
 {
-	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+	struct sp_node *n;
+	struct mempolicy *newpol;
 
+	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
 	if (!n)
 		return NULL;
+
+	newpol = mpol_dup(pol);
+	if (IS_ERR(newpol)) {
+		kmem_cache_free(sn_cache, n);
+		return NULL;
+	}
+	newpol->flags |= MPOL_F_SHARED;
+
 	n->start = start;
 	n->end = end;
-	mpol_get(pol);
-	pol->flags |= MPOL_F_SHARED;	/* for unref */
-	n->policy = pol;
+	n->policy = newpol;
+
 	return n;
 }
 

From b22d127a39ddd10d93deee3d96e643657ad53a49 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:29:17 -0700
Subject: [PATCH 025/160] mempolicy: fix a race in shared_policy_replace()

shared_policy_replace() use of sp_alloc() is unsafe.  1) sp_node cannot
be dereferenced if sp->lock is not held and 2) another thread can modify
sp_node between spin_unlock for allocating a new sp node and next
spin_lock.  The bug was introduced before 2.6.12-rc2.

Kosaki's original patch for this problem was to allocate an sp node and
policy within shared_policy_replace and initialise it when the lock is
reacquired.  I was not keen on this approach because it partially
duplicates sp_alloc().  As the paths were sp->lock is taken are not that
performance critical this patch converts sp->lock to sp->mutex so it can
sleep when calling sp_alloc().

[kosaki.motohiro@jp.fujitsu.com: Original patch]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Cc: Josh Boyer <jwboyer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h |  2 +-
 mm/mempolicy.c            | 37 ++++++++++++++++---------------------
 2 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index ba7a0ff19d39..cec569325608 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -188,7 +188,7 @@ struct sp_node {
 
 struct shared_policy {
 	struct rb_root root;
-	spinlock_t lock;
+	struct mutex mutex;
 };
 
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f0728ae74672..b2f12ecc1b34 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2083,7 +2083,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
  */
 
 /* lookup first element intersecting start-end */
-/* Caller holds sp->lock */
+/* Caller holds sp->mutex */
 static struct sp_node *
 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
 {
@@ -2147,13 +2147,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
 
 	if (!sp->root.rb_node)
 		return NULL;
-	spin_lock(&sp->lock);
+	mutex_lock(&sp->mutex);
 	sn = sp_lookup(sp, idx, idx+1);
 	if (sn) {
 		mpol_get(sn->policy);
 		pol = sn->policy;
 	}
-	spin_unlock(&sp->lock);
+	mutex_unlock(&sp->mutex);
 	return pol;
 }
 
@@ -2193,10 +2193,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
 				 unsigned long end, struct sp_node *new)
 {
-	struct sp_node *n, *new2 = NULL;
+	struct sp_node *n;
+	int ret = 0;
 
-restart:
-	spin_lock(&sp->lock);
+	mutex_lock(&sp->mutex);
 	n = sp_lookup(sp, start, end);
 	/* Take care of old policies in the same range. */
 	while (n && n->start < end) {
@@ -2209,16 +2209,14 @@ restart:
 		} else {
 			/* Old policy spanning whole new range. */
 			if (n->end > end) {
+				struct sp_node *new2;
+				new2 = sp_alloc(end, n->end, n->policy);
 				if (!new2) {
-					spin_unlock(&sp->lock);
-					new2 = sp_alloc(end, n->end, n->policy);
-					if (!new2)
-						return -ENOMEM;
-					goto restart;
+					ret = -ENOMEM;
+					goto out;
 				}
 				n->end = start;
 				sp_insert(sp, new2);
-				new2 = NULL;
 				break;
 			} else
 				n->end = start;
@@ -2229,12 +2227,9 @@ restart:
 	}
 	if (new)
 		sp_insert(sp, new);
-	spin_unlock(&sp->lock);
-	if (new2) {
-		mpol_put(new2->policy);
-		kmem_cache_free(sn_cache, new2);
-	}
-	return 0;
+out:
+	mutex_unlock(&sp->mutex);
+	return ret;
 }
 
 /**
@@ -2252,7 +2247,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 	int ret;
 
 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
-	spin_lock_init(&sp->lock);
+	mutex_init(&sp->mutex);
 
 	if (mpol) {
 		struct vm_area_struct pvma;
@@ -2318,7 +2313,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
 
 	if (!p->root.rb_node)
 		return;
-	spin_lock(&p->lock);
+	mutex_lock(&p->mutex);
 	next = rb_first(&p->root);
 	while (next) {
 		n = rb_entry(next, struct sp_node, nd);
@@ -2327,7 +2322,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
 		mpol_put(n->policy);
 		kmem_cache_free(sn_cache, n);
 	}
-	spin_unlock(&p->lock);
+	mutex_unlock(&p->mutex);
 }
 
 /* assumes fs == KERNEL_DS */

From 63f74ca21f1fad36d075e063f06dcc6d39fe86b2 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 8 Oct 2012 16:29:19 -0700
Subject: [PATCH 026/160] mempolicy: fix refcount leak in
 mpol_set_shared_policy()

When shared_policy_replace() fails to allocate new->policy is not freed
correctly by mpol_set_shared_policy().  The problem is that shared
mempolicy code directly call kmem_cache_free() in multiple places where
it is easy to make a mistake.

This patch creates an sp_free wrapper function and uses it. The bug was
introduced pre-git age (IOW, before 2.6.12-rc2).

[mgorman@suse.de: Editted changelog]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Christoph Lameter <cl@linux.com>
Cc: Josh Boyer <jwboyer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b2f12ecc1b34..1763418f1ada 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2157,12 +2157,17 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
 	return pol;
 }
 
+static void sp_free(struct sp_node *n)
+{
+	mpol_put(n->policy);
+	kmem_cache_free(sn_cache, n);
+}
+
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
 	rb_erase(&n->nd, &sp->root);
-	mpol_put(n->policy);
-	kmem_cache_free(sn_cache, n);
+	sp_free(n);
 }
 
 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
@@ -2301,7 +2306,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
 	}
 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
 	if (err && new)
-		kmem_cache_free(sn_cache, new);
+		sp_free(new);
 	return err;
 }
 
@@ -2318,9 +2323,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
 	while (next) {
 		n = rb_entry(next, struct sp_node, nd);
 		next = rb_next(&n->nd);
-		rb_erase(&n->nd, &p->root);
-		mpol_put(n->policy);
-		kmem_cache_free(sn_cache, n);
+		sp_delete(p, n);
 	}
 	mutex_unlock(&p->mutex);
 }

From 00442ad04a5eac08a98255697c510e708f6082e2 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:29:20 -0700
Subject: [PATCH 027/160] mempolicy: fix a memory corruption by refcount
 imbalance in alloc_pages_vma()

Commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier
related damage v3") introduced a potential memory corruption.
shmem_alloc_page() uses a pseudo vma and it has one significant unique
combination, vma->vm_ops=NULL and vma->policy->flags & MPOL_F_SHARED.

get_vma_policy() does NOT increase a policy ref when vma->vm_ops=NULL
and mpol_cond_put() DOES decrease a policy ref when a policy has
MPOL_F_SHARED.  Therefore, when a cpuset update race occurs,
alloc_pages_vma() falls in 'goto retry_cpuset' path, decrements the
reference count and frees the policy prematurely.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Christoph Lameter <cl@linux.com>
Cc: Josh Boyer <jwboyer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1763418f1ada..3d64b369180d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1552,8 +1552,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
 									addr);
 			if (vpol)
 				pol = vpol;
-		} else if (vma->vm_policy)
+		} else if (vma->vm_policy) {
 			pol = vma->vm_policy;
+
+			/*
+			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
+			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
+			 * count on these policies which will be dropped by
+			 * mpol_cond_put() later
+			 */
+			if (mpol_needs_cond_ref(pol))
+				mpol_get(pol);
+		}
 	}
 	if (!pol)
 		pol = &default_policy;

From 48af0d7cb3c87fae2ff38af372821dcb0b019c9e Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:23 -0700
Subject: [PATCH 028/160] mm: mmu_notifier: fix inconsistent memory between
 secondary MMU and host

There is a bug in set_pte_at_notify() which always sets the pte to the
new page before releasing the old page in the secondary MMU.  At this
time, the process will access on the new page, but the secondary MMU
still access on the old page, the memory is inconsistent between them

The below scenario shows the bug more clearly:

at the beginning: *p = 0, and p is write-protected by KSM or shared with
parent process

CPU 0                                       CPU 1
write 1 to p to trigger COW,
set_pte_at_notify will be called:
  *pte = new_page + W; /* The W bit of pte is set */

                                     *p = 1; /* pte is valid, so no #PF */

                                     return back to secondary MMU, then
                                     the secondary MMU read p, but get:
                                     *p == 0;

                         /*
                          * !!!!!!
                          * the host has already set p to 1, but the secondary
                          * MMU still get the old value 0
                          */

  call mmu_notifier_change_pte to release
  old page in secondary MMU

We can fix it by release old page first, then set the pte to the new
page.

Note, the new page will be firstly used in secondary MMU before it is
mapped into the page table of the process, but this is safe because it
is protected by the page table lock, there is no race to change the pte

[akpm@linux-foundation.org: add comment from Andrea]
Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 1d1b1e13f79f..6f32b2b1f76b 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -311,14 +311,24 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 	__young;							\
 })
 
+/*
+ * set_pte_at_notify() sets the pte _after_ running the notifier.
+ * This is safe to start by updating the secondary MMUs, because the primary MMU
+ * pte invalidate must have already happened with a ptep_clear_flush() before
+ * set_pte_at_notify() has been invoked.  Updating the secondary MMUs first is
+ * required when we change both the protection of the mapping from read-only to
+ * read-write and the pfn (like during copy on write page faults). Otherwise the
+ * old page would remain mapped readonly in the secondary MMUs after the new
+ * page is already writable by some CPU through the primary MMU.
+ */
 #define set_pte_at_notify(__mm, __address, __ptep, __pte)		\
 ({									\
 	struct mm_struct *___mm = __mm;					\
 	unsigned long ___address = __address;				\
 	pte_t ___pte = __pte;						\
 									\
-	set_pte_at(___mm, ___address, __ptep, ___pte);			\
 	mmu_notifier_change_pte(___mm, ___address, ___pte);		\
+	set_pte_at(___mm, ___address, __ptep, ___pte);			\
 })
 
 #else /* CONFIG_MMU_NOTIFIER */

From 21a92735f660eaecf69a6f2e777f18463760ec32 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.co.il>
Date: Mon, 8 Oct 2012 16:29:24 -0700
Subject: [PATCH 029/160] mm: mmu_notifier: have mmu_notifiers use a global
 SRCU so they may safely schedule

With an RCU based mmu_notifier implementation, any callout to
mmu_notifier_invalidate_range_{start,end}() or
mmu_notifier_invalidate_page() would not be allowed to call schedule()
as that could potentially allow a modification to the mmu_notifier
structure while it is currently being used.

Since srcu allocs 4 machine words per instance per cpu, we may end up
with memory exhaustion if we use srcu per mm.  So all mms share a global
srcu.  Note that during large mmu_notifier activity exit & unregister
paths might hang for longer periods, but it is tolerable for current
mmu_notifier clients.

Signed-off-by: Sagi Grimberg <sagig@mellanox.co.il>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Haggai Eran <haggaie@mellanox.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmu_notifier.h |  1 +
 mm/mmu_notifier.c            | 73 ++++++++++++++++++++++++------------
 2 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 6f32b2b1f76b..4b7183e98061 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -4,6 +4,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/mm_types.h>
+#include <linux/srcu.h>
 
 struct mmu_notifier;
 struct mmu_notifier_ops;
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 862b60822d9f..35ff447d8d14 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -14,10 +14,14 @@
 #include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/err.h>
+#include <linux/srcu.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
+/* global SRCU for all MMs */
+struct srcu_struct srcu;
+
 /*
  * This function can't run concurrently against mmu_notifier_register
  * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
@@ -25,8 +29,8 @@
  * in parallel despite there being no task using this mm any more,
  * through the vmas outside of the exit_mmap context, such as with
  * vmtruncate. This serializes against mmu_notifier_unregister with
- * the mmu_notifier_mm->lock in addition to RCU and it serializes
- * against the other mmu notifiers with RCU. struct mmu_notifier_mm
+ * the mmu_notifier_mm->lock in addition to SRCU and it serializes
+ * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
  * can't go away from under us as exit_mmap holds an mm_count pin
  * itself.
  */
@@ -34,12 +38,13 @@ void __mmu_notifier_release(struct mm_struct *mm)
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int id;
 
 	/*
 	 * RCU here will block mmu_notifier_unregister until
 	 * ->release returns.
 	 */
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
 		/*
 		 * if ->release runs before mmu_notifier_unregister it
@@ -50,7 +55,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
 		 */
 		if (mn->ops->release)
 			mn->ops->release(mn, mm);
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 
 	spin_lock(&mm->mmu_notifier_mm->lock);
 	while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -68,7 +73,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
 	spin_unlock(&mm->mmu_notifier_mm->lock);
 
 	/*
-	 * synchronize_rcu here prevents mmu_notifier_release to
+	 * synchronize_srcu here prevents mmu_notifier_release to
 	 * return to exit_mmap (which would proceed freeing all pages
 	 * in the mm) until the ->release method returns, if it was
 	 * invoked by mmu_notifier_unregister.
@@ -76,7 +81,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
 	 * The mmu_notifier_mm can't go away from under us because one
 	 * mm_count is hold by exit_mmap.
 	 */
-	synchronize_rcu();
+	synchronize_srcu(&srcu);
 }
 
 /*
@@ -89,14 +94,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
-	int young = 0;
+	int young = 0, id;
 
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->clear_flush_young)
 			young |= mn->ops->clear_flush_young(mn, mm, address);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 
 	return young;
 }
@@ -106,9 +111,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
-	int young = 0;
+	int young = 0, id;
 
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->test_young) {
 			young = mn->ops->test_young(mn, mm, address);
@@ -116,7 +121,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
 				break;
 		}
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 
 	return young;
 }
@@ -126,8 +131,9 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int id;
 
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->change_pte)
 			mn->ops->change_pte(mn, mm, address, pte);
@@ -138,7 +144,7 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 		else if (mn->ops->invalidate_page)
 			mn->ops->invalidate_page(mn, mm, address);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 }
 
 void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -146,13 +152,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int id;
 
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_page)
 			mn->ops->invalidate_page(mn, mm, address);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 }
 
 void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -160,13 +167,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int id;
 
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_range_start)
 			mn->ops->invalidate_range_start(mn, mm, start, end);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 }
 
 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -174,13 +182,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int id;
 
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_range_end)
 			mn->ops->invalidate_range_end(mn, mm, start, end);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 }
 
 static int do_mmu_notifier_register(struct mmu_notifier *mn,
@@ -192,6 +201,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
 
 	BUG_ON(atomic_read(&mm->mm_users) <= 0);
 
+	/*
+	* Verify that mmu_notifier_init() already run and the global srcu is
+	* initialized.
+	*/
+	BUG_ON(!srcu.per_cpu_ref);
+
 	ret = -ENOMEM;
 	mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
 	if (unlikely(!mmu_notifier_mm))
@@ -274,8 +289,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm)
 /*
  * This releases the mm_count pin automatically and frees the mm
  * structure if it was the last user of it. It serializes against
- * running mmu notifiers with RCU and against mmu_notifier_unregister
- * with the unregister lock + RCU. All sptes must be dropped before
+ * running mmu notifiers with SRCU and against mmu_notifier_unregister
+ * with the unregister lock + SRCU. All sptes must be dropped before
  * calling mmu_notifier_unregister. ->release or any other notifier
  * method may be invoked concurrently with mmu_notifier_unregister,
  * and only after mmu_notifier_unregister returned we're guaranteed
@@ -290,8 +305,9 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 		 * RCU here will force exit_mmap to wait ->release to finish
 		 * before freeing the pages.
 		 */
-		rcu_read_lock();
+		int id;
 
+		id = srcu_read_lock(&srcu);
 		/*
 		 * exit_mmap will block in mmu_notifier_release to
 		 * guarantee ->release is called before freeing the
@@ -299,7 +315,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 		 */
 		if (mn->ops->release)
 			mn->ops->release(mn, mm);
-		rcu_read_unlock();
+		srcu_read_unlock(&srcu, id);
 
 		spin_lock(&mm->mmu_notifier_mm->lock);
 		hlist_del_rcu(&mn->hlist);
@@ -310,10 +326,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 	 * Wait any running method to finish, of course including
 	 * ->release if it was run by mmu_notifier_relase instead of us.
 	 */
-	synchronize_rcu();
+	synchronize_srcu(&srcu);
 
 	BUG_ON(atomic_read(&mm->mm_count) <= 0);
 
 	mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
+
+static int __init mmu_notifier_init(void)
+{
+	return init_srcu_struct(&srcu);
+}
+
+module_init(mmu_notifier_init);

From e0f3c3f78da29b114e7c1c68019036559f715948 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:26 -0700
Subject: [PATCH 030/160] mm/mmu_notifier: init notifier if necessary

While registering MMU notifier, new instance of MMU notifier_mm will be
allocated and later free'd if currrent mm_struct's MMU notifier_mm has
been initialized.  That causes some overhead.  The patch tries to
elominate that.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Sagi Grimberg <sagig@mellanox.co.il>
Cc: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmu_notifier.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 35ff447d8d14..947df83dccb0 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -207,22 +207,23 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
 	*/
 	BUG_ON(!srcu.per_cpu_ref);
 
-	ret = -ENOMEM;
-	mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
-	if (unlikely(!mmu_notifier_mm))
-		goto out;
-
 	if (take_mmap_sem)
 		down_write(&mm->mmap_sem);
 	ret = mm_take_all_locks(mm);
 	if (unlikely(ret))
-		goto out_cleanup;
+		goto out;
 
 	if (!mm_has_notifiers(mm)) {
+		mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm),
+					GFP_KERNEL);
+		if (unlikely(!mmu_notifier_mm)) {
+			ret = -ENOMEM;
+			goto out_of_mem;
+		}
 		INIT_HLIST_HEAD(&mmu_notifier_mm->list);
 		spin_lock_init(&mmu_notifier_mm->lock);
+
 		mm->mmu_notifier_mm = mmu_notifier_mm;
-		mmu_notifier_mm = NULL;
 	}
 	atomic_inc(&mm->mm_count);
 
@@ -238,13 +239,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
 	hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
 	spin_unlock(&mm->mmu_notifier_mm->lock);
 
+out_of_mem:
 	mm_drop_all_locks(mm);
-out_cleanup:
+out:
 	if (take_mmap_sem)
 		up_write(&mm->mmap_sem);
-	/* kfree() does nothing if mmu_notifier_mm is NULL */
-	kfree(mmu_notifier_mm);
-out:
+
 	BUG_ON(atomic_read(&mm->mm_users) <= 0);
 	return ret;
 }

From d5dc0ad928fb9e972001e552597fd0b794863f34 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:27 -0700
Subject: [PATCH 031/160] mm/vmscan: fix error number for failed kthread

Fix the return value while failing to create the kswapd kernel thread.
Also, the error message is prioritized as KERN_ERR.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index f6fdae7a8d4c..d16bf5a53266 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3126,9 +3126,9 @@ int kswapd_run(int nid)
 	if (IS_ERR(pgdat->kswapd)) {
 		/* failure at boot is fatal */
 		BUG_ON(system_state == SYSTEM_BOOTING);
-		printk("Failed to start kswapd on node %d\n",nid);
 		pgdat->kswapd = NULL;
-		ret = -1;
+		pr_err("Failed to start kswapd on node %d\n", nid);
+		ret = PTR_ERR(pgdat->kswapd);
 	}
 	return ret;
 }

From 01dc52ebdf472f77cca623ca693ca24cfc0f1bbe Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@gnu.org>
Date: Mon, 8 Oct 2012 16:29:30 -0700
Subject: [PATCH 032/160] oom: remove deprecated oom_adj

The deprecated /proc/<pid>/oom_adj is scheduled for removal this month.

Signed-off-by: Davidlohr Bueso <dave@gnu.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/ABI/obsolete/proc-pid-oom_adj |  22 ----
 Documentation/filesystems/proc.txt          |  22 +---
 fs/proc/base.c                              | 117 +-------------------
 include/linux/oom.h                         |  11 --
 include/linux/sched.h                       |   1 -
 kernel/fork.c                               |   1 -
 mm/oom_kill.c                               |   4 +-
 7 files changed, 7 insertions(+), 171 deletions(-)
 delete mode 100644 Documentation/ABI/obsolete/proc-pid-oom_adj

diff --git a/Documentation/ABI/obsolete/proc-pid-oom_adj b/Documentation/ABI/obsolete/proc-pid-oom_adj
deleted file mode 100644
index 9a3cb88ade47..000000000000
--- a/Documentation/ABI/obsolete/proc-pid-oom_adj
+++ /dev/null
@@ -1,22 +0,0 @@
-What:	/proc/<pid>/oom_adj
-When:	August 2012
-Why:	/proc/<pid>/oom_adj allows userspace to influence the oom killer's
-	badness heuristic used to determine which task to kill when the kernel
-	is out of memory.
-
-	The badness heuristic has since been rewritten since the introduction of
-	this tunable such that its meaning is deprecated.  The value was
-	implemented as a bitshift on a score generated by the badness()
-	function that did not have any precise units of measure.  With the
-	rewrite, the score is given as a proportion of available memory to the
-	task allocating pages, so using a bitshift which grows the score
-	exponentially is, thus, impossible to tune with fine granularity.
-
-	A much more powerful interface, /proc/<pid>/oom_score_adj, was
-	introduced with the oom killer rewrite that allows users to increase or
-	decrease the badness score linearly.  This interface will replace
-	/proc/<pid>/oom_adj.
-
-	A warning will be emitted to the kernel log if an application uses this
-	deprecated interface.  After it is printed once, future warnings will be
-	suppressed until the kernel is rebooted.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index fb0a6aeb936c..a1793d670cd0 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -33,7 +33,7 @@ Table of Contents
   2	Modifying System Parameters
 
   3	Per-Process Parameters
-  3.1	/proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj - Adjust the oom-killer
+  3.1	/proc/<pid>/oom_score_adj - Adjust the oom-killer
 								score
   3.2	/proc/<pid>/oom_score - Display current oom-killer score
   3.3	/proc/<pid>/io - Display the IO accounting fields
@@ -1320,10 +1320,10 @@ of the kernel.
 CHAPTER 3: PER-PROCESS PARAMETERS
 ------------------------------------------------------------------------------
 
-3.1 /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj- Adjust the oom-killer score
+3.1 /proc/<pid>/oom_score_adj- Adjust the oom-killer score
 --------------------------------------------------------------------------------
 
-These file can be used to adjust the badness heuristic used to select which
+This file can be used to adjust the badness heuristic used to select which
 process gets killed in out of memory conditions.
 
 The badness heuristic assigns a value to each candidate task ranging from 0
@@ -1361,22 +1361,10 @@ same system, cpuset, mempolicy, or memory controller resources to use at least
 equivalent to discounting 50% of the task's allowed memory from being considered
 as scoring against the task.
 
-For backwards compatibility with previous kernels, /proc/<pid>/oom_adj may also
-be used to tune the badness score.  Its acceptable values range from -16
-(OOM_ADJUST_MIN) to +15 (OOM_ADJUST_MAX) and a special value of -17
-(OOM_DISABLE) to disable oom killing entirely for that task.  Its value is
-scaled linearly with /proc/<pid>/oom_score_adj.
-
-Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
-other with its scaled value.
-
 The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last
 value set by a CAP_SYS_RESOURCE process. To reduce the value any lower
 requires CAP_SYS_RESOURCE.
 
-NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see
-Documentation/feature-removal-schedule.txt.
-
 Caveat: when a parent task is selected, the oom killer will sacrifice any first
 generation children with separate address spaces instead, if possible.  This
 avoids servers and important system daemons from being killed and loses the
@@ -1387,9 +1375,7 @@ minimal amount of work.
 -------------------------------------------------------------
 
 This file can be used to check the current score used by the oom-killer is for
-any given <pid>. Use it together with /proc/<pid>/oom_adj to tune which
-process should be killed in an out-of-memory situation.
-
+any given <pid>.
 
 3.3  /proc/<pid>/io - Display the IO accounting fields
 -------------------------------------------------------
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d295af993677..ef5c84be66f9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -873,111 +873,6 @@ static const struct file_operations proc_environ_operations = {
 	.release	= mem_release,
 };
 
-static ssize_t oom_adjust_read(struct file *file, char __user *buf,
-				size_t count, loff_t *ppos)
-{
-	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
-	char buffer[PROC_NUMBUF];
-	size_t len;
-	int oom_adjust = OOM_DISABLE;
-	unsigned long flags;
-
-	if (!task)
-		return -ESRCH;
-
-	if (lock_task_sighand(task, &flags)) {
-		oom_adjust = task->signal->oom_adj;
-		unlock_task_sighand(task, &flags);
-	}
-
-	put_task_struct(task);
-
-	len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
-
-	return simple_read_from_buffer(buf, count, ppos, buffer, len);
-}
-
-static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
-				size_t count, loff_t *ppos)
-{
-	struct task_struct *task;
-	char buffer[PROC_NUMBUF];
-	int oom_adjust;
-	unsigned long flags;
-	int err;
-
-	memset(buffer, 0, sizeof(buffer));
-	if (count > sizeof(buffer) - 1)
-		count = sizeof(buffer) - 1;
-	if (copy_from_user(buffer, buf, count)) {
-		err = -EFAULT;
-		goto out;
-	}
-
-	err = kstrtoint(strstrip(buffer), 0, &oom_adjust);
-	if (err)
-		goto out;
-	if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
-	     oom_adjust != OOM_DISABLE) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	task = get_proc_task(file->f_path.dentry->d_inode);
-	if (!task) {
-		err = -ESRCH;
-		goto out;
-	}
-
-	task_lock(task);
-	if (!task->mm) {
-		err = -EINVAL;
-		goto err_task_lock;
-	}
-
-	if (!lock_task_sighand(task, &flags)) {
-		err = -ESRCH;
-		goto err_task_lock;
-	}
-
-	if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
-		err = -EACCES;
-		goto err_sighand;
-	}
-
-	/*
-	 * Warn that /proc/pid/oom_adj is deprecated, see
-	 * Documentation/feature-removal-schedule.txt.
-	 */
-	printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
-		  current->comm, task_pid_nr(current), task_pid_nr(task),
-		  task_pid_nr(task));
-	task->signal->oom_adj = oom_adjust;
-	/*
-	 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
-	 * value is always attainable.
-	 */
-	if (task->signal->oom_adj == OOM_ADJUST_MAX)
-		task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
-	else
-		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
-								-OOM_DISABLE;
-	trace_oom_score_adj_update(task);
-err_sighand:
-	unlock_task_sighand(task, &flags);
-err_task_lock:
-	task_unlock(task);
-	put_task_struct(task);
-out:
-	return err < 0 ? err : count;
-}
-
-static const struct file_operations proc_oom_adjust_operations = {
-	.read		= oom_adjust_read,
-	.write		= oom_adjust_write,
-	.llseek		= generic_file_llseek,
-};
-
 static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
 					size_t count, loff_t *ppos)
 {
@@ -1051,15 +946,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
 		task->signal->oom_score_adj_min = oom_score_adj;
 	trace_oom_score_adj_update(task);
-	/*
-	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
-	 * always attainable.
-	 */
-	if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-		task->signal->oom_adj = OOM_DISABLE;
-	else
-		task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
-							OOM_SCORE_ADJ_MAX;
+
 err_sighand:
 	unlock_task_sighand(task, &flags);
 err_task_lock:
@@ -2710,7 +2597,6 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
 	INF("oom_score",  S_IRUGO, proc_oom_score),
-	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@ -3077,7 +2963,6 @@ static const struct pid_entry tid_base_stuff[] = {
 	REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
 	INF("oom_score", S_IRUGO, proc_oom_score),
-	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 49a3031fda50..d36a8221f58b 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -1,17 +1,6 @@
 #ifndef __INCLUDE_LINUX_OOM_H
 #define __INCLUDE_LINUX_OOM_H
 
-/*
- * /proc/<pid>/oom_adj is deprecated, see
- * Documentation/feature-removal-schedule.txt.
- *
- * /proc/<pid>/oom_adj set to -17 protects from the oom-killer
- */
-#define OOM_DISABLE (-17)
-/* inclusive */
-#define OOM_ADJUST_MIN (-16)
-#define OOM_ADJUST_MAX 15
-
 /*
  * /proc/<pid>/oom_score_adj set to OOM_SCORE_ADJ_MIN disables oom killing for
  * pid.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9c5612f0374b..c2070e92a9d6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -671,7 +671,6 @@ struct signal_struct {
 	struct rw_semaphore group_rwsem;
 #endif
 
-	int oom_adj;		/* OOM kill score adjustment (bit shift) */
 	int oom_score_adj;	/* OOM kill score adjustment */
 	int oom_score_adj_min;	/* OOM kill score adjustment minimum value.
 				 * Only settable by CAP_SYS_RESOURCE. */
diff --git a/kernel/fork.c b/kernel/fork.c
index ec667f797af3..972762e01024 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1056,7 +1056,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	init_rwsem(&sig->group_rwsem);
 #endif
 
-	sig->oom_adj = current->signal->oom_adj;
 	sig->oom_score_adj = current->signal->oom_score_adj;
 	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 198600861638..79e0f3e24831 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -428,8 +428,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 {
 	task_lock(current);
 	pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-		"oom_adj=%d, oom_score_adj=%d\n",
-		current->comm, gfp_mask, order, current->signal->oom_adj,
+		"oom_score_adj=%d\n",
+		current->comm, gfp_mask, order,
 		current->signal->oom_score_adj);
 	cpuset_print_task_mems_allowed(current);
 	task_unlock(current);

From 5d3a551c28c6669dc43be40d8fafafbc2ec8f42b Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Mon, 8 Oct 2012 16:29:32 -0700
Subject: [PATCH 033/160] mm: hugetlb: add arch hook for clearing page flags
 before entering pool

The core page allocator ensures that page flags are zeroed when freeing
pages via free_pages_check.  A number of architectures (ARM, PPC, MIPS)
rely on this property to treat new pages as dirty with respect to the data
cache and perform the appropriate flushing before mapping the pages into
userspace.

This can lead to cache synchronisation problems when using hugepages,
since the allocator keeps its own pool of pages above the usual page
allocator and does not reset the page flags when freeing a page into the
pool.

This patch adds a new architecture hook, arch_clear_hugepage_flags, so
that architectures which rely on the page flags being in a particular
state for fresh allocations can adjust the flags accordingly when a page
is freed into the pool.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Cc: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/include/asm/hugetlb.h    | 4 ++++
 arch/mips/include/asm/hugetlb.h    | 4 ++++
 arch/powerpc/include/asm/hugetlb.h | 4 ++++
 arch/s390/include/asm/hugetlb.h    | 1 +
 arch/sh/include/asm/hugetlb.h      | 6 ++++++
 arch/sparc/include/asm/hugetlb.h   | 4 ++++
 arch/tile/include/asm/hugetlb.h    | 4 ++++
 arch/x86/include/asm/hugetlb.h     | 4 ++++
 mm/hugetlb.c                       | 1 +
 9 files changed, 32 insertions(+)

diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h
index da55c63728e0..94eaa5bd5d0c 100644
--- a/arch/ia64/include/asm/hugetlb.h
+++ b/arch/ia64/include/asm/hugetlb.h
@@ -77,4 +77,8 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+}
+
 #endif /* _ASM_IA64_HUGETLB_H */
diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h
index 58d36889f09b..bd94946a18f3 100644
--- a/arch/mips/include/asm/hugetlb.h
+++ b/arch/mips/include/asm/hugetlb.h
@@ -112,4 +112,8 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+}
+
 #endif /* __ASM_HUGETLB_H */
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index dfdb95bc59a5..62e11a32c4c2 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -151,6 +151,10 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+}
+
 #else /* ! CONFIG_HUGETLB_PAGE */
 static inline void flush_hugetlb_page(struct vm_area_struct *vma,
 				      unsigned long vmaddr)
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index 2d6e6e380564..fc322421b1cc 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -33,6 +33,7 @@ static inline int prepare_hugepage_range(struct file *file,
 }
 
 #define hugetlb_prefault_arch_hook(mm)		do { } while (0)
+#define arch_clear_hugepage_flags(page)		do { } while (0)
 
 int arch_prepare_hugepage(struct page *page);
 void arch_release_hugepage(struct page *page);
diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h
index 967068fb79ac..b3808c7d67b2 100644
--- a/arch/sh/include/asm/hugetlb.h
+++ b/arch/sh/include/asm/hugetlb.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_SH_HUGETLB_H
 #define _ASM_SH_HUGETLB_H
 
+#include <asm/cacheflush.h>
 #include <asm/page.h>
 
 
@@ -89,4 +90,9 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+	clear_bit(PG_dcache_clean, &page->flags);
+}
+
 #endif /* _ASM_SH_HUGETLB_H */
diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h
index 177061064ee6..e7927c9758a1 100644
--- a/arch/sparc/include/asm/hugetlb.h
+++ b/arch/sparc/include/asm/hugetlb.h
@@ -82,4 +82,8 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+}
+
 #endif /* _ASM_SPARC64_HUGETLB_H */
diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/hugetlb.h
index b2042380a5aa..0f885af2b621 100644
--- a/arch/tile/include/asm/hugetlb.h
+++ b/arch/tile/include/asm/hugetlb.h
@@ -106,6 +106,10 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+}
+
 #ifdef CONFIG_HUGETLB_SUPER_PAGES
 static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
 				       struct page *page, int writable)
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 439a9acc132d..bdd35dbd0605 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -90,4 +90,8 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+}
+
 #endif /* _ASM_X86_HUGETLB_H */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc727122dd44..f1bb534254f6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -637,6 +637,7 @@ static void free_huge_page(struct page *page)
 		h->surplus_huge_pages--;
 		h->surplus_huge_pages_node[nid]--;
 	} else {
+		arch_clear_hugepage_flags(page);
 		enqueue_huge_page(h, page);
 	}
 	spin_unlock(&hugetlb_lock);

From db971418824381d3583c73751181fec76c743bf6 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:29:34 -0700
Subject: [PATCH 034/160] mm: adjust final #endif position in mm/internal.h

Make sure the #endif that terminates the standard #ifndef / #define /
#endif construct gets labeled, and gets positioned at the end of the file
as is normally the case.

Signed-off-by: Michel Lespinasse <walken@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/internal.h b/mm/internal.h
index e549a7fbc296..bbd7b34db4ea 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -341,7 +341,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 #define ZONE_RECLAIM_FULL	-1
 #define ZONE_RECLAIM_SOME	0
 #define ZONE_RECLAIM_SUCCESS	1
-#endif
 
 extern int hwpoison_filter(struct page *p);
 
@@ -357,3 +356,5 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
         unsigned long, unsigned long);
 
 extern void set_pageblock_order(void);
+
+#endif	/* __MM_INTERNAL_H */

From 65b3c07b43f7f8a5cbf8923011bd4e6650e3d1dc Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:35 -0700
Subject: [PATCH 035/160] thp: fix the count of THP_COLLAPSE_ALLOC

THP_COLLAPSE_ALLOC is double counted if NUMA is disabled since it has
already been calculated in khugepaged_alloc_hugepage

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9b72d1270519..860ea9127851 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1874,9 +1874,9 @@ static void collapse_huge_page(struct mm_struct *mm,
 		*hpage = ERR_PTR(-ENOMEM);
 		return;
 	}
+	count_vm_event(THP_COLLAPSE_ALLOC);
 #endif
 
-	count_vm_event(THP_COLLAPSE_ALLOC);
 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
 #ifdef CONFIG_NUMA
 		put_page(new_page);

From 637e3a27ec2c84f7ecd083fa6943da2f19eb5e9f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:38 -0700
Subject: [PATCH 036/160] thp: remove unnecessary check in start_khugepaged

The check is unnecessary since if mm_slot_cache or mm_slots_hash
initialize failed, no sysfs interface will be created

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 860ea9127851..9833d8ecf38f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -140,10 +140,7 @@ static int start_khugepaged(void)
 	int err = 0;
 	if (khugepaged_enabled()) {
 		int wakeup;
-		if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
-			err = -ENOMEM;
-			goto out;
-		}
+
 		mutex_lock(&khugepaged_mutex);
 		if (!khugepaged_thread)
 			khugepaged_thread = kthread_run(khugepaged, NULL,
@@ -163,7 +160,7 @@ static int start_khugepaged(void)
 	} else
 		/* wakeup to exit */
 		wake_up_interruptible(&khugepaged_wait);
-out:
+
 	return err;
 }
 

From 911891afe1c3104adf0f802189909868239ebbfd Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:41 -0700
Subject: [PATCH 037/160] thp: move khugepaged_mutex out of khugepaged

Currently, hugepaged_mutex is used really complexly and hard to
understand, actually, it is just used to serialize start_khugepaged and
khugepaged for these reasons:

- khugepaged_thread is shared between them
- the thp disable path (echo never > transparent_hugepage/enabled) is
  nonblocking, so we need to protect khugepaged_thread to get a stable
  running state

These can be avoided by:

- use the lock to serialize the thread creation and cancel
- thp disable path can not finised until the thread exits

Then khugepaged_thread is fully controlled by start_khugepaged, khugepaged
will be happy without the lock

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 36 +++++++++++++-----------------------
 1 file changed, 13 insertions(+), 23 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9833d8ecf38f..0931b2b19c52 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -139,9 +139,6 @@ static int start_khugepaged(void)
 {
 	int err = 0;
 	if (khugepaged_enabled()) {
-		int wakeup;
-
-		mutex_lock(&khugepaged_mutex);
 		if (!khugepaged_thread)
 			khugepaged_thread = kthread_run(khugepaged, NULL,
 							"khugepaged");
@@ -151,15 +148,17 @@ static int start_khugepaged(void)
 			err = PTR_ERR(khugepaged_thread);
 			khugepaged_thread = NULL;
 		}
-		wakeup = !list_empty(&khugepaged_scan.mm_head);
-		mutex_unlock(&khugepaged_mutex);
-		if (wakeup)
+
+		if (!list_empty(&khugepaged_scan.mm_head))
 			wake_up_interruptible(&khugepaged_wait);
 
 		set_recommended_min_free_kbytes();
-	} else
+	} else if (khugepaged_thread) {
 		/* wakeup to exit */
 		wake_up_interruptible(&khugepaged_wait);
+		kthread_stop(khugepaged_thread);
+		khugepaged_thread = NULL;
+	}
 
 	return err;
 }
@@ -221,7 +220,12 @@ static ssize_t enabled_store(struct kobject *kobj,
 				TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
 
 	if (ret > 0) {
-		int err = start_khugepaged();
+		int err;
+
+		mutex_lock(&khugepaged_mutex);
+		err = start_khugepaged();
+		mutex_unlock(&khugepaged_mutex);
+
 		if (err)
 			ret = err;
 	}
@@ -2329,20 +2333,10 @@ static int khugepaged(void *none)
 	set_freezable();
 	set_user_nice(current, 19);
 
-	/* serialize with start_khugepaged() */
-	mutex_lock(&khugepaged_mutex);
-
-	for (;;) {
-		mutex_unlock(&khugepaged_mutex);
+	while (!kthread_should_stop()) {
 		VM_BUG_ON(khugepaged_thread != current);
 		khugepaged_loop();
 		VM_BUG_ON(khugepaged_thread != current);
-
-		mutex_lock(&khugepaged_mutex);
-		if (!khugepaged_enabled())
-			break;
-		if (unlikely(kthread_should_stop()))
-			break;
 	}
 
 	spin_lock(&khugepaged_mm_lock);
@@ -2351,10 +2345,6 @@ static int khugepaged(void *none)
 	if (mm_slot)
 		collect_mm_slot(mm_slot);
 	spin_unlock(&khugepaged_mm_lock);
-
-	khugepaged_thread = NULL;
-	mutex_unlock(&khugepaged_mutex);
-
 	return 0;
 }
 

From e060f0e0139b83f05bb90fa05563d14179b9a7ff Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:42 -0700
Subject: [PATCH 038/160] thp: remove unnecessary khugepaged_thread check

Now, khugepaged creation and cancel are completely serial under the
protection of khugepaged_mutex, it is impossible that many khugepaged
entities are running

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0931b2b19c52..46e3f150a6ca 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2333,11 +2333,8 @@ static int khugepaged(void *none)
 	set_freezable();
 	set_user_nice(current, 19);
 
-	while (!kthread_should_stop()) {
-		VM_BUG_ON(khugepaged_thread != current);
+	while (!kthread_should_stop())
 		khugepaged_loop();
-		VM_BUG_ON(khugepaged_thread != current);
-	}
 
 	spin_lock(&khugepaged_mm_lock);
 	mm_slot = khugepaged_scan.mm_slot;

From 2017c0bff8ba79ea527361adbe19471e174775d6 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:44 -0700
Subject: [PATCH 039/160] thp: remove wake_up_interruptible in the exit path

Add the check of kthread_should_stop() to the conditions which are used to
wakeup on khugepaged_wait, then kthread_stop is enough to let the thread
exit

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 46e3f150a6ca..a94c07a1b3c5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -154,8 +154,6 @@ static int start_khugepaged(void)
 
 		set_recommended_min_free_kbytes();
 	} else if (khugepaged_thread) {
-		/* wakeup to exit */
-		wake_up_interruptible(&khugepaged_wait);
 		kthread_stop(khugepaged_thread);
 		khugepaged_thread = NULL;
 	}
@@ -2221,7 +2219,7 @@ static int khugepaged_has_work(void)
 static int khugepaged_wait_event(void)
 {
 	return !list_empty(&khugepaged_scan.mm_head) ||
-		!khugepaged_enabled();
+		kthread_should_stop();
 }
 
 static void khugepaged_do_scan(struct page **hpage)
@@ -2288,6 +2286,24 @@ static struct page *khugepaged_alloc_hugepage(void)
 }
 #endif
 
+static void khugepaged_wait_work(void)
+{
+	try_to_freeze();
+
+	if (khugepaged_has_work()) {
+		if (!khugepaged_scan_sleep_millisecs)
+			return;
+
+		wait_event_freezable_timeout(khugepaged_wait,
+					     kthread_should_stop(),
+			msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
+		return;
+	}
+
+	if (khugepaged_enabled())
+		wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
+}
+
 static void khugepaged_loop(void)
 {
 	struct page *hpage;
@@ -2312,17 +2328,8 @@ static void khugepaged_loop(void)
 		if (hpage)
 			put_page(hpage);
 #endif
-		try_to_freeze();
-		if (unlikely(kthread_should_stop()))
-			break;
-		if (khugepaged_has_work()) {
-			if (!khugepaged_scan_sleep_millisecs)
-				continue;
-			wait_event_freezable_timeout(khugepaged_wait, false,
-			    msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
-		} else if (khugepaged_enabled())
-			wait_event_freezable(khugepaged_wait,
-					     khugepaged_wait_event());
+
+		khugepaged_wait_work();
 	}
 }
 

From 9817626e722a5e5699cf38f5d3a4c9851e054436 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:46 -0700
Subject: [PATCH 040/160] thp: remove some code depend on CONFIG_NUMA

If NUMA is disabled, hpage is used as page pre-alloc, so there are two
cases for hpage:

- it is !NULL, means the page is not consumed otherwise,
- the page has been consumed

If NUMA is enabled, hpage is just used as alloc-fail indicator which is
not a real page, NULL means not fail triggered.

So, we can release the page only if !IS_ERR_OR_NULL

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a94c07a1b3c5..1e21b4cf4c75 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2306,11 +2306,8 @@ static void khugepaged_wait_work(void)
 
 static void khugepaged_loop(void)
 {
-	struct page *hpage;
+	struct page *hpage = NULL;
 
-#ifdef CONFIG_NUMA
-	hpage = NULL;
-#endif
 	while (likely(khugepaged_enabled())) {
 #ifndef CONFIG_NUMA
 		hpage = khugepaged_alloc_hugepage();
@@ -2324,10 +2321,9 @@ static void khugepaged_loop(void)
 #endif
 
 		khugepaged_do_scan(&hpage);
-#ifndef CONFIG_NUMA
-		if (hpage)
+
+		if (!IS_ERR_OR_NULL(hpage))
 			put_page(hpage);
-#endif
 
 		khugepaged_wait_work();
 	}

From d516904bd239fe2c9f1bd46cf146bb4b8831321c Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:48 -0700
Subject: [PATCH 041/160] thp: merge page pre-alloc in khugepaged_loop into
 khugepaged_do_scan

There are two pre-alloc operations in these two function, the different is:
- it allows to sleep if page alloc fail in khugepaged_loop
- it exits immediately if page alloc fail in khugepaged_do_scan

Actually, in khugepaged_do_scan, we can allow the pre-alloc to sleep on
the first failure, then the operation in khugepaged_loop can be removed

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 99 ++++++++++++++++++++++--------------------------
 1 file changed, 46 insertions(+), 53 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1e21b4cf4c75..d5b5fcc73c44 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2222,10 +2222,40 @@ static int khugepaged_wait_event(void)
 		kthread_should_stop();
 }
 
-static void khugepaged_do_scan(struct page **hpage)
+static void khugepaged_alloc_sleep(void)
 {
+	wait_event_freezable_timeout(khugepaged_wait, false,
+			msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+}
+
+#ifndef CONFIG_NUMA
+static struct page *khugepaged_alloc_hugepage(bool *wait)
+{
+	struct page *hpage;
+
+	do {
+		hpage = alloc_hugepage(khugepaged_defrag());
+		if (!hpage) {
+			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+			if (!*wait)
+				return NULL;
+
+			*wait = false;
+			khugepaged_alloc_sleep();
+		} else
+			count_vm_event(THP_COLLAPSE_ALLOC);
+	} while (unlikely(!hpage) && likely(khugepaged_enabled()));
+
+	return hpage;
+}
+#endif
+
+static void khugepaged_do_scan(void)
+{
+	struct page *hpage = NULL;
 	unsigned int progress = 0, pass_through_head = 0;
 	unsigned int pages = khugepaged_pages_to_scan;
+	bool wait = true;
 
 	barrier(); /* write khugepaged_pages_to_scan to local stack */
 
@@ -2233,17 +2263,18 @@ static void khugepaged_do_scan(struct page **hpage)
 		cond_resched();
 
 #ifndef CONFIG_NUMA
-		if (!*hpage) {
-			*hpage = alloc_hugepage(khugepaged_defrag());
-			if (unlikely(!*hpage)) {
-				count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-				break;
-			}
-			count_vm_event(THP_COLLAPSE_ALLOC);
-		}
-#else
-		if (IS_ERR(*hpage))
+		if (!hpage)
+			hpage = khugepaged_alloc_hugepage(&wait);
+
+		if (unlikely(!hpage))
 			break;
+#else
+		if (IS_ERR(hpage)) {
+			if (!wait)
+				break;
+			wait = false;
+			khugepaged_alloc_sleep();
+		}
 #endif
 
 		if (unlikely(kthread_should_stop() || freezing(current)))
@@ -2255,37 +2286,16 @@ static void khugepaged_do_scan(struct page **hpage)
 		if (khugepaged_has_work() &&
 		    pass_through_head < 2)
 			progress += khugepaged_scan_mm_slot(pages - progress,
-							    hpage);
+							    &hpage);
 		else
 			progress = pages;
 		spin_unlock(&khugepaged_mm_lock);
 	}
-}
 
-static void khugepaged_alloc_sleep(void)
-{
-	wait_event_freezable_timeout(khugepaged_wait, false,
-			msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+	if (!IS_ERR_OR_NULL(hpage))
+		put_page(hpage);
 }
 
-#ifndef CONFIG_NUMA
-static struct page *khugepaged_alloc_hugepage(void)
-{
-	struct page *hpage;
-
-	do {
-		hpage = alloc_hugepage(khugepaged_defrag());
-		if (!hpage) {
-			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-			khugepaged_alloc_sleep();
-		} else
-			count_vm_event(THP_COLLAPSE_ALLOC);
-	} while (unlikely(!hpage) &&
-		 likely(khugepaged_enabled()));
-	return hpage;
-}
-#endif
-
 static void khugepaged_wait_work(void)
 {
 	try_to_freeze();
@@ -2306,25 +2316,8 @@ static void khugepaged_wait_work(void)
 
 static void khugepaged_loop(void)
 {
-	struct page *hpage = NULL;
-
 	while (likely(khugepaged_enabled())) {
-#ifndef CONFIG_NUMA
-		hpage = khugepaged_alloc_hugepage();
-		if (unlikely(!hpage))
-			break;
-#else
-		if (IS_ERR(hpage)) {
-			khugepaged_alloc_sleep();
-			hpage = NULL;
-		}
-#endif
-
-		khugepaged_do_scan(&hpage);
-
-		if (!IS_ERR_OR_NULL(hpage))
-			put_page(hpage);
-
+		khugepaged_do_scan();
 		khugepaged_wait_work();
 	}
 }

From 420256ef02660af0acf28c12fe4b7d514ca88a4d Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:49 -0700
Subject: [PATCH 042/160] thp: release page in page pre-alloc path

If NUMA is enabled, we can release the page in the page pre-alloc
operation, then the CONFIG_NUMA dependent code can be reduced

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d5b5fcc73c44..9c4390f60c3e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1873,15 +1873,12 @@ static void collapse_huge_page(struct mm_struct *mm,
 		*hpage = ERR_PTR(-ENOMEM);
 		return;
 	}
+	*hpage = new_page;
 	count_vm_event(THP_COLLAPSE_ALLOC);
 #endif
 
-	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
-#ifdef CONFIG_NUMA
-		put_page(new_page);
-#endif
+	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
 		return;
-	}
 
 	/*
 	 * Prevent all access to pagetables with the exception of
@@ -1982,9 +1979,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 	prepare_pmd_huge_pte(pgtable, mm);
 	spin_unlock(&mm->page_table_lock);
 
-#ifndef CONFIG_NUMA
 	*hpage = NULL;
-#endif
+
 	khugepaged_pages_collapsed++;
 out_up_write:
 	up_write(&mm->mmap_sem);
@@ -1992,9 +1988,6 @@ out_up_write:
 
 out:
 	mem_cgroup_uncharge_page(new_page);
-#ifdef CONFIG_NUMA
-	put_page(new_page);
-#endif
 	goto out_up_write;
 }
 
@@ -2260,8 +2253,6 @@ static void khugepaged_do_scan(void)
 	barrier(); /* write khugepaged_pages_to_scan to local stack */
 
 	while (progress < pages) {
-		cond_resched();
-
 #ifndef CONFIG_NUMA
 		if (!hpage)
 			hpage = khugepaged_alloc_hugepage(&wait);
@@ -2274,8 +2265,12 @@ static void khugepaged_do_scan(void)
 				break;
 			wait = false;
 			khugepaged_alloc_sleep();
+		} else if (hpage) {
+			put_page(hpage);
+			hpage = NULL;
 		}
 #endif
+		cond_resched();
 
 		if (unlikely(kthread_should_stop() || freezing(current)))
 			break;

From 26234f36ef3ec7efcfa9acb181427849c1f9db7c Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:51 -0700
Subject: [PATCH 043/160] thp: introduce khugepaged_prealloc_page and
 khugepaged_alloc_page

They are used to abstract the difference between NUMA enabled and NUMA
disabled to make the code more readable

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 186 +++++++++++++++++++++++++++--------------------
 1 file changed, 108 insertions(+), 78 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9c4390f60c3e..f0e999379dd7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1827,11 +1827,110 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 	}
 }
 
+static void khugepaged_alloc_sleep(void)
+{
+	wait_event_freezable_timeout(khugepaged_wait, false,
+			msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+}
+
+#ifdef CONFIG_NUMA
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+	if (IS_ERR(*hpage)) {
+		if (!*wait)
+			return false;
+
+		*wait = false;
+		khugepaged_alloc_sleep();
+	} else if (*hpage) {
+		put_page(*hpage);
+		*hpage = NULL;
+	}
+
+	return true;
+}
+
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+		       struct vm_area_struct *vma, unsigned long address,
+		       int node)
+{
+	VM_BUG_ON(*hpage);
+	/*
+	 * Allocate the page while the vma is still valid and under
+	 * the mmap_sem read mode so there is no memory allocation
+	 * later when we take the mmap_sem in write mode. This is more
+	 * friendly behavior (OTOH it may actually hide bugs) to
+	 * filesystems in userland with daemons allocating memory in
+	 * the userland I/O paths.  Allocating memory with the
+	 * mmap_sem in read mode is good idea also to allow greater
+	 * scalability.
+	 */
+	*hpage  = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+				      node, __GFP_OTHER_NODE);
+
+	/*
+	 * After allocating the hugepage, release the mmap_sem read lock in
+	 * preparation for taking it in write mode.
+	 */
+	up_read(&mm->mmap_sem);
+	if (unlikely(!*hpage)) {
+		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+		*hpage = ERR_PTR(-ENOMEM);
+		return NULL;
+	}
+
+	count_vm_event(THP_COLLAPSE_ALLOC);
+	return *hpage;
+}
+#else
+static struct page *khugepaged_alloc_hugepage(bool *wait)
+{
+	struct page *hpage;
+
+	do {
+		hpage = alloc_hugepage(khugepaged_defrag());
+		if (!hpage) {
+			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+			if (!*wait)
+				return NULL;
+
+			*wait = false;
+			khugepaged_alloc_sleep();
+		} else
+			count_vm_event(THP_COLLAPSE_ALLOC);
+	} while (unlikely(!hpage) && likely(khugepaged_enabled()));
+
+	return hpage;
+}
+
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+	if (!*hpage)
+		*hpage = khugepaged_alloc_hugepage(wait);
+
+	if (unlikely(!*hpage))
+		return false;
+
+	return true;
+}
+
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+		       struct vm_area_struct *vma, unsigned long address,
+		       int node)
+{
+	up_read(&mm->mmap_sem);
+	VM_BUG_ON(!*hpage);
+	return  *hpage;
+}
+#endif
+
 static void collapse_huge_page(struct mm_struct *mm,
-			       unsigned long address,
-			       struct page **hpage,
-			       struct vm_area_struct *vma,
-			       int node)
+				   unsigned long address,
+				   struct page **hpage,
+				   struct vm_area_struct *vma,
+				   int node)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -1844,38 +1943,11 @@ static void collapse_huge_page(struct mm_struct *mm,
 	unsigned long hstart, hend;
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-#ifndef CONFIG_NUMA
-	up_read(&mm->mmap_sem);
-	VM_BUG_ON(!*hpage);
-	new_page = *hpage;
-#else
-	VM_BUG_ON(*hpage);
-	/*
-	 * Allocate the page while the vma is still valid and under
-	 * the mmap_sem read mode so there is no memory allocation
-	 * later when we take the mmap_sem in write mode. This is more
-	 * friendly behavior (OTOH it may actually hide bugs) to
-	 * filesystems in userland with daemons allocating memory in
-	 * the userland I/O paths.  Allocating memory with the
-	 * mmap_sem in read mode is good idea also to allow greater
-	 * scalability.
-	 */
-	new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
-				      node, __GFP_OTHER_NODE);
 
-	/*
-	 * After allocating the hugepage, release the mmap_sem read lock in
-	 * preparation for taking it in write mode.
-	 */
-	up_read(&mm->mmap_sem);
-	if (unlikely(!new_page)) {
-		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-		*hpage = ERR_PTR(-ENOMEM);
+	/* release the mmap_sem read lock. */
+	new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+	if (!new_page)
 		return;
-	}
-	*hpage = new_page;
-	count_vm_event(THP_COLLAPSE_ALLOC);
-#endif
 
 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
 		return;
@@ -2215,34 +2287,6 @@ static int khugepaged_wait_event(void)
 		kthread_should_stop();
 }
 
-static void khugepaged_alloc_sleep(void)
-{
-	wait_event_freezable_timeout(khugepaged_wait, false,
-			msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
-}
-
-#ifndef CONFIG_NUMA
-static struct page *khugepaged_alloc_hugepage(bool *wait)
-{
-	struct page *hpage;
-
-	do {
-		hpage = alloc_hugepage(khugepaged_defrag());
-		if (!hpage) {
-			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-			if (!*wait)
-				return NULL;
-
-			*wait = false;
-			khugepaged_alloc_sleep();
-		} else
-			count_vm_event(THP_COLLAPSE_ALLOC);
-	} while (unlikely(!hpage) && likely(khugepaged_enabled()));
-
-	return hpage;
-}
-#endif
-
 static void khugepaged_do_scan(void)
 {
 	struct page *hpage = NULL;
@@ -2253,23 +2297,9 @@ static void khugepaged_do_scan(void)
 	barrier(); /* write khugepaged_pages_to_scan to local stack */
 
 	while (progress < pages) {
-#ifndef CONFIG_NUMA
-		if (!hpage)
-			hpage = khugepaged_alloc_hugepage(&wait);
-
-		if (unlikely(!hpage))
+		if (!khugepaged_prealloc_page(&hpage, &wait))
 			break;
-#else
-		if (IS_ERR(hpage)) {
-			if (!wait)
-				break;
-			wait = false;
-			khugepaged_alloc_sleep();
-		} else if (hpage) {
-			put_page(hpage);
-			hpage = NULL;
-		}
-#endif
+
 		cond_resched();
 
 		if (unlikely(kthread_should_stop() || freezing(current)))

From b7231789b0224e73af4efc7973f8bcf17fc16edd Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:54 -0700
Subject: [PATCH 044/160] thp: remove khugepaged_loop

Merge khugepaged_loop into khugepaged

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f0e999379dd7..77b470b7e3c6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2339,14 +2339,6 @@ static void khugepaged_wait_work(void)
 		wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
 }
 
-static void khugepaged_loop(void)
-{
-	while (likely(khugepaged_enabled())) {
-		khugepaged_do_scan();
-		khugepaged_wait_work();
-	}
-}
-
 static int khugepaged(void *none)
 {
 	struct mm_slot *mm_slot;
@@ -2354,8 +2346,10 @@ static int khugepaged(void *none)
 	set_freezable();
 	set_user_nice(current, 19);
 
-	while (!kthread_should_stop())
-		khugepaged_loop();
+	while (!kthread_should_stop()) {
+		khugepaged_do_scan();
+		khugepaged_wait_work();
+	}
 
 	spin_lock(&khugepaged_mm_lock);
 	mm_slot = khugepaged_scan.mm_slot;

From 17c230afa58a6d013a4949d5c04b823a281d40fa Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:56 -0700
Subject: [PATCH 045/160] thp: use khugepaged_enabled to remove duplicate code

Use khugepaged_enabled to see whether thp is enabled

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 77b470b7e3c6..a169c4dbcddc 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -102,10 +102,7 @@ static int set_recommended_min_free_kbytes(void)
 	unsigned long recommended_min;
 	extern int min_free_kbytes;
 
-	if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
-		      &transparent_hugepage_flags) &&
-	    !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
-		      &transparent_hugepage_flags))
+	if (!khugepaged_enabled())
 		return 0;
 
 	for_each_populated_zone(zone)
@@ -228,11 +225,7 @@ static ssize_t enabled_store(struct kobject *kobj,
 			ret = err;
 	}
 
-	if (ret > 0 &&
-	    (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
-		      &transparent_hugepage_flags) ||
-	     test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
-		      &transparent_hugepage_flags)))
+	if (ret > 0 && khugepaged_enabled())
 		set_recommended_min_free_kbytes();
 
 	return ret;

From 227e4047488d3ee0173a914275a7fd207ad51e5b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:29:59 -0700
Subject: [PATCH 046/160] thp: remove unnecessary
 set_recommended_min_free_kbytes

Since it is called in start_khugepaged

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a169c4dbcddc..e19cc426c522 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -225,9 +225,6 @@ static ssize_t enabled_store(struct kobject *kobj,
 			ret = err;
 	}
 
-	if (ret > 0 && khugepaged_enabled())
-		set_recommended_min_free_kbytes();
-
 	return ret;
 }
 static struct kobj_attribute enabled_attr =
@@ -562,8 +559,6 @@ static int __init hugepage_init(void)
 
 	start_khugepaged();
 
-	set_recommended_min_free_kbytes();
-
 	return 0;
 out:
 	hugepage_exit_sysfs(hugepage_kobj);

From ca42b26ab285edc5ee3f9faa48379d258db53c35 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:30:01 -0700
Subject: [PATCH 047/160] mm: fix potential anon_vma locking issue in
 mprotect()

Fix an anon_vma locking issue in the following situation:

- vma has no anon_vma
- next has an anon_vma
- vma is being shrunk / next is being expanded, due to an mprotect call

We need to take next's anon_vma lock to avoid races with rmap users (such
as page migration) while next is being expanded.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 2ba27434e843..e3c365ff1b6a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -573,8 +573,12 @@ again:			remove_next = 1 + (end > next->vm_end);
 	 */
 	if (vma->anon_vma && (importer || start != vma->vm_start)) {
 		anon_vma = vma->anon_vma;
+		VM_BUG_ON(adjust_next && next->anon_vma &&
+			  anon_vma != next->anon_vma);
+	} else if (adjust_next && next->anon_vma)
+		anon_vma = next->anon_vma;
+	if (anon_vma)
 		anon_vma_lock(anon_vma);
-	}
 
 	if (root) {
 		flush_dcache_mmap_lock(mapping);

From 15626062f4a98279c59a2a5208c496cf65cbf8c0 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Mon, 8 Oct 2012 16:30:04 -0700
Subject: [PATCH 048/160] thp, x86: introduce HAVE_ARCH_TRANSPARENT_HUGEPAGE

Cleanup patch in preparation for transparent hugepage support on s390.
Adding new architectures to the TRANSPARENT_HUGEPAGE config option can
make the "depends" line rather ugly, like "depends on (X86 || (S390 &&
64BIT)) && MMU".

This patch adds a HAVE_ARCH_TRANSPARENT_HUGEPAGE instead.  x86 already has
MMU "def_bool y", so the MMU check is superfluous there and
HAVE_ARCH_TRANSPARENT_HUGEPAGE can be selected in arch/x86/Kconfig.

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig     | 3 +++
 arch/x86/Kconfig | 1 +
 mm/Kconfig       | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index a62965d057f6..550cce4dd648 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -313,4 +313,7 @@ config HAVE_IRQ_TIME_ACCOUNTING
 	  Archs need to ensure they use a high enough resolution clock to
 	  support irq time accounting and then call enable_sched_clock_irqtime().
 
+config HAVE_ARCH_TRANSPARENT_HUGEPAGE
+	bool
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6119d6c7002e..1ae94bcae5d9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -88,6 +88,7 @@ config X86
 	select IRQ_FORCED_THREADING
 	select USE_GENERIC_SMP_HELPERS if SMP
 	select HAVE_BPF_JIT if X86_64
+	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select CLKEVT_I8253
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_IOMAP
diff --git a/mm/Kconfig b/mm/Kconfig
index d5c8019c6627..3322342a1ffb 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -318,7 +318,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
 
 config TRANSPARENT_HUGEPAGE
 	bool "Transparent Hugepage Support"
-	depends on X86 && MMU
+	depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select COMPACTION
 	help
 	  Transparent Hugepages allows the kernel to use huge pages and

From e3ebcf64381188a2744a9829a4eb5c2b60f1974c Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Mon, 8 Oct 2012 16:30:07 -0700
Subject: [PATCH 049/160] thp: remove assumptions on pgtable_t type

The thp page table pre-allocation code currently assumes that pgtable_t is
of type "struct page *".  This may not be true for all architectures, so
this patch removes that assumption by replacing the functions
prepare_pmd_huge_pte() and get_pmd_huge_pte() with two new functions that
can be defined architecture-specific.

It also removes two VM_BUG_ON checks for page_count() and page_mapcount()
operating on a pgtable_t.  Apart from the VM_BUG_ON removal, there will be
no functional change introduced by this patch.

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/pgtable.h |  8 ++++++
 include/linux/huge_mm.h       |  1 -
 mm/huge_memory.c              | 50 ++++++-----------------------------
 mm/pgtable-generic.c          | 39 +++++++++++++++++++++++++++
 4 files changed, 55 insertions(+), 43 deletions(-)

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index c9a612069c8e..044939f21c16 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -162,6 +162,14 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma,
 				 unsigned long address, pmd_t *pmdp);
 #endif
 
+#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
+#endif
+
+#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
+#endif
+
 #ifndef __HAVE_ARCH_PTE_SAME
 static inline int pte_same(pte_t pte_a, pte_t pte_b)
 {
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 4c59b1131187..6ab47af5a849 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -11,7 +11,6 @@ extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			       unsigned long address, pmd_t *pmd,
 			       pmd_t orig_pmd);
-extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm);
 extern struct page *follow_trans_huge_pmd(struct mm_struct *mm,
 					  unsigned long addr,
 					  pmd_t *pmd,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e19cc426c522..9ea6d1953765 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -598,19 +598,6 @@ out:
 }
 __setup("transparent_hugepage=", setup_transparent_hugepage);
 
-static void prepare_pmd_huge_pte(pgtable_t pgtable,
-				 struct mm_struct *mm)
-{
-	assert_spin_locked(&mm->page_table_lock);
-
-	/* FIFO */
-	if (!mm->pmd_huge_pte)
-		INIT_LIST_HEAD(&pgtable->lru);
-	else
-		list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
-	mm->pmd_huge_pte = pgtable;
-}
-
 static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
 	if (likely(vma->vm_flags & VM_WRITE))
@@ -652,7 +639,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 		 */
 		page_add_new_anon_rmap(page, vma, haddr);
 		set_pmd_at(mm, haddr, pmd, entry);
-		prepare_pmd_huge_pte(pgtable, mm);
+		pgtable_trans_huge_deposit(mm, pgtable);
 		add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
 		mm->nr_ptes++;
 		spin_unlock(&mm->page_table_lock);
@@ -778,7 +765,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	pmdp_set_wrprotect(src_mm, addr, src_pmd);
 	pmd = pmd_mkold(pmd_wrprotect(pmd));
 	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-	prepare_pmd_huge_pte(pgtable, dst_mm);
+	pgtable_trans_huge_deposit(dst_mm, pgtable);
 	dst_mm->nr_ptes++;
 
 	ret = 0;
@@ -789,25 +776,6 @@ out:
 	return ret;
 }
 
-/* no "address" argument so destroys page coloring of some arch */
-pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
-{
-	pgtable_t pgtable;
-
-	assert_spin_locked(&mm->page_table_lock);
-
-	/* FIFO */
-	pgtable = mm->pmd_huge_pte;
-	if (list_empty(&pgtable->lru))
-		mm->pmd_huge_pte = NULL;
-	else {
-		mm->pmd_huge_pte = list_entry(pgtable->lru.next,
-					      struct page, lru);
-		list_del(&pgtable->lru);
-	}
-	return pgtable;
-}
-
 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 					struct vm_area_struct *vma,
 					unsigned long address,
@@ -863,7 +831,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 	pmdp_clear_flush_notify(vma, haddr, pmd);
 	/* leave pmd empty until pte is filled */
 
-	pgtable = get_pmd_huge_pte(mm);
+	pgtable = pgtable_trans_huge_withdraw(mm);
 	pmd_populate(mm, &_pmd, pgtable);
 
 	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1028,7 +996,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	if (__pmd_trans_huge_lock(pmd, vma) == 1) {
 		struct page *page;
 		pgtable_t pgtable;
-		pgtable = get_pmd_huge_pte(tlb->mm);
+		pgtable = pgtable_trans_huge_withdraw(tlb->mm);
 		page = pmd_page(*pmd);
 		pmd_clear(pmd);
 		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
@@ -1345,11 +1313,11 @@ static int __split_huge_page_map(struct page *page,
 	pmd = page_check_address_pmd(page, mm, address,
 				     PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
 	if (pmd) {
-		pgtable = get_pmd_huge_pte(mm);
+		pgtable = pgtable_trans_huge_withdraw(mm);
 		pmd_populate(mm, &_pmd, pgtable);
 
-		for (i = 0, haddr = address; i < HPAGE_PMD_NR;
-		     i++, haddr += PAGE_SIZE) {
+		haddr = address;
+		for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
 			pte_t *pte, entry;
 			BUG_ON(PageCompound(page+i));
 			entry = mk_pte(page + i, vma->vm_page_prot);
@@ -2017,8 +1985,6 @@ static void collapse_huge_page(struct mm_struct *mm,
 	pte_unmap(pte);
 	__SetPageUptodate(new_page);
 	pgtable = pmd_pgtable(_pmd);
-	VM_BUG_ON(page_count(pgtable) != 1);
-	VM_BUG_ON(page_mapcount(pgtable) != 0);
 
 	_pmd = mk_pmd(new_page, vma->vm_page_prot);
 	_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
@@ -2036,7 +2002,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	page_add_new_anon_rmap(new_page, vma, address);
 	set_pmd_at(mm, address, pmd, _pmd);
 	update_mmu_cache(vma, address, _pmd);
-	prepare_pmd_huge_pte(pgtable, mm);
+	pgtable_trans_huge_deposit(mm, pgtable);
 	spin_unlock(&mm->page_table_lock);
 
 	*hpage = NULL;
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 74c0ddaa6fa0..29867e083d37 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -120,3 +120,42 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
+
+#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+{
+	assert_spin_locked(&mm->page_table_lock);
+
+	/* FIFO */
+	if (!mm->pmd_huge_pte)
+		INIT_LIST_HEAD(&pgtable->lru);
+	else
+		list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+	mm->pmd_huge_pte = pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
+#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+{
+	pgtable_t pgtable;
+
+	assert_spin_locked(&mm->page_table_lock);
+
+	/* FIFO */
+	pgtable = mm->pmd_huge_pte;
+	if (list_empty(&pgtable->lru))
+		mm->pmd_huge_pte = NULL;
+	else {
+		mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+					      struct page, lru);
+		list_del(&pgtable->lru);
+	}
+	return pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif

From 46dcde735c9d8953bbd8d105ca6779e5b5300c28 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Mon, 8 Oct 2012 16:30:09 -0700
Subject: [PATCH 050/160] thp: introduce pmdp_invalidate()

On s390, a valid page table entry must not be changed while it is attached
to any CPU.  So instead of pmd_mknotpresent() and set_pmd_at(), an IDTE
operation would be necessary there.  This patch introduces the
pmdp_invalidate() function, to allow architecture-specific
implementations.

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/pgtable.h |  5 +++++
 mm/huge_memory.c              |  3 +--
 mm/pgtable-generic.c          | 11 +++++++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 044939f21c16..ebdd74fb8d6a 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -170,6 +170,11 @@ extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
 extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
 #endif
 
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+			    pmd_t *pmdp);
+#endif
+
 #ifndef __HAVE_ARCH_PTE_SAME
 static inline int pte_same(pte_t pte_a, pte_t pte_b)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9ea6d1953765..bec6243b6965 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1361,8 +1361,7 @@ static int __split_huge_page_map(struct page *page,
 		 * SMP TLB and finally we write the non-huge version
 		 * of the pmd entry with pmd_populate.
 		 */
-		set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
-		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+		pmdp_invalidate(vma, address, pmd);
 		pmd_populate(mm, pmd, pgtable);
 		ret = 1;
 	}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 29867e083d37..e642627da6b7 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -159,3 +159,14 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
+
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+		     pmd_t *pmdp)
+{
+	set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
+	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif

From 8e72033f2a489b6c98c4e3c7cc281b1afd6cb85c Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Mon, 8 Oct 2012 16:30:12 -0700
Subject: [PATCH 051/160] thp: make MADV_HUGEPAGE check for mm->def_flags

This adds a check to hugepage_madvise(), to refuse MADV_HUGEPAGE if
VM_NOHUGEPAGE is set in mm->def_flags.  On s390, the VM_NOHUGEPAGE flag
will be set in mm->def_flags for kvm processes, to prevent any future thp
mappings.  In order to also prevent MADV_HUGEPAGE on such an mm,
hugepage_madvise() should check mm->def_flags.

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bec6243b6965..010d32944d14 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1450,6 +1450,8 @@ out:
 int hugepage_madvise(struct vm_area_struct *vma,
 		     unsigned long *vm_flags, int advice)
 {
+	struct mm_struct *mm = vma->vm_mm;
+
 	switch (advice) {
 	case MADV_HUGEPAGE:
 		/*
@@ -1457,6 +1459,8 @@ int hugepage_madvise(struct vm_area_struct *vma,
 		 */
 		if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
 			return -EINVAL;
+		if (mm->def_flags & VM_NOHUGEPAGE)
+			return -EINVAL;
 		*vm_flags &= ~VM_NOHUGEPAGE;
 		*vm_flags |= VM_HUGEPAGE;
 		/*

From 75077afbec1ac89178c1542b23a70d0f960b0aaf Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Mon, 8 Oct 2012 16:30:15 -0700
Subject: [PATCH 052/160] thp, s390: thp splitting backend for s390

This patch is part of the architecture backend for thp on s390.  It
provides the functions related to thp splitting, including serialization
against gup.  Unlike other archs, pmdp_splitting_flush() cannot use a tlb
flushing operation to serialize against gup on s390, because that wouldn't
be stopped by the disabled IRQs.  So instead, smp_call_function() is
called with an empty function, which will have the expected effect.

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/include/asm/pgtable.h | 13 +++++++++++++
 arch/s390/mm/gup.c              | 11 ++++++++++-
 arch/s390/mm/pgtable.c          | 18 ++++++++++++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 6bd7d7483017..c3f0775e71f1 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -347,6 +347,8 @@ extern struct page *vmemmap;
 
 #define _SEGMENT_ENTRY_LARGE	0x400	/* STE-format control, large page   */
 #define _SEGMENT_ENTRY_CO	0x100	/* change-recording override   */
+#define _SEGMENT_ENTRY_SPLIT_BIT 0	/* THP splitting bit number */
+#define _SEGMENT_ENTRY_SPLIT	(1UL << _SEGMENT_ENTRY_SPLIT_BIT)
 
 /* Page status table bits for virtualization */
 #define RCP_ACC_BITS	0xf000000000000000UL
@@ -506,6 +508,10 @@ static inline int pmd_bad(pmd_t pmd)
 	return (pmd_val(pmd) & mask) != _SEGMENT_ENTRY;
 }
 
+#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern void pmdp_splitting_flush(struct vm_area_struct *vma,
+				 unsigned long addr, pmd_t *pmdp);
+
 static inline int pte_none(pte_t pte)
 {
 	return (pte_val(pte) & _PAGE_INVALID) && !(pte_val(pte) & _PAGE_SWT);
@@ -1159,6 +1165,13 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
 #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address)
 #define pte_unmap(pte) do { } while (0)
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+	return pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 /*
  * 31 bit swap entry format:
  * A page-table entry has some bits we have to treat in a special way.
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index eeaf8023851f..60acb93a4680 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -115,7 +115,16 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
 		pmd = *pmdp;
 		barrier();
 		next = pmd_addr_end(addr, end);
-		if (pmd_none(pmd))
+		/*
+		 * The pmd_trans_splitting() check below explains why
+		 * pmdp_splitting_flush() has to serialize with
+		 * smp_call_function() against our disabled IRQs, to stop
+		 * this gup-fast code from running while we set the
+		 * splitting bit in the pmd. Returning zero will take
+		 * the slow path that will call wait_split_huge_page()
+		 * if the pmd is still in splitting state.
+		 */
+		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
 			return 0;
 		if (unlikely(pmd_huge(pmd))) {
 			if (!gup_huge_pmd(pmdp, pmd, addr, next,
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index b402991e43d7..a6131d1fe6c0 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -866,3 +866,21 @@ bool kernel_page_present(struct page *page)
 	return cc == 0;
 }
 #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void pmdp_splitting_flush_sync(void *arg)
+{
+	/* Simply deliver the interrupt */
+}
+
+void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
+			  pmd_t *pmdp)
+{
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+	if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
+			      (unsigned long *) pmdp)) {
+		/* need to serialize against gup-fast (IRQ disabled) */
+		smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
+	}
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

From 9501d09fa3c4ca18971083dfb0c9aa1afc85f19c Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Mon, 8 Oct 2012 16:30:18 -0700
Subject: [PATCH 053/160] thp, s390: thp pagetable pre-allocation for s390

This patch is part of the architecture backend for thp on s390.  It
provides the pagetable pre-allocation functions
pgtable_trans_huge_deposit() and pgtable_trans_huge_withdraw().  Unlike
other archs, s390 has no struct page * as pgtable_t, but rather a pointer
to the page table.  So instead of saving the pagetable pre- allocation
list info inside the struct page, it is being saved within the pagetable
itself.

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/include/asm/pgtable.h |  6 ++++++
 arch/s390/mm/pgtable.c          | 38 +++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index c3f0775e71f1..2c8f00d31cfe 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1166,6 +1166,12 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
 #define pte_unmap(pte) do { } while (0)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define __HAVE_ARCH_PGTABLE_DEPOSIT
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
+
+#define __HAVE_ARCH_PGTABLE_WITHDRAW
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
+
 static inline int pmd_trans_splitting(pmd_t pmd)
 {
 	return pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT;
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index a6131d1fe6c0..1ca371a06516 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -883,4 +883,42 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
 		smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
 	}
 }
+
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+{
+	struct list_head *lh = (struct list_head *) pgtable;
+
+	assert_spin_locked(&mm->page_table_lock);
+
+	/* FIFO */
+	if (!mm->pmd_huge_pte)
+		INIT_LIST_HEAD(lh);
+	else
+		list_add(lh, (struct list_head *) mm->pmd_huge_pte);
+	mm->pmd_huge_pte = pgtable;
+}
+
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+{
+	struct list_head *lh;
+	pgtable_t pgtable;
+	pte_t *ptep;
+
+	assert_spin_locked(&mm->page_table_lock);
+
+	/* FIFO */
+	pgtable = mm->pmd_huge_pte;
+	lh = (struct list_head *) pgtable;
+	if (list_empty(lh))
+		mm->pmd_huge_pte = NULL;
+	else {
+		mm->pmd_huge_pte = (pgtable_t) lh->next;
+		list_del(lh);
+	}
+	ptep = (pte_t *) pgtable;
+	pte_val(*ptep) = _PAGE_TYPE_EMPTY;
+	ptep++;
+	pte_val(*ptep) = _PAGE_TYPE_EMPTY;
+	return pgtable;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */

From 274023da1e8a49efa6fd9bf857f8557e5db44cdf Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Mon, 8 Oct 2012 16:30:21 -0700
Subject: [PATCH 054/160] thp, s390: disable thp for kvm host on s390

This patch is part of the architecture backend for thp on s390.  It
disables thp for kvm hosts, because there is no kvm host hugepage support
so far.  Existing thp mappings are split by follow_page() with FOLL_SPLIT,
and future thp mappings are prevented by setting VM_NOHUGEPAGE in
mm->def_flags.

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/mm/pgtable.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 1ca371a06516..e86a523875eb 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -787,6 +787,30 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
 		tlb_table_flush(tlb);
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void thp_split_vma(struct vm_area_struct *vma)
+{
+	unsigned long addr;
+	struct page *page;
+
+	for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
+		page = follow_page(vma, addr, FOLL_SPLIT);
+	}
+}
+
+void thp_split_mm(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma = mm->mmap;
+
+	while (vma != NULL) {
+		thp_split_vma(vma);
+		vma->vm_flags &= ~VM_HUGEPAGE;
+		vma->vm_flags |= VM_NOHUGEPAGE;
+		vma = vma->vm_next;
+	}
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 /*
  * switch on pgstes for its userspace process (for kvm)
  */
@@ -824,6 +848,12 @@ int s390_enable_sie(void)
 	if (!mm)
 		return -ENOMEM;
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	/* split thp mappings and disable thp for future mappings */
+	thp_split_mm(mm);
+	mm->def_flags |= VM_NOHUGEPAGE;
+#endif
+
 	/* Now lets check again if something happened */
 	task_lock(tsk);
 	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||

From 1ae1c1d09f220ded48ee9a7d91a65e94f95c4af1 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Mon, 8 Oct 2012 16:30:24 -0700
Subject: [PATCH 055/160] thp, s390: architecture backend for thp on s390

This implements the architecture backend for transparent hugepages
on s390.

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/include/asm/hugetlb.h |  18 +--
 arch/s390/include/asm/pgtable.h | 190 ++++++++++++++++++++++++++++++++
 arch/s390/include/asm/setup.h   |   5 +-
 arch/s390/include/asm/tlb.h     |   1 +
 arch/s390/kernel/early.c        |   2 +
 arch/s390/mm/pgtable.c          |  22 ++++
 6 files changed, 220 insertions(+), 18 deletions(-)

diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index fc322421b1cc..593753ee07f3 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -78,23 +78,6 @@ static inline void __pmd_csp(pmd_t *pmdp)
 		"	csp %1,%3"
 		: "=m" (*pmdp)
 		: "d" (reg2), "d" (reg3), "d" (reg4), "m" (*pmdp) : "cc");
-	pmd_val(*pmdp) = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY;
-}
-
-static inline void __pmd_idte(unsigned long address, pmd_t *pmdp)
-{
-	unsigned long sto = (unsigned long) pmdp -
-				pmd_index(address) * sizeof(pmd_t);
-
-	if (!(pmd_val(*pmdp) & _SEGMENT_ENTRY_INV)) {
-		asm volatile(
-			"	.insn	rrf,0xb98e0000,%2,%3,0,0"
-			: "=m" (*pmdp)
-			: "m" (*pmdp), "a" (sto),
-			  "a" ((address & HPAGE_MASK))
-		);
-	}
-	pmd_val(*pmdp) = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY;
 }
 
 static inline void huge_ptep_invalidate(struct mm_struct *mm,
@@ -106,6 +89,7 @@ static inline void huge_ptep_invalidate(struct mm_struct *mm,
 		__pmd_idte(address, pmdp);
 	else
 		__pmd_csp(pmdp);
+	pmd_val(*pmdp) = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY;
 }
 
 static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 2c8f00d31cfe..ed14fc2db6e0 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -350,6 +350,10 @@ extern struct page *vmemmap;
 #define _SEGMENT_ENTRY_SPLIT_BIT 0	/* THP splitting bit number */
 #define _SEGMENT_ENTRY_SPLIT	(1UL << _SEGMENT_ENTRY_SPLIT_BIT)
 
+/* Set of bits not changed in pmd_modify */
+#define _SEGMENT_CHG_MASK	(_SEGMENT_ENTRY_ORIGIN | _SEGMENT_ENTRY_LARGE \
+				 | _SEGMENT_ENTRY_SPLIT | _SEGMENT_ENTRY_CO)
+
 /* Page status table bits for virtualization */
 #define RCP_ACC_BITS	0xf000000000000000UL
 #define RCP_FP_BIT	0x0800000000000000UL
@@ -512,6 +516,26 @@ static inline int pmd_bad(pmd_t pmd)
 extern void pmdp_splitting_flush(struct vm_area_struct *vma,
 				 unsigned long addr, pmd_t *pmdp);
 
+#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+				 unsigned long address, pmd_t *pmdp,
+				 pmd_t entry, int dirty);
+
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+				  unsigned long address, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMD_WRITE
+static inline int pmd_write(pmd_t pmd)
+{
+	return (pmd_val(pmd) & _SEGMENT_ENTRY_RO) == 0;
+}
+
+static inline int pmd_young(pmd_t pmd)
+{
+	return 0;
+}
+
 static inline int pte_none(pte_t pte)
 {
 	return (pte_val(pte) & _PAGE_INVALID) && !(pte_val(pte) & _PAGE_SWT);
@@ -1165,6 +1189,22 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
 #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address)
 #define pte_unmap(pte) do { } while (0)
 
+static inline void __pmd_idte(unsigned long address, pmd_t *pmdp)
+{
+	unsigned long sto = (unsigned long) pmdp -
+			    pmd_index(address) * sizeof(pmd_t);
+
+	if (!(pmd_val(*pmdp) & _SEGMENT_ENTRY_INV)) {
+		asm volatile(
+			"	.insn	rrf,0xb98e0000,%2,%3,0,0"
+			: "=m" (*pmdp)
+			: "m" (*pmdp), "a" (sto),
+			  "a" ((address & HPAGE_MASK))
+			: "cc"
+		);
+	}
+}
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define __HAVE_ARCH_PGTABLE_DEPOSIT
 extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
@@ -1176,6 +1216,156 @@ static inline int pmd_trans_splitting(pmd_t pmd)
 {
 	return pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT;
 }
+
+static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+			      pmd_t *pmdp, pmd_t entry)
+{
+	*pmdp = entry;
+}
+
+static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot)
+{
+	unsigned long pgprot_pmd = 0;
+
+	if (pgprot_val(pgprot) & _PAGE_INVALID) {
+		if (pgprot_val(pgprot) & _PAGE_SWT)
+			pgprot_pmd |= _HPAGE_TYPE_NONE;
+		pgprot_pmd |= _SEGMENT_ENTRY_INV;
+	}
+	if (pgprot_val(pgprot) & _PAGE_RO)
+		pgprot_pmd |= _SEGMENT_ENTRY_RO;
+	return pgprot_pmd;
+}
+
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+	pmd_val(pmd) &= _SEGMENT_CHG_MASK;
+	pmd_val(pmd) |= massage_pgprot_pmd(newprot);
+	return pmd;
+}
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+	pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
+	return pmd;
+}
+
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
+{
+	pmd_val(pmd) &= ~_SEGMENT_ENTRY_RO;
+	return pmd;
+}
+
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+	pmd_val(pmd) |= _SEGMENT_ENTRY_RO;
+	return pmd;
+}
+
+static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+	/* No dirty bit in the segment table entry. */
+	return pmd;
+}
+
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+	/* No referenced bit in the segment table entry. */
+	return pmd;
+}
+
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+	/* No referenced bit in the segment table entry. */
+	return pmd;
+}
+
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+					    unsigned long address, pmd_t *pmdp)
+{
+	unsigned long pmd_addr = pmd_val(*pmdp) & HPAGE_MASK;
+	long tmp, rc;
+	int counter;
+
+	rc = 0;
+	if (MACHINE_HAS_RRBM) {
+		counter = PTRS_PER_PTE >> 6;
+		asm volatile(
+			"0:	.insn	rre,0xb9ae0000,%0,%3\n"	/* rrbm */
+			"	ogr	%1,%0\n"
+			"	la	%3,0(%4,%3)\n"
+			"	brct	%2,0b\n"
+			: "=&d" (tmp), "+&d" (rc), "+d" (counter),
+			  "+a" (pmd_addr)
+			: "a" (64 * 4096UL) : "cc");
+		rc = !!rc;
+	} else {
+		counter = PTRS_PER_PTE;
+		asm volatile(
+			"0:	rrbe	0,%2\n"
+			"	la	%2,0(%3,%2)\n"
+			"	brc	12,1f\n"
+			"	lhi	%0,1\n"
+			"1:	brct	%1,0b\n"
+			: "+d" (rc), "+d" (counter), "+a" (pmd_addr)
+			: "a" (4096UL) : "cc");
+	}
+	return rc;
+}
+
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+				       unsigned long address, pmd_t *pmdp)
+{
+	pmd_t pmd = *pmdp;
+
+	__pmd_idte(address, pmdp);
+	pmd_clear(pmdp);
+	return pmd;
+}
+
+#define __HAVE_ARCH_PMDP_CLEAR_FLUSH
+static inline pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
+				     unsigned long address, pmd_t *pmdp)
+{
+	return pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+}
+
+#define __HAVE_ARCH_PMDP_INVALIDATE
+static inline void pmdp_invalidate(struct vm_area_struct *vma,
+				   unsigned long address, pmd_t *pmdp)
+{
+	__pmd_idte(address, pmdp);
+}
+
+static inline pmd_t mk_pmd_phys(unsigned long physpage, pgprot_t pgprot)
+{
+	pmd_t __pmd;
+	pmd_val(__pmd) = physpage + massage_pgprot_pmd(pgprot);
+	return __pmd;
+}
+
+#define pfn_pmd(pfn, pgprot)	mk_pmd_phys(__pa((pfn) << PAGE_SHIFT), (pgprot))
+#define mk_pmd(page, pgprot)	pfn_pmd(page_to_pfn(page), (pgprot))
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+	return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE;
+}
+
+static inline int has_transparent_hugepage(void)
+{
+	return MACHINE_HAS_HPAGE ? 1 : 0;
+}
+
+static inline unsigned long pmd_pfn(pmd_t pmd)
+{
+	if (pmd_trans_huge(pmd))
+		return pmd_val(pmd) >> HPAGE_SHIFT;
+	else
+		return pmd_val(pmd) >> PAGE_SHIFT;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /*
diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index 87b47ca954f1..8cfd731a18d8 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -81,6 +81,7 @@ extern unsigned int s390_user_mode;
 #define MACHINE_FLAG_SPP	(1UL << 13)
 #define MACHINE_FLAG_TOPOLOGY	(1UL << 14)
 #define MACHINE_FLAG_TE		(1UL << 15)
+#define MACHINE_FLAG_RRBM	(1UL << 16)
 
 #define MACHINE_IS_VM		(S390_lowcore.machine_flags & MACHINE_FLAG_VM)
 #define MACHINE_IS_KVM		(S390_lowcore.machine_flags & MACHINE_FLAG_KVM)
@@ -99,7 +100,8 @@ extern unsigned int s390_user_mode;
 #define MACHINE_HAS_PFMF	(0)
 #define MACHINE_HAS_SPP		(0)
 #define MACHINE_HAS_TOPOLOGY	(0)
-#define MACHINE_HAS_TE		      (0)
+#define MACHINE_HAS_TE		(0)
+#define MACHINE_HAS_RRBM	(0)
 #else /* CONFIG_64BIT */
 #define MACHINE_HAS_IEEE	(1)
 #define MACHINE_HAS_CSP		(1)
@@ -112,6 +114,7 @@ extern unsigned int s390_user_mode;
 #define MACHINE_HAS_SPP		(S390_lowcore.machine_flags & MACHINE_FLAG_SPP)
 #define MACHINE_HAS_TOPOLOGY	(S390_lowcore.machine_flags & MACHINE_FLAG_TOPOLOGY)
 #define MACHINE_HAS_TE		(S390_lowcore.machine_flags & MACHINE_FLAG_TE)
+#define MACHINE_HAS_RRBM	(S390_lowcore.machine_flags & MACHINE_FLAG_RRBM)
 #endif /* CONFIG_64BIT */
 
 #define ZFCPDUMP_HSA_SIZE	(32UL<<20)
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index 06e5acbc84bd..b75d7d686684 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -137,6 +137,7 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
 #define tlb_start_vma(tlb, vma)			do { } while (0)
 #define tlb_end_vma(tlb, vma)			do { } while (0)
 #define tlb_remove_tlb_entry(tlb, ptep, addr)	do { } while (0)
+#define tlb_remove_pmd_tlb_entry(tlb, pmdp, addr)	do { } while (0)
 #define tlb_migrate_finish(mm)			do { } while (0)
 
 #endif /* _S390_TLB_H */
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 7f4717675c19..00d114445068 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -388,6 +388,8 @@ static __init void detect_machine_facilities(void)
 		S390_lowcore.machine_flags |= MACHINE_FLAG_SPP;
 	if (test_facility(50) && test_facility(73))
 		S390_lowcore.machine_flags |= MACHINE_FLAG_TE;
+	if (test_facility(66))
+		S390_lowcore.machine_flags |= MACHINE_FLAG_RRBM;
 #endif
 }
 
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index e86a523875eb..c8188a18af05 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -898,6 +898,28 @@ bool kernel_page_present(struct page *page)
 #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address,
+			   pmd_t *pmdp)
+{
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+	/* No need to flush TLB
+	 * On s390 reference bits are in storage key and never in TLB */
+	return pmdp_test_and_clear_young(vma, address, pmdp);
+}
+
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+			  unsigned long address, pmd_t *pmdp,
+			  pmd_t entry, int dirty)
+{
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+	if (pmd_same(*pmdp, entry))
+		return 0;
+	pmdp_invalidate(vma, address, pmdp);
+	set_pmd_at(vma->vm_mm, address, pmdp, entry);
+	return 1;
+}
+
 static void pmdp_splitting_flush_sync(void *arg)
 {
 	/* Simply deliver the interrupt */

From 1638113d9d8b7e04c1eeae9014d43f6381a74040 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:30:26 -0700
Subject: [PATCH 056/160] ipc/mqueue: remove unnecessary rb_init_node() calls

Commit d6629859b36d ("ipc/mqueue: improve performance of send/recv") and
ce2d52cc ("ipc/mqueue: add rbtree node caching support") introduced an
rbtree of message priorities, and usage of rb_init_node() to initialize
the corresponding nodes.  As it turns out, rb_init_node() is unnecessary
here, as the nodes are fully initialized on insertion by rb_link_node()
and the code doesn't access nodes that aren't inserted on the rbtree.

Removing the rb_init_node() calls as I removed that function during
rbtree API cleanups (the only other use of it was in a place that
similarly didn't require it).

Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Doug Ledford <dledford@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/mqueue.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 6d255e535d03..6b97e2466fad 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -142,7 +142,6 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
 		leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC);
 		if (!leaf)
 			return -ENOMEM;
-		rb_init_node(&leaf->rb_node);
 		INIT_LIST_HEAD(&leaf->msg_list);
 		info->qsize += sizeof(*leaf);
 	}
@@ -1013,7 +1012,6 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
 
 	if (!info->node_cache && new_leaf) {
 		/* Save our speculative allocation into the cache */
-		rb_init_node(&new_leaf->rb_node);
 		INIT_LIST_HEAD(&new_leaf->msg_list);
 		info->node_cache = new_leaf;
 		info->qsize += sizeof(*new_leaf);
@@ -1121,7 +1119,6 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
 
 	if (!info->node_cache && new_leaf) {
 		/* Save our speculative allocation into the cache */
-		rb_init_node(&new_leaf->rb_node);
 		INIT_LIST_HEAD(&new_leaf->msg_list);
 		info->node_cache = new_leaf;
 		info->qsize += sizeof(*new_leaf);

From 1457d2877864d918c546735bd89c29d5e2a542f1 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:30:28 -0700
Subject: [PATCH 057/160] rbtree: reference Documentation/rbtree.txt for usage
 instructions

I recently started looking at the rbtree code (with an eye towards
improving the augmented rbtree support, but I haven't gotten there yet).
I noticed a lot of possible speed improvements, which I am now proposing
in this patch set.

Patches 1-4 are preparatory: remove internal functions from rbtree.h so
that users won't be tempted to use them instead of the documented APIs,
clean up some incorrect usages I've noticed (in particular, with the
recently added fs/proc/proc_sysctl.c rbtree usage), reference the
documentation so that people have one less excuse to miss it, etc.

Patch 5 is a small module I wrote to check the rbtree performance.  It
creates 100 nodes with random keys and repeatedly inserts and erases them
from an rbtree.  Additionally, it has code to check for rbtree invariants
after each insert or erase operation.

Patches 6-12 is where the rbtree optimizations are done, and they touch
only that one file, lib/rbtree.c .  I am getting good results out of these
- in my small benchmark doing rbtree insertion (including search) and
erase, I'm seeing a 30% runtime reduction on Sandybridge E5, which is more
than I initially thought would be possible.  (the results aren't as
impressive on my two other test hosts though, AMD barcelona and Intel
Westmere, where I am seeing 14% runtime reduction only).  The code size -
both source (ommiting comments) and compiled - is also shorter after these
changes.  However, I do admit that the updated code is more arduous to
read - one big reason for that is the removal of the tree rotation
helpers, which added some overhead but also made it easier to reason about
things locally.  Overall, I believe this is an acceptable compromise,
given that this code doesn't get modified very often, and that I have good
tests for it.

Upon Peter's suggestion, I added comments showing the rtree configuration
before every rotation.  I think they help; however it's still best to have
a copy of the cormen/leiserson/rivest book when digging into this code.

This patch: reference Documentation/rbtree.txt for usage instructions

include/linux/rbtree.h included some basic usage instructions, while
Documentation/rbtree.txt had some more complete and easier to follow
instructions.  Replacing the former with a reference to the latter.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rbtree.h | 67 +-----------------------------------------
 1 file changed, 1 insertion(+), 66 deletions(-)

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 033b507b33b1..e6a807720ded 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -23,72 +23,7 @@
   I know it's not the cleaner way,  but in C (not in C++) to get
   performances and genericity...
 
-  Some example of insert and search follows here. The search is a plain
-  normal search over an ordered tree. The insert instead must be implemented
-  in two steps: First, the code must insert the element in order as a red leaf
-  in the tree, and then the support library function rb_insert_color() must
-  be called. Such function will do the not trivial work to rebalance the
-  rbtree, if necessary.
-
------------------------------------------------------------------------
-static inline struct page * rb_search_page_cache(struct inode * inode,
-						 unsigned long offset)
-{
-	struct rb_node * n = inode->i_rb_page_cache.rb_node;
-	struct page * page;
-
-	while (n)
-	{
-		page = rb_entry(n, struct page, rb_page_cache);
-
-		if (offset < page->offset)
-			n = n->rb_left;
-		else if (offset > page->offset)
-			n = n->rb_right;
-		else
-			return page;
-	}
-	return NULL;
-}
-
-static inline struct page * __rb_insert_page_cache(struct inode * inode,
-						   unsigned long offset,
-						   struct rb_node * node)
-{
-	struct rb_node ** p = &inode->i_rb_page_cache.rb_node;
-	struct rb_node * parent = NULL;
-	struct page * page;
-
-	while (*p)
-	{
-		parent = *p;
-		page = rb_entry(parent, struct page, rb_page_cache);
-
-		if (offset < page->offset)
-			p = &(*p)->rb_left;
-		else if (offset > page->offset)
-			p = &(*p)->rb_right;
-		else
-			return page;
-	}
-
-	rb_link_node(node, parent, p);
-
-	return NULL;
-}
-
-static inline struct page * rb_insert_page_cache(struct inode * inode,
-						 unsigned long offset,
-						 struct rb_node * node)
-{
-	struct page * ret;
-	if ((ret = __rb_insert_page_cache(inode, offset, node)))
-		goto out;
-	rb_insert_color(node, &inode->i_rb_page_cache);
- out:
-	return ret;
-}
------------------------------------------------------------------------
+  See Documentation/rbtree.txt for documentation and samples.
 */
 
 #ifndef	_LINUX_RBTREE_H

From 4c199a93a2d36b277a9fd209a0f2793f8460a215 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:30:32 -0700
Subject: [PATCH 058/160] rbtree: empty nodes have no color

Empty nodes have no color.  We can make use of this property to simplify
the code emitted by the RB_EMPTY_NODE and RB_CLEAR_NODE macros.  Also,
we can get rid of the rb_init_node function which had been introduced by
commit 88d19cf37952 ("timers: Add rb_init_node() to allow for stack
allocated rb nodes") to avoid some issue with the empty node's color not
being initialized.

I'm not sure what the RB_EMPTY_NODE checks in rb_prev() / rb_next() are
doing there, though.  axboe introduced them in commit 10fd48f2376d
("rbtree: fixed reversed RB_EMPTY_NODE and rb_next/prev").  The way I
see it, the 'empty node' abstraction is only used by rbtree users to
flag nodes that they haven't inserted in any rbtree, so asking the
predecessor or successor of such nodes doesn't make any sense.

One final rb_init_node() caller was recently added in sysctl code to
implement faster sysctl name lookups.  This code doesn't make use of
RB_EMPTY_NODE at all, and from what I could see it only called
rb_init_node() under the mistaken assumption that such initialization was
required before node insertion.

[sfr@canb.auug.org.au: fix net/ceph/osd_client.c build]
Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: John Stultz <john.stultz@linaro.org>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_sysctl.c      |  4 +---
 include/linux/rbtree.h     | 15 +++++----------
 include/linux/timerqueue.h |  2 +-
 lib/rbtree.c               |  4 ++--
 net/ceph/osd_client.c      |  1 -
 5 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index dcd56f84db7e..fddc50729632 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -168,10 +168,8 @@ static void init_header(struct ctl_table_header *head,
 	head->node = node;
 	if (node) {
 		struct ctl_table *entry;
-		for (entry = table; entry->procname; entry++, node++) {
-			rb_init_node(&node->node);
+		for (entry = table; entry->procname; entry++, node++)
 			node->header = head;
-		}
 	}
 }
 
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index e6a807720ded..2049087c43b7 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -67,17 +67,12 @@ static inline void rb_set_color(struct rb_node *rb, int color)
 #define RB_ROOT	(struct rb_root) { NULL, }
 #define	rb_entry(ptr, type, member) container_of(ptr, type, member)
 
-#define RB_EMPTY_ROOT(root)	((root)->rb_node == NULL)
-#define RB_EMPTY_NODE(node)	(rb_parent(node) == node)
-#define RB_CLEAR_NODE(node)	(rb_set_parent(node, node))
+#define RB_EMPTY_ROOT(root)  ((root)->rb_node == NULL)
+
+/* 'empty' nodes are nodes that are known not to be inserted in an rbree */
+#define RB_EMPTY_NODE(node)  ((node)->rb_parent_color == (unsigned long)(node))
+#define RB_CLEAR_NODE(node)  ((node)->rb_parent_color = (unsigned long)(node))
 
-static inline void rb_init_node(struct rb_node *rb)
-{
-	rb->rb_parent_color = 0;
-	rb->rb_right = NULL;
-	rb->rb_left = NULL;
-	RB_CLEAR_NODE(rb);
-}
 
 extern void rb_insert_color(struct rb_node *, struct rb_root *);
 extern void rb_erase(struct rb_node *, struct rb_root *);
diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
index 5088727478fd..a520fd70a59f 100644
--- a/include/linux/timerqueue.h
+++ b/include/linux/timerqueue.h
@@ -39,7 +39,7 @@ struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
 
 static inline void timerqueue_init(struct timerqueue_node *node)
 {
-	rb_init_node(&node->node);
+	RB_CLEAR_NODE(&node->node);
 }
 
 static inline void timerqueue_init_head(struct timerqueue_head *head)
diff --git a/lib/rbtree.c b/lib/rbtree.c
index d4175565dc2c..fe43c8c5f527 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -387,7 +387,7 @@ struct rb_node *rb_next(const struct rb_node *node)
 {
 	struct rb_node *parent;
 
-	if (rb_parent(node) == node)
+	if (RB_EMPTY_NODE(node))
 		return NULL;
 
 	/* If we have a right-hand child, go down and then left as far
@@ -416,7 +416,7 @@ struct rb_node *rb_prev(const struct rb_node *node)
 {
 	struct rb_node *parent;
 
-	if (rb_parent(node) == node)
+	if (RB_EMPTY_NODE(node))
 		return NULL;
 
 	/* If we have a left-hand child, go down and then right as far
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ccbdfbba9e53..c1d756cc7448 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -221,7 +221,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	kref_init(&req->r_kref);
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
-	rb_init_node(&req->r_node);
 	INIT_LIST_HEAD(&req->r_unsafe_item);
 	INIT_LIST_HEAD(&req->r_linger_item);
 	INIT_LIST_HEAD(&req->r_linger_osd);

From ea5272f5c94fb2ee62f4f15a5b88eef6184cd506 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:30:35 -0700
Subject: [PATCH 059/160] rbtree: fix incorrect rbtree node insertion in
 fs/proc/proc_sysctl.c

The recently added code to use rbtrees in sysctl did not follow the proper
rbtree interface on insertion - it was calling rb_link_node() which
inserts a new node into the binary tree, but missed the call to
rb_insert_color() which properly balances the rbtree and establishes all
expected rbtree invariants.

I found out about this only because faulty commit also used
rb_init_node(), which I am removing within this patchset.  But I think
it's an easy mistake to make, and it makes me wonder if we should change
the rbtree API so that insertions would be done with a single rb_insert()
call (even if its implementation could still inline the rb_link_node()
part and call a private __rb_insert_color function to do the rebalancing).

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_sysctl.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index fddc50729632..a781bdf06694 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -142,6 +142,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
 	}
 
 	rb_link_node(node, parent, p);
+	rb_insert_color(node, &head->parent->root);
 	return 0;
 }
 

From bf7ad8eeab995710c766df49c9c69a8592ca0216 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:30:37 -0700
Subject: [PATCH 060/160] rbtree: move some implementation details from
 rbtree.h to rbtree.c

rbtree users must use the documented APIs to manipulate the tree
structure.  Low-level helpers to manipulate node colors and parenthood are
not part of that API, so move them to lib/rbtree.c

[dwmw2@infradead.org: fix jffs2 build issue due to renamed __rb_parent_color field]
Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jffs2/readinode.c   | 13 ++++++++-----
 include/linux/rbtree.h | 34 +++++++++-------------------------
 lib/rbtree.c           | 20 +++++++++++++++++++-
 3 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 1ea349fff68b..ae81b01e6fd7 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -394,8 +394,11 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
 }
 
 /* Trivial function to remove the last node in the tree. Which by definition
-   has no right-hand -- so can be removed just by making its only child (if
-   any) take its place under its parent. */
+   has no right-hand child — so can be removed just by making its left-hand
+   child (if any) take its place under its parent. Since this is only done
+   when we're consuming the whole tree, there's no need to use rb_erase()
+   and let it worry about adjusting colours and balancing the tree. That
+   would just be a waste of time. */
 static void eat_last(struct rb_root *root, struct rb_node *node)
 {
 	struct rb_node *parent = rb_parent(node);
@@ -412,12 +415,12 @@ static void eat_last(struct rb_root *root, struct rb_node *node)
 		link = &parent->rb_right;
 
 	*link = node->rb_left;
-	/* Colour doesn't matter now. Only the parent pointer. */
 	if (node->rb_left)
-		node->rb_left->rb_parent_color = node->rb_parent_color;
+		node->rb_left->__rb_parent_color = node->__rb_parent_color;
 }
 
-/* We put this in reverse order, so we can just use eat_last */
+/* We put the version tree in reverse order, so we can use the same eat_last()
+   function that we use to consume the tmpnode tree (tn_root). */
 static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn)
 {
 	struct rb_node **link = &ver_root->rb_node;
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 2049087c43b7..bf836a2c6a32 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -32,37 +32,19 @@
 #include <linux/kernel.h>
 #include <linux/stddef.h>
 
-struct rb_node
-{
-	unsigned long  rb_parent_color;
-#define	RB_RED		0
-#define	RB_BLACK	1
+struct rb_node {
+	unsigned long  __rb_parent_color;
 	struct rb_node *rb_right;
 	struct rb_node *rb_left;
 } __attribute__((aligned(sizeof(long))));
     /* The alignment might seem pointless, but allegedly CRIS needs it */
 
-struct rb_root
-{
+struct rb_root {
 	struct rb_node *rb_node;
 };
 
 
-#define rb_parent(r)   ((struct rb_node *)((r)->rb_parent_color & ~3))
-#define rb_color(r)   ((r)->rb_parent_color & 1)
-#define rb_is_red(r)   (!rb_color(r))
-#define rb_is_black(r) rb_color(r)
-#define rb_set_red(r)  do { (r)->rb_parent_color &= ~1; } while (0)
-#define rb_set_black(r)  do { (r)->rb_parent_color |= 1; } while (0)
-
-static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
-{
-	rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p;
-}
-static inline void rb_set_color(struct rb_node *rb, int color)
-{
-	rb->rb_parent_color = (rb->rb_parent_color & ~1) | color;
-}
+#define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))
 
 #define RB_ROOT	(struct rb_root) { NULL, }
 #define	rb_entry(ptr, type, member) container_of(ptr, type, member)
@@ -70,8 +52,10 @@ static inline void rb_set_color(struct rb_node *rb, int color)
 #define RB_EMPTY_ROOT(root)  ((root)->rb_node == NULL)
 
 /* 'empty' nodes are nodes that are known not to be inserted in an rbree */
-#define RB_EMPTY_NODE(node)  ((node)->rb_parent_color == (unsigned long)(node))
-#define RB_CLEAR_NODE(node)  ((node)->rb_parent_color = (unsigned long)(node))
+#define RB_EMPTY_NODE(node)  \
+	((node)->__rb_parent_color == (unsigned long)(node))
+#define RB_CLEAR_NODE(node)  \
+	((node)->__rb_parent_color = (unsigned long)(node))
 
 
 extern void rb_insert_color(struct rb_node *, struct rb_root *);
@@ -98,7 +82,7 @@ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
 				struct rb_node ** rb_link)
 {
-	node->rb_parent_color = (unsigned long )parent;
+	node->__rb_parent_color = (unsigned long)parent;
 	node->rb_left = node->rb_right = NULL;
 
 	*rb_link = node;
diff --git a/lib/rbtree.c b/lib/rbtree.c
index fe43c8c5f527..ccada9abe6f5 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -23,6 +23,24 @@
 #include <linux/rbtree.h>
 #include <linux/export.h>
 
+#define	RB_RED		0
+#define	RB_BLACK	1
+
+#define rb_color(r)   ((r)->__rb_parent_color & 1)
+#define rb_is_red(r)   (!rb_color(r))
+#define rb_is_black(r) rb_color(r)
+#define rb_set_red(r)  do { (r)->__rb_parent_color &= ~1; } while (0)
+#define rb_set_black(r)  do { (r)->__rb_parent_color |= 1; } while (0)
+
+static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
+{
+	rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
+}
+static inline void rb_set_color(struct rb_node *rb, int color)
+{
+	rb->__rb_parent_color = (rb->__rb_parent_color & ~1) | color;
+}
+
 static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
 {
 	struct rb_node *right = node->rb_right;
@@ -255,7 +273,7 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 			rb_set_parent(old->rb_right, node);
 		}
 
-		node->rb_parent_color = old->rb_parent_color;
+		node->__rb_parent_color = old->__rb_parent_color;
 		node->rb_left = old->rb_left;
 		rb_set_parent(old->rb_left, node);
 

From 910a742d4ba863848c7283d69c21bfa779d3b9a8 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:30:39 -0700
Subject: [PATCH 061/160] rbtree: performance and correctness test

This small module helps measure the performance of rbtree insert and
erase.

Additionally, we run a few correctness tests to check that the rbtrees
have all desired properties:

- contains the right number of nodes in the order desired,
- never two consecutive red nodes on any path,
- all paths to leaf nodes have the same number of black nodes,
- root node is black

[akpm@linux-foundation.org: fix printk warning: sparc64 cycles_t is unsigned long]
Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/Kconfig.debug |   7 +++
 lib/Makefile      |   2 +
 lib/rbtree_test.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 144 insertions(+)
 create mode 100644 lib/rbtree_test.c

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b7281e4d1473..a4e5d93b0f41 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1282,6 +1282,13 @@ config LATENCYTOP
 source mm/Kconfig.debug
 source kernel/trace/Kconfig
 
+config RBTREE_TEST
+	tristate "Red-Black tree test"
+	depends on m && DEBUG_KERNEL
+	help
+	  A benchmark measuring the performance of the rbtree library.
+	  Also includes rbtree invariant checks.
+
 config PROVIDE_OHCI1394_DMA_INIT
 	bool "Remote debugging over FireWire early on boot"
 	depends on PCI && X86
diff --git a/lib/Makefile b/lib/Makefile
index 42d283edc4d3..f49445d26b65 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -140,6 +140,8 @@ $(foreach file, $(libfdt_files), \
 	$(eval CFLAGS_$(file) = -I$(src)/../scripts/dtc/libfdt))
 lib-$(CONFIG_LIBFDT) += $(libfdt_files)
 
+obj-$(CONFIG_RBTREE_TEST) += rbtree_test.o
+
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
 
diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
new file mode 100644
index 000000000000..19dfca9ff7d7
--- /dev/null
+++ b/lib/rbtree_test.c
@@ -0,0 +1,135 @@
+#include <linux/module.h>
+#include <linux/rbtree.h>
+#include <linux/random.h>
+#include <asm/timex.h>
+
+#define NODES       100
+#define PERF_LOOPS  100000
+#define CHECK_LOOPS 100
+
+struct test_node {
+	struct rb_node rb;
+	u32 key;
+};
+
+static struct rb_root root = RB_ROOT;
+static struct test_node nodes[NODES];
+
+static struct rnd_state rnd;
+
+static void insert(struct test_node *node, struct rb_root *root)
+{
+	struct rb_node **new = &root->rb_node, *parent = NULL;
+
+	while (*new) {
+		parent = *new;
+		if (node->key < rb_entry(parent, struct test_node, rb)->key)
+			new = &parent->rb_left;
+		else
+			new = &parent->rb_right;
+	}
+
+	rb_link_node(&node->rb, parent, new);
+	rb_insert_color(&node->rb, root);
+}
+
+static inline void erase(struct test_node *node, struct rb_root *root)
+{
+	rb_erase(&node->rb, root);
+}
+
+static void init(void)
+{
+	int i;
+	for (i = 0; i < NODES; i++)
+		nodes[i].key = prandom32(&rnd);
+}
+
+static bool is_red(struct rb_node *rb)
+{
+	return !(rb->__rb_parent_color & 1);
+}
+
+static int black_path_count(struct rb_node *rb)
+{
+	int count;
+	for (count = 0; rb; rb = rb_parent(rb))
+		count += !is_red(rb);
+	return count;
+}
+
+static void check(int nr_nodes)
+{
+	struct rb_node *rb;
+	int count = 0;
+	int blacks;
+	u32 prev_key = 0;
+
+	for (rb = rb_first(&root); rb; rb = rb_next(rb)) {
+		struct test_node *node = rb_entry(rb, struct test_node, rb);
+		WARN_ON_ONCE(node->key < prev_key);
+		WARN_ON_ONCE(is_red(rb) &&
+			     (!rb_parent(rb) || is_red(rb_parent(rb))));
+		if (!count)
+			blacks = black_path_count(rb);
+		else
+			WARN_ON_ONCE((!rb->rb_left || !rb->rb_right) &&
+				     blacks != black_path_count(rb));
+		prev_key = node->key;
+		count++;
+	}
+	WARN_ON_ONCE(count != nr_nodes);
+}
+
+static int rbtree_test_init(void)
+{
+	int i, j;
+	cycles_t time1, time2, time;
+
+	printk(KERN_ALERT "rbtree testing");
+
+	prandom32_seed(&rnd, 3141592653589793238);
+	init();
+
+	time1 = get_cycles();
+
+	for (i = 0; i < PERF_LOOPS; i++) {
+		for (j = 0; j < NODES; j++)
+			insert(nodes + j, &root);
+		for (j = 0; j < NODES; j++)
+			erase(nodes + j, &root);
+	}
+
+	time2 = get_cycles();
+	time = time2 - time1;
+
+	time = div_u64(time, PERF_LOOPS);
+	printk(" -> %llu cycles\n", (unsigned long long)time);
+
+	for (i = 0; i < CHECK_LOOPS; i++) {
+		init();
+		for (j = 0; j < NODES; j++) {
+			check(j);
+			insert(nodes + j, &root);
+		}
+		for (j = 0; j < NODES; j++) {
+			check(NODES - j);
+			erase(nodes + j, &root);
+		}
+		check(0);
+	}
+
+	return -EAGAIN; /* Fail will directly unload the module */
+}
+
+static void rbtree_test_exit(void)
+{
+	printk(KERN_ALERT "test exit\n");
+}
+
+module_init(rbtree_test_init)
+module_exit(rbtree_test_exit)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michel Lespinasse");
+MODULE_DESCRIPTION("Red Black Tree test");

From 1f0528653e41ec230c60f5738820e8a544731399 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:30:42 -0700
Subject: [PATCH 062/160] rbtree: break out of rb_insert_color loop after tree
 rotation

It is a well known property of rbtrees that insertion never requires more
than two tree rotations.  In our implementation, after one loop iteration
identified one or two necessary tree rotations, we would iterate and look
for more.  However at that point the node's parent would always be black,
which would cause us to exit the loop.

We can make the code flow more obvious by just adding a break statement
after the tree rotations, where we know we are done.  Additionally, in the
cases where two tree rotations are necessary, we don't have to update the
'node' pointer as it wouldn't be used until the next loop iteration, which
we now avoid due to this break statement.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/lib/rbtree.c b/lib/rbtree.c
index ccada9abe6f5..12abb8abf442 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -109,18 +109,15 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root)
 				}
 			}
 
-			if (parent->rb_right == node)
-			{
-				register struct rb_node *tmp;
+			if (parent->rb_right == node) {
 				__rb_rotate_left(parent, root);
-				tmp = parent;
 				parent = node;
-				node = tmp;
 			}
 
 			rb_set_black(parent);
 			rb_set_red(gparent);
 			__rb_rotate_right(gparent, root);
+			break;
 		} else {
 			{
 				register struct rb_node *uncle = gparent->rb_left;
@@ -134,18 +131,15 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root)
 				}
 			}
 
-			if (parent->rb_left == node)
-			{
-				register struct rb_node *tmp;
+			if (parent->rb_left == node) {
 				__rb_rotate_right(parent, root);
-				tmp = parent;
 				parent = node;
-				node = tmp;
 			}
 
 			rb_set_black(parent);
 			rb_set_red(gparent);
 			__rb_rotate_left(gparent, root);
+			break;
 		}
 	}
 

From 6d58452dc066db61acdff7b84671db1b11a3de1c Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:30:44 -0700
Subject: [PATCH 063/160] rbtree: adjust root color in rb_insert_color() only
 when necessary

The root node of an rbtree must always be black.  However,
rb_insert_color() only needs to maintain this invariant when it has been
broken - that is, when it exits the loop due to the current (red) node
being the root.  In all other cases (exiting after tree rotations, or
exiting due to an existing black parent) the invariant is already
satisfied, so there is no need to adjust the root node color.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/lib/rbtree.c b/lib/rbtree.c
index 12abb8abf442..d0be5fcaafe8 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -91,8 +91,21 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root)
 {
 	struct rb_node *parent, *gparent;
 
-	while ((parent = rb_parent(node)) && rb_is_red(parent))
-	{
+	while (true) {
+		/*
+		 * Loop invariant: node is red
+		 *
+		 * If there is a black parent, we are done.
+		 * Otherwise, take some corrective action as we don't
+		 * want a red root or two consecutive red nodes.
+		 */
+		parent = rb_parent(node);
+		if (!parent) {
+			rb_set_black(node);
+			break;
+		} else if (rb_is_black(parent))
+			break;
+
 		gparent = rb_parent(parent);
 
 		if (parent == gparent->rb_left)
@@ -142,8 +155,6 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root)
 			break;
 		}
 	}
-
-	rb_set_black(root->rb_node);
 }
 EXPORT_SYMBOL(rb_insert_color);
 

From 5bc9188aa207dafd47eab57df7c4fe5b3d3f636a Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:30:47 -0700
Subject: [PATCH 064/160] rbtree: low level optimizations in rb_insert_color()

- Use the newly introduced rb_set_parent_color() function to flip the color
  of nodes whose parent is already known.
- Optimize rb_parent() when the node is known to be red - there is no need
  to mask out the color in that case.
- Flipping gparent's color to red requires us to fetch its rb_parent_color
  field, so we can reuse it as the parent value for the next loop iteration.
- Do not use __rb_rotate_left() and __rb_rotate_right() to handle tree
  rotations: we already have pointers to all relevant nodes, and know their
  colors (either because we want to adjust it, or because we've tested it,
  or we can deduce it as black due to the node proximity to a known red node).
  So we can generate more efficient code by making use of the node pointers
  we already have, and setting both the parent and color attributes for
  nodes all at once. Also in Case 2, some node attributes don't have to
  be set because we know another tree rotation (Case 3) will always follow
  and override them.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree.c | 164 ++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 130 insertions(+), 34 deletions(-)

diff --git a/lib/rbtree.c b/lib/rbtree.c
index d0be5fcaafe8..41cf19b2fe51 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -23,6 +23,25 @@
 #include <linux/rbtree.h>
 #include <linux/export.h>
 
+/*
+ * red-black trees properties:  http://en.wikipedia.org/wiki/Rbtree
+ *
+ *  1) A node is either red or black
+ *  2) The root is black
+ *  3) All leaves (NULL) are black
+ *  4) Both children of every red node are black
+ *  5) Every simple path from root to leaves contains the same number
+ *     of black nodes.
+ *
+ *  4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
+ *  consecutive red nodes in a path and every red node is therefore followed by
+ *  a black. So if B is the number of black nodes on every simple path (as per
+ *  5), then the longest possible path due to 4 is 2B.
+ *
+ *  We shall indicate color with case, where black nodes are uppercase and red
+ *  nodes will be lowercase.
+ */
+
 #define	RB_RED		0
 #define	RB_BLACK	1
 
@@ -41,6 +60,17 @@ static inline void rb_set_color(struct rb_node *rb, int color)
 	rb->__rb_parent_color = (rb->__rb_parent_color & ~1) | color;
 }
 
+static inline void rb_set_parent_color(struct rb_node *rb,
+				       struct rb_node *p, int color)
+{
+	rb->__rb_parent_color = (unsigned long)p | color;
+}
+
+static inline struct rb_node *rb_red_parent(struct rb_node *red)
+{
+	return (struct rb_node *)red->__rb_parent_color;
+}
+
 static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
 {
 	struct rb_node *right = node->rb_right;
@@ -87,9 +117,30 @@ static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
 	rb_set_parent(node, left);
 }
 
+/*
+ * Helper function for rotations:
+ * - old's parent and color get assigned to new
+ * - old gets assigned new as a parent and 'color' as a color.
+ */
+static inline void
+__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
+			struct rb_root *root, int color)
+{
+	struct rb_node *parent = rb_parent(old);
+	new->__rb_parent_color = old->__rb_parent_color;
+	rb_set_parent_color(old, new, color);
+	if (parent) {
+		if (parent->rb_left == old)
+			parent->rb_left = new;
+		else
+			parent->rb_right = new;
+	} else
+		root->rb_node = new;
+}
+
 void rb_insert_color(struct rb_node *node, struct rb_root *root)
 {
-	struct rb_node *parent, *gparent;
+	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
 
 	while (true) {
 		/*
@@ -99,59 +150,104 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root)
 		 * Otherwise, take some corrective action as we don't
 		 * want a red root or two consecutive red nodes.
 		 */
-		parent = rb_parent(node);
 		if (!parent) {
-			rb_set_black(node);
+			rb_set_parent_color(node, NULL, RB_BLACK);
 			break;
 		} else if (rb_is_black(parent))
 			break;
 
-		gparent = rb_parent(parent);
+		gparent = rb_red_parent(parent);
 
-		if (parent == gparent->rb_left)
-		{
-			{
-				register struct rb_node *uncle = gparent->rb_right;
-				if (uncle && rb_is_red(uncle))
-				{
-					rb_set_black(uncle);
-					rb_set_black(parent);
-					rb_set_red(gparent);
-					node = gparent;
-					continue;
-				}
+		if (parent == gparent->rb_left) {
+			tmp = gparent->rb_right;
+			if (tmp && rb_is_red(tmp)) {
+				/*
+				 * Case 1 - color flips
+				 *
+				 *       G            g
+				 *      / \          / \
+				 *     p   u  -->   P   U
+				 *    /            /
+				 *   n            N
+				 *
+				 * However, since g's parent might be red, and
+				 * 4) does not allow this, we need to recurse
+				 * at g.
+				 */
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+				rb_set_parent_color(parent, gparent, RB_BLACK);
+				node = gparent;
+				parent = rb_parent(node);
+				rb_set_parent_color(node, parent, RB_RED);
+				continue;
 			}
 
 			if (parent->rb_right == node) {
-				__rb_rotate_left(parent, root);
+				/*
+				 * Case 2 - left rotate at parent
+				 *
+				 *      G             G
+				 *     / \           / \
+				 *    p   U  -->    n   U
+				 *     \           /
+				 *      n         p
+				 *
+				 * This still leaves us in violation of 4), the
+				 * continuation into Case 3 will fix that.
+				 */
+				parent->rb_right = tmp = node->rb_left;
+				node->rb_left = parent;
+				if (tmp)
+					rb_set_parent_color(tmp, parent,
+							    RB_BLACK);
+				rb_set_parent_color(parent, node, RB_RED);
 				parent = node;
 			}
 
-			rb_set_black(parent);
-			rb_set_red(gparent);
-			__rb_rotate_right(gparent, root);
+			/*
+			 * Case 3 - right rotate at gparent
+			 *
+			 *        G           P
+			 *       / \         / \
+			 *      p   U  -->  n   g
+			 *     /                 \
+			 *    n                   U
+			 */
+			gparent->rb_left = tmp = parent->rb_right;
+			parent->rb_right = gparent;
+			if (tmp)
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
 			break;
 		} else {
-			{
-				register struct rb_node *uncle = gparent->rb_left;
-				if (uncle && rb_is_red(uncle))
-				{
-					rb_set_black(uncle);
-					rb_set_black(parent);
-					rb_set_red(gparent);
-					node = gparent;
-					continue;
-				}
+			tmp = gparent->rb_left;
+			if (tmp && rb_is_red(tmp)) {
+				/* Case 1 - color flips */
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+				rb_set_parent_color(parent, gparent, RB_BLACK);
+				node = gparent;
+				parent = rb_parent(node);
+				rb_set_parent_color(node, parent, RB_RED);
+				continue;
 			}
 
 			if (parent->rb_left == node) {
-				__rb_rotate_right(parent, root);
+				/* Case 2 - right rotate at parent */
+				parent->rb_left = tmp = node->rb_right;
+				node->rb_right = parent;
+				if (tmp)
+					rb_set_parent_color(tmp, parent,
+							    RB_BLACK);
+				rb_set_parent_color(parent, node, RB_RED);
 				parent = node;
 			}
 
-			rb_set_black(parent);
-			rb_set_red(gparent);
-			__rb_rotate_left(gparent, root);
+			/* Case 3 - left rotate at gparent */
+			gparent->rb_right = tmp = parent->rb_left;
+			parent->rb_left = gparent;
+			if (tmp)
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
 			break;
 		}
 	}

From d6ff1273928ebf15466a85b7e1810cd00e72998b Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:30:50 -0700
Subject: [PATCH 065/160] rbtree: adjust node color in __rb_erase_color() only
 when necessary

In __rb_erase_color(), we were always setting a node to black after
exiting the main loop.  And in one case, after fixing up the tree to
satisfy all rbtree invariants, we were setting the current node to root
just to guarantee a loop exit, at which point the root would be set to
black.  However this is not necessary, as the root of an rbtree is already
known to be black.  The only case where the color flip is required is when
we exit the loop due to the current node being red, and it's easiest to
just do the flip at that point instead of doing it after the loop.

[adrian.hunter@intel.com: perf tools: fix build for another rbtree.c change]
Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree.c                           | 28 ++++++++++++++++----------
 tools/perf/util/include/linux/rbtree.h |  1 +
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/lib/rbtree.c b/lib/rbtree.c
index 41cf19b2fe51..baf7c835c57c 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -259,10 +259,22 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 {
 	struct rb_node *other;
 
-	while ((!node || rb_is_black(node)) && node != root->rb_node)
-	{
-		if (parent->rb_left == node)
-		{
+	while (true) {
+		/*
+		 * Loop invariant: all leaf paths going through node have a
+		 * black node count that is 1 lower than other leaf paths.
+		 *
+		 * If node is red, we can flip it to black to adjust.
+		 * If node is the root, all leaf paths go through it.
+		 * Otherwise, we need to adjust the tree through color flips
+		 * and tree rotations as per one of the 4 cases below.
+		 */
+		if (node && rb_is_red(node)) {
+			rb_set_black(node);
+			break;
+		} else if (!parent) {
+			break;
+		} else if (parent->rb_left == node) {
 			other = parent->rb_right;
 			if (rb_is_red(other))
 			{
@@ -291,12 +303,9 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 				rb_set_black(parent);
 				rb_set_black(other->rb_right);
 				__rb_rotate_left(parent, root);
-				node = root->rb_node;
 				break;
 			}
-		}
-		else
-		{
+		} else {
 			other = parent->rb_left;
 			if (rb_is_red(other))
 			{
@@ -325,13 +334,10 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 				rb_set_black(parent);
 				rb_set_black(other->rb_left);
 				__rb_rotate_right(parent, root);
-				node = root->rb_node;
 				break;
 			}
 		}
 	}
-	if (node)
-		rb_set_black(node);
 }
 
 void rb_erase(struct rb_node *node, struct rb_root *root)
diff --git a/tools/perf/util/include/linux/rbtree.h b/tools/perf/util/include/linux/rbtree.h
index 2a030c5af3aa..9bcdc844b330 100644
--- a/tools/perf/util/include/linux/rbtree.h
+++ b/tools/perf/util/include/linux/rbtree.h
@@ -1,2 +1,3 @@
 #include <stdbool.h>
+#include <stdbool.h>
 #include "../../../../include/linux/rbtree.h"

From e125d1471a4f8f1bf7ea9a83deb8d23cb40bd712 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:30:54 -0700
Subject: [PATCH 066/160] rbtree: optimize case selection logic in
 __rb_erase_color()

In __rb_erase_color(), we have to select one of 3 cases depending on the
color on the 'other' node children.  If both children are black, we flip a
few node colors and iterate.  Otherwise, we do either one or two tree
rotations, depending on the color of the 'other' child opposite to 'node',
and then we are done.

The corresponding logic had duplicate checks for the color of the 'other'
child opposite to 'node'.  It was checking it first to determine if both
children are black, and then to determine how many tree rotations are
required.  Rearrange the logic to avoid that extra check.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree.c | 68 +++++++++++++++++++++++-----------------------------
 1 file changed, 30 insertions(+), 38 deletions(-)

diff --git a/lib/rbtree.c b/lib/rbtree.c
index baf7c835c57c..eb823a31099c 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -283,28 +283,24 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 				__rb_rotate_left(parent, root);
 				other = parent->rb_right;
 			}
-			if ((!other->rb_left || rb_is_black(other->rb_left)) &&
-			    (!other->rb_right || rb_is_black(other->rb_right)))
-			{
-				rb_set_red(other);
-				node = parent;
-				parent = rb_parent(node);
-			}
-			else
-			{
-				if (!other->rb_right || rb_is_black(other->rb_right))
-				{
-					rb_set_black(other->rb_left);
+			if (!other->rb_right || rb_is_black(other->rb_right)) {
+				if (!other->rb_left ||
+				    rb_is_black(other->rb_left)) {
 					rb_set_red(other);
-					__rb_rotate_right(other, root);
-					other = parent->rb_right;
+					node = parent;
+					parent = rb_parent(node);
+					continue;
 				}
-				rb_set_color(other, rb_color(parent));
-				rb_set_black(parent);
-				rb_set_black(other->rb_right);
-				__rb_rotate_left(parent, root);
-				break;
+				rb_set_black(other->rb_left);
+				rb_set_red(other);
+				__rb_rotate_right(other, root);
+				other = parent->rb_right;
 			}
+			rb_set_color(other, rb_color(parent));
+			rb_set_black(parent);
+			rb_set_black(other->rb_right);
+			__rb_rotate_left(parent, root);
+			break;
 		} else {
 			other = parent->rb_left;
 			if (rb_is_red(other))
@@ -314,28 +310,24 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 				__rb_rotate_right(parent, root);
 				other = parent->rb_left;
 			}
-			if ((!other->rb_left || rb_is_black(other->rb_left)) &&
-			    (!other->rb_right || rb_is_black(other->rb_right)))
-			{
-				rb_set_red(other);
-				node = parent;
-				parent = rb_parent(node);
-			}
-			else
-			{
-				if (!other->rb_left || rb_is_black(other->rb_left))
-				{
-					rb_set_black(other->rb_right);
+			if (!other->rb_left || rb_is_black(other->rb_left)) {
+				if (!other->rb_right ||
+				    rb_is_black(other->rb_right)) {
 					rb_set_red(other);
-					__rb_rotate_left(other, root);
-					other = parent->rb_left;
+					node = parent;
+					parent = rb_parent(node);
+					continue;
 				}
-				rb_set_color(other, rb_color(parent));
-				rb_set_black(parent);
-				rb_set_black(other->rb_left);
-				__rb_rotate_right(parent, root);
-				break;
+				rb_set_black(other->rb_right);
+				rb_set_red(other);
+				__rb_rotate_left(other, root);
+				other = parent->rb_left;
 			}
+			rb_set_color(other, rb_color(parent));
+			rb_set_black(parent);
+			rb_set_black(other->rb_left);
+			__rb_rotate_right(parent, root);
+			break;
 		}
 	}
 }

From 6280d2356fd8ad0936a63c10dc1e6accf48d0c61 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:30:57 -0700
Subject: [PATCH 067/160] rbtree: low level optimizations in __rb_erase_color()

In __rb_erase_color(), we often already have pointers to the nodes being
rotated and/or know what their colors must be, so we can generate more
efficient code than the generic __rb_rotate_left() and __rb_rotate_right()
functions.

Also when the current node is red or when flipping the sibling's color,
the parent is already known so we can use the more efficient
rb_set_parent_color() function to set the desired color.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree.c | 208 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 115 insertions(+), 93 deletions(-)

diff --git a/lib/rbtree.c b/lib/rbtree.c
index eb823a31099c..a38e473d8fe7 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -39,7 +39,8 @@
  *  5), then the longest possible path due to 4 is 2B.
  *
  *  We shall indicate color with case, where black nodes are uppercase and red
- *  nodes will be lowercase.
+ *  nodes will be lowercase. Unknown color nodes shall be drawn as red within
+ *  parentheses and have some accompanying text comment.
  */
 
 #define	RB_RED		0
@@ -48,17 +49,11 @@
 #define rb_color(r)   ((r)->__rb_parent_color & 1)
 #define rb_is_red(r)   (!rb_color(r))
 #define rb_is_black(r) rb_color(r)
-#define rb_set_red(r)  do { (r)->__rb_parent_color &= ~1; } while (0)
-#define rb_set_black(r)  do { (r)->__rb_parent_color |= 1; } while (0)
 
 static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
 {
 	rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
 }
-static inline void rb_set_color(struct rb_node *rb, int color)
-{
-	rb->__rb_parent_color = (rb->__rb_parent_color & ~1) | color;
-}
 
 static inline void rb_set_parent_color(struct rb_node *rb,
 				       struct rb_node *p, int color)
@@ -71,52 +66,6 @@ static inline struct rb_node *rb_red_parent(struct rb_node *red)
 	return (struct rb_node *)red->__rb_parent_color;
 }
 
-static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
-{
-	struct rb_node *right = node->rb_right;
-	struct rb_node *parent = rb_parent(node);
-
-	if ((node->rb_right = right->rb_left))
-		rb_set_parent(right->rb_left, node);
-	right->rb_left = node;
-
-	rb_set_parent(right, parent);
-
-	if (parent)
-	{
-		if (node == parent->rb_left)
-			parent->rb_left = right;
-		else
-			parent->rb_right = right;
-	}
-	else
-		root->rb_node = right;
-	rb_set_parent(node, right);
-}
-
-static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
-{
-	struct rb_node *left = node->rb_left;
-	struct rb_node *parent = rb_parent(node);
-
-	if ((node->rb_left = left->rb_right))
-		rb_set_parent(left->rb_right, node);
-	left->rb_right = node;
-
-	rb_set_parent(left, parent);
-
-	if (parent)
-	{
-		if (node == parent->rb_right)
-			parent->rb_right = left;
-		else
-			parent->rb_left = left;
-	}
-	else
-		root->rb_node = left;
-	rb_set_parent(node, left);
-}
-
 /*
  * Helper function for rotations:
  * - old's parent and color get assigned to new
@@ -257,7 +206,7 @@ EXPORT_SYMBOL(rb_insert_color);
 static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 			     struct rb_root *root)
 {
-	struct rb_node *other;
+	struct rb_node *sibling, *tmp1, *tmp2;
 
 	while (true) {
 		/*
@@ -270,63 +219,136 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 		 * and tree rotations as per one of the 4 cases below.
 		 */
 		if (node && rb_is_red(node)) {
-			rb_set_black(node);
+			rb_set_parent_color(node, parent, RB_BLACK);
 			break;
 		} else if (!parent) {
 			break;
 		} else if (parent->rb_left == node) {
-			other = parent->rb_right;
-			if (rb_is_red(other))
-			{
-				rb_set_black(other);
-				rb_set_red(parent);
-				__rb_rotate_left(parent, root);
-				other = parent->rb_right;
+			sibling = parent->rb_right;
+			if (rb_is_red(sibling)) {
+				/*
+				 * Case 1 - left rotate at parent
+				 *
+				 *     P               S
+				 *    / \             / \
+				 *   N   s    -->    p   Sr
+				 *      / \         / \
+				 *     Sl  Sr      N   Sl
+				 */
+				parent->rb_right = tmp1 = sibling->rb_left;
+				sibling->rb_left = parent;
+				rb_set_parent_color(tmp1, parent, RB_BLACK);
+				__rb_rotate_set_parents(parent, sibling, root,
+							RB_RED);
+				sibling = tmp1;
 			}
-			if (!other->rb_right || rb_is_black(other->rb_right)) {
-				if (!other->rb_left ||
-				    rb_is_black(other->rb_left)) {
-					rb_set_red(other);
+			tmp1 = sibling->rb_right;
+			if (!tmp1 || rb_is_black(tmp1)) {
+				tmp2 = sibling->rb_left;
+				if (!tmp2 || rb_is_black(tmp2)) {
+					/*
+					 * Case 2 - sibling color flip
+					 * (p could be either color here)
+					 *
+					 *    (p)           (p)
+					 *    / \           / \
+					 *   N   S    -->  N   s
+					 *      / \           / \
+					 *     Sl  Sr        Sl  Sr
+					 *
+					 * This leaves us violating 5), so
+					 * recurse at p. If p is red, the
+					 * recursion will just flip it to black
+					 * and exit. If coming from Case 1,
+					 * p is known to be red.
+					 */
+					rb_set_parent_color(sibling, parent,
+							    RB_RED);
 					node = parent;
 					parent = rb_parent(node);
 					continue;
 				}
-				rb_set_black(other->rb_left);
-				rb_set_red(other);
-				__rb_rotate_right(other, root);
-				other = parent->rb_right;
+				/*
+				 * Case 3 - right rotate at sibling
+				 * (p could be either color here)
+				 *
+				 *   (p)           (p)
+				 *   / \           / \
+				 *  N   S    -->  N   Sl
+				 *     / \             \
+				 *    sl  Sr            s
+				 *                       \
+				 *                        Sr
+				 */
+				sibling->rb_left = tmp1 = tmp2->rb_right;
+				tmp2->rb_right = sibling;
+				parent->rb_right = tmp2;
+				if (tmp1)
+					rb_set_parent_color(tmp1, sibling,
+							    RB_BLACK);
+				tmp1 = sibling;
+				sibling = tmp2;
 			}
-			rb_set_color(other, rb_color(parent));
-			rb_set_black(parent);
-			rb_set_black(other->rb_right);
-			__rb_rotate_left(parent, root);
+			/*
+			 * Case 4 - left rotate at parent + color flips
+			 * (p and sl could be either color here.
+			 *  After rotation, p becomes black, s acquires
+			 *  p's color, and sl keeps its color)
+			 *
+			 *      (p)             (s)
+			 *      / \             / \
+			 *     N   S     -->   P   Sr
+			 *        / \         / \
+			 *      (sl) sr      N  (sl)
+			 */
+			parent->rb_right = tmp2 = sibling->rb_left;
+			sibling->rb_left = parent;
+			rb_set_parent_color(tmp1, sibling, RB_BLACK);
+			if (tmp2)
+				rb_set_parent(tmp2, parent);
+			__rb_rotate_set_parents(parent, sibling, root,
+						RB_BLACK);
 			break;
 		} else {
-			other = parent->rb_left;
-			if (rb_is_red(other))
-			{
-				rb_set_black(other);
-				rb_set_red(parent);
-				__rb_rotate_right(parent, root);
-				other = parent->rb_left;
+			sibling = parent->rb_left;
+			if (rb_is_red(sibling)) {
+				/* Case 1 - right rotate at parent */
+				parent->rb_left = tmp1 = sibling->rb_right;
+				sibling->rb_right = parent;
+				rb_set_parent_color(tmp1, parent, RB_BLACK);
+				__rb_rotate_set_parents(parent, sibling, root,
+							RB_RED);
+				sibling = tmp1;
 			}
-			if (!other->rb_left || rb_is_black(other->rb_left)) {
-				if (!other->rb_right ||
-				    rb_is_black(other->rb_right)) {
-					rb_set_red(other);
+			tmp1 = sibling->rb_left;
+			if (!tmp1 || rb_is_black(tmp1)) {
+				tmp2 = sibling->rb_right;
+				if (!tmp2 || rb_is_black(tmp2)) {
+					/* Case 2 - sibling color flip */
+					rb_set_parent_color(sibling, parent,
+							    RB_RED);
 					node = parent;
 					parent = rb_parent(node);
 					continue;
 				}
-				rb_set_black(other->rb_right);
-				rb_set_red(other);
-				__rb_rotate_left(other, root);
-				other = parent->rb_left;
+				/* Case 3 - right rotate at sibling */
+				sibling->rb_right = tmp1 = tmp2->rb_left;
+				tmp2->rb_left = sibling;
+				parent->rb_left = tmp2;
+				if (tmp1)
+					rb_set_parent_color(tmp1, sibling,
+							    RB_BLACK);
+				tmp1 = sibling;
+				sibling = tmp2;
 			}
-			rb_set_color(other, rb_color(parent));
-			rb_set_black(parent);
-			rb_set_black(other->rb_left);
-			__rb_rotate_right(parent, root);
+			/* Case 4 - left rotate at parent + color flips */
+			parent->rb_left = tmp2 = sibling->rb_right;
+			sibling->rb_right = parent;
+			rb_set_parent_color(tmp1, sibling, RB_BLACK);
+			if (tmp2)
+				rb_set_parent(tmp2, parent);
+			__rb_rotate_set_parents(parent, sibling, root,
+						RB_BLACK);
 			break;
 		}
 	}

From 7ce6ff9e5de99e7b72019c7de82fb438fe1dc5a0 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:01 -0700
Subject: [PATCH 068/160] rbtree: coding style adjustments

Set comment and indentation style to be consistent with linux coding style
and the rest of the file, as suggested by Peter Zijlstra

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree.c | 42 +++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/lib/rbtree.c b/lib/rbtree.c
index a38e473d8fe7..08926709b4f9 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -363,8 +363,7 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 		child = node->rb_right;
 	else if (!node->rb_right)
 		child = node->rb_left;
-	else
-	{
+	else {
 		struct rb_node *old = node, *left;
 
 		node = node->rb_right;
@@ -406,17 +405,15 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 
 	if (child)
 		rb_set_parent(child, parent);
-	if (parent)
-	{
+	if (parent) {
 		if (parent->rb_left == node)
 			parent->rb_left = child;
 		else
 			parent->rb_right = child;
-	}
-	else
+	} else
 		root->rb_node = child;
 
- color:
+color:
 	if (color == RB_BLACK)
 		__rb_erase_color(child, parent, root);
 }
@@ -529,8 +526,10 @@ struct rb_node *rb_next(const struct rb_node *node)
 	if (RB_EMPTY_NODE(node))
 		return NULL;
 
-	/* If we have a right-hand child, go down and then left as far
-	   as we can. */
+	/*
+	 * If we have a right-hand child, go down and then left as far
+	 * as we can.
+	 */
 	if (node->rb_right) {
 		node = node->rb_right; 
 		while (node->rb_left)
@@ -538,12 +537,13 @@ struct rb_node *rb_next(const struct rb_node *node)
 		return (struct rb_node *)node;
 	}
 
-	/* No right-hand children.  Everything down and left is
-	   smaller than us, so any 'next' node must be in the general
-	   direction of our parent. Go up the tree; any time the
-	   ancestor is a right-hand child of its parent, keep going
-	   up. First time it's a left-hand child of its parent, said
-	   parent is our 'next' node. */
+	/*
+	 * No right-hand children. Everything down and left is smaller than us,
+	 * so any 'next' node must be in the general direction of our parent.
+	 * Go up the tree; any time the ancestor is a right-hand child of its
+	 * parent, keep going up. First time it's a left-hand child of its
+	 * parent, said parent is our 'next' node.
+	 */
 	while ((parent = rb_parent(node)) && node == parent->rb_right)
 		node = parent;
 
@@ -558,8 +558,10 @@ struct rb_node *rb_prev(const struct rb_node *node)
 	if (RB_EMPTY_NODE(node))
 		return NULL;
 
-	/* If we have a left-hand child, go down and then right as far
-	   as we can. */
+	/*
+	 * If we have a left-hand child, go down and then right as far
+	 * as we can.
+	 */
 	if (node->rb_left) {
 		node = node->rb_left; 
 		while (node->rb_right)
@@ -567,8 +569,10 @@ struct rb_node *rb_prev(const struct rb_node *node)
 		return (struct rb_node *)node;
 	}
 
-	/* No left-hand children. Go up till we find an ancestor which
-	   is a right-hand child of its parent */
+	/*
+	 * No left-hand children. Go up till we find an ancestor which
+	 * is a right-hand child of its parent.
+	 */
 	while ((parent = rb_parent(node)) && node == parent->rb_left)
 		node = parent;
 

From 59633abf34e2f44b8e772a2c12a92132aa7c2220 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:02 -0700
Subject: [PATCH 069/160] rbtree: optimize fetching of sibling node

When looking to fetch a node's sibling, we went through a sequence of:
- check if node is the parent's left child
- if it is, then fetch the parent's right child

This can be replaced with:
- fetch the parent's right child as an assumed sibling
- check that node is NOT the fetched child

This avoids fetching the parent's left child when node is actually
that child. Saves a bit on code size, though it doesn't seem to make
a large difference in speed.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/lib/rbtree.c b/lib/rbtree.c
index 08926709b4f9..61cdd0e3e538 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -107,8 +107,8 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root)
 
 		gparent = rb_red_parent(parent);
 
-		if (parent == gparent->rb_left) {
-			tmp = gparent->rb_right;
+		tmp = gparent->rb_right;
+		if (parent != tmp) {	/* parent == gparent->rb_left */
 			if (tmp && rb_is_red(tmp)) {
 				/*
 				 * Case 1 - color flips
@@ -131,7 +131,8 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root)
 				continue;
 			}
 
-			if (parent->rb_right == node) {
+			tmp = parent->rb_right;
+			if (node == tmp) {
 				/*
 				 * Case 2 - left rotate at parent
 				 *
@@ -151,6 +152,7 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root)
 							    RB_BLACK);
 				rb_set_parent_color(parent, node, RB_RED);
 				parent = node;
+				tmp = node->rb_right;
 			}
 
 			/*
@@ -162,7 +164,7 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root)
 			 *     /                 \
 			 *    n                   U
 			 */
-			gparent->rb_left = tmp = parent->rb_right;
+			gparent->rb_left = tmp;  /* == parent->rb_right */
 			parent->rb_right = gparent;
 			if (tmp)
 				rb_set_parent_color(tmp, gparent, RB_BLACK);
@@ -180,7 +182,8 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root)
 				continue;
 			}
 
-			if (parent->rb_left == node) {
+			tmp = parent->rb_left;
+			if (node == tmp) {
 				/* Case 2 - right rotate at parent */
 				parent->rb_left = tmp = node->rb_right;
 				node->rb_right = parent;
@@ -189,10 +192,11 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root)
 							    RB_BLACK);
 				rb_set_parent_color(parent, node, RB_RED);
 				parent = node;
+				tmp = node->rb_left;
 			}
 
 			/* Case 3 - left rotate at gparent */
-			gparent->rb_right = tmp = parent->rb_left;
+			gparent->rb_right = tmp;  /* == parent->rb_left */
 			parent->rb_left = gparent;
 			if (tmp)
 				rb_set_parent_color(tmp, gparent, RB_BLACK);
@@ -223,8 +227,9 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 			break;
 		} else if (!parent) {
 			break;
-		} else if (parent->rb_left == node) {
-			sibling = parent->rb_right;
+		}
+		sibling = parent->rb_right;
+		if (node != sibling) {	/* node == parent->rb_left */
 			if (rb_is_red(sibling)) {
 				/*
 				 * Case 1 - left rotate at parent

From 28d7530928d01638678f63c3c70113540b0e6abe Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:04 -0700
Subject: [PATCH 070/160] rbtree test: fix sparse warning about 64-bit constant

Just a small fix to make sparse happy.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reported-by: Fengguang Wu <wfg@linux.intel.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index 19dfca9ff7d7..fd09465d82ca 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -88,7 +88,7 @@ static int rbtree_test_init(void)
 
 	printk(KERN_ALERT "rbtree testing");
 
-	prandom32_seed(&rnd, 3141592653589793238);
+	prandom32_seed(&rnd, 3141592653589793238ULL);
 	init();
 
 	time1 = get_cycles();

From 7abc704ae399fcb9c51ca200b0456f8a975a8011 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:07 -0700
Subject: [PATCH 071/160] rbtree: add __rb_change_child() helper function

Add __rb_change_child() as an inline helper function to replace code that
would otherwise be duplicated 4 times in the source.

No changes to binary size or speed.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree.c | 46 +++++++++++++++++-----------------------------
 1 file changed, 17 insertions(+), 29 deletions(-)

diff --git a/lib/rbtree.c b/lib/rbtree.c
index 61cdd0e3e538..de89a614b1ba 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -66,6 +66,19 @@ static inline struct rb_node *rb_red_parent(struct rb_node *red)
 	return (struct rb_node *)red->__rb_parent_color;
 }
 
+static inline void
+__rb_change_child(struct rb_node *old, struct rb_node *new,
+		  struct rb_node *parent, struct rb_root *root)
+{
+	if (parent) {
+		if (parent->rb_left == old)
+			parent->rb_left = new;
+		else
+			parent->rb_right = new;
+	} else
+		root->rb_node = new;
+}
+
 /*
  * Helper function for rotations:
  * - old's parent and color get assigned to new
@@ -78,13 +91,7 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
 	struct rb_node *parent = rb_parent(old);
 	new->__rb_parent_color = old->__rb_parent_color;
 	rb_set_parent_color(old, new, color);
-	if (parent) {
-		if (parent->rb_left == old)
-			parent->rb_left = new;
-		else
-			parent->rb_right = new;
-	} else
-		root->rb_node = new;
+	__rb_change_child(old, new, parent, root);
 }
 
 void rb_insert_color(struct rb_node *node, struct rb_root *root)
@@ -375,13 +382,7 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 		while ((left = node->rb_left) != NULL)
 			node = left;
 
-		if (rb_parent(old)) {
-			if (rb_parent(old)->rb_left == old)
-				rb_parent(old)->rb_left = node;
-			else
-				rb_parent(old)->rb_right = node;
-		} else
-			root->rb_node = node;
+		__rb_change_child(old, node, rb_parent(old), root);
 
 		child = node->rb_right;
 		parent = rb_parent(node);
@@ -410,13 +411,7 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 
 	if (child)
 		rb_set_parent(child, parent);
-	if (parent) {
-		if (parent->rb_left == node)
-			parent->rb_left = child;
-		else
-			parent->rb_right = child;
-	} else
-		root->rb_node = child;
+	__rb_change_child(node, child, parent, root);
 
 color:
 	if (color == RB_BLACK)
@@ -591,14 +586,7 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 	struct rb_node *parent = rb_parent(victim);
 
 	/* Set the surrounding nodes to point to the replacement */
-	if (parent) {
-		if (victim == parent->rb_left)
-			parent->rb_left = new;
-		else
-			parent->rb_right = new;
-	} else {
-		root->rb_node = new;
-	}
+	__rb_change_child(victim, new, parent, root);
 	if (victim->rb_left)
 		rb_set_parent(victim->rb_left, new);
 	if (victim->rb_right)

From 60670b8034d6e2ba860af79c9379b7788d09db73 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:10 -0700
Subject: [PATCH 072/160] rbtree: place easiest case first in rb_erase()

In rb_erase, move the easy case (node to erase has no more than
1 child) first. I feel the code reads easier that way.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/lib/rbtree.c b/lib/rbtree.c
index de89a614b1ba..bde1b5c5fb33 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -368,17 +368,28 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 
 void rb_erase(struct rb_node *node, struct rb_root *root)
 {
-	struct rb_node *child, *parent;
+	struct rb_node *child = node->rb_right, *tmp = node->rb_left;
+	struct rb_node *parent;
 	int color;
 
-	if (!node->rb_left)
-		child = node->rb_right;
-	else if (!node->rb_right)
-		child = node->rb_left;
-	else {
+	if (!tmp) {
+	case1:
+		/* Case 1: node to erase has no more than 1 child (easy!) */
+
+		parent = rb_parent(node);
+		color = rb_color(node);
+
+		if (child)
+			rb_set_parent(child, parent);
+		__rb_change_child(node, child, parent, root);
+	} else if (!child) {
+		/* Still case 1, but this time the child is node->rb_left */
+		child = tmp;
+		goto case1;
+	} else {
 		struct rb_node *old = node, *left;
 
-		node = node->rb_right;
+		node = child;
 		while ((left = node->rb_left) != NULL)
 			node = left;
 
@@ -402,18 +413,8 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 		node->__rb_parent_color = old->__rb_parent_color;
 		node->rb_left = old->rb_left;
 		rb_set_parent(old->rb_left, node);
-
-		goto color;
 	}
 
-	parent = rb_parent(node);
-	color = rb_color(node);
-
-	if (child)
-		rb_set_parent(child, parent);
-	__rb_change_child(node, child, parent, root);
-
-color:
 	if (color == RB_BLACK)
 		__rb_erase_color(child, parent, root);
 }

From 46b6135a7402ac23c5b25f2bd79b03bab8f98278 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:11 -0700
Subject: [PATCH 073/160] rbtree: handle 1-child recoloring in rb_erase()
 instead of rb_erase_color()

An interesting observation for rb_erase() is that when a node has
exactly one child, the node must be black and the child must be red.
An interesting consequence is that removing such a node can be done by
simply replacing it with its child and making the child black,
which we can do efficiently in rb_erase(). __rb_erase_color() then
only needs to handle the no-childs case and can be modified accordingly.

Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree.c | 105 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 62 insertions(+), 43 deletions(-)

diff --git a/lib/rbtree.c b/lib/rbtree.c
index bde1b5c5fb33..80b092538fa9 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -2,7 +2,8 @@
   Red Black Trees
   (C) 1999  Andrea Arcangeli <andrea@suse.de>
   (C) 2002  David Woodhouse <dwmw2@infradead.org>
-  
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
@@ -50,6 +51,11 @@
 #define rb_is_red(r)   (!rb_color(r))
 #define rb_is_black(r) rb_color(r)
 
+static inline void rb_set_black(struct rb_node *rb)
+{
+	rb->__rb_parent_color |= RB_BLACK;
+}
+
 static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
 {
 	rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
@@ -214,27 +220,18 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root)
 }
 EXPORT_SYMBOL(rb_insert_color);
 
-static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
-			     struct rb_root *root)
+static void __rb_erase_color(struct rb_node *parent, struct rb_root *root)
 {
-	struct rb_node *sibling, *tmp1, *tmp2;
+	struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
 
 	while (true) {
 		/*
-		 * Loop invariant: all leaf paths going through node have a
-		 * black node count that is 1 lower than other leaf paths.
-		 *
-		 * If node is red, we can flip it to black to adjust.
-		 * If node is the root, all leaf paths go through it.
-		 * Otherwise, we need to adjust the tree through color flips
-		 * and tree rotations as per one of the 4 cases below.
+		 * Loop invariants:
+		 * - node is black (or NULL on first iteration)
+		 * - node is not the root (parent is not NULL)
+		 * - All leaf paths going through parent and node have a
+		 *   black node count that is 1 lower than other leaf paths.
 		 */
-		if (node && rb_is_red(node)) {
-			rb_set_parent_color(node, parent, RB_BLACK);
-			break;
-		} else if (!parent) {
-			break;
-		}
 		sibling = parent->rb_right;
 		if (node != sibling) {	/* node == parent->rb_left */
 			if (rb_is_red(sibling)) {
@@ -268,17 +265,22 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 					 *      / \           / \
 					 *     Sl  Sr        Sl  Sr
 					 *
-					 * This leaves us violating 5), so
-					 * recurse at p. If p is red, the
-					 * recursion will just flip it to black
-					 * and exit. If coming from Case 1,
-					 * p is known to be red.
+					 * This leaves us violating 5) which
+					 * can be fixed by flipping p to black
+					 * if it was red, or by recursing at p.
+					 * p is red when coming from Case 1.
 					 */
 					rb_set_parent_color(sibling, parent,
 							    RB_RED);
-					node = parent;
-					parent = rb_parent(node);
-					continue;
+					if (rb_is_red(parent))
+						rb_set_black(parent);
+					else {
+						node = parent;
+						parent = rb_parent(node);
+						if (parent)
+							continue;
+					}
+					break;
 				}
 				/*
 				 * Case 3 - right rotate at sibling
@@ -339,9 +341,15 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 					/* Case 2 - sibling color flip */
 					rb_set_parent_color(sibling, parent,
 							    RB_RED);
-					node = parent;
-					parent = rb_parent(node);
-					continue;
+					if (rb_is_red(parent))
+						rb_set_black(parent);
+					else {
+						node = parent;
+						parent = rb_parent(node);
+						if (parent)
+							continue;
+					}
+					break;
 				}
 				/* Case 3 - right rotate at sibling */
 				sibling->rb_right = tmp1 = tmp2->rb_left;
@@ -369,23 +377,31 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 void rb_erase(struct rb_node *node, struct rb_root *root)
 {
 	struct rb_node *child = node->rb_right, *tmp = node->rb_left;
-	struct rb_node *parent;
-	int color;
+	struct rb_node *parent, *rebalance;
 
 	if (!tmp) {
-	case1:
-		/* Case 1: node to erase has no more than 1 child (easy!) */
+		/*
+		 * Case 1: node to erase has no more than 1 child (easy!)
+		 *
+		 * Note that if there is one child it must be red due to 5)
+		 * and node must be black due to 4). We adjust colors locally
+		 * so as to bypass __rb_erase_color() later on.
+		 */
 
 		parent = rb_parent(node);
-		color = rb_color(node);
-
-		if (child)
-			rb_set_parent(child, parent);
 		__rb_change_child(node, child, parent, root);
+		if (child) {
+			rb_set_parent_color(child, parent, RB_BLACK);
+			rebalance = NULL;
+		} else {
+			rebalance = rb_is_black(node) ? parent : NULL;
+		}
 	} else if (!child) {
 		/* Still case 1, but this time the child is node->rb_left */
-		child = tmp;
-		goto case1;
+		parent = rb_parent(node);
+		__rb_change_child(node, tmp, parent, root);
+		rb_set_parent_color(tmp, parent, RB_BLACK);
+		rebalance = NULL;
 	} else {
 		struct rb_node *old = node, *left;
 
@@ -397,26 +413,29 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 
 		child = node->rb_right;
 		parent = rb_parent(node);
-		color = rb_color(node);
 
 		if (parent == old) {
 			parent = node;
 		} else {
-			if (child)
-				rb_set_parent(child, parent);
 			parent->rb_left = child;
 
 			node->rb_right = old->rb_right;
 			rb_set_parent(old->rb_right, node);
 		}
 
+		if (child) {
+			rb_set_parent_color(child, parent, RB_BLACK);
+			rebalance = NULL;
+		} else {
+			rebalance = rb_is_black(node) ? parent : NULL;
+		}
 		node->__rb_parent_color = old->__rb_parent_color;
 		node->rb_left = old->rb_left;
 		rb_set_parent(old->rb_left, node);
 	}
 
-	if (color == RB_BLACK)
-		__rb_erase_color(child, parent, root);
+	if (rebalance)
+		__rb_erase_color(rebalance, root);
 }
 EXPORT_SYMBOL(rb_erase);
 

From 4f035ad67f4633c233cb3642711d49b4efc9c82d Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:13 -0700
Subject: [PATCH 074/160] rbtree: low level optimizations in rb_erase()

Various minor optimizations in rb_erase():
- Avoid multiple loading of node->__rb_parent_color when computing parent
  and color information (possibly not in close sequence, as there might
  be further branches in the algorithm)
- In the 1-child subcase of case 1, copy the __rb_parent_color field from
  the erased node to the child instead of recomputing it from the desired
  parent and color
- When searching for the erased node's successor, differentiate between
  cases 2 and 3 based on whether any left links were followed. This avoids
  a condition later down.
- In case 3, keep a pointer to the erased node's right child so we don't
  have to refetch it later to adjust its parent.
- In the no-childs subcase of cases 2 and 3, place the rebalance assigment
  last so that the compiler can remove the following if(rebalance) test.

Also, added some comments to illustrate cases 2 and 3.

Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree.c | 98 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 64 insertions(+), 34 deletions(-)

diff --git a/lib/rbtree.c b/lib/rbtree.c
index 80b092538fa9..938061ecbe61 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -47,9 +47,14 @@
 #define	RB_RED		0
 #define	RB_BLACK	1
 
-#define rb_color(r)   ((r)->__rb_parent_color & 1)
-#define rb_is_red(r)   (!rb_color(r))
-#define rb_is_black(r) rb_color(r)
+#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))
+
+#define __rb_color(pc)     ((pc) & 1)
+#define __rb_is_black(pc)  __rb_color(pc)
+#define __rb_is_red(pc)    (!__rb_color(pc))
+#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
+#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
+#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)
 
 static inline void rb_set_black(struct rb_node *rb)
 {
@@ -378,6 +383,7 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 {
 	struct rb_node *child = node->rb_right, *tmp = node->rb_left;
 	struct rb_node *parent, *rebalance;
+	unsigned long pc;
 
 	if (!tmp) {
 		/*
@@ -387,51 +393,75 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 		 * and node must be black due to 4). We adjust colors locally
 		 * so as to bypass __rb_erase_color() later on.
 		 */
-
-		parent = rb_parent(node);
+		pc = node->__rb_parent_color;
+		parent = __rb_parent(pc);
 		__rb_change_child(node, child, parent, root);
 		if (child) {
-			rb_set_parent_color(child, parent, RB_BLACK);
+			child->__rb_parent_color = pc;
 			rebalance = NULL;
-		} else {
-			rebalance = rb_is_black(node) ? parent : NULL;
-		}
+		} else
+			rebalance = __rb_is_black(pc) ? parent : NULL;
 	} else if (!child) {
 		/* Still case 1, but this time the child is node->rb_left */
-		parent = rb_parent(node);
+		tmp->__rb_parent_color = pc = node->__rb_parent_color;
+		parent = __rb_parent(pc);
 		__rb_change_child(node, tmp, parent, root);
-		rb_set_parent_color(tmp, parent, RB_BLACK);
 		rebalance = NULL;
 	} else {
-		struct rb_node *old = node, *left;
-
-		node = child;
-		while ((left = node->rb_left) != NULL)
-			node = left;
-
-		__rb_change_child(old, node, rb_parent(old), root);
-
-		child = node->rb_right;
-		parent = rb_parent(node);
-
-		if (parent == old) {
-			parent = node;
+		struct rb_node *successor = child, *child2;
+		tmp = child->rb_left;
+		if (!tmp) {
+			/*
+			 * Case 2: node's successor is its right child
+			 *
+			 *    (n)          (s)
+			 *    / \          / \
+			 *  (x) (s)  ->  (x) (c)
+			 *        \
+			 *        (c)
+			 */
+			parent = child;
+			child2 = child->rb_right;
 		} else {
-			parent->rb_left = child;
-
-			node->rb_right = old->rb_right;
-			rb_set_parent(old->rb_right, node);
+			/*
+			 * Case 3: node's successor is leftmost under
+			 * node's right child subtree
+			 *
+			 *    (n)          (s)
+			 *    / \          / \
+			 *  (x) (y)  ->  (x) (y)
+			 *      /            /
+			 *    (p)          (p)
+			 *    /            /
+			 *  (s)          (c)
+			 *    \
+			 *    (c)
+			 */
+			do {
+				parent = successor;
+				successor = tmp;
+				tmp = tmp->rb_left;
+			} while (tmp);
+			parent->rb_left = child2 = successor->rb_right;
+			successor->rb_right = child;
+			rb_set_parent(child, successor);
 		}
 
-		if (child) {
-			rb_set_parent_color(child, parent, RB_BLACK);
+		successor->rb_left = tmp = node->rb_left;
+		rb_set_parent(tmp, successor);
+
+		pc = node->__rb_parent_color;
+		tmp = __rb_parent(pc);
+		__rb_change_child(node, successor, tmp, root);
+		if (child2) {
+			successor->__rb_parent_color = pc;
+			rb_set_parent_color(child2, parent, RB_BLACK);
 			rebalance = NULL;
 		} else {
-			rebalance = rb_is_black(node) ? parent : NULL;
+			unsigned long pc2 = successor->__rb_parent_color;
+			successor->__rb_parent_color = pc;
+			rebalance = __rb_is_black(pc2) ? parent : NULL;
 		}
-		node->__rb_parent_color = old->__rb_parent_color;
-		node->rb_left = old->rb_left;
-		rb_set_parent(old->rb_left, node);
 	}
 
 	if (rebalance)

From dadf93534f125b9eda486b471446a8456a603d27 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:15 -0700
Subject: [PATCH 075/160] rbtree: augmented rbtree test

Small test to measure the performance of augmented rbtrees.

Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree_test.c | 103 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 101 insertions(+), 2 deletions(-)

diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index fd09465d82ca..66e41d4bfc39 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -10,6 +10,10 @@
 struct test_node {
 	struct rb_node rb;
 	u32 key;
+
+	/* following fields used for testing augmented rbtree functionality */
+	u32 val;
+	u32 augmented;
 };
 
 static struct rb_root root = RB_ROOT;
@@ -20,10 +24,11 @@ static struct rnd_state rnd;
 static void insert(struct test_node *node, struct rb_root *root)
 {
 	struct rb_node **new = &root->rb_node, *parent = NULL;
+	u32 key = node->key;
 
 	while (*new) {
 		parent = *new;
-		if (node->key < rb_entry(parent, struct test_node, rb)->key)
+		if (key < rb_entry(parent, struct test_node, rb)->key)
 			new = &parent->rb_left;
 		else
 			new = &parent->rb_right;
@@ -38,11 +43,62 @@ static inline void erase(struct test_node *node, struct rb_root *root)
 	rb_erase(&node->rb, root);
 }
 
+static inline u32 augment_recompute(struct test_node *node)
+{
+	u32 max = node->val, child_augmented;
+	if (node->rb.rb_left) {
+		child_augmented = rb_entry(node->rb.rb_left, struct test_node,
+					   rb)->augmented;
+		if (max < child_augmented)
+			max = child_augmented;
+	}
+	if (node->rb.rb_right) {
+		child_augmented = rb_entry(node->rb.rb_right, struct test_node,
+					   rb)->augmented;
+		if (max < child_augmented)
+			max = child_augmented;
+	}
+	return max;
+}
+
+static void augment_callback(struct rb_node *rb, void *unused)
+{
+	struct test_node *node = rb_entry(rb, struct test_node, rb);
+	node->augmented = augment_recompute(node);
+}
+
+static void insert_augmented(struct test_node *node, struct rb_root *root)
+{
+	struct rb_node **new = &root->rb_node, *parent = NULL;
+	u32 key = node->key;
+
+	while (*new) {
+		parent = *new;
+		if (key < rb_entry(parent, struct test_node, rb)->key)
+			new = &parent->rb_left;
+		else
+			new = &parent->rb_right;
+	}
+
+	rb_link_node(&node->rb, parent, new);
+	rb_insert_color(&node->rb, root);
+	rb_augment_insert(&node->rb, augment_callback, NULL);
+}
+
+static void erase_augmented(struct test_node *node, struct rb_root *root)
+{
+	struct rb_node *deepest = rb_augment_erase_begin(&node->rb);
+	rb_erase(&node->rb, root);
+	rb_augment_erase_end(deepest, augment_callback, NULL);
+}
+
 static void init(void)
 {
 	int i;
-	for (i = 0; i < NODES; i++)
+	for (i = 0; i < NODES; i++) {
 		nodes[i].key = prandom32(&rnd);
+		nodes[i].val = prandom32(&rnd);
+	}
 }
 
 static bool is_red(struct rb_node *rb)
@@ -81,6 +137,17 @@ static void check(int nr_nodes)
 	WARN_ON_ONCE(count != nr_nodes);
 }
 
+static void check_augmented(int nr_nodes)
+{
+	struct rb_node *rb;
+
+	check(nr_nodes);
+	for (rb = rb_first(&root); rb; rb = rb_next(rb)) {
+		struct test_node *node = rb_entry(rb, struct test_node, rb);
+		WARN_ON_ONCE(node->augmented != augment_recompute(node));
+	}
+}
+
 static int rbtree_test_init(void)
 {
 	int i, j;
@@ -119,6 +186,38 @@ static int rbtree_test_init(void)
 		check(0);
 	}
 
+	printk(KERN_ALERT "augmented rbtree testing");
+
+	init();
+
+	time1 = get_cycles();
+
+	for (i = 0; i < PERF_LOOPS; i++) {
+		for (j = 0; j < NODES; j++)
+			insert_augmented(nodes + j, &root);
+		for (j = 0; j < NODES; j++)
+			erase_augmented(nodes + j, &root);
+	}
+
+	time2 = get_cycles();
+	time = time2 - time1;
+
+	time = div_u64(time, PERF_LOOPS);
+	printk(" -> %llu cycles\n", (unsigned long long)time);
+
+	for (i = 0; i < CHECK_LOOPS; i++) {
+		init();
+		for (j = 0; j < NODES; j++) {
+			check_augmented(j);
+			insert_augmented(nodes + j, &root);
+		}
+		for (j = 0; j < NODES; j++) {
+			check_augmented(NODES - j);
+			erase_augmented(nodes + j, &root);
+		}
+		check_augmented(0);
+	}
+
 	return -EAGAIN; /* Fail will directly unload the module */
 }
 

From 14b94af0b251a2c80885b60538166fb7d04a642e Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:17 -0700
Subject: [PATCH 076/160] rbtree: faster augmented rbtree manipulation

Introduce new augmented rbtree APIs that allow minimal recalculation of
augmented node information.

A new callback is added to the rbtree insertion and erase rebalancing
functions, to be called on each tree rotations. Such rotations preserve
the subtree's root augmented value, but require recalculation of the one
child that was previously located at the subtree root.

In the insertion case, the handcoded search phase must be updated to
maintain the augmented information on insertion, and then the rbtree
coloring/rebalancing algorithms keep it up to date.

In the erase case, things are more complicated since it is library
code that manipulates the rbtree in order to remove internal nodes.
This requires a couple additional callbacks to copy a subtree's
augmented value when a new root is stitched in, and to recompute
augmented values down the ancestry path when a node is removed from
the tree.

In order to preserve maximum speed for the non-augmented case,
we provide two versions of each tree manipulation function.
rb_insert_augmented() is the augmented equivalent of rb_insert_color(),
and rb_erase_augmented() is the augmented equivalent of rb_erase().

Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/rbtree.txt | 190 ++++++++++++++++++++++++++++++++-------
 include/linux/rbtree.h   |  19 ++++
 lib/rbtree.c             |  83 +++++++++++++++--
 lib/rbtree_test.c        |  58 +++++++++---
 4 files changed, 296 insertions(+), 54 deletions(-)

diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt
index 8d32d85a5234..0a0b6dce3e08 100644
--- a/Documentation/rbtree.txt
+++ b/Documentation/rbtree.txt
@@ -193,24 +193,42 @@ Example:
 Support for Augmented rbtrees
 -----------------------------
 
-Augmented rbtree is an rbtree with "some" additional data stored in each node.
-This data can be used to augment some new functionality to rbtree.
-Augmented rbtree is an optional feature built on top of basic rbtree
-infrastructure. An rbtree user who wants this feature will have to call the
-augmentation functions with the user provided augmentation callback
-when inserting and erasing nodes.
+Augmented rbtree is an rbtree with "some" additional data stored in
+each node, where the additional data for node N must be a function of
+the contents of all nodes in the subtree rooted at N. This data can
+be used to augment some new functionality to rbtree. Augmented rbtree
+is an optional feature built on top of basic rbtree infrastructure.
+An rbtree user who wants this feature will have to call the augmentation
+functions with the user provided augmentation callback when inserting
+and erasing nodes.
 
-On insertion, the user must call rb_augment_insert() once the new node is in
-place. This will cause the augmentation function callback to be called for
-each node between the new node and the root which has been affected by the
-insertion.
+On insertion, the user must update the augmented information on the path
+leading to the inserted node, then call rb_link_node() as usual and
+rb_augment_inserted() instead of the usual rb_insert_color() call.
+If rb_augment_inserted() rebalances the rbtree, it will callback into
+a user provided function to update the augmented information on the
+affected subtrees.
 
-When erasing a node, the user must call rb_augment_erase_begin() first to
-retrieve the deepest node on the rebalance path. Then, after erasing the
-original node, the user must call rb_augment_erase_end() with the deepest
-node found earlier. This will cause the augmentation function to be called
-for each affected node between the deepest node and the root.
+When erasing a node, the user must call rb_erase_augmented() instead of
+rb_erase(). rb_erase_augmented() calls back into user provided functions
+to updated the augmented information on affected subtrees.
 
+In both cases, the callbacks are provided through struct rb_augment_callbacks.
+3 callbacks must be defined:
+
+- A propagation callback, which updates the augmented value for a given
+  node and its ancestors, up to a given stop point (or NULL to update
+  all the way to the root).
+
+- A copy callback, which copies the augmented value for a given subtree
+  to a newly assigned subtree root.
+
+- A tree rotation callback, which copies the augmented value for a given
+  subtree to a newly assigned subtree root AND recomputes the augmented
+  information for the former subtree root.
+
+
+Sample usage:
 
 Interval tree is an example of augmented rb tree. Reference -
 "Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein.
@@ -230,26 +248,132 @@ and its immediate children. And this will be used in O(log n) lookup
 for lowest match (lowest start address among all possible matches)
 with something like:
 
-find_lowest_match(lo, hi, node)
+struct interval_tree_node *
+interval_tree_first_match(struct rb_root *root,
+			  unsigned long start, unsigned long last)
 {
-	lowest_match = NULL;
-	while (node) {
-		if (max_hi(node->left) > lo) {
-			// Lowest overlap if any must be on left side
-			node = node->left;
-		} else if (overlap(lo, hi, node)) {
-			lowest_match = node;
-			break;
-		} else if (lo > node->lo) {
-			// Lowest overlap if any must be on right side
-			node = node->right;
-		} else {
-			break;
+	struct interval_tree_node *node;
+
+	if (!root->rb_node)
+		return NULL;
+	node = rb_entry(root->rb_node, struct interval_tree_node, rb);
+
+	while (true) {
+		if (node->rb.rb_left) {
+			struct interval_tree_node *left =
+				rb_entry(node->rb.rb_left,
+					 struct interval_tree_node, rb);
+			if (left->__subtree_last >= start) {
+				/*
+				 * Some nodes in left subtree satisfy Cond2.
+				 * Iterate to find the leftmost such node N.
+				 * If it also satisfies Cond1, that's the match
+				 * we are looking for. Otherwise, there is no
+				 * matching interval as nodes to the right of N
+				 * can't satisfy Cond1 either.
+				 */
+				node = left;
+				continue;
+			}
 		}
+		if (node->start <= last) {		/* Cond1 */
+			if (node->last >= start)	/* Cond2 */
+				return node;	/* node is leftmost match */
+			if (node->rb.rb_right) {
+				node = rb_entry(node->rb.rb_right,
+					struct interval_tree_node, rb);
+				if (node->__subtree_last >= start)
+					continue;
+			}
+		}
+		return NULL;	/* No match */
 	}
-	return lowest_match;
 }
 
-Finding exact match will be to first find lowest match and then to follow
-successor nodes looking for exact match, until the start of a node is beyond
-the hi value we are looking for.
+Insertion/removal are defined using the following augmented callbacks:
+
+static inline unsigned long
+compute_subtree_last(struct interval_tree_node *node)
+{
+	unsigned long max = node->last, subtree_last;
+	if (node->rb.rb_left) {
+		subtree_last = rb_entry(node->rb.rb_left,
+			struct interval_tree_node, rb)->__subtree_last;
+		if (max < subtree_last)
+			max = subtree_last;
+	}
+	if (node->rb.rb_right) {
+		subtree_last = rb_entry(node->rb.rb_right,
+			struct interval_tree_node, rb)->__subtree_last;
+		if (max < subtree_last)
+			max = subtree_last;
+	}
+	return max;
+}
+
+static void augment_propagate(struct rb_node *rb, struct rb_node *stop)
+{
+	while (rb != stop) {
+		struct interval_tree_node *node =
+			rb_entry(rb, struct interval_tree_node, rb);
+		unsigned long subtree_last = compute_subtree_last(node);
+		if (node->__subtree_last == subtree_last)
+			break;
+		node->__subtree_last = subtree_last;
+		rb = rb_parent(&node->rb);
+	}
+}
+
+static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+	struct interval_tree_node *old =
+		rb_entry(rb_old, struct interval_tree_node, rb);
+	struct interval_tree_node *new =
+		rb_entry(rb_new, struct interval_tree_node, rb);
+
+	new->__subtree_last = old->__subtree_last;
+}
+
+static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+	struct interval_tree_node *old =
+		rb_entry(rb_old, struct interval_tree_node, rb);
+	struct interval_tree_node *new =
+		rb_entry(rb_new, struct interval_tree_node, rb);
+
+	new->__subtree_last = old->__subtree_last;
+	old->__subtree_last = compute_subtree_last(old);
+}
+
+static const struct rb_augment_callbacks augment_callbacks = {
+	augment_propagate, augment_copy, augment_rotate
+};
+
+void interval_tree_insert(struct interval_tree_node *node,
+			  struct rb_root *root)
+{
+	struct rb_node **link = &root->rb_node, *rb_parent = NULL;
+	unsigned long start = node->start, last = node->last;
+	struct interval_tree_node *parent;
+
+	while (*link) {
+		rb_parent = *link;
+		parent = rb_entry(rb_parent, struct interval_tree_node, rb);
+		if (parent->__subtree_last < last)
+			parent->__subtree_last = last;
+		if (start < parent->start)
+			link = &parent->rb.rb_left;
+		else
+			link = &parent->rb.rb_right;
+	}
+
+	node->__subtree_last = last;
+	rb_link_node(&node->rb, rb_parent, link);
+	rb_insert_augmented(&node->rb, root, &augment_callbacks);
+}
+
+void interval_tree_remove(struct interval_tree_node *node,
+			  struct rb_root *root)
+{
+	rb_erase_augmented(&node->rb, root, &augment_callbacks);
+}
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index bf836a2c6a32..c902eb9d6506 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -61,6 +61,25 @@ struct rb_root {
 extern void rb_insert_color(struct rb_node *, struct rb_root *);
 extern void rb_erase(struct rb_node *, struct rb_root *);
 
+
+struct rb_augment_callbacks {
+	void (*propagate)(struct rb_node *node, struct rb_node *stop);
+	void (*copy)(struct rb_node *old, struct rb_node *new);
+	void (*rotate)(struct rb_node *old, struct rb_node *new);
+};
+
+extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+extern void rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+			       const struct rb_augment_callbacks *augment);
+static inline void
+rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+		    const struct rb_augment_callbacks *augment)
+{
+	__rb_insert_augmented(node, root, augment->rotate);
+}
+
+
 typedef void (*rb_augment_f)(struct rb_node *node, void *data);
 
 extern void rb_augment_insert(struct rb_node *node,
diff --git a/lib/rbtree.c b/lib/rbtree.c
index 938061ecbe61..a37ee7954b8f 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -105,7 +105,9 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
 	__rb_change_child(old, new, parent, root);
 }
 
-void rb_insert_color(struct rb_node *node, struct rb_root *root)
+static __always_inline void
+__rb_insert(struct rb_node *node, struct rb_root *root,
+	    void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
 	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
 
@@ -169,6 +171,7 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root)
 					rb_set_parent_color(tmp, parent,
 							    RB_BLACK);
 				rb_set_parent_color(parent, node, RB_RED);
+				augment_rotate(parent, node);
 				parent = node;
 				tmp = node->rb_right;
 			}
@@ -187,6 +190,7 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root)
 			if (tmp)
 				rb_set_parent_color(tmp, gparent, RB_BLACK);
 			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
+			augment_rotate(gparent, parent);
 			break;
 		} else {
 			tmp = gparent->rb_left;
@@ -209,6 +213,7 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root)
 					rb_set_parent_color(tmp, parent,
 							    RB_BLACK);
 				rb_set_parent_color(parent, node, RB_RED);
+				augment_rotate(parent, node);
 				parent = node;
 				tmp = node->rb_left;
 			}
@@ -219,13 +224,15 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root)
 			if (tmp)
 				rb_set_parent_color(tmp, gparent, RB_BLACK);
 			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
+			augment_rotate(gparent, parent);
 			break;
 		}
 	}
 }
-EXPORT_SYMBOL(rb_insert_color);
 
-static void __rb_erase_color(struct rb_node *parent, struct rb_root *root)
+static __always_inline void
+__rb_erase_color(struct rb_node *parent, struct rb_root *root,
+		 const struct rb_augment_callbacks *augment)
 {
 	struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
 
@@ -254,6 +261,7 @@ static void __rb_erase_color(struct rb_node *parent, struct rb_root *root)
 				rb_set_parent_color(tmp1, parent, RB_BLACK);
 				__rb_rotate_set_parents(parent, sibling, root,
 							RB_RED);
+				augment->rotate(parent, sibling);
 				sibling = tmp1;
 			}
 			tmp1 = sibling->rb_right;
@@ -305,6 +313,7 @@ static void __rb_erase_color(struct rb_node *parent, struct rb_root *root)
 				if (tmp1)
 					rb_set_parent_color(tmp1, sibling,
 							    RB_BLACK);
+				augment->rotate(sibling, tmp2);
 				tmp1 = sibling;
 				sibling = tmp2;
 			}
@@ -327,6 +336,7 @@ static void __rb_erase_color(struct rb_node *parent, struct rb_root *root)
 				rb_set_parent(tmp2, parent);
 			__rb_rotate_set_parents(parent, sibling, root,
 						RB_BLACK);
+			augment->rotate(parent, sibling);
 			break;
 		} else {
 			sibling = parent->rb_left;
@@ -337,6 +347,7 @@ static void __rb_erase_color(struct rb_node *parent, struct rb_root *root)
 				rb_set_parent_color(tmp1, parent, RB_BLACK);
 				__rb_rotate_set_parents(parent, sibling, root,
 							RB_RED);
+				augment->rotate(parent, sibling);
 				sibling = tmp1;
 			}
 			tmp1 = sibling->rb_left;
@@ -363,6 +374,7 @@ static void __rb_erase_color(struct rb_node *parent, struct rb_root *root)
 				if (tmp1)
 					rb_set_parent_color(tmp1, sibling,
 							    RB_BLACK);
+				augment->rotate(sibling, tmp2);
 				tmp1 = sibling;
 				sibling = tmp2;
 			}
@@ -374,12 +386,15 @@ static void __rb_erase_color(struct rb_node *parent, struct rb_root *root)
 				rb_set_parent(tmp2, parent);
 			__rb_rotate_set_parents(parent, sibling, root,
 						RB_BLACK);
+			augment->rotate(parent, sibling);
 			break;
 		}
 	}
 }
 
-void rb_erase(struct rb_node *node, struct rb_root *root)
+static __always_inline void
+__rb_erase(struct rb_node *node, struct rb_root *root,
+	   const struct rb_augment_callbacks *augment)
 {
 	struct rb_node *child = node->rb_right, *tmp = node->rb_left;
 	struct rb_node *parent, *rebalance;
@@ -401,12 +416,14 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 			rebalance = NULL;
 		} else
 			rebalance = __rb_is_black(pc) ? parent : NULL;
+		tmp = parent;
 	} else if (!child) {
 		/* Still case 1, but this time the child is node->rb_left */
 		tmp->__rb_parent_color = pc = node->__rb_parent_color;
 		parent = __rb_parent(pc);
 		__rb_change_child(node, tmp, parent, root);
 		rebalance = NULL;
+		tmp = parent;
 	} else {
 		struct rb_node *successor = child, *child2;
 		tmp = child->rb_left;
@@ -420,8 +437,9 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 			 *        \
 			 *        (c)
 			 */
-			parent = child;
-			child2 = child->rb_right;
+			parent = successor;
+			child2 = successor->rb_right;
+			augment->copy(node, successor);
 		} else {
 			/*
 			 * Case 3: node's successor is leftmost under
@@ -445,6 +463,8 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 			parent->rb_left = child2 = successor->rb_right;
 			successor->rb_right = child;
 			rb_set_parent(child, successor);
+			augment->copy(node, successor);
+			augment->propagate(parent, successor);
 		}
 
 		successor->rb_left = tmp = node->rb_left;
@@ -462,13 +482,62 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 			successor->__rb_parent_color = pc;
 			rebalance = __rb_is_black(pc2) ? parent : NULL;
 		}
+		tmp = successor;
 	}
 
+	augment->propagate(tmp, NULL);
 	if (rebalance)
-		__rb_erase_color(rebalance, root);
+		__rb_erase_color(rebalance, root, augment);
+}
+
+/*
+ * Non-augmented rbtree manipulation functions.
+ *
+ * We use dummy augmented callbacks here, and have the compiler optimize them
+ * out of the rb_insert_color() and rb_erase() function definitions.
+ */
+
+static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
+static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
+static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
+
+static const struct rb_augment_callbacks dummy_callbacks = {
+	dummy_propagate, dummy_copy, dummy_rotate
+};
+
+void rb_insert_color(struct rb_node *node, struct rb_root *root)
+{
+	__rb_insert(node, root, dummy_rotate);
+}
+EXPORT_SYMBOL(rb_insert_color);
+
+void rb_erase(struct rb_node *node, struct rb_root *root)
+{
+	__rb_erase(node, root, &dummy_callbacks);
 }
 EXPORT_SYMBOL(rb_erase);
 
+/*
+ * Augmented rbtree manipulation functions.
+ *
+ * This instantiates the same __always_inline functions as in the non-augmented
+ * case, but this time with user-defined callbacks.
+ */
+
+void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+{
+	__rb_insert(node, root, augment_rotate);
+}
+EXPORT_SYMBOL(__rb_insert_augmented);
+
+void rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+			const struct rb_augment_callbacks *augment)
+{
+	__rb_erase(node, root, augment);
+}
+EXPORT_SYMBOL(rb_erase_augmented);
+
 static void rb_augment_path(struct rb_node *node, rb_augment_f func, void *data)
 {
 	struct rb_node *parent;
diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index 66e41d4bfc39..e28345df09bf 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -61,35 +61,65 @@ static inline u32 augment_recompute(struct test_node *node)
 	return max;
 }
 
-static void augment_callback(struct rb_node *rb, void *unused)
+static void augment_propagate(struct rb_node *rb, struct rb_node *stop)
 {
-	struct test_node *node = rb_entry(rb, struct test_node, rb);
-	node->augmented = augment_recompute(node);
+	while (rb != stop) {
+		struct test_node *node = rb_entry(rb, struct test_node, rb);
+		u32 augmented = augment_recompute(node);
+		if (node->augmented == augmented)
+			break;
+		node->augmented = augmented;
+		rb = rb_parent(&node->rb);
+	}
 }
 
+static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+	struct test_node *old = rb_entry(rb_old, struct test_node, rb);
+	struct test_node *new = rb_entry(rb_new, struct test_node, rb);
+	new->augmented = old->augmented;
+}
+
+static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+	struct test_node *old = rb_entry(rb_old, struct test_node, rb);
+	struct test_node *new = rb_entry(rb_new, struct test_node, rb);
+
+	/* Rotation doesn't change subtree's augmented value */
+	new->augmented = old->augmented;
+	old->augmented = augment_recompute(old);
+}
+
+static const struct rb_augment_callbacks augment_callbacks = {
+	augment_propagate, augment_copy, augment_rotate
+};
+
 static void insert_augmented(struct test_node *node, struct rb_root *root)
 {
-	struct rb_node **new = &root->rb_node, *parent = NULL;
+	struct rb_node **new = &root->rb_node, *rb_parent = NULL;
 	u32 key = node->key;
+	u32 val = node->val;
+	struct test_node *parent;
 
 	while (*new) {
-		parent = *new;
-		if (key < rb_entry(parent, struct test_node, rb)->key)
-			new = &parent->rb_left;
+		rb_parent = *new;
+		parent = rb_entry(rb_parent, struct test_node, rb);
+		if (parent->augmented < val)
+			parent->augmented = val;
+		if (key < parent->key)
+			new = &parent->rb.rb_left;
 		else
-			new = &parent->rb_right;
+			new = &parent->rb.rb_right;
 	}
 
-	rb_link_node(&node->rb, parent, new);
-	rb_insert_color(&node->rb, root);
-	rb_augment_insert(&node->rb, augment_callback, NULL);
+	node->augmented = val;
+	rb_link_node(&node->rb, rb_parent, new);
+	rb_insert_augmented(&node->rb, root, &augment_callbacks);
 }
 
 static void erase_augmented(struct test_node *node, struct rb_root *root)
 {
-	struct rb_node *deepest = rb_augment_erase_begin(&node->rb);
-	rb_erase(&node->rb, root);
-	rb_augment_erase_end(deepest, augment_callback, NULL);
+	rb_erase_augmented(&node->rb, root, &augment_callbacks);
 }
 
 static void init(void)

From 9d9e6f9703bbd642f3f2f807e6aaa642a4cbcec9 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:20 -0700
Subject: [PATCH 077/160] rbtree: remove prior augmented rbtree implementation

convert arch/x86/mm/pat_rbtree.c to the proposed augmented rbtree api
and remove the old augmented rbtree implementation.

Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/pat_rbtree.c | 65 +++++++++++++++++++++++++-----------
 include/linux/rbtree.h   |  8 -----
 lib/rbtree.c             | 71 ----------------------------------------
 3 files changed, 46 insertions(+), 98 deletions(-)

diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 8acaddd0fb21..7e1515bd4770 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -54,29 +54,57 @@ static u64 get_subtree_max_end(struct rb_node *node)
 	return ret;
 }
 
-/* Update 'subtree_max_end' for a node, based on node and its children */
-static void memtype_rb_augment_cb(struct rb_node *node, void *__unused)
+static u64 compute_subtree_max_end(struct memtype *data)
 {
-	struct memtype *data;
-	u64 max_end, child_max_end;
+	u64 max_end = data->end, child_max_end;
 
-	if (!node)
-		return;
-
-	data = container_of(node, struct memtype, rb);
-	max_end = data->end;
-
-	child_max_end = get_subtree_max_end(node->rb_right);
+	child_max_end = get_subtree_max_end(data->rb.rb_right);
 	if (child_max_end > max_end)
 		max_end = child_max_end;
 
-	child_max_end = get_subtree_max_end(node->rb_left);
+	child_max_end = get_subtree_max_end(data->rb.rb_left);
 	if (child_max_end > max_end)
 		max_end = child_max_end;
 
-	data->subtree_max_end = max_end;
+	return max_end;
 }
 
+/* Update 'subtree_max_end' for node and its parents */
+static void memtype_rb_propagate_cb(struct rb_node *node, struct rb_node *stop)
+{
+	while (node != stop) {
+		struct memtype *data = container_of(node, struct memtype, rb);
+		u64 subtree_max_end = compute_subtree_max_end(data);
+		if (data->subtree_max_end == subtree_max_end)
+			break;
+		data->subtree_max_end = subtree_max_end;
+		node = rb_parent(&data->rb);
+	}
+}
+
+static void memtype_rb_copy_cb(struct rb_node *old, struct rb_node *new)
+{
+	struct memtype *old_data = container_of(old, struct memtype, rb);
+	struct memtype *new_data = container_of(new, struct memtype, rb);
+
+	new_data->subtree_max_end = old_data->subtree_max_end;
+}
+
+/* Update 'subtree_max_end' after tree rotation. old and new are the
+ * former and current subtree roots */
+static void memtype_rb_rotate_cb(struct rb_node *old, struct rb_node *new)
+{
+	struct memtype *old_data = container_of(old, struct memtype, rb);
+	struct memtype *new_data = container_of(new, struct memtype, rb);
+
+	new_data->subtree_max_end = old_data->subtree_max_end;
+	old_data->subtree_max_end = compute_subtree_max_end(old_data);
+}
+
+static const struct rb_augment_callbacks memtype_rb_augment_cb = {
+	memtype_rb_propagate_cb, memtype_rb_copy_cb, memtype_rb_rotate_cb
+};
+
 /* Find the first (lowest start addr) overlapping range from rb tree */
 static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
 				u64 start, u64 end)
@@ -179,15 +207,17 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
 		struct memtype *data = container_of(*node, struct memtype, rb);
 
 		parent = *node;
+		if (data->subtree_max_end < newdata->end)
+			data->subtree_max_end = newdata->end;
 		if (newdata->start <= data->start)
 			node = &((*node)->rb_left);
 		else if (newdata->start > data->start)
 			node = &((*node)->rb_right);
 	}
 
+	newdata->subtree_max_end = newdata->end;
 	rb_link_node(&newdata->rb, parent, node);
-	rb_insert_color(&newdata->rb, root);
-	rb_augment_insert(&newdata->rb, memtype_rb_augment_cb, NULL);
+	rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb);
 }
 
 int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
@@ -209,16 +239,13 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
 
 struct memtype *rbt_memtype_erase(u64 start, u64 end)
 {
-	struct rb_node *deepest;
 	struct memtype *data;
 
 	data = memtype_rb_exact_match(&memtype_rbroot, start, end);
 	if (!data)
 		goto out;
 
-	deepest = rb_augment_erase_begin(&data->rb);
-	rb_erase(&data->rb, &memtype_rbroot);
-	rb_augment_erase_end(deepest, memtype_rb_augment_cb, NULL);
+	rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb);
 out:
 	return data;
 }
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index c902eb9d6506..4ace31b33380 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -80,14 +80,6 @@ rb_insert_augmented(struct rb_node *node, struct rb_root *root,
 }
 
 
-typedef void (*rb_augment_f)(struct rb_node *node, void *data);
-
-extern void rb_augment_insert(struct rb_node *node,
-			      rb_augment_f func, void *data);
-extern struct rb_node *rb_augment_erase_begin(struct rb_node *node);
-extern void rb_augment_erase_end(struct rb_node *node,
-				 rb_augment_f func, void *data);
-
 /* Find logical next and previous nodes in a tree */
 extern struct rb_node *rb_next(const struct rb_node *);
 extern struct rb_node *rb_prev(const struct rb_node *);
diff --git a/lib/rbtree.c b/lib/rbtree.c
index a37ee7954b8f..c0088ca345f9 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -538,77 +538,6 @@ void rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 }
 EXPORT_SYMBOL(rb_erase_augmented);
 
-static void rb_augment_path(struct rb_node *node, rb_augment_f func, void *data)
-{
-	struct rb_node *parent;
-
-up:
-	func(node, data);
-	parent = rb_parent(node);
-	if (!parent)
-		return;
-
-	if (node == parent->rb_left && parent->rb_right)
-		func(parent->rb_right, data);
-	else if (parent->rb_left)
-		func(parent->rb_left, data);
-
-	node = parent;
-	goto up;
-}
-
-/*
- * after inserting @node into the tree, update the tree to account for
- * both the new entry and any damage done by rebalance
- */
-void rb_augment_insert(struct rb_node *node, rb_augment_f func, void *data)
-{
-	if (node->rb_left)
-		node = node->rb_left;
-	else if (node->rb_right)
-		node = node->rb_right;
-
-	rb_augment_path(node, func, data);
-}
-EXPORT_SYMBOL(rb_augment_insert);
-
-/*
- * before removing the node, find the deepest node on the rebalance path
- * that will still be there after @node gets removed
- */
-struct rb_node *rb_augment_erase_begin(struct rb_node *node)
-{
-	struct rb_node *deepest;
-
-	if (!node->rb_right && !node->rb_left)
-		deepest = rb_parent(node);
-	else if (!node->rb_right)
-		deepest = node->rb_left;
-	else if (!node->rb_left)
-		deepest = node->rb_right;
-	else {
-		deepest = rb_next(node);
-		if (deepest->rb_right)
-			deepest = deepest->rb_right;
-		else if (rb_parent(deepest) != node)
-			deepest = rb_parent(deepest);
-	}
-
-	return deepest;
-}
-EXPORT_SYMBOL(rb_augment_erase_begin);
-
-/*
- * after removal, update the tree to account for the removed entry
- * and any rebalance damage.
- */
-void rb_augment_erase_end(struct rb_node *node, rb_augment_f func, void *data)
-{
-	if (node)
-		rb_augment_path(node, func, data);
-}
-EXPORT_SYMBOL(rb_augment_erase_end);
-
 /*
  * This function returns the first node (in sort order) of the tree.
  */

From 3908836aa77e3621aaf2101f2920e01d7c8460d6 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:21 -0700
Subject: [PATCH 078/160] rbtree: add RB_DECLARE_CALLBACKS() macro

As proposed by Peter Zijlstra, this makes it easier to define the augmented
rbtree callbacks.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/pat_rbtree.c | 37 ++-----------------------------------
 include/linux/rbtree.h   | 30 ++++++++++++++++++++++++++++++
 lib/rbtree_test.c        | 34 ++--------------------------------
 3 files changed, 34 insertions(+), 67 deletions(-)

diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 7e1515bd4770..4d116959075d 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -69,41 +69,8 @@ static u64 compute_subtree_max_end(struct memtype *data)
 	return max_end;
 }
 
-/* Update 'subtree_max_end' for node and its parents */
-static void memtype_rb_propagate_cb(struct rb_node *node, struct rb_node *stop)
-{
-	while (node != stop) {
-		struct memtype *data = container_of(node, struct memtype, rb);
-		u64 subtree_max_end = compute_subtree_max_end(data);
-		if (data->subtree_max_end == subtree_max_end)
-			break;
-		data->subtree_max_end = subtree_max_end;
-		node = rb_parent(&data->rb);
-	}
-}
-
-static void memtype_rb_copy_cb(struct rb_node *old, struct rb_node *new)
-{
-	struct memtype *old_data = container_of(old, struct memtype, rb);
-	struct memtype *new_data = container_of(new, struct memtype, rb);
-
-	new_data->subtree_max_end = old_data->subtree_max_end;
-}
-
-/* Update 'subtree_max_end' after tree rotation. old and new are the
- * former and current subtree roots */
-static void memtype_rb_rotate_cb(struct rb_node *old, struct rb_node *new)
-{
-	struct memtype *old_data = container_of(old, struct memtype, rb);
-	struct memtype *new_data = container_of(new, struct memtype, rb);
-
-	new_data->subtree_max_end = old_data->subtree_max_end;
-	old_data->subtree_max_end = compute_subtree_max_end(old_data);
-}
-
-static const struct rb_augment_callbacks memtype_rb_augment_cb = {
-	memtype_rb_propagate_cb, memtype_rb_copy_cb, memtype_rb_rotate_cb
-};
+RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb,
+		     u64, subtree_max_end, compute_subtree_max_end)
 
 /* Find the first (lowest start addr) overlapping range from rb tree */
 static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 4ace31b33380..8d1e83b1c87b 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -79,6 +79,36 @@ rb_insert_augmented(struct rb_node *node, struct rb_root *root,
 	__rb_insert_augmented(node, root, augment->rotate);
 }
 
+#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield,	      \
+			     rbtype, rbaugmented, rbcompute)		      \
+static void rbname ## _propagate(struct rb_node *rb, struct rb_node *stop)    \
+{									      \
+	while (rb != stop) {						      \
+		rbstruct *node = rb_entry(rb, rbstruct, rbfield);	      \
+		rbtype augmented = rbcompute(node);			      \
+		if (node->rbaugmented == augmented)			      \
+			break;						      \
+		node->rbaugmented = augmented;				      \
+		rb = rb_parent(&node->rbfield);				      \
+	}								      \
+}									      \
+static void rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)   \
+{									      \
+	rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);		      \
+	rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);		      \
+	new->rbaugmented = old->rbaugmented;				      \
+}									      \
+static void rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \
+{									      \
+	rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);		      \
+	rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);		      \
+	new->rbaugmented = old->rbaugmented;				      \
+	old->rbaugmented = rbcompute(old);				      \
+}									      \
+rbstatic const struct rb_augment_callbacks rbname = {			      \
+	rbname ## _propagate, rbname ## _copy, rbname ## _rotate	      \
+};
+
 
 /* Find logical next and previous nodes in a tree */
 extern struct rb_node *rb_next(const struct rb_node *);
diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index e28345df09bf..b20e99969b0f 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -61,38 +61,8 @@ static inline u32 augment_recompute(struct test_node *node)
 	return max;
 }
 
-static void augment_propagate(struct rb_node *rb, struct rb_node *stop)
-{
-	while (rb != stop) {
-		struct test_node *node = rb_entry(rb, struct test_node, rb);
-		u32 augmented = augment_recompute(node);
-		if (node->augmented == augmented)
-			break;
-		node->augmented = augmented;
-		rb = rb_parent(&node->rb);
-	}
-}
-
-static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new)
-{
-	struct test_node *old = rb_entry(rb_old, struct test_node, rb);
-	struct test_node *new = rb_entry(rb_new, struct test_node, rb);
-	new->augmented = old->augmented;
-}
-
-static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
-{
-	struct test_node *old = rb_entry(rb_old, struct test_node, rb);
-	struct test_node *new = rb_entry(rb_new, struct test_node, rb);
-
-	/* Rotation doesn't change subtree's augmented value */
-	new->augmented = old->augmented;
-	old->augmented = augment_recompute(old);
-}
-
-static const struct rb_augment_callbacks augment_callbacks = {
-	augment_propagate, augment_copy, augment_rotate
-};
+RB_DECLARE_CALLBACKS(static, augment_callbacks, struct test_node, rb,
+		     u32, augmented, augment_recompute)
 
 static void insert_augmented(struct test_node *node, struct rb_root *root)
 {

From fff3fd8a1210a165252cd7cd01206da7a90d3a06 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:23 -0700
Subject: [PATCH 079/160] rbtree: add prio tree and interval tree tests

Patch 1 implements support for interval trees, on top of the augmented
rbtree API. It also adds synthetic tests to compare the performance of
interval trees vs prio trees. Short answers is that interval trees are
slightly faster (~25%) on insert/erase, and much faster (~2.4 - 3x)
on search. It is debatable how realistic the synthetic test is, and I have
not made such measurements yet, but my impression is that interval trees
would still come out faster.

Patch 2 uses a preprocessor template to make the interval tree generic,
and uses it as a replacement for the vma prio_tree.

Patch 3 takes the other prio_tree user, kmemleak, and converts it to use
a basic rbtree. We don't actually need the augmented rbtree support here
because the intervals are always non-overlapping.

Patch 4 removes the now-unused prio tree library.

Patch 5 proposes an additional optimization to rb_erase_augmented, now
providing it as an inline function so that the augmented callbacks can be
inlined in. This provides an additional 5-10% performance improvement
for the interval tree insert/erase benchmark. There is a maintainance cost
as it exposes augmented rbtree users to some of the rbtree library internals;
however I think this cost shouldn't be too high as I expect the augmented
rbtree will always have much less users than the base rbtree.

I should probably add a quick summary of why I think it makes sense to
replace prio trees with augmented rbtree based interval trees now.  One of
the drivers is that we need augmented rbtrees for Rik's vma gap finding
code, and once you have them, it just makes sense to use them for interval
trees as well, as this is the simpler and more well known algorithm.  prio
trees, in comparison, seem *too* clever: they impose an additional 'heap'
constraint on the tree, which they use to guarantee a faster worst-case
complexity of O(k+log N) for stabbing queries in a well-balanced prio
tree, vs O(k*log N) for interval trees (where k=number of matches,
N=number of intervals).  Now this sounds great, but in practice prio trees
don't realize this theorical benefit.  First, the additional constraint
makes them harder to update, so that the kernel implementation has to
simplify things by balancing them like a radix tree, which is not always
ideal.  Second, the fact that there are both index and heap properties
makes both tree manipulation and search more complex, which results in a
higher multiplicative time constant.  As it turns out, the simple interval
tree algorithm ends up running faster than the more clever prio tree.

This patch:

Add two test modules:

- prio_tree_test measures the performance of lib/prio_tree.c, both for
  insertion/removal and for stabbing searches

- interval_tree_test measures the performance of a library of equivalent
  functionality, built using the augmented rbtree support.

In order to support the second test module, lib/interval_tree.c is
introduced. It is kept separate from the interval_tree_test main file
for two reasons: first we don't want to provide an unfair advantage
over prio_tree_test by having everything in a single compilation unit,
and second there is the possibility that the interval tree functionality
could get some non-test users in kernel over time.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/interval_tree.h |  27 ++++++
 lib/Kconfig.debug             |  12 +++
 lib/Makefile                  |   4 +
 lib/interval_tree.c           | 159 ++++++++++++++++++++++++++++++++++
 lib/interval_tree_test_main.c | 105 ++++++++++++++++++++++
 lib/prio_tree.c               |   4 +
 lib/prio_tree_test.c          | 106 +++++++++++++++++++++++
 7 files changed, 417 insertions(+)
 create mode 100644 include/linux/interval_tree.h
 create mode 100644 lib/interval_tree.c
 create mode 100644 lib/interval_tree_test_main.c
 create mode 100644 lib/prio_tree_test.c

diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h
new file mode 100644
index 000000000000..724556aa3c95
--- /dev/null
+++ b/include/linux/interval_tree.h
@@ -0,0 +1,27 @@
+#ifndef _LINUX_INTERVAL_TREE_H
+#define _LINUX_INTERVAL_TREE_H
+
+#include <linux/rbtree.h>
+
+struct interval_tree_node {
+	struct rb_node rb;
+	unsigned long start;	/* Start of interval */
+	unsigned long last;	/* Last location _in_ interval */
+	unsigned long __subtree_last;
+};
+
+extern void
+interval_tree_insert(struct interval_tree_node *node, struct rb_root *root);
+
+extern void
+interval_tree_remove(struct interval_tree_node *node, struct rb_root *root);
+
+extern struct interval_tree_node *
+interval_tree_iter_first(struct rb_root *root,
+			 unsigned long start, unsigned long last);
+
+extern struct interval_tree_node *
+interval_tree_iter_next(struct interval_tree_node *node,
+			unsigned long start, unsigned long last);
+
+#endif	/* _LINUX_INTERVAL_TREE_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a4e5d93b0f41..ee9f030b6951 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1289,6 +1289,18 @@ config RBTREE_TEST
 	  A benchmark measuring the performance of the rbtree library.
 	  Also includes rbtree invariant checks.
 
+config PRIO_TREE_TEST
+	tristate "Prio tree test"
+	depends on m && DEBUG_KERNEL
+	help
+	  A benchmark measuring the performance of the prio tree library
+
+config INTERVAL_TREE_TEST
+	tristate "Interval tree test"
+	depends on m && DEBUG_KERNEL
+	help
+	  A benchmark measuring the performance of the interval tree library
+
 config PROVIDE_OHCI1394_DMA_INIT
 	bool "Remote debugging over FireWire early on boot"
 	depends on PCI && X86
diff --git a/lib/Makefile b/lib/Makefile
index f49445d26b65..26f578bf616a 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -141,6 +141,10 @@ $(foreach file, $(libfdt_files), \
 lib-$(CONFIG_LIBFDT) += $(libfdt_files)
 
 obj-$(CONFIG_RBTREE_TEST) += rbtree_test.o
+obj-$(CONFIG_PRIO_TREE_TEST) += prio_tree_test.o
+obj-$(CONFIG_INTERVAL_TREE_TEST) += interval_tree_test.o
+
+interval_tree_test-objs := interval_tree_test_main.o interval_tree.o
 
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
diff --git a/lib/interval_tree.c b/lib/interval_tree.c
new file mode 100644
index 000000000000..6fd540b1e499
--- /dev/null
+++ b/lib/interval_tree.c
@@ -0,0 +1,159 @@
+#include <linux/init.h>
+#include <linux/interval_tree.h>
+
+/* Callbacks for augmented rbtree insert and remove */
+
+static inline unsigned long
+compute_subtree_last(struct interval_tree_node *node)
+{
+	unsigned long max = node->last, subtree_last;
+	if (node->rb.rb_left) {
+		subtree_last = rb_entry(node->rb.rb_left,
+			struct interval_tree_node, rb)->__subtree_last;
+		if (max < subtree_last)
+			max = subtree_last;
+	}
+	if (node->rb.rb_right) {
+		subtree_last = rb_entry(node->rb.rb_right,
+			struct interval_tree_node, rb)->__subtree_last;
+		if (max < subtree_last)
+			max = subtree_last;
+	}
+	return max;
+}
+
+RB_DECLARE_CALLBACKS(static, augment_callbacks, struct interval_tree_node, rb,
+		     unsigned long, __subtree_last, compute_subtree_last)
+
+/* Insert / remove interval nodes from the tree */
+
+void interval_tree_insert(struct interval_tree_node *node,
+			  struct rb_root *root)
+{
+	struct rb_node **link = &root->rb_node, *rb_parent = NULL;
+	unsigned long start = node->start, last = node->last;
+	struct interval_tree_node *parent;
+
+	while (*link) {
+		rb_parent = *link;
+		parent = rb_entry(rb_parent, struct interval_tree_node, rb);
+		if (parent->__subtree_last < last)
+			parent->__subtree_last = last;
+		if (start < parent->start)
+			link = &parent->rb.rb_left;
+		else
+			link = &parent->rb.rb_right;
+	}
+
+	node->__subtree_last = last;
+	rb_link_node(&node->rb, rb_parent, link);
+	rb_insert_augmented(&node->rb, root, &augment_callbacks);
+}
+
+void interval_tree_remove(struct interval_tree_node *node,
+			  struct rb_root *root)
+{
+	rb_erase_augmented(&node->rb, root, &augment_callbacks);
+}
+
+/*
+ * Iterate over intervals intersecting [start;last]
+ *
+ * Note that a node's interval intersects [start;last] iff:
+ *   Cond1: node->start <= last
+ * and
+ *   Cond2: start <= node->last
+ */
+
+static struct interval_tree_node *
+subtree_search(struct interval_tree_node *node,
+	       unsigned long start, unsigned long last)
+{
+	while (true) {
+		/*
+		 * Loop invariant: start <= node->__subtree_last
+		 * (Cond2 is satisfied by one of the subtree nodes)
+		 */
+		if (node->rb.rb_left) {
+			struct interval_tree_node *left =
+				rb_entry(node->rb.rb_left,
+					 struct interval_tree_node, rb);
+			if (start <= left->__subtree_last) {
+				/*
+				 * Some nodes in left subtree satisfy Cond2.
+				 * Iterate to find the leftmost such node N.
+				 * If it also satisfies Cond1, that's the match
+				 * we are looking for. Otherwise, there is no
+				 * matching interval as nodes to the right of N
+				 * can't satisfy Cond1 either.
+				 */
+				node = left;
+				continue;
+			}
+		}
+		if (node->start <= last) {		/* Cond1 */
+			if (start <= node->last)	/* Cond2 */
+				return node;	/* node is leftmost match */
+			if (node->rb.rb_right) {
+				node = rb_entry(node->rb.rb_right,
+					struct interval_tree_node, rb);
+				if (start <= node->__subtree_last)
+					continue;
+			}
+		}
+		return NULL;	/* No match */
+	}
+}
+
+struct interval_tree_node *
+interval_tree_iter_first(struct rb_root *root,
+			 unsigned long start, unsigned long last)
+{
+	struct interval_tree_node *node;
+
+	if (!root->rb_node)
+		return NULL;
+	node = rb_entry(root->rb_node, struct interval_tree_node, rb);
+	if (node->__subtree_last < start)
+		return NULL;
+	return subtree_search(node, start, last);
+}
+
+struct interval_tree_node *
+interval_tree_iter_next(struct interval_tree_node *node,
+			unsigned long start, unsigned long last)
+{
+	struct rb_node *rb = node->rb.rb_right, *prev;
+
+	while (true) {
+		/*
+		 * Loop invariants:
+		 *   Cond1: node->start <= last
+		 *   rb == node->rb.rb_right
+		 *
+		 * First, search right subtree if suitable
+		 */
+		if (rb) {
+			struct interval_tree_node *right =
+				rb_entry(rb, struct interval_tree_node, rb);
+			if (start <= right->__subtree_last)
+				return subtree_search(right, start, last);
+		}
+
+		/* Move up the tree until we come from a node's left child */
+		do {
+			rb = rb_parent(&node->rb);
+			if (!rb)
+				return NULL;
+			prev = &node->rb;
+			node = rb_entry(rb, struct interval_tree_node, rb);
+			rb = node->rb.rb_right;
+		} while (prev == rb);
+
+		/* Check if the node intersects [start;last] */
+		if (last < node->start)		/* !Cond1 */
+			return NULL;
+		else if (start <= node->last)	/* Cond2 */
+			return node;
+	}
+}
diff --git a/lib/interval_tree_test_main.c b/lib/interval_tree_test_main.c
new file mode 100644
index 000000000000..b25903987f7a
--- /dev/null
+++ b/lib/interval_tree_test_main.c
@@ -0,0 +1,105 @@
+#include <linux/module.h>
+#include <linux/interval_tree.h>
+#include <linux/random.h>
+#include <asm/timex.h>
+
+#define NODES        100
+#define PERF_LOOPS   100000
+#define SEARCHES     100
+#define SEARCH_LOOPS 10000
+
+static struct rb_root root = RB_ROOT;
+static struct interval_tree_node nodes[NODES];
+static u32 queries[SEARCHES];
+
+static struct rnd_state rnd;
+
+static inline unsigned long
+search(unsigned long query, struct rb_root *root)
+{
+	struct interval_tree_node *node;
+	unsigned long results = 0;
+
+	for (node = interval_tree_iter_first(root, query, query); node;
+	     node = interval_tree_iter_next(node, query, query))
+		results++;
+	return results;
+}
+
+static void init(void)
+{
+	int i;
+	for (i = 0; i < NODES; i++) {
+		u32 a = prandom32(&rnd), b = prandom32(&rnd);
+		if (a <= b) {
+			nodes[i].start = a;
+			nodes[i].last = b;
+		} else {
+			nodes[i].start = b;
+			nodes[i].last = a;
+		}
+	}
+	for (i = 0; i < SEARCHES; i++)
+		queries[i] = prandom32(&rnd);
+}
+
+static int interval_tree_test_init(void)
+{
+	int i, j;
+	unsigned long results;
+	cycles_t time1, time2, time;
+
+	printk(KERN_ALERT "interval tree insert/remove");
+
+	prandom32_seed(&rnd, 3141592653589793238ULL);
+	init();
+
+	time1 = get_cycles();
+
+	for (i = 0; i < PERF_LOOPS; i++) {
+		for (j = 0; j < NODES; j++)
+			interval_tree_insert(nodes + j, &root);
+		for (j = 0; j < NODES; j++)
+			interval_tree_remove(nodes + j, &root);
+	}
+
+	time2 = get_cycles();
+	time = time2 - time1;
+
+	time = div_u64(time, PERF_LOOPS);
+	printk(" -> %llu cycles\n", (unsigned long long)time);
+
+	printk(KERN_ALERT "interval tree search");
+
+	for (j = 0; j < NODES; j++)
+		interval_tree_insert(nodes + j, &root);
+
+	time1 = get_cycles();
+
+	results = 0;
+	for (i = 0; i < SEARCH_LOOPS; i++)
+		for (j = 0; j < SEARCHES; j++)
+			results += search(queries[j], &root);
+
+	time2 = get_cycles();
+	time = time2 - time1;
+
+	time = div_u64(time, SEARCH_LOOPS);
+	results = div_u64(results, SEARCH_LOOPS);
+	printk(" -> %llu cycles (%lu results)\n",
+	       (unsigned long long)time, results);
+
+	return -EAGAIN; /* Fail will directly unload the module */
+}
+
+static void interval_tree_test_exit(void)
+{
+	printk(KERN_ALERT "test exit\n");
+}
+
+module_init(interval_tree_test_init)
+module_exit(interval_tree_test_exit)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michel Lespinasse");
+MODULE_DESCRIPTION("Interval Tree test");
diff --git a/lib/prio_tree.c b/lib/prio_tree.c
index 8d443af03b4c..4e0d2edff2b4 100644
--- a/lib/prio_tree.c
+++ b/lib/prio_tree.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/prio_tree.h>
+#include <linux/export.h>
 
 /*
  * A clever mix of heap and radix trees forms a radix priority search tree (PST)
@@ -241,6 +242,7 @@ struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root,
 	BUG();
 	return NULL;
 }
+EXPORT_SYMBOL(prio_tree_insert);
 
 /*
  * Remove a prio_tree_node @node from a radix priority search tree @root. The
@@ -290,6 +292,7 @@ void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node)
 	while (cur != node)
 		cur = prio_tree_replace(root, cur->parent, cur);
 }
+EXPORT_SYMBOL(prio_tree_remove);
 
 static void iter_walk_down(struct prio_tree_iter *iter)
 {
@@ -464,3 +467,4 @@ repeat:
 
 	goto repeat;
 }
+EXPORT_SYMBOL(prio_tree_next);
diff --git a/lib/prio_tree_test.c b/lib/prio_tree_test.c
new file mode 100644
index 000000000000..c26084ddc6a4
--- /dev/null
+++ b/lib/prio_tree_test.c
@@ -0,0 +1,106 @@
+#include <linux/module.h>
+#include <linux/prio_tree.h>
+#include <linux/random.h>
+#include <asm/timex.h>
+
+#define NODES        100
+#define PERF_LOOPS   100000
+#define SEARCHES     100
+#define SEARCH_LOOPS 10000
+
+static struct prio_tree_root root;
+static struct prio_tree_node nodes[NODES];
+static u32 queries[SEARCHES];
+
+static struct rnd_state rnd;
+
+static inline unsigned long
+search(unsigned long query, struct prio_tree_root *root)
+{
+	struct prio_tree_iter iter;
+	unsigned long results = 0;
+
+	prio_tree_iter_init(&iter, root, query, query);
+	while (prio_tree_next(&iter))
+		results++;
+	return results;
+}
+
+static void init(void)
+{
+	int i;
+	for (i = 0; i < NODES; i++) {
+		u32 a = prandom32(&rnd), b = prandom32(&rnd);
+		if (a <= b) {
+			nodes[i].start = a;
+			nodes[i].last = b;
+		} else {
+			nodes[i].start = b;
+			nodes[i].last = a;
+		}
+	}
+	for (i = 0; i < SEARCHES; i++)
+		queries[i] = prandom32(&rnd);
+}
+
+static int prio_tree_test_init(void)
+{
+	int i, j;
+	unsigned long results;
+	cycles_t time1, time2, time;
+
+	printk(KERN_ALERT "prio tree insert/remove");
+
+	prandom32_seed(&rnd, 3141592653589793238ULL);
+	INIT_PRIO_TREE_ROOT(&root);
+	init();
+
+	time1 = get_cycles();
+
+	for (i = 0; i < PERF_LOOPS; i++) {
+		for (j = 0; j < NODES; j++)
+			prio_tree_insert(&root, nodes + j);
+		for (j = 0; j < NODES; j++)
+			prio_tree_remove(&root, nodes + j);
+	}
+
+	time2 = get_cycles();
+	time = time2 - time1;
+
+	time = div_u64(time, PERF_LOOPS);
+	printk(" -> %llu cycles\n", (unsigned long long)time);
+
+	printk(KERN_ALERT "prio tree search");
+
+	for (j = 0; j < NODES; j++)
+		prio_tree_insert(&root, nodes + j);
+
+	time1 = get_cycles();
+
+	results = 0;
+	for (i = 0; i < SEARCH_LOOPS; i++)
+		for (j = 0; j < SEARCHES; j++)
+			results += search(queries[j], &root);
+
+	time2 = get_cycles();
+	time = time2 - time1;
+
+	time = div_u64(time, SEARCH_LOOPS);
+	results = div_u64(results, SEARCH_LOOPS);
+	printk(" -> %llu cycles (%lu results)\n",
+	       (unsigned long long)time, results);
+
+	return -EAGAIN; /* Fail will directly unload the module */
+}
+
+static void prio_tree_test_exit(void)
+{
+	printk(KERN_ALERT "test exit\n");
+}
+
+module_init(prio_tree_test_init)
+module_exit(prio_tree_test_exit)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michel Lespinasse");
+MODULE_DESCRIPTION("Prio Tree test");

From 6b2dbba8b6ac4df26f72eda1e5ea7bab9f950e08 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:25 -0700
Subject: [PATCH 080/160] mm: replace vma prio_tree with an interval tree

Implement an interval tree as a replacement for the VMA prio_tree.  The
algorithms are similar to lib/interval_tree.c; however that code can't be
directly reused as the interval endpoints are not explicitly stored in the
VMA.  So instead, the common algorithm is moved into a template and the
details (node type, how to get interval endpoints from the node, etc) are
filled in using the C preprocessor.

Once the interval tree functions are available, using them as a
replacement to the VMA prio tree is a relatively simple, mechanical job.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/fault-armv.c           |   3 +-
 arch/arm/mm/flush.c                |   3 +-
 arch/parisc/kernel/cache.c         |   3 +-
 arch/x86/mm/hugetlbpage.c          |   3 +-
 fs/hugetlbfs/inode.c               |   9 +-
 fs/inode.c                         |   2 +-
 include/linux/fs.h                 |   6 +-
 include/linux/interval_tree_tmpl.h | 215 +++++++++++++++++++++++++++++
 include/linux/mm.h                 |  28 ++--
 include/linux/mm_types.h           |  14 +-
 kernel/events/uprobes.c            |   3 +-
 kernel/fork.c                      |   2 +-
 lib/interval_tree.c                | 164 ++--------------------
 lib/prio_tree.c                    |  19 +--
 mm/Makefile                        |   4 +-
 mm/filemap_xip.c                   |   3 +-
 mm/fremap.c                        |   2 +-
 mm/hugetlb.c                       |   3 +-
 mm/interval_tree.c                 |  61 ++++++++
 mm/memory-failure.c                |   3 +-
 mm/memory.c                        |   9 +-
 mm/mmap.c                          |  22 +--
 mm/nommu.c                         |  12 +-
 mm/prio_tree.c                     | 208 ----------------------------
 mm/rmap.c                          |  18 +--
 25 files changed, 355 insertions(+), 464 deletions(-)
 create mode 100644 include/linux/interval_tree_tmpl.h
 create mode 100644 mm/interval_tree.c
 delete mode 100644 mm/prio_tree.c

diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index 7599e2625c7d..2a5907b5c8d2 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -134,7 +134,6 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma,
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *mpnt;
-	struct prio_tree_iter iter;
 	unsigned long offset;
 	pgoff_t pgoff;
 	int aliases = 0;
@@ -147,7 +146,7 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma,
 	 * cache coherency.
 	 */
 	flush_dcache_mmap_lock(mapping);
-	vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) {
 		/*
 		 * If this VMA is not in our MM, we can ignore it.
 		 * Note that we intentionally mask out the VMA
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 40ca11ed6e5f..1c8f7f564175 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -196,7 +196,6 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p
 {
 	struct mm_struct *mm = current->active_mm;
 	struct vm_area_struct *mpnt;
-	struct prio_tree_iter iter;
 	pgoff_t pgoff;
 
 	/*
@@ -208,7 +207,7 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p
 	pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 
 	flush_dcache_mmap_lock(mapping);
-	vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long offset;
 
 		/*
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index 9d181890a7e3..48e16dc20102 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -276,7 +276,6 @@ void flush_dcache_page(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
 	struct vm_area_struct *mpnt;
-	struct prio_tree_iter iter;
 	unsigned long offset;
 	unsigned long addr, old_addr = 0;
 	pgoff_t pgoff;
@@ -299,7 +298,7 @@ void flush_dcache_page(struct page *page)
 	 * to flush one address here for them all to become coherent */
 
 	flush_dcache_mmap_lock(mapping);
-	vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) {
 		offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
 		addr = mpnt->vm_start + offset;
 
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index b91e48512425..937bff5cdaa7 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -71,7 +71,6 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
 			vma->vm_pgoff;
-	struct prio_tree_iter iter;
 	struct vm_area_struct *svma;
 	unsigned long saddr;
 	pte_t *spte = NULL;
@@ -81,7 +80,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 		return (pte_t *)pmd_alloc(mm, pud, addr);
 
 	mutex_lock(&mapping->i_mmap_mutex);
-	vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
+	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
 		if (svma == vma)
 			continue;
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 0a0ab8e21b19..c5bc355d8243 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -397,17 +397,16 @@ static void hugetlbfs_evict_inode(struct inode *inode)
 }
 
 static inline void
-hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff)
+hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
 {
 	struct vm_area_struct *vma;
-	struct prio_tree_iter iter;
 
-	vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) {
+	vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
 		unsigned long v_offset;
 
 		/*
 		 * Can the expression below overflow on 32-bit arches?
-		 * No, because the prio_tree returns us only those vmas
+		 * No, because the interval tree returns us only those vmas
 		 * which overlap the truncated area starting at pgoff,
 		 * and no vma on a 32-bit arch can span beyond the 4GB.
 		 */
@@ -432,7 +431,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 
 	i_size_write(inode, offset);
 	mutex_lock(&mapping->i_mmap_mutex);
-	if (!prio_tree_empty(&mapping->i_mmap))
+	if (!RB_EMPTY_ROOT(&mapping->i_mmap))
 		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
 	mutex_unlock(&mapping->i_mmap_mutex);
 	truncate_hugepages(inode, offset);
diff --git a/fs/inode.c b/fs/inode.c
index ac8d904b3f16..b03c71957246 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -348,7 +348,7 @@ void address_space_init_once(struct address_space *mapping)
 	mutex_init(&mapping->i_mmap_mutex);
 	INIT_LIST_HEAD(&mapping->private_list);
 	spin_lock_init(&mapping->private_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+	mapping->i_mmap = RB_ROOT;
 	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
 }
 EXPORT_SYMBOL(address_space_init_once);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5a8a273d5b2f..c617ed024df8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -401,7 +401,7 @@ struct inodes_stat_t {
 #include <linux/cache.h>
 #include <linux/list.h>
 #include <linux/radix-tree.h>
-#include <linux/prio_tree.h>
+#include <linux/rbtree.h>
 #include <linux/init.h>
 #include <linux/pid.h>
 #include <linux/bug.h>
@@ -669,7 +669,7 @@ struct address_space {
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
 	spinlock_t		tree_lock;	/* and lock protecting it */
 	unsigned int		i_mmap_writable;/* count VM_SHARED mappings */
-	struct prio_tree_root	i_mmap;		/* tree of private and shared mappings */
+	struct rb_root		i_mmap;		/* tree of private and shared mappings */
 	struct list_head	i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
 	struct mutex		i_mmap_mutex;	/* protect tree, count, list */
 	/* Protected by tree_lock together with the radix tree */
@@ -741,7 +741,7 @@ int mapping_tagged(struct address_space *mapping, int tag);
  */
 static inline int mapping_mapped(struct address_space *mapping)
 {
-	return	!prio_tree_empty(&mapping->i_mmap) ||
+	return	!RB_EMPTY_ROOT(&mapping->i_mmap) ||
 		!list_empty(&mapping->i_mmap_nonlinear);
 }
 
diff --git a/include/linux/interval_tree_tmpl.h b/include/linux/interval_tree_tmpl.h
new file mode 100644
index 000000000000..c65deda31413
--- /dev/null
+++ b/include/linux/interval_tree_tmpl.h
@@ -0,0 +1,215 @@
+/*
+  Interval Trees
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  include/linux/interval_tree_tmpl.h
+*/
+
+/*
+ * Template for implementing interval trees
+ *
+ * ITSTRUCT:   struct type of the interval tree nodes
+ * ITRB:       name of struct rb_node field within ITSTRUCT
+ * ITTYPE:     type of the interval endpoints
+ * ITSUBTREE:  name of ITTYPE field within ITSTRUCT holding last-in-subtree
+ * ITSTART(n): start endpoint of ITSTRUCT node n
+ * ITLAST(n):  last endpoing of ITSTRUCT node n
+ * ITSTATIC:   'static' or empty
+ * ITPREFIX:   prefix to use for the inline tree definitions
+ */
+
+/* IT(name) -> ITPREFIX_name */
+#define _ITNAME(prefix, name) prefix ## _ ## name
+#define ITNAME(prefix, name) _ITNAME(prefix, name)
+#define IT(name) ITNAME(ITPREFIX, name)
+
+/* Callbacks for augmented rbtree insert and remove */
+
+static inline ITTYPE IT(compute_subtree_last)(ITSTRUCT *node)
+{
+	ITTYPE max = ITLAST(node), subtree_last;
+	if (node->ITRB.rb_left) {
+		subtree_last = rb_entry(node->ITRB.rb_left,
+					ITSTRUCT, ITRB)->ITSUBTREE;
+		if (max < subtree_last)
+			max = subtree_last;
+	}
+	if (node->ITRB.rb_right) {
+		subtree_last = rb_entry(node->ITRB.rb_right,
+					ITSTRUCT, ITRB)->ITSUBTREE;
+		if (max < subtree_last)
+			max = subtree_last;
+	}
+	return max;
+}
+
+static void IT(augment_propagate)(struct rb_node *rb, struct rb_node *stop)
+{
+	while (rb != stop) {
+		ITSTRUCT *node = rb_entry(rb, ITSTRUCT, ITRB);
+		ITTYPE subtree_last = IT(compute_subtree_last)(node);
+		if (node->ITSUBTREE == subtree_last)
+			break;
+		node->ITSUBTREE = subtree_last;
+		rb = rb_parent(&node->ITRB);
+	}
+}
+
+static void IT(augment_copy)(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+	ITSTRUCT *old = rb_entry(rb_old, ITSTRUCT, ITRB);
+	ITSTRUCT *new = rb_entry(rb_new, ITSTRUCT, ITRB);
+
+	new->ITSUBTREE = old->ITSUBTREE;
+}
+
+static void IT(augment_rotate)(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+	ITSTRUCT *old = rb_entry(rb_old, ITSTRUCT, ITRB);
+	ITSTRUCT *new = rb_entry(rb_new, ITSTRUCT, ITRB);
+
+	new->ITSUBTREE = old->ITSUBTREE;
+	old->ITSUBTREE = IT(compute_subtree_last)(old);
+}
+
+static const struct rb_augment_callbacks IT(augment_callbacks) = {
+	IT(augment_propagate), IT(augment_copy), IT(augment_rotate)
+};
+
+/* Insert / remove interval nodes from the tree */
+
+ITSTATIC void IT(insert)(ITSTRUCT *node, struct rb_root *root)
+{
+	struct rb_node **link = &root->rb_node, *rb_parent = NULL;
+	ITTYPE start = ITSTART(node), last = ITLAST(node);
+	ITSTRUCT *parent;
+
+	while (*link) {
+		rb_parent = *link;
+		parent = rb_entry(rb_parent, ITSTRUCT, ITRB);
+		if (parent->ITSUBTREE < last)
+			parent->ITSUBTREE = last;
+		if (start < ITSTART(parent))
+			link = &parent->ITRB.rb_left;
+		else
+			link = &parent->ITRB.rb_right;
+	}
+
+	node->ITSUBTREE = last;
+	rb_link_node(&node->ITRB, rb_parent, link);
+	rb_insert_augmented(&node->ITRB, root, &IT(augment_callbacks));
+}
+
+ITSTATIC void IT(remove)(ITSTRUCT *node, struct rb_root *root)
+{
+	rb_erase_augmented(&node->ITRB, root, &IT(augment_callbacks));
+}
+
+/*
+ * Iterate over intervals intersecting [start;last]
+ *
+ * Note that a node's interval intersects [start;last] iff:
+ *   Cond1: ITSTART(node) <= last
+ * and
+ *   Cond2: start <= ITLAST(node)
+ */
+
+static ITSTRUCT *IT(subtree_search)(ITSTRUCT *node, ITTYPE start, ITTYPE last)
+{
+	while (true) {
+		/*
+		 * Loop invariant: start <= node->ITSUBTREE
+		 * (Cond2 is satisfied by one of the subtree nodes)
+		 */
+		if (node->ITRB.rb_left) {
+			ITSTRUCT *left = rb_entry(node->ITRB.rb_left,
+						  ITSTRUCT, ITRB);
+			if (start <= left->ITSUBTREE) {
+				/*
+				 * Some nodes in left subtree satisfy Cond2.
+				 * Iterate to find the leftmost such node N.
+				 * If it also satisfies Cond1, that's the match
+				 * we are looking for. Otherwise, there is no
+				 * matching interval as nodes to the right of N
+				 * can't satisfy Cond1 either.
+				 */
+				node = left;
+				continue;
+			}
+		}
+		if (ITSTART(node) <= last) {		/* Cond1 */
+			if (start <= ITLAST(node))	/* Cond2 */
+				return node;	/* node is leftmost match */
+			if (node->ITRB.rb_right) {
+				node = rb_entry(node->ITRB.rb_right,
+						ITSTRUCT, ITRB);
+				if (start <= node->ITSUBTREE)
+					continue;
+			}
+		}
+		return NULL;	/* No match */
+	}
+}
+
+ITSTATIC ITSTRUCT *IT(iter_first)(struct rb_root *root,
+				  ITTYPE start, ITTYPE last)
+{
+	ITSTRUCT *node;
+
+	if (!root->rb_node)
+		return NULL;
+	node = rb_entry(root->rb_node, ITSTRUCT, ITRB);
+	if (node->ITSUBTREE < start)
+		return NULL;
+	return IT(subtree_search)(node, start, last);
+}
+
+ITSTATIC ITSTRUCT *IT(iter_next)(ITSTRUCT *node, ITTYPE start, ITTYPE last)
+{
+	struct rb_node *rb = node->ITRB.rb_right, *prev;
+
+	while (true) {
+		/*
+		 * Loop invariants:
+		 *   Cond1: ITSTART(node) <= last
+		 *   rb == node->ITRB.rb_right
+		 *
+		 * First, search right subtree if suitable
+		 */
+		if (rb) {
+			ITSTRUCT *right = rb_entry(rb, ITSTRUCT, ITRB);
+			if (start <= right->ITSUBTREE)
+				return IT(subtree_search)(right, start, last);
+		}
+
+		/* Move up the tree until we come from a node's left child */
+		do {
+			rb = rb_parent(&node->ITRB);
+			if (!rb)
+				return NULL;
+			prev = &node->ITRB;
+			node = rb_entry(rb, ITSTRUCT, ITRB);
+			rb = node->ITRB.rb_right;
+		} while (prev == rb);
+
+		/* Check if the node intersects [start;last] */
+		if (last < ITSTART(node))		/* !Cond1 */
+			return NULL;
+		else if (start <= ITLAST(node))		/* Cond2 */
+			return node;
+	}
+}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5ddb11b2b4bb..0f671ef09eba 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -10,7 +10,6 @@
 #include <linux/list.h>
 #include <linux/mmzone.h>
 #include <linux/rbtree.h>
-#include <linux/prio_tree.h>
 #include <linux/atomic.h>
 #include <linux/debug_locks.h>
 #include <linux/mm_types.h>
@@ -1355,22 +1354,27 @@ extern void zone_pcp_reset(struct zone *zone);
 extern atomic_long_t mmap_pages_allocated;
 extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
 
-/* prio_tree.c */
-void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
-void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *);
-void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *);
-struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
-	struct prio_tree_iter *iter);
+/* interval_tree.c */
+void vma_interval_tree_add(struct vm_area_struct *vma,
+			   struct vm_area_struct *old,
+			   struct address_space *mapping);
+void vma_interval_tree_insert(struct vm_area_struct *node,
+			      struct rb_root *root);
+void vma_interval_tree_remove(struct vm_area_struct *node,
+			      struct rb_root *root);
+struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root *root,
+				unsigned long start, unsigned long last);
+struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
+				unsigned long start, unsigned long last);
 
-#define vma_prio_tree_foreach(vma, iter, root, begin, end)	\
-	for (prio_tree_iter_init(iter, root, begin, end), vma = NULL;	\
-		(vma = vma_prio_tree_next(vma, iter)); )
+#define vma_interval_tree_foreach(vma, root, start, last)		\
+	for (vma = vma_interval_tree_iter_first(root, start, last);	\
+	     vma; vma = vma_interval_tree_iter_next(vma, start, last))
 
 static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
 					struct list_head *list)
 {
-	vma->shared.vm_set.parent = NULL;
-	list_add_tail(&vma->shared.vm_set.list, list);
+	list_add_tail(&vma->shared.nonlinear, list);
 }
 
 /* mmap.c */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a57a43f5ca7c..31f8a3af7d94 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -6,7 +6,6 @@
 #include <linux/threads.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
-#include <linux/prio_tree.h>
 #include <linux/rbtree.h>
 #include <linux/rwsem.h>
 #include <linux/completion.h>
@@ -240,18 +239,15 @@ struct vm_area_struct {
 
 	/*
 	 * For areas with an address space and backing store,
-	 * linkage into the address_space->i_mmap prio tree, or
-	 * linkage to the list of like vmas hanging off its node, or
+	 * linkage into the address_space->i_mmap interval tree, or
 	 * linkage of vma in the address_space->i_mmap_nonlinear list.
 	 */
 	union {
 		struct {
-			struct list_head list;
-			void *parent;	/* aligns with prio_tree_node parent */
-			struct vm_area_struct *head;
-		} vm_set;
-
-		struct raw_prio_tree_node prio_tree_node;
+			struct rb_node rb;
+			unsigned long rb_subtree_last;
+		} linear;
+		struct list_head nonlinear;
 	} shared;
 
 	/*
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 912ef48d28ab..1d9c0a985960 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -735,7 +735,6 @@ static struct map_info *
 build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 {
 	unsigned long pgoff = offset >> PAGE_SHIFT;
-	struct prio_tree_iter iter;
 	struct vm_area_struct *vma;
 	struct map_info *curr = NULL;
 	struct map_info *prev = NULL;
@@ -744,7 +743,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 
  again:
 	mutex_lock(&mapping->i_mmap_mutex);
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		if (!valid_vma(vma, is_register))
 			continue;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 972762e01024..90dace52715e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -423,7 +423,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 				mapping->i_mmap_writable++;
 			flush_dcache_mmap_lock(mapping);
 			/* insert tmp into the share list, just after mpnt */
-			vma_prio_tree_add(tmp, mpnt);
+			vma_interval_tree_add(tmp, mpnt, mapping);
 			flush_dcache_mmap_unlock(mapping);
 			mutex_unlock(&mapping->i_mmap_mutex);
 		}
diff --git a/lib/interval_tree.c b/lib/interval_tree.c
index 6fd540b1e499..77a793e0644b 100644
--- a/lib/interval_tree.c
+++ b/lib/interval_tree.c
@@ -1,159 +1,13 @@
 #include <linux/init.h>
 #include <linux/interval_tree.h>
 
-/* Callbacks for augmented rbtree insert and remove */
+#define ITSTRUCT   struct interval_tree_node
+#define ITRB       rb
+#define ITTYPE     unsigned long
+#define ITSUBTREE  __subtree_last
+#define ITSTART(n) ((n)->start)
+#define ITLAST(n)  ((n)->last)
+#define ITSTATIC
+#define ITPREFIX   interval_tree
 
-static inline unsigned long
-compute_subtree_last(struct interval_tree_node *node)
-{
-	unsigned long max = node->last, subtree_last;
-	if (node->rb.rb_left) {
-		subtree_last = rb_entry(node->rb.rb_left,
-			struct interval_tree_node, rb)->__subtree_last;
-		if (max < subtree_last)
-			max = subtree_last;
-	}
-	if (node->rb.rb_right) {
-		subtree_last = rb_entry(node->rb.rb_right,
-			struct interval_tree_node, rb)->__subtree_last;
-		if (max < subtree_last)
-			max = subtree_last;
-	}
-	return max;
-}
-
-RB_DECLARE_CALLBACKS(static, augment_callbacks, struct interval_tree_node, rb,
-		     unsigned long, __subtree_last, compute_subtree_last)
-
-/* Insert / remove interval nodes from the tree */
-
-void interval_tree_insert(struct interval_tree_node *node,
-			  struct rb_root *root)
-{
-	struct rb_node **link = &root->rb_node, *rb_parent = NULL;
-	unsigned long start = node->start, last = node->last;
-	struct interval_tree_node *parent;
-
-	while (*link) {
-		rb_parent = *link;
-		parent = rb_entry(rb_parent, struct interval_tree_node, rb);
-		if (parent->__subtree_last < last)
-			parent->__subtree_last = last;
-		if (start < parent->start)
-			link = &parent->rb.rb_left;
-		else
-			link = &parent->rb.rb_right;
-	}
-
-	node->__subtree_last = last;
-	rb_link_node(&node->rb, rb_parent, link);
-	rb_insert_augmented(&node->rb, root, &augment_callbacks);
-}
-
-void interval_tree_remove(struct interval_tree_node *node,
-			  struct rb_root *root)
-{
-	rb_erase_augmented(&node->rb, root, &augment_callbacks);
-}
-
-/*
- * Iterate over intervals intersecting [start;last]
- *
- * Note that a node's interval intersects [start;last] iff:
- *   Cond1: node->start <= last
- * and
- *   Cond2: start <= node->last
- */
-
-static struct interval_tree_node *
-subtree_search(struct interval_tree_node *node,
-	       unsigned long start, unsigned long last)
-{
-	while (true) {
-		/*
-		 * Loop invariant: start <= node->__subtree_last
-		 * (Cond2 is satisfied by one of the subtree nodes)
-		 */
-		if (node->rb.rb_left) {
-			struct interval_tree_node *left =
-				rb_entry(node->rb.rb_left,
-					 struct interval_tree_node, rb);
-			if (start <= left->__subtree_last) {
-				/*
-				 * Some nodes in left subtree satisfy Cond2.
-				 * Iterate to find the leftmost such node N.
-				 * If it also satisfies Cond1, that's the match
-				 * we are looking for. Otherwise, there is no
-				 * matching interval as nodes to the right of N
-				 * can't satisfy Cond1 either.
-				 */
-				node = left;
-				continue;
-			}
-		}
-		if (node->start <= last) {		/* Cond1 */
-			if (start <= node->last)	/* Cond2 */
-				return node;	/* node is leftmost match */
-			if (node->rb.rb_right) {
-				node = rb_entry(node->rb.rb_right,
-					struct interval_tree_node, rb);
-				if (start <= node->__subtree_last)
-					continue;
-			}
-		}
-		return NULL;	/* No match */
-	}
-}
-
-struct interval_tree_node *
-interval_tree_iter_first(struct rb_root *root,
-			 unsigned long start, unsigned long last)
-{
-	struct interval_tree_node *node;
-
-	if (!root->rb_node)
-		return NULL;
-	node = rb_entry(root->rb_node, struct interval_tree_node, rb);
-	if (node->__subtree_last < start)
-		return NULL;
-	return subtree_search(node, start, last);
-}
-
-struct interval_tree_node *
-interval_tree_iter_next(struct interval_tree_node *node,
-			unsigned long start, unsigned long last)
-{
-	struct rb_node *rb = node->rb.rb_right, *prev;
-
-	while (true) {
-		/*
-		 * Loop invariants:
-		 *   Cond1: node->start <= last
-		 *   rb == node->rb.rb_right
-		 *
-		 * First, search right subtree if suitable
-		 */
-		if (rb) {
-			struct interval_tree_node *right =
-				rb_entry(rb, struct interval_tree_node, rb);
-			if (start <= right->__subtree_last)
-				return subtree_search(right, start, last);
-		}
-
-		/* Move up the tree until we come from a node's left child */
-		do {
-			rb = rb_parent(&node->rb);
-			if (!rb)
-				return NULL;
-			prev = &node->rb;
-			node = rb_entry(rb, struct interval_tree_node, rb);
-			rb = node->rb.rb_right;
-		} while (prev == rb);
-
-		/* Check if the node intersects [start;last] */
-		if (last < node->start)		/* !Cond1 */
-			return NULL;
-		else if (start <= node->last)	/* Cond2 */
-			return node;
-	}
-}
+#include <linux/interval_tree_tmpl.h>
diff --git a/lib/prio_tree.c b/lib/prio_tree.c
index 4e0d2edff2b4..bba37148c15e 100644
--- a/lib/prio_tree.c
+++ b/lib/prio_tree.c
@@ -44,27 +44,12 @@
  * The following macros are used for implementing prio_tree for i_mmap
  */
 
-#define RADIX_INDEX(vma)  ((vma)->vm_pgoff)
-#define VMA_SIZE(vma)	  (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
-/* avoid overflow */
-#define HEAP_INDEX(vma)	  ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
-
-
 static void get_index(const struct prio_tree_root *root,
     const struct prio_tree_node *node,
     unsigned long *radix, unsigned long *heap)
 {
-	if (root->raw) {
-		struct vm_area_struct *vma = prio_tree_entry(
-		    node, struct vm_area_struct, shared.prio_tree_node);
-
-		*radix = RADIX_INDEX(vma);
-		*heap = HEAP_INDEX(vma);
-	}
-	else {
-		*radix = node->start;
-		*heap = node->last;
-	}
+	*radix = node->start;
+	*heap = node->last;
 }
 
 static unsigned long index_bits_to_maxindex[BITS_PER_LONG];
diff --git a/mm/Makefile b/mm/Makefile
index 92753e2d82da..6b025f80af34 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -14,9 +14,9 @@ endif
 obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   maccess.o page_alloc.o page-writeback.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
-			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
+			   util.o mmzone.o vmstat.o backing-dev.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
-			   compaction.o $(mmu-y)
+			   compaction.o interval_tree.o $(mmu-y)
 
 obj-y += init-mm.o
 
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 91750227a191..a52daee11d3f 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping,
 {
 	struct vm_area_struct *vma;
 	struct mm_struct *mm;
-	struct prio_tree_iter iter;
 	unsigned long address;
 	pte_t *pte;
 	pte_t pteval;
@@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping,
 
 retry:
 	mutex_lock(&mapping->i_mmap_mutex);
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		mm = vma->vm_mm;
 		address = vma->vm_start +
 			((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
diff --git a/mm/fremap.c b/mm/fremap.c
index 3d731a498788..3899a86851ce 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -214,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 		mutex_lock(&mapping->i_mmap_mutex);
 		flush_dcache_mmap_lock(mapping);
 		vma->vm_flags |= VM_NONLINEAR;
-		vma_prio_tree_remove(vma, &mapping->i_mmap);
+		vma_interval_tree_remove(vma, &mapping->i_mmap);
 		vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
 		flush_dcache_mmap_unlock(mapping);
 		mutex_unlock(&mapping->i_mmap_mutex);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f1bb534254f6..c9b40e3a9936 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2474,7 +2474,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct hstate *h = hstate_vma(vma);
 	struct vm_area_struct *iter_vma;
 	struct address_space *mapping;
-	struct prio_tree_iter iter;
 	pgoff_t pgoff;
 
 	/*
@@ -2491,7 +2490,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * __unmap_hugepage_range() is called as the lock is already held
 	 */
 	mutex_lock(&mapping->i_mmap_mutex);
-	vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
 		/* Do not unmap the current VMA */
 		if (iter_vma == vma)
 			continue;
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
new file mode 100644
index 000000000000..7dc565660e56
--- /dev/null
+++ b/mm/interval_tree.c
@@ -0,0 +1,61 @@
+/*
+ * mm/interval_tree.c - interval tree for mapping->i_mmap
+ *
+ * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
+ *
+ * This file is released under the GPL v2.
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+
+#define ITSTRUCT   struct vm_area_struct
+#define ITRB       shared.linear.rb
+#define ITTYPE     unsigned long
+#define ITSUBTREE  shared.linear.rb_subtree_last
+#define ITSTART(n) ((n)->vm_pgoff)
+#define ITLAST(n)  ((n)->vm_pgoff + \
+		    (((n)->vm_end - (n)->vm_start) >> PAGE_SHIFT) - 1)
+#define ITSTATIC
+#define ITPREFIX   vma_interval_tree
+
+#include <linux/interval_tree_tmpl.h>
+
+/* Insert old immediately after vma in the interval tree */
+void vma_interval_tree_add(struct vm_area_struct *vma,
+			   struct vm_area_struct *old,
+			   struct address_space *mapping)
+{
+	struct rb_node **link;
+	struct vm_area_struct *parent;
+	unsigned long last;
+
+	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
+		list_add(&vma->shared.nonlinear, &old->shared.nonlinear);
+		return;
+	}
+
+	last = ITLAST(vma);
+
+	if (!old->shared.linear.rb.rb_right) {
+		parent = old;
+		link = &old->shared.linear.rb.rb_right;
+	} else {
+		parent = rb_entry(old->shared.linear.rb.rb_right,
+				  struct vm_area_struct, shared.linear.rb);
+		if (parent->shared.linear.rb_subtree_last < last)
+			parent->shared.linear.rb_subtree_last = last;
+		while (parent->shared.linear.rb.rb_left) {
+			parent = rb_entry(parent->shared.linear.rb.rb_left,
+				struct vm_area_struct, shared.linear.rb);
+			if (parent->shared.linear.rb_subtree_last < last)
+				parent->shared.linear.rb_subtree_last = last;
+		}
+		link = &parent->shared.linear.rb.rb_left;
+	}
+
+	vma->shared.linear.rb_subtree_last = last;
+	rb_link_node(&vma->shared.linear.rb, &parent->shared.linear.rb, link);
+	rb_insert_augmented(&vma->shared.linear.rb, &mapping->i_mmap,
+			    &vma_interval_tree_augment_callbacks);
+}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a6e2141a6610..c38a6257d082 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -431,7 +431,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 {
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
-	struct prio_tree_iter iter;
 	struct address_space *mapping = page->mapping;
 
 	mutex_lock(&mapping->i_mmap_mutex);
@@ -442,7 +441,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 		if (!task_early_kill(tsk))
 			continue;
 
-		vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
+		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
 				      pgoff) {
 			/*
 			 * Send early kill signal to tasks where a vma covers
diff --git a/mm/memory.c b/mm/memory.c
index e09c04813186..d205e4381a34 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2801,14 +2801,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
 	zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
 }
 
-static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
+static inline void unmap_mapping_range_tree(struct rb_root *root,
 					    struct zap_details *details)
 {
 	struct vm_area_struct *vma;
-	struct prio_tree_iter iter;
 	pgoff_t vba, vea, zba, zea;
 
-	vma_prio_tree_foreach(vma, &iter, root,
+	vma_interval_tree_foreach(vma, root,
 			details->first_index, details->last_index) {
 
 		vba = vma->vm_pgoff;
@@ -2839,7 +2838,7 @@ static inline void unmap_mapping_range_list(struct list_head *head,
 	 * across *all* the pages in each nonlinear VMA, not just the pages
 	 * whose virtual address lies outside the file truncation point.
 	 */
-	list_for_each_entry(vma, head, shared.vm_set.list) {
+	list_for_each_entry(vma, head, shared.nonlinear) {
 		details->nonlinear_vma = vma;
 		unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
 	}
@@ -2883,7 +2882,7 @@ void unmap_mapping_range(struct address_space *mapping,
 
 
 	mutex_lock(&mapping->i_mmap_mutex);
-	if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
+	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
 	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
 		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
diff --git a/mm/mmap.c b/mm/mmap.c
index e3c365ff1b6a..5ac533f88e99 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -199,14 +199,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 
 	flush_dcache_mmap_lock(mapping);
 	if (unlikely(vma->vm_flags & VM_NONLINEAR))
-		list_del_init(&vma->shared.vm_set.list);
+		list_del_init(&vma->shared.nonlinear);
 	else
-		vma_prio_tree_remove(vma, &mapping->i_mmap);
+		vma_interval_tree_remove(vma, &mapping->i_mmap);
 	flush_dcache_mmap_unlock(mapping);
 }
 
 /*
- * Unlink a file-based vm structure from its prio_tree, to hide
+ * Unlink a file-based vm structure from its interval tree, to hide
  * vma from rmap and vmtruncate before freeing its page tables.
  */
 void unlink_file_vma(struct vm_area_struct *vma)
@@ -411,7 +411,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
 		if (unlikely(vma->vm_flags & VM_NONLINEAR))
 			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
 		else
-			vma_prio_tree_insert(vma, &mapping->i_mmap);
+			vma_interval_tree_insert(vma, &mapping->i_mmap);
 		flush_dcache_mmap_unlock(mapping);
 	}
 }
@@ -449,7 +449,7 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 
 /*
  * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
- * mm's list and rbtree.  It has already been inserted into the prio_tree.
+ * mm's list and rbtree.  It has already been inserted into the interval tree.
  */
 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
@@ -491,7 +491,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	struct vm_area_struct *next = vma->vm_next;
 	struct vm_area_struct *importer = NULL;
 	struct address_space *mapping = NULL;
-	struct prio_tree_root *root = NULL;
+	struct rb_root *root = NULL;
 	struct anon_vma *anon_vma = NULL;
 	struct file *file = vma->vm_file;
 	long adjust_next = 0;
@@ -554,7 +554,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 		mutex_lock(&mapping->i_mmap_mutex);
 		if (insert) {
 			/*
-			 * Put into prio_tree now, so instantiated pages
+			 * Put into interval tree now, so instantiated pages
 			 * are visible to arm/parisc __flush_dcache_page
 			 * throughout; but we cannot insert into address
 			 * space until vma start or end is updated.
@@ -582,9 +582,9 @@ again:			remove_next = 1 + (end > next->vm_end);
 
 	if (root) {
 		flush_dcache_mmap_lock(mapping);
-		vma_prio_tree_remove(vma, root);
+		vma_interval_tree_remove(vma, root);
 		if (adjust_next)
-			vma_prio_tree_remove(next, root);
+			vma_interval_tree_remove(next, root);
 	}
 
 	vma->vm_start = start;
@@ -597,8 +597,8 @@ again:			remove_next = 1 + (end > next->vm_end);
 
 	if (root) {
 		if (adjust_next)
-			vma_prio_tree_insert(next, root);
-		vma_prio_tree_insert(vma, root);
+			vma_interval_tree_insert(next, root);
+		vma_interval_tree_insert(vma, root);
 		flush_dcache_mmap_unlock(mapping);
 	}
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 12e84e69dd06..45131b41bcdb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 
 		mutex_lock(&mapping->i_mmap_mutex);
 		flush_dcache_mmap_lock(mapping);
-		vma_prio_tree_insert(vma, &mapping->i_mmap);
+		vma_interval_tree_insert(vma, &mapping->i_mmap);
 		flush_dcache_mmap_unlock(mapping);
 		mutex_unlock(&mapping->i_mmap_mutex);
 	}
@@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
 
 		mutex_lock(&mapping->i_mmap_mutex);
 		flush_dcache_mmap_lock(mapping);
-		vma_prio_tree_remove(vma, &mapping->i_mmap);
+		vma_interval_tree_remove(vma, &mapping->i_mmap);
 		flush_dcache_mmap_unlock(mapping);
 		mutex_unlock(&mapping->i_mmap_mutex);
 	}
@@ -2044,7 +2044,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
 				size_t newsize)
 {
 	struct vm_area_struct *vma;
-	struct prio_tree_iter iter;
 	struct vm_region *region;
 	pgoff_t low, high;
 	size_t r_size, r_top;
@@ -2056,8 +2055,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
 	mutex_lock(&inode->i_mapping->i_mmap_mutex);
 
 	/* search for VMAs that fall within the dead zone */
-	vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
-			      low, high) {
+	vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
 		/* found one - only interested if it's shared out of the page
 		 * cache */
 		if (vma->vm_flags & VM_SHARED) {
@@ -2073,8 +2071,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
 	 * we don't check for any regions that start beyond the EOF as there
 	 * shouldn't be any
 	 */
-	vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
-			      0, ULONG_MAX) {
+	vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap,
+				  0, ULONG_MAX) {
 		if (!(vma->vm_flags & VM_SHARED))
 			continue;
 
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
deleted file mode 100644
index 799dcfd7cd8c..000000000000
--- a/mm/prio_tree.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * mm/prio_tree.c - priority search tree for mapping->i_mmap
- *
- * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
- *
- * This file is released under the GPL v2.
- *
- * Based on the radix priority search tree proposed by Edward M. McCreight
- * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
- *
- * 02Feb2004	Initial version
- */
-
-#include <linux/mm.h>
-#include <linux/prio_tree.h>
-#include <linux/prefetch.h>
-
-/*
- * See lib/prio_tree.c for details on the general radix priority search tree
- * code.
- */
-
-/*
- * The following #defines are mirrored from lib/prio_tree.c. They're only used
- * for debugging, and should be removed (along with the debugging code using
- * them) when switching also VMAs to the regular prio_tree code.
- */
-
-#define RADIX_INDEX(vma)  ((vma)->vm_pgoff)
-#define VMA_SIZE(vma)	  (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
-/* avoid overflow */
-#define HEAP_INDEX(vma)   ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
-
-/*
- * Radix priority search tree for address_space->i_mmap
- *
- * For each vma that map a unique set of file pages i.e., unique [radix_index,
- * heap_index] value, we have a corresponding priority search tree node. If
- * multiple vmas have identical [radix_index, heap_index] value, then one of
- * them is used as a tree node and others are stored in a vm_set list. The tree
- * node points to the first vma (head) of the list using vm_set.head.
- *
- * prio_tree_root
- *      |
- *      A       vm_set.head
- *     / \      /
- *    L   R -> H-I-J-K-M-N-O-P-Q-S
- *    ^   ^    <-- vm_set.list -->
- *  tree nodes
- *
- * We need some way to identify whether a vma is a tree node, head of a vm_set
- * list, or just a member of a vm_set list. We cannot use vm_flags to store
- * such information. The reason is, in the above figure, it is possible that
- * vm_flags' of R and H are covered by the different mmap_sems. When R is
- * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
- * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
- * That's why some trick involving shared.vm_set.parent is used for identifying
- * tree nodes and list head nodes.
- *
- * vma radix priority search tree node rules:
- *
- * vma->shared.vm_set.parent != NULL    ==> a tree node
- *      vma->shared.vm_set.head != NULL ==> list of others mapping same range
- *      vma->shared.vm_set.head == NULL ==> no others map the same range
- *
- * vma->shared.vm_set.parent == NULL
- * 	vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
- * 	vma->shared.vm_set.head == NULL ==> a list node
- */
-
-/*
- * Add a new vma known to map the same set of pages as the old vma:
- * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
- * Note that it just happens to work correctly on i_mmap_nonlinear too.
- */
-void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
-{
-	/* Leave these BUG_ONs till prio_tree patch stabilizes */
-	BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
-	BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
-
-	vma->shared.vm_set.head = NULL;
-	vma->shared.vm_set.parent = NULL;
-
-	if (!old->shared.vm_set.parent)
-		list_add(&vma->shared.vm_set.list,
-				&old->shared.vm_set.list);
-	else if (old->shared.vm_set.head)
-		list_add_tail(&vma->shared.vm_set.list,
-				&old->shared.vm_set.head->shared.vm_set.list);
-	else {
-		INIT_LIST_HEAD(&vma->shared.vm_set.list);
-		vma->shared.vm_set.head = old;
-		old->shared.vm_set.head = vma;
-	}
-}
-
-void vma_prio_tree_insert(struct vm_area_struct *vma,
-			  struct prio_tree_root *root)
-{
-	struct prio_tree_node *ptr;
-	struct vm_area_struct *old;
-
-	vma->shared.vm_set.head = NULL;
-
-	ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
-	if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
-		old = prio_tree_entry(ptr, struct vm_area_struct,
-					shared.prio_tree_node);
-		vma_prio_tree_add(vma, old);
-	}
-}
-
-void vma_prio_tree_remove(struct vm_area_struct *vma,
-			  struct prio_tree_root *root)
-{
-	struct vm_area_struct *node, *head, *new_head;
-
-	if (!vma->shared.vm_set.head) {
-		if (!vma->shared.vm_set.parent)
-			list_del_init(&vma->shared.vm_set.list);
-		else
-			raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
-	} else {
-		/* Leave this BUG_ON till prio_tree patch stabilizes */
-		BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
-		if (vma->shared.vm_set.parent) {
-			head = vma->shared.vm_set.head;
-			if (!list_empty(&head->shared.vm_set.list)) {
-				new_head = list_entry(
-					head->shared.vm_set.list.next,
-					struct vm_area_struct,
-					shared.vm_set.list);
-				list_del_init(&head->shared.vm_set.list);
-			} else
-				new_head = NULL;
-
-			raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
-					&head->shared.prio_tree_node);
-			head->shared.vm_set.head = new_head;
-			if (new_head)
-				new_head->shared.vm_set.head = head;
-
-		} else {
-			node = vma->shared.vm_set.head;
-			if (!list_empty(&vma->shared.vm_set.list)) {
-				new_head = list_entry(
-					vma->shared.vm_set.list.next,
-					struct vm_area_struct,
-					shared.vm_set.list);
-				list_del_init(&vma->shared.vm_set.list);
-				node->shared.vm_set.head = new_head;
-				new_head->shared.vm_set.head = node;
-			} else
-				node->shared.vm_set.head = NULL;
-		}
-	}
-}
-
-/*
- * Helper function to enumerate vmas that map a given file page or a set of
- * contiguous file pages. The function returns vmas that at least map a single
- * page in the given range of contiguous file pages.
- */
-struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
-					struct prio_tree_iter *iter)
-{
-	struct prio_tree_node *ptr;
-	struct vm_area_struct *next;
-
-	if (!vma) {
-		/*
-		 * First call is with NULL vma
-		 */
-		ptr = prio_tree_next(iter);
-		if (ptr) {
-			next = prio_tree_entry(ptr, struct vm_area_struct,
-						shared.prio_tree_node);
-			prefetch(next->shared.vm_set.head);
-			return next;
-		} else
-			return NULL;
-	}
-
-	if (vma->shared.vm_set.parent) {
-		if (vma->shared.vm_set.head) {
-			next = vma->shared.vm_set.head;
-			prefetch(next->shared.vm_set.list.next);
-			return next;
-		}
-	} else {
-		next = list_entry(vma->shared.vm_set.list.next,
-				struct vm_area_struct, shared.vm_set.list);
-		if (!next->shared.vm_set.head) {
-			prefetch(next->shared.vm_set.list.next);
-			return next;
-		}
-	}
-
-	ptr = prio_tree_next(iter);
-	if (ptr) {
-		next = prio_tree_entry(ptr, struct vm_area_struct,
-					shared.prio_tree_node);
-		prefetch(next->shared.vm_set.head);
-		return next;
-	} else
-		return NULL;
-}
diff --git a/mm/rmap.c b/mm/rmap.c
index 0f3b7cda2a24..7b5b51d25fc5 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -820,7 +820,6 @@ static int page_referenced_file(struct page *page,
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct vm_area_struct *vma;
-	struct prio_tree_iter iter;
 	int referenced = 0;
 
 	/*
@@ -846,7 +845,7 @@ static int page_referenced_file(struct page *page,
 	 */
 	mapcount = page_mapcount(page);
 
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
 		if (address == -EFAULT)
 			continue;
@@ -945,13 +944,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
 {
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct vm_area_struct *vma;
-	struct prio_tree_iter iter;
 	int ret = 0;
 
 	BUG_ON(PageAnon(page));
 
 	mutex_lock(&mapping->i_mmap_mutex);
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		if (vma->vm_flags & VM_SHARED) {
 			unsigned long address = vma_address(page, vma);
 			if (address == -EFAULT)
@@ -1547,7 +1545,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct vm_area_struct *vma;
-	struct prio_tree_iter iter;
 	int ret = SWAP_AGAIN;
 	unsigned long cursor;
 	unsigned long max_nl_cursor = 0;
@@ -1555,7 +1552,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 	unsigned int mapcount;
 
 	mutex_lock(&mapping->i_mmap_mutex);
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
 		if (address == -EFAULT)
 			continue;
@@ -1576,7 +1573,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 		goto out;
 
 	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
-						shared.vm_set.list) {
+							shared.nonlinear) {
 		cursor = (unsigned long) vma->vm_private_data;
 		if (cursor > max_nl_cursor)
 			max_nl_cursor = cursor;
@@ -1608,7 +1605,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 
 	do {
 		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
-						shared.vm_set.list) {
+							shared.nonlinear) {
 			cursor = (unsigned long) vma->vm_private_data;
 			while ( cursor < max_nl_cursor &&
 				cursor < vma->vm_end - vma->vm_start) {
@@ -1631,7 +1628,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 	 * in locked vmas).  Reset cursor on all unreserved nonlinear
 	 * vmas, now forgetting on which ones it had fallen behind.
 	 */
-	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
 		vma->vm_private_data = NULL;
 out:
 	mutex_unlock(&mapping->i_mmap_mutex);
@@ -1748,13 +1745,12 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct vm_area_struct *vma;
-	struct prio_tree_iter iter;
 	int ret = SWAP_AGAIN;
 
 	if (!mapping)
 		return ret;
 	mutex_lock(&mapping->i_mmap_mutex);
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
 		if (address == -EFAULT)
 			continue;

From 85d3a316c714197f94e75c1e5b2d37607d66e5de Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:27 -0700
Subject: [PATCH 081/160] kmemleak: use rbtree instead of prio tree

kmemleak uses a tree where each node represents an allocated memory object
in order to quickly find out what object a given address is part of.
However, the objects don't overlap, so rbtrees are a better choice than
prio tree for this use.  They are both faster and have lower memory
overhead.

Tested by booting a kernel with kmemleak enabled, loading the
kmemleak_test module, and looking for the expected messages.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kmemleak.c | 98 ++++++++++++++++++++++++++-------------------------
 1 file changed, 50 insertions(+), 48 deletions(-)

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 0de83b4541e9..a217cc544060 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -29,7 +29,7 @@
  * - kmemleak_lock (rwlock): protects the object_list modifications and
  *   accesses to the object_tree_root. The object_list is the main list
  *   holding the metadata (struct kmemleak_object) for the allocated memory
- *   blocks. The object_tree_root is a priority search tree used to look-up
+ *   blocks. The object_tree_root is a red black tree used to look-up
  *   metadata based on a pointer to the corresponding memory block.  The
  *   kmemleak_object structures are added to the object_list and
  *   object_tree_root in the create_object() function called from the
@@ -71,7 +71,7 @@
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/kthread.h>
-#include <linux/prio_tree.h>
+#include <linux/rbtree.h>
 #include <linux/fs.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
@@ -132,7 +132,7 @@ struct kmemleak_scan_area {
  * Structure holding the metadata for each allocated memory block.
  * Modifications to such objects should be made while holding the
  * object->lock. Insertions or deletions from object_list, gray_list or
- * tree_node are already protected by the corresponding locks or mutex (see
+ * rb_node are already protected by the corresponding locks or mutex (see
  * the notes on locking above). These objects are reference-counted
  * (use_count) and freed using the RCU mechanism.
  */
@@ -141,7 +141,7 @@ struct kmemleak_object {
 	unsigned long flags;		/* object status flags */
 	struct list_head object_list;
 	struct list_head gray_list;
-	struct prio_tree_node tree_node;
+	struct rb_node rb_node;
 	struct rcu_head rcu;		/* object_list lockless traversal */
 	/* object usage count; object freed when use_count == 0 */
 	atomic_t use_count;
@@ -182,9 +182,9 @@ struct kmemleak_object {
 static LIST_HEAD(object_list);
 /* the list of gray-colored objects (see color_gray comment below) */
 static LIST_HEAD(gray_list);
-/* prio search tree for object boundaries */
-static struct prio_tree_root object_tree_root;
-/* rw_lock protecting the access to object_list and prio_tree_root */
+/* search tree for object boundaries */
+static struct rb_root object_tree_root = RB_ROOT;
+/* rw_lock protecting the access to object_list and object_tree_root */
 static DEFINE_RWLOCK(kmemleak_lock);
 
 /* allocation caches for kmemleak internal data */
@@ -380,7 +380,7 @@ static void dump_object_info(struct kmemleak_object *object)
 	trace.entries = object->trace;
 
 	pr_notice("Object 0x%08lx (size %zu):\n",
-		  object->tree_node.start, object->size);
+		  object->pointer, object->size);
 	pr_notice("  comm \"%s\", pid %d, jiffies %lu\n",
 		  object->comm, object->pid, object->jiffies);
 	pr_notice("  min_count = %d\n", object->min_count);
@@ -392,32 +392,32 @@ static void dump_object_info(struct kmemleak_object *object)
 }
 
 /*
- * Look-up a memory block metadata (kmemleak_object) in the priority search
+ * Look-up a memory block metadata (kmemleak_object) in the object search
  * tree based on a pointer value. If alias is 0, only values pointing to the
  * beginning of the memory block are allowed. The kmemleak_lock must be held
  * when calling this function.
  */
 static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
 {
-	struct prio_tree_node *node;
-	struct prio_tree_iter iter;
-	struct kmemleak_object *object;
+	struct rb_node *rb = object_tree_root.rb_node;
 
-	prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr);
-	node = prio_tree_next(&iter);
-	if (node) {
-		object = prio_tree_entry(node, struct kmemleak_object,
-					 tree_node);
-		if (!alias && object->pointer != ptr) {
+	while (rb) {
+		struct kmemleak_object *object =
+			rb_entry(rb, struct kmemleak_object, rb_node);
+		if (ptr < object->pointer)
+			rb = object->rb_node.rb_left;
+		else if (object->pointer + object->size <= ptr)
+			rb = object->rb_node.rb_right;
+		else if (object->pointer == ptr || alias)
+			return object;
+		else {
 			kmemleak_warn("Found object by alias at 0x%08lx\n",
 				      ptr);
 			dump_object_info(object);
-			object = NULL;
+			break;
 		}
-	} else
-		object = NULL;
-
-	return object;
+	}
+	return NULL;
 }
 
 /*
@@ -471,7 +471,7 @@ static void put_object(struct kmemleak_object *object)
 }
 
 /*
- * Look up an object in the prio search tree and increase its use_count.
+ * Look up an object in the object search tree and increase its use_count.
  */
 static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
 {
@@ -516,8 +516,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
 					     int min_count, gfp_t gfp)
 {
 	unsigned long flags;
-	struct kmemleak_object *object;
-	struct prio_tree_node *node;
+	struct kmemleak_object *object, *parent;
+	struct rb_node **link, *rb_parent;
 
 	object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
 	if (!object) {
@@ -560,31 +560,34 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
 	/* kernel backtrace */
 	object->trace_len = __save_stack_trace(object->trace);
 
-	INIT_PRIO_TREE_NODE(&object->tree_node);
-	object->tree_node.start = ptr;
-	object->tree_node.last = ptr + size - 1;
-
 	write_lock_irqsave(&kmemleak_lock, flags);
 
 	min_addr = min(min_addr, ptr);
 	max_addr = max(max_addr, ptr + size);
-	node = prio_tree_insert(&object_tree_root, &object->tree_node);
-	/*
-	 * The code calling the kernel does not yet have the pointer to the
-	 * memory block to be able to free it.  However, we still hold the
-	 * kmemleak_lock here in case parts of the kernel started freeing
-	 * random memory blocks.
-	 */
-	if (node != &object->tree_node) {
-		kmemleak_stop("Cannot insert 0x%lx into the object search tree "
-			      "(already existing)\n", ptr);
-		object = lookup_object(ptr, 1);
-		spin_lock(&object->lock);
-		dump_object_info(object);
-		spin_unlock(&object->lock);
-
-		goto out;
+	link = &object_tree_root.rb_node;
+	rb_parent = NULL;
+	while (*link) {
+		rb_parent = *link;
+		parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
+		if (ptr + size <= parent->pointer)
+			link = &parent->rb_node.rb_left;
+		else if (parent->pointer + parent->size <= ptr)
+			link = &parent->rb_node.rb_right;
+		else {
+			kmemleak_stop("Cannot insert 0x%lx into the object "
+				      "search tree (overlaps existing)\n",
+				      ptr);
+			kmem_cache_free(object_cache, object);
+			object = parent;
+			spin_lock(&object->lock);
+			dump_object_info(object);
+			spin_unlock(&object->lock);
+			goto out;
+		}
 	}
+	rb_link_node(&object->rb_node, rb_parent, link);
+	rb_insert_color(&object->rb_node, &object_tree_root);
+
 	list_add_tail_rcu(&object->object_list, &object_list);
 out:
 	write_unlock_irqrestore(&kmemleak_lock, flags);
@@ -600,7 +603,7 @@ static void __delete_object(struct kmemleak_object *object)
 	unsigned long flags;
 
 	write_lock_irqsave(&kmemleak_lock, flags);
-	prio_tree_remove(&object_tree_root, &object->tree_node);
+	rb_erase(&object->rb_node, &object_tree_root);
 	list_del_rcu(&object->object_list);
 	write_unlock_irqrestore(&kmemleak_lock, flags);
 
@@ -1766,7 +1769,6 @@ void __init kmemleak_init(void)
 
 	object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
 	scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
-	INIT_PRIO_TREE_ROOT(&object_tree_root);
 
 	if (crt_early_log >= ARRAY_SIZE(early_log))
 		pr_warning("Early log buffer exceeded (%d), please increase "

From 147e615f83c2c36caf89e7a3bf78090ade6f266c Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:30 -0700
Subject: [PATCH 082/160] prio_tree: remove

After both prio_tree users have been converted to use red-black trees,
there is no need to keep around the prio tree library anymore.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/00-INDEX      |   2 -
 Documentation/prio_tree.txt | 107 ---------
 include/linux/prio_tree.h   | 120 ----------
 init/main.c                 |   2 -
 lib/Kconfig.debug           |   6 -
 lib/Makefile                |   3 +-
 lib/prio_tree.c             | 455 ------------------------------------
 lib/prio_tree_test.c        | 106 ---------
 8 files changed, 1 insertion(+), 800 deletions(-)
 delete mode 100644 Documentation/prio_tree.txt
 delete mode 100644 include/linux/prio_tree.h
 delete mode 100644 lib/prio_tree.c
 delete mode 100644 lib/prio_tree_test.c

diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 49c051380daf..f54273e2ac97 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -270,8 +270,6 @@ preempt-locking.txt
 	- info on locking under a preemptive kernel.
 printk-formats.txt
 	- how to get printk format specifiers right
-prio_tree.txt
-	- info on radix-priority-search-tree use for indexing vmas.
 ramoops.txt
 	- documentation of the ramoops oops/panic logging module.
 rbtree.txt
diff --git a/Documentation/prio_tree.txt b/Documentation/prio_tree.txt
deleted file mode 100644
index 3aa68f9a117b..000000000000
--- a/Documentation/prio_tree.txt
+++ /dev/null
@@ -1,107 +0,0 @@
-The prio_tree.c code indexes vmas using 3 different indexes:
-	* heap_index  = vm_pgoff + vm_size_in_pages : end_vm_pgoff
-	* radix_index = vm_pgoff : start_vm_pgoff
-	* size_index = vm_size_in_pages
-
-A regular radix-priority-search-tree indexes vmas using only heap_index and
-radix_index. The conditions for indexing are:
-	* ->heap_index >= ->left->heap_index &&
-		->heap_index >= ->right->heap_index
-	* if (->heap_index == ->left->heap_index)
-		then ->radix_index < ->left->radix_index;
-	* if (->heap_index == ->right->heap_index)
-		then ->radix_index < ->right->radix_index;
-	* nodes are hashed to left or right subtree using radix_index
-	  similar to a pure binary radix tree.
-
-A regular radix-priority-search-tree helps to store and query
-intervals (vmas). However, a regular radix-priority-search-tree is only
-suitable for storing vmas with different radix indices (vm_pgoff).
-
-Therefore, the prio_tree.c extends the regular radix-priority-search-tree
-to handle many vmas with the same vm_pgoff. Such vmas are handled in
-2 different ways: 1) All vmas with the same radix _and_ heap indices are
-linked using vm_set.list, 2) if there are many vmas with the same radix
-index, but different heap indices and if the regular radix-priority-search
-tree cannot index them all, we build an overflow-sub-tree that indexes such
-vmas using heap and size indices instead of heap and radix indices. For
-example, in the figure below some vmas with vm_pgoff = 0 (zero) are
-indexed by regular radix-priority-search-tree whereas others are pushed
-into an overflow-subtree. Note that all vmas in an overflow-sub-tree have
-the same vm_pgoff (radix_index) and if necessary we build different
-overflow-sub-trees to handle each possible radix_index. For example,
-in figure we have 3 overflow-sub-trees corresponding to radix indices
-0, 2, and 4.
-
-In the final tree the first few (prio_tree_root->index_bits) levels
-are indexed using heap and radix indices whereas the overflow-sub-trees below
-those levels (i.e. levels prio_tree_root->index_bits + 1 and higher) are
-indexed using heap and size indices. In overflow-sub-trees the size_index
-is used for hashing the nodes to appropriate places.
-
-Now, an example prio_tree:
-
-  vmas are represented [radix_index, size_index, heap_index]
-                 i.e., [start_vm_pgoff, vm_size_in_pages, end_vm_pgoff]
-
-level  prio_tree_root->index_bits = 3
------
-												_
-  0			 				[0,7,7]					 |
-  							/     \					 |
-				      ------------------       ------------			 |     Regular
-  				     /					   \			 |  radix priority
-  1		 		[1,6,7]					  [4,3,7]		 |   search tree
-  				/     \					  /     \		 |
-			 -------       -----			    ------       -----		 |  heap-and-radix
-			/		    \			   /		      \		 |      indexed
-  2		    [0,6,6]	 	   [2,5,7]		[5,2,7]		    [6,1,7]	 |
-		    /     \		   /     \		/     \		    /     \	 |
-  3		[0,5,5]	[1,5,6]		[2,4,6]	[3,4,7]	    [4,2,6] [5,1,6]	[6,0,6]	[7,0,7]	 |
-		   /			   /		       /		   		_
-                  /		          /		      /					_
-  4	      [0,4,4]		      [2,3,5]		   [4,1,5]				 |
-  		 /			 /		      /					 |
-  5	     [0,3,3]		     [2,2,4]		  [4,0,4]				 |  Overflow-sub-trees
-  		/			/							 |
-  6	    [0,2,2]		    [2,1,3]							 |    heap-and-size
-  	       /		       /							 |       indexed
-  7	   [0,1,1]		   [2,0,2]							 |
-  	      /											 |
-  8	  [0,0,0]										 |
-  												_
-
-Note that we use prio_tree_root->index_bits to optimize the height
-of the heap-and-radix indexed tree. Since prio_tree_root->index_bits is
-set according to the maximum end_vm_pgoff mapped, we are sure that all
-bits (in vm_pgoff) above prio_tree_root->index_bits are 0 (zero). Therefore,
-we only use the first prio_tree_root->index_bits as radix_index.
-Whenever index_bits is increased in prio_tree_expand, we shuffle the tree
-to make sure that the first prio_tree_root->index_bits levels of the tree
-is indexed properly using heap and radix indices.
-
-We do not optimize the height of overflow-sub-trees using index_bits.
-The reason is: there can be many such overflow-sub-trees and all of
-them have to be suffled whenever the index_bits increases. This may involve
-walking the whole prio_tree in prio_tree_insert->prio_tree_expand code
-path which is not desirable. Hence, we do not optimize the height of the
-heap-and-size indexed overflow-sub-trees using prio_tree->index_bits.
-Instead the overflow sub-trees are indexed using full BITS_PER_LONG bits
-of size_index. This may lead to skewed sub-trees because most of the
-higher significant bits of the size_index are likely to be 0 (zero). In
-the example above, all 3 overflow-sub-trees are skewed. This may marginally
-affect the performance. However, processes rarely map many vmas with the
-same start_vm_pgoff but different end_vm_pgoffs. Therefore, we normally
-do not require overflow-sub-trees to index all vmas.
-
-From the above discussion it is clear that the maximum height of
-a prio_tree can be prio_tree_root->index_bits + BITS_PER_LONG.
-However, in most of the common cases we do not need overflow-sub-trees,
-so the tree height in the common cases will be prio_tree_root->index_bits.
-
-It is fair to mention here that the prio_tree_root->index_bits
-is increased on demand, however, the index_bits is not decreased when
-vmas are removed from the prio_tree. That's tricky to do. Hence, it's
-left as a home work problem.
-
-
diff --git a/include/linux/prio_tree.h b/include/linux/prio_tree.h
deleted file mode 100644
index db04abb557e0..000000000000
--- a/include/linux/prio_tree.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef _LINUX_PRIO_TREE_H
-#define _LINUX_PRIO_TREE_H
-
-/*
- * K&R 2nd ed. A8.3 somewhat obliquely hints that initial sequences of struct
- * fields with identical types should end up at the same location. We'll use
- * this until we can scrap struct raw_prio_tree_node.
- *
- * Note: all this could be done more elegantly by using unnamed union/struct
- * fields. However, gcc 2.95.3 and apparently also gcc 3.0.4 don't support this
- * language extension.
- */
-
-struct raw_prio_tree_node {
-	struct prio_tree_node	*left;
-	struct prio_tree_node	*right;
-	struct prio_tree_node	*parent;
-};
-
-struct prio_tree_node {
-	struct prio_tree_node	*left;
-	struct prio_tree_node	*right;
-	struct prio_tree_node	*parent;
-	unsigned long		start;
-	unsigned long		last;	/* last location _in_ interval */
-};
-
-struct prio_tree_root {
-	struct prio_tree_node	*prio_tree_node;
-	unsigned short 		index_bits;
-	unsigned short		raw;
-		/*
-		 * 0: nodes are of type struct prio_tree_node
-		 * 1: nodes are of type raw_prio_tree_node
-		 */
-};
-
-struct prio_tree_iter {
-	struct prio_tree_node	*cur;
-	unsigned long		mask;
-	unsigned long		value;
-	int			size_level;
-
-	struct prio_tree_root	*root;
-	pgoff_t			r_index;
-	pgoff_t			h_index;
-};
-
-static inline void prio_tree_iter_init(struct prio_tree_iter *iter,
-		struct prio_tree_root *root, pgoff_t r_index, pgoff_t h_index)
-{
-	iter->root = root;
-	iter->r_index = r_index;
-	iter->h_index = h_index;
-	iter->cur = NULL;
-}
-
-#define __INIT_PRIO_TREE_ROOT(ptr, _raw)	\
-do {					\
-	(ptr)->prio_tree_node = NULL;	\
-	(ptr)->index_bits = 1;		\
-	(ptr)->raw = (_raw);		\
-} while (0)
-
-#define INIT_PRIO_TREE_ROOT(ptr)	__INIT_PRIO_TREE_ROOT(ptr, 0)
-#define INIT_RAW_PRIO_TREE_ROOT(ptr)	__INIT_PRIO_TREE_ROOT(ptr, 1)
-
-#define INIT_PRIO_TREE_NODE(ptr)				\
-do {								\
-	(ptr)->left = (ptr)->right = (ptr)->parent = (ptr);	\
-} while (0)
-
-#define INIT_PRIO_TREE_ITER(ptr)	\
-do {					\
-	(ptr)->cur = NULL;		\
-	(ptr)->mask = 0UL;		\
-	(ptr)->value = 0UL;		\
-	(ptr)->size_level = 0;		\
-} while (0)
-
-#define prio_tree_entry(ptr, type, member) \
-       ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
-
-static inline int prio_tree_empty(const struct prio_tree_root *root)
-{
-	return root->prio_tree_node == NULL;
-}
-
-static inline int prio_tree_root(const struct prio_tree_node *node)
-{
-	return node->parent == node;
-}
-
-static inline int prio_tree_left_empty(const struct prio_tree_node *node)
-{
-	return node->left == node;
-}
-
-static inline int prio_tree_right_empty(const struct prio_tree_node *node)
-{
-	return node->right == node;
-}
-
-
-struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root,
-                struct prio_tree_node *old, struct prio_tree_node *node);
-struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root,
-                struct prio_tree_node *node);
-void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node);
-struct prio_tree_node *prio_tree_next(struct prio_tree_iter *iter);
-
-#define raw_prio_tree_replace(root, old, node) \
-	prio_tree_replace(root, (struct prio_tree_node *) (old), \
-	    (struct prio_tree_node *) (node))
-#define raw_prio_tree_insert(root, node) \
-	prio_tree_insert(root, (struct prio_tree_node *) (node))
-#define raw_prio_tree_remove(root, node) \
-	prio_tree_remove(root, (struct prio_tree_node *) (node))
-
-#endif /* _LINUX_PRIO_TREE_H */
diff --git a/init/main.c b/init/main.c
index db34c0ec4711..313360fe1118 100644
--- a/init/main.c
+++ b/init/main.c
@@ -86,7 +86,6 @@ extern void init_IRQ(void);
 extern void fork_init(unsigned long);
 extern void mca_init(void);
 extern void sbus_init(void);
-extern void prio_tree_init(void);
 extern void radix_tree_init(void);
 #ifndef CONFIG_DEBUG_RODATA
 static inline void mark_rodata_ro(void) { }
@@ -547,7 +546,6 @@ asmlinkage void __init start_kernel(void)
 	/* init some links before init_ISA_irqs() */
 	early_irq_init();
 	init_IRQ();
-	prio_tree_init();
 	init_timers();
 	hrtimers_init();
 	softirq_init();
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ee9f030b6951..a6e7e7741523 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1289,12 +1289,6 @@ config RBTREE_TEST
 	  A benchmark measuring the performance of the rbtree library.
 	  Also includes rbtree invariant checks.
 
-config PRIO_TREE_TEST
-	tristate "Prio tree test"
-	depends on m && DEBUG_KERNEL
-	help
-	  A benchmark measuring the performance of the prio tree library
-
 config INTERVAL_TREE_TEST
 	tristate "Interval tree test"
 	depends on m && DEBUG_KERNEL
diff --git a/lib/Makefile b/lib/Makefile
index 26f578bf616a..3128e357e286 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -9,7 +9,7 @@ endif
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
-	 idr.o int_sqrt.o extable.o prio_tree.o \
+	 idr.o int_sqrt.o extable.o \
 	 sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
 	 proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o
@@ -141,7 +141,6 @@ $(foreach file, $(libfdt_files), \
 lib-$(CONFIG_LIBFDT) += $(libfdt_files)
 
 obj-$(CONFIG_RBTREE_TEST) += rbtree_test.o
-obj-$(CONFIG_PRIO_TREE_TEST) += prio_tree_test.o
 obj-$(CONFIG_INTERVAL_TREE_TEST) += interval_tree_test.o
 
 interval_tree_test-objs := interval_tree_test_main.o interval_tree.o
diff --git a/lib/prio_tree.c b/lib/prio_tree.c
deleted file mode 100644
index bba37148c15e..000000000000
--- a/lib/prio_tree.c
+++ /dev/null
@@ -1,455 +0,0 @@
-/*
- * lib/prio_tree.c - priority search tree
- *
- * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
- *
- * This file is released under the GPL v2.
- *
- * Based on the radix priority search tree proposed by Edward M. McCreight
- * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
- *
- * 02Feb2004	Initial version
- */
-
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/prio_tree.h>
-#include <linux/export.h>
-
-/*
- * A clever mix of heap and radix trees forms a radix priority search tree (PST)
- * which is useful for storing intervals, e.g, we can consider a vma as a closed
- * interval of file pages [offset_begin, offset_end], and store all vmas that
- * map a file in a PST. Then, using the PST, we can answer a stabbing query,
- * i.e., selecting a set of stored intervals (vmas) that overlap with (map) a
- * given input interval X (a set of consecutive file pages), in "O(log n + m)"
- * time where 'log n' is the height of the PST, and 'm' is the number of stored
- * intervals (vmas) that overlap (map) with the input interval X (the set of
- * consecutive file pages).
- *
- * In our implementation, we store closed intervals of the form [radix_index,
- * heap_index]. We assume that always radix_index <= heap_index. McCreight's PST
- * is designed for storing intervals with unique radix indices, i.e., each
- * interval have different radix_index. However, this limitation can be easily
- * overcome by using the size, i.e., heap_index - radix_index, as part of the
- * index, so we index the tree using [(radix_index,size), heap_index].
- *
- * When the above-mentioned indexing scheme is used, theoretically, in a 32 bit
- * machine, the maximum height of a PST can be 64. We can use a balanced version
- * of the priority search tree to optimize the tree height, but the balanced
- * tree proposed by McCreight is too complex and memory-hungry for our purpose.
- */
-
-/*
- * The following macros are used for implementing prio_tree for i_mmap
- */
-
-static void get_index(const struct prio_tree_root *root,
-    const struct prio_tree_node *node,
-    unsigned long *radix, unsigned long *heap)
-{
-	*radix = node->start;
-	*heap = node->last;
-}
-
-static unsigned long index_bits_to_maxindex[BITS_PER_LONG];
-
-void __init prio_tree_init(void)
-{
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++)
-		index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1;
-	index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL;
-}
-
-/*
- * Maximum heap_index that can be stored in a PST with index_bits bits
- */
-static inline unsigned long prio_tree_maxindex(unsigned int bits)
-{
-	return index_bits_to_maxindex[bits - 1];
-}
-
-static void prio_set_parent(struct prio_tree_node *parent,
-			    struct prio_tree_node *child, bool left)
-{
-	if (left)
-		parent->left = child;
-	else
-		parent->right = child;
-
-	child->parent = parent;
-}
-
-/*
- * Extend a priority search tree so that it can store a node with heap_index
- * max_heap_index. In the worst case, this algorithm takes O((log n)^2).
- * However, this function is used rarely and the common case performance is
- * not bad.
- */
-static struct prio_tree_node *prio_tree_expand(struct prio_tree_root *root,
-		struct prio_tree_node *node, unsigned long max_heap_index)
-{
-	struct prio_tree_node *prev;
-
-	if (max_heap_index > prio_tree_maxindex(root->index_bits))
-		root->index_bits++;
-
-	prev = node;
-	INIT_PRIO_TREE_NODE(node);
-
-	while (max_heap_index > prio_tree_maxindex(root->index_bits)) {
-		struct prio_tree_node *tmp = root->prio_tree_node;
-
-		root->index_bits++;
-
-		if (prio_tree_empty(root))
-			continue;
-
-		prio_tree_remove(root, root->prio_tree_node);
-		INIT_PRIO_TREE_NODE(tmp);
-
-		prio_set_parent(prev, tmp, true);
-		prev = tmp;
-	}
-
-	if (!prio_tree_empty(root))
-		prio_set_parent(prev, root->prio_tree_node, true);
-
-	root->prio_tree_node = node;
-	return node;
-}
-
-/*
- * Replace a prio_tree_node with a new node and return the old node
- */
-struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root,
-		struct prio_tree_node *old, struct prio_tree_node *node)
-{
-	INIT_PRIO_TREE_NODE(node);
-
-	if (prio_tree_root(old)) {
-		BUG_ON(root->prio_tree_node != old);
-		/*
-		 * We can reduce root->index_bits here. However, it is complex
-		 * and does not help much to improve performance (IMO).
-		 */
-		root->prio_tree_node = node;
-	} else
-		prio_set_parent(old->parent, node, old->parent->left == old);
-
-	if (!prio_tree_left_empty(old))
-		prio_set_parent(node, old->left, true);
-
-	if (!prio_tree_right_empty(old))
-		prio_set_parent(node, old->right, false);
-
-	return old;
-}
-
-/*
- * Insert a prio_tree_node @node into a radix priority search tree @root. The
- * algorithm typically takes O(log n) time where 'log n' is the number of bits
- * required to represent the maximum heap_index. In the worst case, the algo
- * can take O((log n)^2) - check prio_tree_expand.
- *
- * If a prior node with same radix_index and heap_index is already found in
- * the tree, then returns the address of the prior node. Otherwise, inserts
- * @node into the tree and returns @node.
- */
-struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root,
-		struct prio_tree_node *node)
-{
-	struct prio_tree_node *cur, *res = node;
-	unsigned long radix_index, heap_index;
-	unsigned long r_index, h_index, index, mask;
-	int size_flag = 0;
-
-	get_index(root, node, &radix_index, &heap_index);
-
-	if (prio_tree_empty(root) ||
-			heap_index > prio_tree_maxindex(root->index_bits))
-		return prio_tree_expand(root, node, heap_index);
-
-	cur = root->prio_tree_node;
-	mask = 1UL << (root->index_bits - 1);
-
-	while (mask) {
-		get_index(root, cur, &r_index, &h_index);
-
-		if (r_index == radix_index && h_index == heap_index)
-			return cur;
-
-                if (h_index < heap_index ||
-		    (h_index == heap_index && r_index > radix_index)) {
-			struct prio_tree_node *tmp = node;
-			node = prio_tree_replace(root, cur, node);
-			cur = tmp;
-			/* swap indices */
-			index = r_index;
-			r_index = radix_index;
-			radix_index = index;
-			index = h_index;
-			h_index = heap_index;
-			heap_index = index;
-		}
-
-		if (size_flag)
-			index = heap_index - radix_index;
-		else
-			index = radix_index;
-
-		if (index & mask) {
-			if (prio_tree_right_empty(cur)) {
-				INIT_PRIO_TREE_NODE(node);
-				prio_set_parent(cur, node, false);
-				return res;
-			} else
-				cur = cur->right;
-		} else {
-			if (prio_tree_left_empty(cur)) {
-				INIT_PRIO_TREE_NODE(node);
-				prio_set_parent(cur, node, true);
-				return res;
-			} else
-				cur = cur->left;
-		}
-
-		mask >>= 1;
-
-		if (!mask) {
-			mask = 1UL << (BITS_PER_LONG - 1);
-			size_flag = 1;
-		}
-	}
-	/* Should not reach here */
-	BUG();
-	return NULL;
-}
-EXPORT_SYMBOL(prio_tree_insert);
-
-/*
- * Remove a prio_tree_node @node from a radix priority search tree @root. The
- * algorithm takes O(log n) time where 'log n' is the number of bits required
- * to represent the maximum heap_index.
- */
-void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node)
-{
-	struct prio_tree_node *cur;
-	unsigned long r_index, h_index_right, h_index_left;
-
-	cur = node;
-
-	while (!prio_tree_left_empty(cur) || !prio_tree_right_empty(cur)) {
-		if (!prio_tree_left_empty(cur))
-			get_index(root, cur->left, &r_index, &h_index_left);
-		else {
-			cur = cur->right;
-			continue;
-		}
-
-		if (!prio_tree_right_empty(cur))
-			get_index(root, cur->right, &r_index, &h_index_right);
-		else {
-			cur = cur->left;
-			continue;
-		}
-
-		/* both h_index_left and h_index_right cannot be 0 */
-		if (h_index_left >= h_index_right)
-			cur = cur->left;
-		else
-			cur = cur->right;
-	}
-
-	if (prio_tree_root(cur)) {
-		BUG_ON(root->prio_tree_node != cur);
-		__INIT_PRIO_TREE_ROOT(root, root->raw);
-		return;
-	}
-
-	if (cur->parent->right == cur)
-		cur->parent->right = cur->parent;
-	else
-		cur->parent->left = cur->parent;
-
-	while (cur != node)
-		cur = prio_tree_replace(root, cur->parent, cur);
-}
-EXPORT_SYMBOL(prio_tree_remove);
-
-static void iter_walk_down(struct prio_tree_iter *iter)
-{
-	iter->mask >>= 1;
-	if (iter->mask) {
-		if (iter->size_level)
-			iter->size_level++;
-		return;
-	}
-
-	if (iter->size_level) {
-		BUG_ON(!prio_tree_left_empty(iter->cur));
-		BUG_ON(!prio_tree_right_empty(iter->cur));
-		iter->size_level++;
-		iter->mask = ULONG_MAX;
-	} else {
-		iter->size_level = 1;
-		iter->mask = 1UL << (BITS_PER_LONG - 1);
-	}
-}
-
-static void iter_walk_up(struct prio_tree_iter *iter)
-{
-	if (iter->mask == ULONG_MAX)
-		iter->mask = 1UL;
-	else if (iter->size_level == 1)
-		iter->mask = 1UL;
-	else
-		iter->mask <<= 1;
-	if (iter->size_level)
-		iter->size_level--;
-	if (!iter->size_level && (iter->value & iter->mask))
-		iter->value ^= iter->mask;
-}
-
-/*
- * Following functions help to enumerate all prio_tree_nodes in the tree that
- * overlap with the input interval X [radix_index, heap_index]. The enumeration
- * takes O(log n + m) time where 'log n' is the height of the tree (which is
- * proportional to # of bits required to represent the maximum heap_index) and
- * 'm' is the number of prio_tree_nodes that overlap the interval X.
- */
-
-static struct prio_tree_node *prio_tree_left(struct prio_tree_iter *iter,
-		unsigned long *r_index, unsigned long *h_index)
-{
-	if (prio_tree_left_empty(iter->cur))
-		return NULL;
-
-	get_index(iter->root, iter->cur->left, r_index, h_index);
-
-	if (iter->r_index <= *h_index) {
-		iter->cur = iter->cur->left;
-		iter_walk_down(iter);
-		return iter->cur;
-	}
-
-	return NULL;
-}
-
-static struct prio_tree_node *prio_tree_right(struct prio_tree_iter *iter,
-		unsigned long *r_index, unsigned long *h_index)
-{
-	unsigned long value;
-
-	if (prio_tree_right_empty(iter->cur))
-		return NULL;
-
-	if (iter->size_level)
-		value = iter->value;
-	else
-		value = iter->value | iter->mask;
-
-	if (iter->h_index < value)
-		return NULL;
-
-	get_index(iter->root, iter->cur->right, r_index, h_index);
-
-	if (iter->r_index <= *h_index) {
-		iter->cur = iter->cur->right;
-		iter_walk_down(iter);
-		return iter->cur;
-	}
-
-	return NULL;
-}
-
-static struct prio_tree_node *prio_tree_parent(struct prio_tree_iter *iter)
-{
-	iter->cur = iter->cur->parent;
-	iter_walk_up(iter);
-	return iter->cur;
-}
-
-static inline int overlap(struct prio_tree_iter *iter,
-		unsigned long r_index, unsigned long h_index)
-{
-	return iter->h_index >= r_index && iter->r_index <= h_index;
-}
-
-/*
- * prio_tree_first:
- *
- * Get the first prio_tree_node that overlaps with the interval [radix_index,
- * heap_index]. Note that always radix_index <= heap_index. We do a pre-order
- * traversal of the tree.
- */
-static struct prio_tree_node *prio_tree_first(struct prio_tree_iter *iter)
-{
-	struct prio_tree_root *root;
-	unsigned long r_index, h_index;
-
-	INIT_PRIO_TREE_ITER(iter);
-
-	root = iter->root;
-	if (prio_tree_empty(root))
-		return NULL;
-
-	get_index(root, root->prio_tree_node, &r_index, &h_index);
-
-	if (iter->r_index > h_index)
-		return NULL;
-
-	iter->mask = 1UL << (root->index_bits - 1);
-	iter->cur = root->prio_tree_node;
-
-	while (1) {
-		if (overlap(iter, r_index, h_index))
-			return iter->cur;
-
-		if (prio_tree_left(iter, &r_index, &h_index))
-			continue;
-
-		if (prio_tree_right(iter, &r_index, &h_index))
-			continue;
-
-		break;
-	}
-	return NULL;
-}
-
-/*
- * prio_tree_next:
- *
- * Get the next prio_tree_node that overlaps with the input interval in iter
- */
-struct prio_tree_node *prio_tree_next(struct prio_tree_iter *iter)
-{
-	unsigned long r_index, h_index;
-
-	if (iter->cur == NULL)
-		return prio_tree_first(iter);
-
-repeat:
-	while (prio_tree_left(iter, &r_index, &h_index))
-		if (overlap(iter, r_index, h_index))
-			return iter->cur;
-
-	while (!prio_tree_right(iter, &r_index, &h_index)) {
-	    	while (!prio_tree_root(iter->cur) &&
-				iter->cur->parent->right == iter->cur)
-			prio_tree_parent(iter);
-
-		if (prio_tree_root(iter->cur))
-			return NULL;
-
-		prio_tree_parent(iter);
-	}
-
-	if (overlap(iter, r_index, h_index))
-		return iter->cur;
-
-	goto repeat;
-}
-EXPORT_SYMBOL(prio_tree_next);
diff --git a/lib/prio_tree_test.c b/lib/prio_tree_test.c
deleted file mode 100644
index c26084ddc6a4..000000000000
--- a/lib/prio_tree_test.c
+++ /dev/null
@@ -1,106 +0,0 @@
-#include <linux/module.h>
-#include <linux/prio_tree.h>
-#include <linux/random.h>
-#include <asm/timex.h>
-
-#define NODES        100
-#define PERF_LOOPS   100000
-#define SEARCHES     100
-#define SEARCH_LOOPS 10000
-
-static struct prio_tree_root root;
-static struct prio_tree_node nodes[NODES];
-static u32 queries[SEARCHES];
-
-static struct rnd_state rnd;
-
-static inline unsigned long
-search(unsigned long query, struct prio_tree_root *root)
-{
-	struct prio_tree_iter iter;
-	unsigned long results = 0;
-
-	prio_tree_iter_init(&iter, root, query, query);
-	while (prio_tree_next(&iter))
-		results++;
-	return results;
-}
-
-static void init(void)
-{
-	int i;
-	for (i = 0; i < NODES; i++) {
-		u32 a = prandom32(&rnd), b = prandom32(&rnd);
-		if (a <= b) {
-			nodes[i].start = a;
-			nodes[i].last = b;
-		} else {
-			nodes[i].start = b;
-			nodes[i].last = a;
-		}
-	}
-	for (i = 0; i < SEARCHES; i++)
-		queries[i] = prandom32(&rnd);
-}
-
-static int prio_tree_test_init(void)
-{
-	int i, j;
-	unsigned long results;
-	cycles_t time1, time2, time;
-
-	printk(KERN_ALERT "prio tree insert/remove");
-
-	prandom32_seed(&rnd, 3141592653589793238ULL);
-	INIT_PRIO_TREE_ROOT(&root);
-	init();
-
-	time1 = get_cycles();
-
-	for (i = 0; i < PERF_LOOPS; i++) {
-		for (j = 0; j < NODES; j++)
-			prio_tree_insert(&root, nodes + j);
-		for (j = 0; j < NODES; j++)
-			prio_tree_remove(&root, nodes + j);
-	}
-
-	time2 = get_cycles();
-	time = time2 - time1;
-
-	time = div_u64(time, PERF_LOOPS);
-	printk(" -> %llu cycles\n", (unsigned long long)time);
-
-	printk(KERN_ALERT "prio tree search");
-
-	for (j = 0; j < NODES; j++)
-		prio_tree_insert(&root, nodes + j);
-
-	time1 = get_cycles();
-
-	results = 0;
-	for (i = 0; i < SEARCH_LOOPS; i++)
-		for (j = 0; j < SEARCHES; j++)
-			results += search(queries[j], &root);
-
-	time2 = get_cycles();
-	time = time2 - time1;
-
-	time = div_u64(time, SEARCH_LOOPS);
-	results = div_u64(results, SEARCH_LOOPS);
-	printk(" -> %llu cycles (%lu results)\n",
-	       (unsigned long long)time, results);
-
-	return -EAGAIN; /* Fail will directly unload the module */
-}
-
-static void prio_tree_test_exit(void)
-{
-	printk(KERN_ALERT "test exit\n");
-}
-
-module_init(prio_tree_test_init)
-module_exit(prio_tree_test_exit)
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Michel Lespinasse");
-MODULE_DESCRIPTION("Prio Tree test");

From 9c079add0d0f45220f4bb37febf0621137ec2d38 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:33 -0700
Subject: [PATCH 083/160] rbtree: move augmented rbtree functionality to
 rbtree_augmented.h

Provide rb_insert_augmented() and rb_erase_augmented() through a new
rbtree_augmented.h include file.  rb_erase_augmented() is defined there as
an __always_inline function, in order to allow inlining of augmented
rbtree callbacks into it.  Since this generates a relatively large
function, each augmented rbtree user should make sure to have a single
call site.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/rbtree.txt           |  13 ++
 arch/x86/mm/pat_rbtree.c           |   2 +-
 include/linux/interval_tree_tmpl.h |   8 +-
 include/linux/rbtree.h             |  48 -------
 include/linux/rbtree_augmented.h   | 223 +++++++++++++++++++++++++++++
 lib/rbtree.c                       | 162 ++-------------------
 lib/rbtree_test.c                  |   2 +-
 7 files changed, 255 insertions(+), 203 deletions(-)
 create mode 100644 include/linux/rbtree_augmented.h

diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt
index 0a0b6dce3e08..61b6c48871a0 100644
--- a/Documentation/rbtree.txt
+++ b/Documentation/rbtree.txt
@@ -202,6 +202,14 @@ An rbtree user who wants this feature will have to call the augmentation
 functions with the user provided augmentation callback when inserting
 and erasing nodes.
 
+C files implementing augmented rbtree manipulation must include
+<linux/rbtree_augmented.h> instead of <linus/rbtree.h>. Note that
+linux/rbtree_augmented.h exposes some rbtree implementations details
+you are not expected to rely on; please stick to the documented APIs
+there and do not include <linux/rbtree_augmented.h> from header files
+either so as to minimize chances of your users accidentally relying on
+such implementation details.
+
 On insertion, the user must update the augmented information on the path
 leading to the inserted node, then call rb_link_node() as usual and
 rb_augment_inserted() instead of the usual rb_insert_color() call.
@@ -227,6 +235,11 @@ In both cases, the callbacks are provided through struct rb_augment_callbacks.
   subtree to a newly assigned subtree root AND recomputes the augmented
   information for the former subtree root.
 
+The compiled code for rb_erase_augmented() may inline the propagation and
+copy callbacks, which results in a large function, so each augmented rbtree
+user should have a single rb_erase_augmented() call site in order to limit
+compiled code size.
+
 
 Sample usage:
 
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 4d116959075d..415f6c4ced36 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -12,7 +12,7 @@
 #include <linux/debugfs.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/rbtree.h>
+#include <linux/rbtree_augmented.h>
 #include <linux/sched.h>
 #include <linux/gfp.h>
 
diff --git a/include/linux/interval_tree_tmpl.h b/include/linux/interval_tree_tmpl.h
index c65deda31413..c1aeb922d65f 100644
--- a/include/linux/interval_tree_tmpl.h
+++ b/include/linux/interval_tree_tmpl.h
@@ -19,6 +19,8 @@
   include/linux/interval_tree_tmpl.h
 */
 
+#include <linux/rbtree_augmented.h>
+
 /*
  * Template for implementing interval trees
  *
@@ -57,7 +59,8 @@ static inline ITTYPE IT(compute_subtree_last)(ITSTRUCT *node)
 	return max;
 }
 
-static void IT(augment_propagate)(struct rb_node *rb, struct rb_node *stop)
+static inline void
+IT(augment_propagate)(struct rb_node *rb, struct rb_node *stop)
 {
 	while (rb != stop) {
 		ITSTRUCT *node = rb_entry(rb, ITSTRUCT, ITRB);
@@ -69,7 +72,8 @@ static void IT(augment_propagate)(struct rb_node *rb, struct rb_node *stop)
 	}
 }
 
-static void IT(augment_copy)(struct rb_node *rb_old, struct rb_node *rb_new)
+static inline void
+IT(augment_copy)(struct rb_node *rb_old, struct rb_node *rb_new)
 {
 	ITSTRUCT *old = rb_entry(rb_old, ITSTRUCT, ITRB);
 	ITSTRUCT *new = rb_entry(rb_new, ITSTRUCT, ITRB);
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 8d1e83b1c87b..0022c1bb1e26 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -62,54 +62,6 @@ extern void rb_insert_color(struct rb_node *, struct rb_root *);
 extern void rb_erase(struct rb_node *, struct rb_root *);
 
 
-struct rb_augment_callbacks {
-	void (*propagate)(struct rb_node *node, struct rb_node *stop);
-	void (*copy)(struct rb_node *old, struct rb_node *new);
-	void (*rotate)(struct rb_node *old, struct rb_node *new);
-};
-
-extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
-	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
-extern void rb_erase_augmented(struct rb_node *node, struct rb_root *root,
-			       const struct rb_augment_callbacks *augment);
-static inline void
-rb_insert_augmented(struct rb_node *node, struct rb_root *root,
-		    const struct rb_augment_callbacks *augment)
-{
-	__rb_insert_augmented(node, root, augment->rotate);
-}
-
-#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield,	      \
-			     rbtype, rbaugmented, rbcompute)		      \
-static void rbname ## _propagate(struct rb_node *rb, struct rb_node *stop)    \
-{									      \
-	while (rb != stop) {						      \
-		rbstruct *node = rb_entry(rb, rbstruct, rbfield);	      \
-		rbtype augmented = rbcompute(node);			      \
-		if (node->rbaugmented == augmented)			      \
-			break;						      \
-		node->rbaugmented = augmented;				      \
-		rb = rb_parent(&node->rbfield);				      \
-	}								      \
-}									      \
-static void rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)   \
-{									      \
-	rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);		      \
-	rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);		      \
-	new->rbaugmented = old->rbaugmented;				      \
-}									      \
-static void rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \
-{									      \
-	rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);		      \
-	rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);		      \
-	new->rbaugmented = old->rbaugmented;				      \
-	old->rbaugmented = rbcompute(old);				      \
-}									      \
-rbstatic const struct rb_augment_callbacks rbname = {			      \
-	rbname ## _propagate, rbname ## _copy, rbname ## _rotate	      \
-};
-
-
 /* Find logical next and previous nodes in a tree */
 extern struct rb_node *rb_next(const struct rb_node *);
 extern struct rb_node *rb_prev(const struct rb_node *);
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
new file mode 100644
index 000000000000..214caa33433b
--- /dev/null
+++ b/include/linux/rbtree_augmented.h
@@ -0,0 +1,223 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/include/linux/rbtree_augmented.h
+*/
+
+#ifndef _LINUX_RBTREE_AUGMENTED_H
+#define _LINUX_RBTREE_AUGMENTED_H
+
+#include <linux/rbtree.h>
+
+/*
+ * Please note - only struct rb_augment_callbacks and the prototypes for
+ * rb_insert_augmented() and rb_erase_augmented() are intended to be public.
+ * The rest are implementation details you are not expected to depend on.
+ *
+ * See Documentation/rbtree.txt for documentation and samples.
+ */
+
+struct rb_augment_callbacks {
+	void (*propagate)(struct rb_node *node, struct rb_node *stop);
+	void (*copy)(struct rb_node *old, struct rb_node *new);
+	void (*rotate)(struct rb_node *old, struct rb_node *new);
+};
+
+extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+static inline void
+rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+		    const struct rb_augment_callbacks *augment)
+{
+	__rb_insert_augmented(node, root, augment->rotate);
+}
+
+#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield,	\
+			     rbtype, rbaugmented, rbcompute)		\
+static inline void							\
+rbname ## _propagate(struct rb_node *rb, struct rb_node *stop)		\
+{									\
+	while (rb != stop) {						\
+		rbstruct *node = rb_entry(rb, rbstruct, rbfield);	\
+		rbtype augmented = rbcompute(node);			\
+		if (node->rbaugmented == augmented)			\
+			break;						\
+		node->rbaugmented = augmented;				\
+		rb = rb_parent(&node->rbfield);				\
+	}								\
+}									\
+static inline void							\
+rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)		\
+{									\
+	rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);		\
+	rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);		\
+	new->rbaugmented = old->rbaugmented;				\
+}									\
+static void								\
+rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)	\
+{									\
+	rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);		\
+	rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);		\
+	new->rbaugmented = old->rbaugmented;				\
+	old->rbaugmented = rbcompute(old);				\
+}									\
+rbstatic const struct rb_augment_callbacks rbname = {			\
+	rbname ## _propagate, rbname ## _copy, rbname ## _rotate	\
+};
+
+
+#define	RB_RED		0
+#define	RB_BLACK	1
+
+#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))
+
+#define __rb_color(pc)     ((pc) & 1)
+#define __rb_is_black(pc)  __rb_color(pc)
+#define __rb_is_red(pc)    (!__rb_color(pc))
+#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
+#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
+#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)
+
+static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
+{
+	rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
+}
+
+static inline void rb_set_parent_color(struct rb_node *rb,
+				       struct rb_node *p, int color)
+{
+	rb->__rb_parent_color = (unsigned long)p | color;
+}
+
+static inline void
+__rb_change_child(struct rb_node *old, struct rb_node *new,
+		  struct rb_node *parent, struct rb_root *root)
+{
+	if (parent) {
+		if (parent->rb_left == old)
+			parent->rb_left = new;
+		else
+			parent->rb_right = new;
+	} else
+		root->rb_node = new;
+}
+
+extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+
+static __always_inline void
+rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+		   const struct rb_augment_callbacks *augment)
+{
+	struct rb_node *child = node->rb_right, *tmp = node->rb_left;
+	struct rb_node *parent, *rebalance;
+	unsigned long pc;
+
+	if (!tmp) {
+		/*
+		 * Case 1: node to erase has no more than 1 child (easy!)
+		 *
+		 * Note that if there is one child it must be red due to 5)
+		 * and node must be black due to 4). We adjust colors locally
+		 * so as to bypass __rb_erase_color() later on.
+		 */
+		pc = node->__rb_parent_color;
+		parent = __rb_parent(pc);
+		__rb_change_child(node, child, parent, root);
+		if (child) {
+			child->__rb_parent_color = pc;
+			rebalance = NULL;
+		} else
+			rebalance = __rb_is_black(pc) ? parent : NULL;
+		tmp = parent;
+	} else if (!child) {
+		/* Still case 1, but this time the child is node->rb_left */
+		tmp->__rb_parent_color = pc = node->__rb_parent_color;
+		parent = __rb_parent(pc);
+		__rb_change_child(node, tmp, parent, root);
+		rebalance = NULL;
+		tmp = parent;
+	} else {
+		struct rb_node *successor = child, *child2;
+		tmp = child->rb_left;
+		if (!tmp) {
+			/*
+			 * Case 2: node's successor is its right child
+			 *
+			 *    (n)          (s)
+			 *    / \          / \
+			 *  (x) (s)  ->  (x) (c)
+			 *        \
+			 *        (c)
+			 */
+			parent = successor;
+			child2 = successor->rb_right;
+			augment->copy(node, successor);
+		} else {
+			/*
+			 * Case 3: node's successor is leftmost under
+			 * node's right child subtree
+			 *
+			 *    (n)          (s)
+			 *    / \          / \
+			 *  (x) (y)  ->  (x) (y)
+			 *      /            /
+			 *    (p)          (p)
+			 *    /            /
+			 *  (s)          (c)
+			 *    \
+			 *    (c)
+			 */
+			do {
+				parent = successor;
+				successor = tmp;
+				tmp = tmp->rb_left;
+			} while (tmp);
+			parent->rb_left = child2 = successor->rb_right;
+			successor->rb_right = child;
+			rb_set_parent(child, successor);
+			augment->copy(node, successor);
+			augment->propagate(parent, successor);
+		}
+
+		successor->rb_left = tmp = node->rb_left;
+		rb_set_parent(tmp, successor);
+
+		pc = node->__rb_parent_color;
+		tmp = __rb_parent(pc);
+		__rb_change_child(node, successor, tmp, root);
+		if (child2) {
+			successor->__rb_parent_color = pc;
+			rb_set_parent_color(child2, parent, RB_BLACK);
+			rebalance = NULL;
+		} else {
+			unsigned long pc2 = successor->__rb_parent_color;
+			successor->__rb_parent_color = pc;
+			rebalance = __rb_is_black(pc2) ? parent : NULL;
+		}
+		tmp = successor;
+	}
+
+	augment->propagate(tmp, NULL);
+	if (rebalance)
+		__rb_erase_color(rebalance, root, augment->rotate);
+}
+
+#endif	/* _LINUX_RBTREE_AUGMENTED_H */
diff --git a/lib/rbtree.c b/lib/rbtree.c
index c0088ca345f9..4f56a11d67fa 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -21,7 +21,7 @@
   linux/lib/rbtree.c
 */
 
-#include <linux/rbtree.h>
+#include <linux/rbtree_augmented.h>
 #include <linux/export.h>
 
 /*
@@ -44,52 +44,16 @@
  *  parentheses and have some accompanying text comment.
  */
 
-#define	RB_RED		0
-#define	RB_BLACK	1
-
-#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))
-
-#define __rb_color(pc)     ((pc) & 1)
-#define __rb_is_black(pc)  __rb_color(pc)
-#define __rb_is_red(pc)    (!__rb_color(pc))
-#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
-#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
-#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)
-
 static inline void rb_set_black(struct rb_node *rb)
 {
 	rb->__rb_parent_color |= RB_BLACK;
 }
 
-static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
-{
-	rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
-}
-
-static inline void rb_set_parent_color(struct rb_node *rb,
-				       struct rb_node *p, int color)
-{
-	rb->__rb_parent_color = (unsigned long)p | color;
-}
-
 static inline struct rb_node *rb_red_parent(struct rb_node *red)
 {
 	return (struct rb_node *)red->__rb_parent_color;
 }
 
-static inline void
-__rb_change_child(struct rb_node *old, struct rb_node *new,
-		  struct rb_node *parent, struct rb_root *root)
-{
-	if (parent) {
-		if (parent->rb_left == old)
-			parent->rb_left = new;
-		else
-			parent->rb_right = new;
-	} else
-		root->rb_node = new;
-}
-
 /*
  * Helper function for rotations:
  * - old's parent and color get assigned to new
@@ -230,9 +194,9 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
 	}
 }
 
-static __always_inline void
+__always_inline void
 __rb_erase_color(struct rb_node *parent, struct rb_root *root,
-		 const struct rb_augment_callbacks *augment)
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
 	struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
 
@@ -261,7 +225,7 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				rb_set_parent_color(tmp1, parent, RB_BLACK);
 				__rb_rotate_set_parents(parent, sibling, root,
 							RB_RED);
-				augment->rotate(parent, sibling);
+				augment_rotate(parent, sibling);
 				sibling = tmp1;
 			}
 			tmp1 = sibling->rb_right;
@@ -313,7 +277,7 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				if (tmp1)
 					rb_set_parent_color(tmp1, sibling,
 							    RB_BLACK);
-				augment->rotate(sibling, tmp2);
+				augment_rotate(sibling, tmp2);
 				tmp1 = sibling;
 				sibling = tmp2;
 			}
@@ -336,7 +300,7 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				rb_set_parent(tmp2, parent);
 			__rb_rotate_set_parents(parent, sibling, root,
 						RB_BLACK);
-			augment->rotate(parent, sibling);
+			augment_rotate(parent, sibling);
 			break;
 		} else {
 			sibling = parent->rb_left;
@@ -347,7 +311,7 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				rb_set_parent_color(tmp1, parent, RB_BLACK);
 				__rb_rotate_set_parents(parent, sibling, root,
 							RB_RED);
-				augment->rotate(parent, sibling);
+				augment_rotate(parent, sibling);
 				sibling = tmp1;
 			}
 			tmp1 = sibling->rb_left;
@@ -374,7 +338,7 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				if (tmp1)
 					rb_set_parent_color(tmp1, sibling,
 							    RB_BLACK);
-				augment->rotate(sibling, tmp2);
+				augment_rotate(sibling, tmp2);
 				tmp1 = sibling;
 				sibling = tmp2;
 			}
@@ -386,109 +350,12 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				rb_set_parent(tmp2, parent);
 			__rb_rotate_set_parents(parent, sibling, root,
 						RB_BLACK);
-			augment->rotate(parent, sibling);
+			augment_rotate(parent, sibling);
 			break;
 		}
 	}
 }
-
-static __always_inline void
-__rb_erase(struct rb_node *node, struct rb_root *root,
-	   const struct rb_augment_callbacks *augment)
-{
-	struct rb_node *child = node->rb_right, *tmp = node->rb_left;
-	struct rb_node *parent, *rebalance;
-	unsigned long pc;
-
-	if (!tmp) {
-		/*
-		 * Case 1: node to erase has no more than 1 child (easy!)
-		 *
-		 * Note that if there is one child it must be red due to 5)
-		 * and node must be black due to 4). We adjust colors locally
-		 * so as to bypass __rb_erase_color() later on.
-		 */
-		pc = node->__rb_parent_color;
-		parent = __rb_parent(pc);
-		__rb_change_child(node, child, parent, root);
-		if (child) {
-			child->__rb_parent_color = pc;
-			rebalance = NULL;
-		} else
-			rebalance = __rb_is_black(pc) ? parent : NULL;
-		tmp = parent;
-	} else if (!child) {
-		/* Still case 1, but this time the child is node->rb_left */
-		tmp->__rb_parent_color = pc = node->__rb_parent_color;
-		parent = __rb_parent(pc);
-		__rb_change_child(node, tmp, parent, root);
-		rebalance = NULL;
-		tmp = parent;
-	} else {
-		struct rb_node *successor = child, *child2;
-		tmp = child->rb_left;
-		if (!tmp) {
-			/*
-			 * Case 2: node's successor is its right child
-			 *
-			 *    (n)          (s)
-			 *    / \          / \
-			 *  (x) (s)  ->  (x) (c)
-			 *        \
-			 *        (c)
-			 */
-			parent = successor;
-			child2 = successor->rb_right;
-			augment->copy(node, successor);
-		} else {
-			/*
-			 * Case 3: node's successor is leftmost under
-			 * node's right child subtree
-			 *
-			 *    (n)          (s)
-			 *    / \          / \
-			 *  (x) (y)  ->  (x) (y)
-			 *      /            /
-			 *    (p)          (p)
-			 *    /            /
-			 *  (s)          (c)
-			 *    \
-			 *    (c)
-			 */
-			do {
-				parent = successor;
-				successor = tmp;
-				tmp = tmp->rb_left;
-			} while (tmp);
-			parent->rb_left = child2 = successor->rb_right;
-			successor->rb_right = child;
-			rb_set_parent(child, successor);
-			augment->copy(node, successor);
-			augment->propagate(parent, successor);
-		}
-
-		successor->rb_left = tmp = node->rb_left;
-		rb_set_parent(tmp, successor);
-
-		pc = node->__rb_parent_color;
-		tmp = __rb_parent(pc);
-		__rb_change_child(node, successor, tmp, root);
-		if (child2) {
-			successor->__rb_parent_color = pc;
-			rb_set_parent_color(child2, parent, RB_BLACK);
-			rebalance = NULL;
-		} else {
-			unsigned long pc2 = successor->__rb_parent_color;
-			successor->__rb_parent_color = pc;
-			rebalance = __rb_is_black(pc2) ? parent : NULL;
-		}
-		tmp = successor;
-	}
-
-	augment->propagate(tmp, NULL);
-	if (rebalance)
-		__rb_erase_color(rebalance, root, augment);
-}
+EXPORT_SYMBOL(__rb_erase_color);
 
 /*
  * Non-augmented rbtree manipulation functions.
@@ -513,7 +380,7 @@ EXPORT_SYMBOL(rb_insert_color);
 
 void rb_erase(struct rb_node *node, struct rb_root *root)
 {
-	__rb_erase(node, root, &dummy_callbacks);
+	rb_erase_augmented(node, root, &dummy_callbacks);
 }
 EXPORT_SYMBOL(rb_erase);
 
@@ -531,13 +398,6 @@ void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
 }
 EXPORT_SYMBOL(__rb_insert_augmented);
 
-void rb_erase_augmented(struct rb_node *node, struct rb_root *root,
-			const struct rb_augment_callbacks *augment)
-{
-	__rb_erase(node, root, augment);
-}
-EXPORT_SYMBOL(rb_erase_augmented);
-
 /*
  * This function returns the first node (in sort order) of the tree.
  */
diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index b20e99969b0f..268b23951fec 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -1,5 +1,5 @@
 #include <linux/module.h>
-#include <linux/rbtree.h>
+#include <linux/rbtree_augmented.h>
 #include <linux/random.h>
 #include <asm/timex.h>
 

From 9826a516ff77c5820e591211e4f3e58ff36f46be Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:35 -0700
Subject: [PATCH 084/160] mm: interval tree updates

Update the generic interval tree code that was introduced in "mm: replace
vma prio_tree with an interval tree".

Changes:

- fixed 'endpoing' typo noticed by Andrew Morton

- replaced include/linux/interval_tree_tmpl.h, which was used as a
  template (including it automatically defined the interval tree
  functions) with include/linux/interval_tree_generic.h, which only
  defines a preprocessor macro INTERVAL_TREE_DEFINE(), which itself
  defines the interval tree functions when invoked. Now that is a very
  long macro which is unfortunate, but it does make the usage sites
  (lib/interval_tree.c and mm/interval_tree.c) a bit nicer than previously.

- make use of RB_DECLARE_CALLBACKS() in the INTERVAL_TREE_DEFINE() macro,
  instead of duplicating that code in the interval tree template.

- replaced vma_interval_tree_add(), which was actually handling the
  nonlinear and interval tree cases, with vma_interval_tree_insert_after()
  which handles only the interval tree case and has an API that is more
  consistent with the other interval tree handling functions.
  The nonlinear case is now handled explicitly in kernel/fork.c dup_mmap().

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/interval_tree_generic.h | 191 ++++++++++++++++++++++
 include/linux/interval_tree_tmpl.h    | 219 --------------------------
 include/linux/mm.h                    |   6 +-
 kernel/fork.c                         |   7 +-
 lib/interval_tree.c                   |  15 +-
 mm/interval_tree.c                    |  56 ++++---
 6 files changed, 233 insertions(+), 261 deletions(-)
 create mode 100644 include/linux/interval_tree_generic.h
 delete mode 100644 include/linux/interval_tree_tmpl.h

diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h
new file mode 100644
index 000000000000..58370e1862ad
--- /dev/null
+++ b/include/linux/interval_tree_generic.h
@@ -0,0 +1,191 @@
+/*
+  Interval Trees
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  include/linux/interval_tree_generic.h
+*/
+
+#include <linux/rbtree_augmented.h>
+
+/*
+ * Template for implementing interval trees
+ *
+ * ITSTRUCT:   struct type of the interval tree nodes
+ * ITRB:       name of struct rb_node field within ITSTRUCT
+ * ITTYPE:     type of the interval endpoints
+ * ITSUBTREE:  name of ITTYPE field within ITSTRUCT holding last-in-subtree
+ * ITSTART(n): start endpoint of ITSTRUCT node n
+ * ITLAST(n):  last endpoint of ITSTRUCT node n
+ * ITSTATIC:   'static' or empty
+ * ITPREFIX:   prefix to use for the inline tree definitions
+ *
+ * Note - before using this, please consider if non-generic version
+ * (interval_tree.h) would work for you...
+ */
+
+#define INTERVAL_TREE_DEFINE(ITSTRUCT, ITRB, ITTYPE, ITSUBTREE,		      \
+			     ITSTART, ITLAST, ITSTATIC, ITPREFIX)	      \
+									      \
+/* Callbacks for augmented rbtree insert and remove */			      \
+									      \
+static inline ITTYPE ITPREFIX ## _compute_subtree_last(ITSTRUCT *node)	      \
+{									      \
+	ITTYPE max = ITLAST(node), subtree_last;			      \
+	if (node->ITRB.rb_left) {					      \
+		subtree_last = rb_entry(node->ITRB.rb_left,		      \
+					ITSTRUCT, ITRB)->ITSUBTREE;	      \
+		if (max < subtree_last)					      \
+			max = subtree_last;				      \
+	}								      \
+	if (node->ITRB.rb_right) {					      \
+		subtree_last = rb_entry(node->ITRB.rb_right,		      \
+					ITSTRUCT, ITRB)->ITSUBTREE;	      \
+		if (max < subtree_last)					      \
+			max = subtree_last;				      \
+	}								      \
+	return max;							      \
+}									      \
+									      \
+RB_DECLARE_CALLBACKS(static, ITPREFIX ## _augment, ITSTRUCT, ITRB,	      \
+		     ITTYPE, ITSUBTREE, ITPREFIX ## _compute_subtree_last)    \
+									      \
+/* Insert / remove interval nodes from the tree */			      \
+									      \
+ITSTATIC void ITPREFIX ## _insert(ITSTRUCT *node, struct rb_root *root)	      \
+{									      \
+	struct rb_node **link = &root->rb_node, *rb_parent = NULL;	      \
+	ITTYPE start = ITSTART(node), last = ITLAST(node);		      \
+	ITSTRUCT *parent;						      \
+									      \
+	while (*link) {							      \
+		rb_parent = *link;					      \
+		parent = rb_entry(rb_parent, ITSTRUCT, ITRB);		      \
+		if (parent->ITSUBTREE < last)				      \
+			parent->ITSUBTREE = last;			      \
+		if (start < ITSTART(parent))				      \
+			link = &parent->ITRB.rb_left;			      \
+		else							      \
+			link = &parent->ITRB.rb_right;			      \
+	}								      \
+									      \
+	node->ITSUBTREE = last;						      \
+	rb_link_node(&node->ITRB, rb_parent, link);			      \
+	rb_insert_augmented(&node->ITRB, root, &ITPREFIX ## _augment);	      \
+}									      \
+									      \
+ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, struct rb_root *root)	      \
+{									      \
+	rb_erase_augmented(&node->ITRB, root, &ITPREFIX ## _augment);	      \
+}									      \
+									      \
+/*									      \
+ * Iterate over intervals intersecting [start;last]			      \
+ *									      \
+ * Note that a node's interval intersects [start;last] iff:		      \
+ *   Cond1: ITSTART(node) <= last					      \
+ * and									      \
+ *   Cond2: start <= ITLAST(node)					      \
+ */									      \
+									      \
+static ITSTRUCT *							      \
+ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last)	      \
+{									      \
+	while (true) {							      \
+		/*							      \
+		 * Loop invariant: start <= node->ITSUBTREE		      \
+		 * (Cond2 is satisfied by one of the subtree nodes)	      \
+		 */							      \
+		if (node->ITRB.rb_left) {				      \
+			ITSTRUCT *left = rb_entry(node->ITRB.rb_left,	      \
+						  ITSTRUCT, ITRB);	      \
+			if (start <= left->ITSUBTREE) {			      \
+				/*					      \
+				 * Some nodes in left subtree satisfy Cond2.  \
+				 * Iterate to find the leftmost such node N.  \
+				 * If it also satisfies Cond1, that's the     \
+				 * match we are looking for. Otherwise, there \
+				 * is no matching interval as nodes to the    \
+				 * right of N can't satisfy Cond1 either.     \
+				 */					      \
+				node = left;				      \
+				continue;				      \
+			}						      \
+		}							      \
+		if (ITSTART(node) <= last) {		/* Cond1 */	      \
+			if (start <= ITLAST(node))	/* Cond2 */	      \
+				return node;	/* node is leftmost match */  \
+			if (node->ITRB.rb_right) {			      \
+				node = rb_entry(node->ITRB.rb_right,	      \
+						ITSTRUCT, ITRB);	      \
+				if (start <= node->ITSUBTREE)		      \
+					continue;			      \
+			}						      \
+		}							      \
+		return NULL;	/* No match */				      \
+	}								      \
+}									      \
+									      \
+ITSTATIC ITSTRUCT *							      \
+ITPREFIX ## _iter_first(struct rb_root *root, ITTYPE start, ITTYPE last)      \
+{									      \
+	ITSTRUCT *node;							      \
+									      \
+	if (!root->rb_node)						      \
+		return NULL;						      \
+	node = rb_entry(root->rb_node, ITSTRUCT, ITRB);			      \
+	if (node->ITSUBTREE < start)					      \
+		return NULL;						      \
+	return ITPREFIX ## _subtree_search(node, start, last);		      \
+}									      \
+									      \
+ITSTATIC ITSTRUCT *							      \
+ITPREFIX ## _iter_next(ITSTRUCT *node, ITTYPE start, ITTYPE last)	      \
+{									      \
+	struct rb_node *rb = node->ITRB.rb_right, *prev;		      \
+									      \
+	while (true) {							      \
+		/*							      \
+		 * Loop invariants:					      \
+		 *   Cond1: ITSTART(node) <= last			      \
+		 *   rb == node->ITRB.rb_right				      \
+		 *							      \
+		 * First, search right subtree if suitable		      \
+		 */							      \
+		if (rb) {						      \
+			ITSTRUCT *right = rb_entry(rb, ITSTRUCT, ITRB);	      \
+			if (start <= right->ITSUBTREE)			      \
+				return ITPREFIX ## _subtree_search(right,     \
+								start, last); \
+		}							      \
+									      \
+		/* Move up the tree until we come from a node's left child */ \
+		do {							      \
+			rb = rb_parent(&node->ITRB);			      \
+			if (!rb)					      \
+				return NULL;				      \
+			prev = &node->ITRB;				      \
+			node = rb_entry(rb, ITSTRUCT, ITRB);		      \
+			rb = node->ITRB.rb_right;			      \
+		} while (prev == rb);					      \
+									      \
+		/* Check if the node intersects [start;last] */		      \
+		if (last < ITSTART(node))		/* !Cond1 */	      \
+			return NULL;					      \
+		else if (start <= ITLAST(node))		/* Cond2 */	      \
+			return node;					      \
+	}								      \
+}
diff --git a/include/linux/interval_tree_tmpl.h b/include/linux/interval_tree_tmpl.h
deleted file mode 100644
index c1aeb922d65f..000000000000
--- a/include/linux/interval_tree_tmpl.h
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
-  Interval Trees
-  (C) 2012  Michel Lespinasse <walken@google.com>
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation; either version 2 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-  include/linux/interval_tree_tmpl.h
-*/
-
-#include <linux/rbtree_augmented.h>
-
-/*
- * Template for implementing interval trees
- *
- * ITSTRUCT:   struct type of the interval tree nodes
- * ITRB:       name of struct rb_node field within ITSTRUCT
- * ITTYPE:     type of the interval endpoints
- * ITSUBTREE:  name of ITTYPE field within ITSTRUCT holding last-in-subtree
- * ITSTART(n): start endpoint of ITSTRUCT node n
- * ITLAST(n):  last endpoing of ITSTRUCT node n
- * ITSTATIC:   'static' or empty
- * ITPREFIX:   prefix to use for the inline tree definitions
- */
-
-/* IT(name) -> ITPREFIX_name */
-#define _ITNAME(prefix, name) prefix ## _ ## name
-#define ITNAME(prefix, name) _ITNAME(prefix, name)
-#define IT(name) ITNAME(ITPREFIX, name)
-
-/* Callbacks for augmented rbtree insert and remove */
-
-static inline ITTYPE IT(compute_subtree_last)(ITSTRUCT *node)
-{
-	ITTYPE max = ITLAST(node), subtree_last;
-	if (node->ITRB.rb_left) {
-		subtree_last = rb_entry(node->ITRB.rb_left,
-					ITSTRUCT, ITRB)->ITSUBTREE;
-		if (max < subtree_last)
-			max = subtree_last;
-	}
-	if (node->ITRB.rb_right) {
-		subtree_last = rb_entry(node->ITRB.rb_right,
-					ITSTRUCT, ITRB)->ITSUBTREE;
-		if (max < subtree_last)
-			max = subtree_last;
-	}
-	return max;
-}
-
-static inline void
-IT(augment_propagate)(struct rb_node *rb, struct rb_node *stop)
-{
-	while (rb != stop) {
-		ITSTRUCT *node = rb_entry(rb, ITSTRUCT, ITRB);
-		ITTYPE subtree_last = IT(compute_subtree_last)(node);
-		if (node->ITSUBTREE == subtree_last)
-			break;
-		node->ITSUBTREE = subtree_last;
-		rb = rb_parent(&node->ITRB);
-	}
-}
-
-static inline void
-IT(augment_copy)(struct rb_node *rb_old, struct rb_node *rb_new)
-{
-	ITSTRUCT *old = rb_entry(rb_old, ITSTRUCT, ITRB);
-	ITSTRUCT *new = rb_entry(rb_new, ITSTRUCT, ITRB);
-
-	new->ITSUBTREE = old->ITSUBTREE;
-}
-
-static void IT(augment_rotate)(struct rb_node *rb_old, struct rb_node *rb_new)
-{
-	ITSTRUCT *old = rb_entry(rb_old, ITSTRUCT, ITRB);
-	ITSTRUCT *new = rb_entry(rb_new, ITSTRUCT, ITRB);
-
-	new->ITSUBTREE = old->ITSUBTREE;
-	old->ITSUBTREE = IT(compute_subtree_last)(old);
-}
-
-static const struct rb_augment_callbacks IT(augment_callbacks) = {
-	IT(augment_propagate), IT(augment_copy), IT(augment_rotate)
-};
-
-/* Insert / remove interval nodes from the tree */
-
-ITSTATIC void IT(insert)(ITSTRUCT *node, struct rb_root *root)
-{
-	struct rb_node **link = &root->rb_node, *rb_parent = NULL;
-	ITTYPE start = ITSTART(node), last = ITLAST(node);
-	ITSTRUCT *parent;
-
-	while (*link) {
-		rb_parent = *link;
-		parent = rb_entry(rb_parent, ITSTRUCT, ITRB);
-		if (parent->ITSUBTREE < last)
-			parent->ITSUBTREE = last;
-		if (start < ITSTART(parent))
-			link = &parent->ITRB.rb_left;
-		else
-			link = &parent->ITRB.rb_right;
-	}
-
-	node->ITSUBTREE = last;
-	rb_link_node(&node->ITRB, rb_parent, link);
-	rb_insert_augmented(&node->ITRB, root, &IT(augment_callbacks));
-}
-
-ITSTATIC void IT(remove)(ITSTRUCT *node, struct rb_root *root)
-{
-	rb_erase_augmented(&node->ITRB, root, &IT(augment_callbacks));
-}
-
-/*
- * Iterate over intervals intersecting [start;last]
- *
- * Note that a node's interval intersects [start;last] iff:
- *   Cond1: ITSTART(node) <= last
- * and
- *   Cond2: start <= ITLAST(node)
- */
-
-static ITSTRUCT *IT(subtree_search)(ITSTRUCT *node, ITTYPE start, ITTYPE last)
-{
-	while (true) {
-		/*
-		 * Loop invariant: start <= node->ITSUBTREE
-		 * (Cond2 is satisfied by one of the subtree nodes)
-		 */
-		if (node->ITRB.rb_left) {
-			ITSTRUCT *left = rb_entry(node->ITRB.rb_left,
-						  ITSTRUCT, ITRB);
-			if (start <= left->ITSUBTREE) {
-				/*
-				 * Some nodes in left subtree satisfy Cond2.
-				 * Iterate to find the leftmost such node N.
-				 * If it also satisfies Cond1, that's the match
-				 * we are looking for. Otherwise, there is no
-				 * matching interval as nodes to the right of N
-				 * can't satisfy Cond1 either.
-				 */
-				node = left;
-				continue;
-			}
-		}
-		if (ITSTART(node) <= last) {		/* Cond1 */
-			if (start <= ITLAST(node))	/* Cond2 */
-				return node;	/* node is leftmost match */
-			if (node->ITRB.rb_right) {
-				node = rb_entry(node->ITRB.rb_right,
-						ITSTRUCT, ITRB);
-				if (start <= node->ITSUBTREE)
-					continue;
-			}
-		}
-		return NULL;	/* No match */
-	}
-}
-
-ITSTATIC ITSTRUCT *IT(iter_first)(struct rb_root *root,
-				  ITTYPE start, ITTYPE last)
-{
-	ITSTRUCT *node;
-
-	if (!root->rb_node)
-		return NULL;
-	node = rb_entry(root->rb_node, ITSTRUCT, ITRB);
-	if (node->ITSUBTREE < start)
-		return NULL;
-	return IT(subtree_search)(node, start, last);
-}
-
-ITSTATIC ITSTRUCT *IT(iter_next)(ITSTRUCT *node, ITTYPE start, ITTYPE last)
-{
-	struct rb_node *rb = node->ITRB.rb_right, *prev;
-
-	while (true) {
-		/*
-		 * Loop invariants:
-		 *   Cond1: ITSTART(node) <= last
-		 *   rb == node->ITRB.rb_right
-		 *
-		 * First, search right subtree if suitable
-		 */
-		if (rb) {
-			ITSTRUCT *right = rb_entry(rb, ITSTRUCT, ITRB);
-			if (start <= right->ITSUBTREE)
-				return IT(subtree_search)(right, start, last);
-		}
-
-		/* Move up the tree until we come from a node's left child */
-		do {
-			rb = rb_parent(&node->ITRB);
-			if (!rb)
-				return NULL;
-			prev = &node->ITRB;
-			node = rb_entry(rb, ITSTRUCT, ITRB);
-			rb = node->ITRB.rb_right;
-		} while (prev == rb);
-
-		/* Check if the node intersects [start;last] */
-		if (last < ITSTART(node))		/* !Cond1 */
-			return NULL;
-		else if (start <= ITLAST(node))		/* Cond2 */
-			return node;
-	}
-}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0f671ef09eba..f1d9aaadb566 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1355,11 +1355,11 @@ extern atomic_long_t mmap_pages_allocated;
 extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
 
 /* interval_tree.c */
-void vma_interval_tree_add(struct vm_area_struct *vma,
-			   struct vm_area_struct *old,
-			   struct address_space *mapping);
 void vma_interval_tree_insert(struct vm_area_struct *node,
 			      struct rb_root *root);
+void vma_interval_tree_insert_after(struct vm_area_struct *node,
+				    struct vm_area_struct *prev,
+				    struct rb_root *root);
 void vma_interval_tree_remove(struct vm_area_struct *node,
 			      struct rb_root *root);
 struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root *root,
diff --git a/kernel/fork.c b/kernel/fork.c
index 90dace52715e..1cd7d581b3b2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -423,7 +423,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 				mapping->i_mmap_writable++;
 			flush_dcache_mmap_lock(mapping);
 			/* insert tmp into the share list, just after mpnt */
-			vma_interval_tree_add(tmp, mpnt, mapping);
+			if (unlikely(tmp->vm_flags & VM_NONLINEAR))
+				vma_nonlinear_insert(tmp,
+						&mapping->i_mmap_nonlinear);
+			else
+				vma_interval_tree_insert_after(tmp, mpnt,
+							&mapping->i_mmap);
 			flush_dcache_mmap_unlock(mapping);
 			mutex_unlock(&mapping->i_mmap_mutex);
 		}
diff --git a/lib/interval_tree.c b/lib/interval_tree.c
index 77a793e0644b..e6eb406f2d65 100644
--- a/lib/interval_tree.c
+++ b/lib/interval_tree.c
@@ -1,13 +1,10 @@
 #include <linux/init.h>
 #include <linux/interval_tree.h>
+#include <linux/interval_tree_generic.h>
 
-#define ITSTRUCT   struct interval_tree_node
-#define ITRB       rb
-#define ITTYPE     unsigned long
-#define ITSUBTREE  __subtree_last
-#define ITSTART(n) ((n)->start)
-#define ITLAST(n)  ((n)->last)
-#define ITSTATIC
-#define ITPREFIX   interval_tree
+#define START(node) ((node)->start)
+#define LAST(node)  ((node)->last)
 
-#include <linux/interval_tree_tmpl.h>
+INTERVAL_TREE_DEFINE(struct interval_tree_node, rb,
+		     unsigned long, __subtree_last,
+		     START, LAST,, interval_tree)
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 7dc565660e56..4ab7b9ec3a56 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -8,40 +8,38 @@
 
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/interval_tree_generic.h>
 
-#define ITSTRUCT   struct vm_area_struct
-#define ITRB       shared.linear.rb
-#define ITTYPE     unsigned long
-#define ITSUBTREE  shared.linear.rb_subtree_last
-#define ITSTART(n) ((n)->vm_pgoff)
-#define ITLAST(n)  ((n)->vm_pgoff + \
-		    (((n)->vm_end - (n)->vm_start) >> PAGE_SHIFT) - 1)
-#define ITSTATIC
-#define ITPREFIX   vma_interval_tree
+static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
+{
+	return v->vm_pgoff;
+}
 
-#include <linux/interval_tree_tmpl.h>
+static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
+{
+	return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
+}
 
-/* Insert old immediately after vma in the interval tree */
-void vma_interval_tree_add(struct vm_area_struct *vma,
-			   struct vm_area_struct *old,
-			   struct address_space *mapping)
+INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
+		     unsigned long, shared.linear.rb_subtree_last,
+		     vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
+
+/* Insert node immediately after prev in the interval tree */
+void vma_interval_tree_insert_after(struct vm_area_struct *node,
+				    struct vm_area_struct *prev,
+				    struct rb_root *root)
 {
 	struct rb_node **link;
 	struct vm_area_struct *parent;
-	unsigned long last;
+	unsigned long last = vma_last_pgoff(node);
 
-	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
-		list_add(&vma->shared.nonlinear, &old->shared.nonlinear);
-		return;
-	}
+	VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev));
 
-	last = ITLAST(vma);
-
-	if (!old->shared.linear.rb.rb_right) {
-		parent = old;
-		link = &old->shared.linear.rb.rb_right;
+	if (!prev->shared.linear.rb.rb_right) {
+		parent = prev;
+		link = &prev->shared.linear.rb.rb_right;
 	} else {
-		parent = rb_entry(old->shared.linear.rb.rb_right,
+		parent = rb_entry(prev->shared.linear.rb.rb_right,
 				  struct vm_area_struct, shared.linear.rb);
 		if (parent->shared.linear.rb_subtree_last < last)
 			parent->shared.linear.rb_subtree_last = last;
@@ -54,8 +52,8 @@ void vma_interval_tree_add(struct vm_area_struct *vma,
 		link = &parent->shared.linear.rb.rb_left;
 	}
 
-	vma->shared.linear.rb_subtree_last = last;
-	rb_link_node(&vma->shared.linear.rb, &parent->shared.linear.rb, link);
-	rb_insert_augmented(&vma->shared.linear.rb, &mapping->i_mmap,
-			    &vma_interval_tree_augment_callbacks);
+	node->shared.linear.rb_subtree_last = last;
+	rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
+	rb_insert_augmented(&node->shared.linear.rb, root,
+			    &vma_interval_tree_augment);
 }

From 108d6642ad81bb1d62b401490a334d2c12397517 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:36 -0700
Subject: [PATCH 085/160] mm anon rmap: remove anon_vma_moveto_tail

mremap() had a clever optimization where move_ptes() did not take the
anon_vma lock to avoid a race with anon rmap users such as page migration.
 Instead, the avc's were ordered in such a way that the origin vma was
always visited by rmap before the destination.  This ordering and the use
of page table locks rmap usage safe.  However, we want to replace the use
of linked lists in anon rmap with an interval tree, and this will make it
harder to impose such ordering as the interval tree will always be sorted
by the avc->vma->vm_pgoff value.  For now, let's replace the
anon_vma_moveto_tail() ordering function with proper anon_vma locking in
move_ptes().  Once we have the anon interval tree in place, we will
re-introduce an optimization to avoid taking these locks in the most
common cases.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  1 -
 mm/mmap.c            |  3 +--
 mm/mremap.c          | 14 +++++---------
 mm/rmap.c            | 45 --------------------------------------------
 4 files changed, 6 insertions(+), 57 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 3fce545df394..7f32cec57e67 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -120,7 +120,6 @@ void anon_vma_init(void);	/* create anon_vma_cachep */
 int  anon_vma_prepare(struct vm_area_struct *);
 void unlink_anon_vmas(struct vm_area_struct *);
 int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
-void anon_vma_moveto_tail(struct vm_area_struct *);
 int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
 
 static inline void anon_vma_merge(struct vm_area_struct *vma,
diff --git a/mm/mmap.c b/mm/mmap.c
index 5ac533f88e99..66984aab7915 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2378,8 +2378,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 			 */
 			VM_BUG_ON(faulted_in_anon_vma);
 			*vmap = new_vma;
-		} else
-			anon_vma_moveto_tail(new_vma);
+		}
 	} else {
 		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 		if (new_vma) {
diff --git a/mm/mremap.c b/mm/mremap.c
index cc06d0e48d05..5588bb6e9295 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -74,6 +74,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		unsigned long new_addr)
 {
 	struct address_space *mapping = NULL;
+	struct anon_vma *anon_vma = vma->anon_vma;
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
 	spinlock_t *old_ptl, *new_ptl;
@@ -88,6 +89,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		mapping = vma->vm_file->f_mapping;
 		mutex_lock(&mapping->i_mmap_mutex);
 	}
+	if (anon_vma)
+		anon_vma_lock(anon_vma);
 
 	/*
 	 * We don't have to worry about the ordering of src and dst
@@ -114,6 +117,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		spin_unlock(new_ptl);
 	pte_unmap(new_pte - 1);
 	pte_unmap_unlock(old_pte - 1, old_ptl);
+	if (anon_vma)
+		anon_vma_unlock(anon_vma);
 	if (mapping)
 		mutex_unlock(&mapping->i_mmap_mutex);
 }
@@ -220,15 +225,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 
 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
 	if (moved_len < old_len) {
-		/*
-		 * Before moving the page tables from the new vma to
-		 * the old vma, we need to be sure the old vma is
-		 * queued after new vma in the same_anon_vma list to
-		 * prevent SMP races with rmap_walk (that could lead
-		 * rmap_walk to miss some page table).
-		 */
-		anon_vma_moveto_tail(vma);
-
 		/*
 		 * On error, move entries back from new area to old,
 		 * which will succeed since page tables still there,
diff --git a/mm/rmap.c b/mm/rmap.c
index 7b5b51d25fc5..8cbd62fde0f1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -268,51 +268,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 	return -ENOMEM;
 }
 
-/*
- * Some rmap walk that needs to find all ptes/hugepmds without false
- * negatives (like migrate and split_huge_page) running concurrent
- * with operations that copy or move pagetables (like mremap() and
- * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
- * list to be in a certain order: the dst_vma must be placed after the
- * src_vma in the list. This is always guaranteed by fork() but
- * mremap() needs to call this function to enforce it in case the
- * dst_vma isn't newly allocated and chained with the anon_vma_clone()
- * function but just an extension of a pre-existing vma through
- * vma_merge.
- *
- * NOTE: the same_anon_vma list can still be changed by other
- * processes while mremap runs because mremap doesn't hold the
- * anon_vma mutex to prevent modifications to the list while it
- * runs. All we need to enforce is that the relative order of this
- * process vmas isn't changing (we don't care about other vmas
- * order). Each vma corresponds to an anon_vma_chain structure so
- * there's no risk that other processes calling anon_vma_moveto_tail()
- * and changing the same_anon_vma list under mremap() will screw with
- * the relative order of this process vmas in the list, because we
- * they can't alter the order of any vma that belongs to this
- * process. And there can't be another anon_vma_moveto_tail() running
- * concurrently with mremap() coming from this process because we hold
- * the mmap_sem for the whole mremap(). fork() ordering dependency
- * also shouldn't be affected because fork() only cares that the
- * parent vmas are placed in the list before the child vmas and
- * anon_vma_moveto_tail() won't reorder vmas from either the fork()
- * parent or child.
- */
-void anon_vma_moveto_tail(struct vm_area_struct *dst)
-{
-	struct anon_vma_chain *pavc;
-	struct anon_vma *root = NULL;
-
-	list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
-		struct anon_vma *anon_vma = pavc->anon_vma;
-		VM_BUG_ON(pavc->vma != dst);
-		root = lock_anon_vma_root(root, anon_vma);
-		list_del(&pavc->same_anon_vma);
-		list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
-	}
-	unlock_anon_vma_root(root);
-}
-
 /*
  * Attach vma to its own anon_vma, as well as to the anon_vmas that
  * the corresponding VMA in the parent process is attached to.

From bf181b9f9d8dfbba58b23441ad60d0bc33806d64 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:39 -0700
Subject: [PATCH 086/160] mm anon rmap: replace same_anon_vma linked list with
 an interval tree.

When a large VMA (anon or private file mapping) is first touched, which
will populate its anon_vma field, and then split into many regions through
the use of mprotect(), the original anon_vma ends up linking all of the
vmas on a linked list.  This can cause rmap to become inefficient, as we
have to walk potentially thousands of irrelevent vmas before finding the
one a given anon page might fall into.

By replacing the same_anon_vma linked list with an interval tree (where
each avc's interval is determined by its vma's start and last pgoffs), we
can make rmap efficient for this use case again.

While the change is large, all of its pieces are fairly simple.

Most places that were walking the same_anon_vma list were looking for a
known pgoff, so they can just use the anon_vma_interval_tree_foreach()
interval tree iterator instead.  The exception here is ksm, where the
page's index is not known.  It would probably be possible to rework ksm so
that the index would be known, but for now I have decided to keep things
simple and just walk the entirety of the interval tree there.

When updating vma's that already have an anon_vma assigned, we must take
care to re-index the corresponding avc's on their interval tree.  This is
done through the use of anon_vma_interval_tree_pre_update_vma() and
anon_vma_interval_tree_post_update_vma(), which remove the avc's from
their interval tree before the update and re-insert them after the update.
 The anon_vma stays locked during the update, so there is no chance that
rmap would miss the vmas that are being updated.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h   | 14 +++++++++
 include/linux/rmap.h | 11 ++++---
 mm/huge_memory.c     |  5 +--
 mm/interval_tree.c   | 14 +++++++++
 mm/ksm.c             |  9 ++++--
 mm/memory-failure.c  |  5 ++-
 mm/mmap.c            | 73 +++++++++++++++++++++++++++++++++-----------
 mm/rmap.c            | 24 +++++++--------
 8 files changed, 114 insertions(+), 41 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f1d9aaadb566..0cdab4e0f814 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -20,6 +20,7 @@
 
 struct mempolicy;
 struct anon_vma;
+struct anon_vma_chain;
 struct file_ra_state;
 struct user_struct;
 struct writeback_control;
@@ -1377,6 +1378,19 @@ static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
 	list_add_tail(&vma->shared.nonlinear, list);
 }
 
+void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
+				   struct rb_root *root);
+void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
+				   struct rb_root *root);
+struct anon_vma_chain *anon_vma_interval_tree_iter_first(
+	struct rb_root *root, unsigned long start, unsigned long last);
+struct anon_vma_chain *anon_vma_interval_tree_iter_next(
+	struct anon_vma_chain *node, unsigned long start, unsigned long last);
+
+#define anon_vma_interval_tree_foreach(avc, root, start, last)		 \
+	for (avc = anon_vma_interval_tree_iter_first(root, start, last); \
+	     avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))
+
 /* mmap.c */
 extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
 extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 7f32cec57e67..dce44f7d3ed8 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -37,14 +37,14 @@ struct anon_vma {
 	atomic_t refcount;
 
 	/*
-	 * NOTE: the LSB of the head.next is set by
+	 * NOTE: the LSB of the rb_root.rb_node is set by
 	 * mm_take_all_locks() _after_ taking the above lock. So the
-	 * head must only be read/written after taking the above lock
+	 * rb_root must only be read/written after taking the above lock
 	 * to be sure to see a valid next pointer. The LSB bit itself
 	 * is serialized by a system wide lock only visible to
 	 * mm_take_all_locks() (mm_all_locks_mutex).
 	 */
-	struct list_head head;	/* Chain of private "related" vmas */
+	struct rb_root rb_root;	/* Interval tree of private "related" vmas */
 };
 
 /*
@@ -57,14 +57,15 @@ struct anon_vma {
  * with a VMA, or the VMAs associated with an anon_vma.
  * The "same_vma" list contains the anon_vma_chains linking
  * all the anon_vmas associated with this VMA.
- * The "same_anon_vma" list contains the anon_vma_chains
+ * The "rb" field indexes on an interval tree the anon_vma_chains
  * which link all the VMAs associated with this anon_vma.
  */
 struct anon_vma_chain {
 	struct vm_area_struct *vma;
 	struct anon_vma *anon_vma;
 	struct list_head same_vma;   /* locked by mmap_sem & page_table_lock */
-	struct list_head same_anon_vma;	/* locked by anon_vma->mutex */
+	struct rb_node rb;			/* locked by anon_vma->mutex */
+	unsigned long rb_subtree_last;
 };
 
 #ifdef CONFIG_MMU
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 010d32944d14..ce59ada09462 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1375,13 +1375,14 @@ static void __split_huge_page(struct page *page,
 			      struct anon_vma *anon_vma)
 {
 	int mapcount, mapcount2;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct anon_vma_chain *avc;
 
 	BUG_ON(!PageHead(page));
 	BUG_ON(PageTail(page));
 
 	mapcount = 0;
-	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long addr = vma_address(page, vma);
 		BUG_ON(is_vma_temporary_stack(vma));
@@ -1407,7 +1408,7 @@ static void __split_huge_page(struct page *page,
 	__split_huge_page_refcount(page);
 
 	mapcount2 = 0;
-	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long addr = vma_address(page, vma);
 		BUG_ON(is_vma_temporary_stack(vma));
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 4ab7b9ec3a56..f7c72cd35e1d 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -8,6 +8,7 @@
 
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/rmap.h>
 #include <linux/interval_tree_generic.h>
 
 static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
@@ -57,3 +58,16 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
 	rb_insert_augmented(&node->shared.linear.rb, root,
 			    &vma_interval_tree_augment);
 }
+
+static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
+{
+	return vma_start_pgoff(avc->vma);
+}
+
+static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
+{
+	return vma_last_pgoff(avc->vma);
+}
+
+INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
+		     avc_start_pgoff, avc_last_pgoff,, anon_vma_interval_tree)
diff --git a/mm/ksm.c b/mm/ksm.c
index 9638620a7530..14ee5cf8a513 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1618,7 +1618,8 @@ again:
 		struct vm_area_struct *vma;
 
 		anon_vma_lock(anon_vma);
-		list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+					       0, ULONG_MAX) {
 			vma = vmac->vma;
 			if (rmap_item->address < vma->vm_start ||
 			    rmap_item->address >= vma->vm_end)
@@ -1671,7 +1672,8 @@ again:
 		struct vm_area_struct *vma;
 
 		anon_vma_lock(anon_vma);
-		list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+					       0, ULONG_MAX) {
 			vma = vmac->vma;
 			if (rmap_item->address < vma->vm_start ||
 			    rmap_item->address >= vma->vm_end)
@@ -1723,7 +1725,8 @@ again:
 		struct vm_area_struct *vma;
 
 		anon_vma_lock(anon_vma);
-		list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+					       0, ULONG_MAX) {
 			vma = vmac->vma;
 			if (rmap_item->address < vma->vm_start ||
 			    rmap_item->address >= vma->vm_end)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c38a6257d082..6c5899b9034a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
 	struct anon_vma *av;
+	pgoff_t pgoff;
 
 	av = page_lock_anon_vma(page);
 	if (av == NULL)	/* Not actually mapped anymore */
 		return;
 
+	pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	read_lock(&tasklist_lock);
 	for_each_process (tsk) {
 		struct anon_vma_chain *vmac;
 
 		if (!task_early_kill(tsk))
 			continue;
-		list_for_each_entry(vmac, &av->head, same_anon_vma) {
+		anon_vma_interval_tree_foreach(vmac, &av->rb_root,
+					       pgoff, pgoff) {
 			vma = vmac->vma;
 			if (!page_mapped_in_vma(page, vma))
 				continue;
diff --git a/mm/mmap.c b/mm/mmap.c
index 66984aab7915..2e580ed79211 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -353,6 +353,38 @@ void validate_mm(struct mm_struct *mm)
 #define validate_mm(mm) do { } while (0)
 #endif
 
+/*
+ * vma has some anon_vma assigned, and is already inserted on that
+ * anon_vma's interval trees.
+ *
+ * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
+ * vma must be removed from the anon_vma's interval trees using
+ * anon_vma_interval_tree_pre_update_vma().
+ *
+ * After the update, the vma will be reinserted using
+ * anon_vma_interval_tree_post_update_vma().
+ *
+ * The entire update must be protected by exclusive mmap_sem and by
+ * the root anon_vma's mutex.
+ */
+static inline void
+anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
+{
+	struct anon_vma_chain *avc;
+
+	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+		anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
+}
+
+static inline void
+anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
+{
+	struct anon_vma_chain *avc;
+
+	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+		anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
+}
+
 static int find_vma_links(struct mm_struct *mm, unsigned long addr,
 		unsigned long end, struct vm_area_struct **pprev,
 		struct rb_node ***rb_link, struct rb_node **rb_parent)
@@ -565,20 +597,17 @@ again:			remove_next = 1 + (end > next->vm_end);
 
 	vma_adjust_trans_huge(vma, start, end, adjust_next);
 
-	/*
-	 * When changing only vma->vm_end, we don't really need anon_vma
-	 * lock. This is a fairly rare case by itself, but the anon_vma
-	 * lock may be shared between many sibling processes.  Skipping
-	 * the lock for brk adjustments makes a difference sometimes.
-	 */
-	if (vma->anon_vma && (importer || start != vma->vm_start)) {
-		anon_vma = vma->anon_vma;
+	anon_vma = vma->anon_vma;
+	if (!anon_vma && adjust_next)
+		anon_vma = next->anon_vma;
+	if (anon_vma) {
 		VM_BUG_ON(adjust_next && next->anon_vma &&
 			  anon_vma != next->anon_vma);
-	} else if (adjust_next && next->anon_vma)
-		anon_vma = next->anon_vma;
-	if (anon_vma)
 		anon_vma_lock(anon_vma);
+		anon_vma_interval_tree_pre_update_vma(vma);
+		if (adjust_next)
+			anon_vma_interval_tree_pre_update_vma(next);
+	}
 
 	if (root) {
 		flush_dcache_mmap_lock(mapping);
@@ -619,8 +648,12 @@ again:			remove_next = 1 + (end > next->vm_end);
 		__insert_vm_struct(mm, insert);
 	}
 
-	if (anon_vma)
+	if (anon_vma) {
+		anon_vma_interval_tree_post_update_vma(vma);
+		if (adjust_next)
+			anon_vma_interval_tree_post_update_vma(next);
 		anon_vma_unlock(anon_vma);
+	}
 	if (mapping)
 		mutex_unlock(&mapping->i_mmap_mutex);
 
@@ -1748,7 +1781,9 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 		if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
 			error = acct_stack_growth(vma, size, grow);
 			if (!error) {
+				anon_vma_interval_tree_pre_update_vma(vma);
 				vma->vm_end = address;
+				anon_vma_interval_tree_post_update_vma(vma);
 				perf_event_mmap(vma);
 			}
 		}
@@ -1798,8 +1833,10 @@ int expand_downwards(struct vm_area_struct *vma,
 		if (grow <= vma->vm_pgoff) {
 			error = acct_stack_growth(vma, size, grow);
 			if (!error) {
+				anon_vma_interval_tree_pre_update_vma(vma);
 				vma->vm_start = address;
 				vma->vm_pgoff -= grow;
+				anon_vma_interval_tree_post_update_vma(vma);
 				perf_event_mmap(vma);
 			}
 		}
@@ -2515,7 +2552,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
 
 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 {
-	if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
+	if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
 		/*
 		 * The LSB of head.next can't change from under us
 		 * because we hold the mm_all_locks_mutex.
@@ -2531,7 +2568,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 		 * anon_vma->root->mutex.
 		 */
 		if (__test_and_set_bit(0, (unsigned long *)
-				       &anon_vma->root->head.next))
+				       &anon_vma->root->rb_root.rb_node))
 			BUG();
 	}
 }
@@ -2572,7 +2609,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
  * A single task can't take more than one mm_take_all_locks() in a row
  * or it would deadlock.
  *
- * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
  * mapping->flags avoid to take the same lock twice, if more than one
  * vma in this mm is backed by the same anon_vma or address_space.
  *
@@ -2619,13 +2656,13 @@ out_unlock:
 
 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 {
-	if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
+	if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
 		/*
 		 * The LSB of head.next can't change to 0 from under
 		 * us because we hold the mm_all_locks_mutex.
 		 *
 		 * We must however clear the bitflag before unlocking
-		 * the vma so the users using the anon_vma->head will
+		 * the vma so the users using the anon_vma->rb_root will
 		 * never see our bitflag.
 		 *
 		 * No need of atomic instructions here, head.next
@@ -2633,7 +2670,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 		 * anon_vma->root->mutex.
 		 */
 		if (!__test_and_clear_bit(0, (unsigned long *)
-					  &anon_vma->root->head.next))
+					  &anon_vma->root->rb_root.rb_node))
 			BUG();
 		anon_vma_unlock(anon_vma);
 	}
diff --git a/mm/rmap.c b/mm/rmap.c
index 8cbd62fde0f1..9c61bf387fd1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -127,12 +127,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
 	avc->vma = vma;
 	avc->anon_vma = anon_vma;
 	list_add(&avc->same_vma, &vma->anon_vma_chain);
-
-	/*
-	 * It's critical to add new vmas to the tail of the anon_vma,
-	 * see comment in huge_memory.c:__split_huge_page().
-	 */
-	list_add_tail(&avc->same_anon_vma, &anon_vma->head);
+	anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
 }
 
 /**
@@ -336,13 +331,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
 		struct anon_vma *anon_vma = avc->anon_vma;
 
 		root = lock_anon_vma_root(root, anon_vma);
-		list_del(&avc->same_anon_vma);
+		anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
 
 		/*
 		 * Leave empty anon_vmas on the list - we'll need
 		 * to free them outside the lock.
 		 */
-		if (list_empty(&anon_vma->head))
+		if (RB_EMPTY_ROOT(&anon_vma->rb_root))
 			continue;
 
 		list_del(&avc->same_vma);
@@ -371,7 +366,7 @@ static void anon_vma_ctor(void *data)
 
 	mutex_init(&anon_vma->mutex);
 	atomic_set(&anon_vma->refcount, 0);
-	INIT_LIST_HEAD(&anon_vma->head);
+	anon_vma->rb_root = RB_ROOT;
 }
 
 void __init anon_vma_init(void)
@@ -724,6 +719,7 @@ static int page_referenced_anon(struct page *page,
 {
 	unsigned int mapcount;
 	struct anon_vma *anon_vma;
+	pgoff_t pgoff;
 	struct anon_vma_chain *avc;
 	int referenced = 0;
 
@@ -732,7 +728,8 @@ static int page_referenced_anon(struct page *page,
 		return referenced;
 
 	mapcount = page_mapcount(page);
-	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+	pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address = vma_address(page, vma);
 		if (address == -EFAULT)
@@ -1445,6 +1442,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma)
 static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 {
 	struct anon_vma *anon_vma;
+	pgoff_t pgoff;
 	struct anon_vma_chain *avc;
 	int ret = SWAP_AGAIN;
 
@@ -1452,7 +1450,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 	if (!anon_vma)
 		return ret;
 
-	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+	pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address;
 
@@ -1668,6 +1667,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
 		struct vm_area_struct *, unsigned long, void *), void *arg)
 {
 	struct anon_vma *anon_vma;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct anon_vma_chain *avc;
 	int ret = SWAP_AGAIN;
 
@@ -1681,7 +1681,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
 	if (!anon_vma)
 		return ret;
 	anon_vma_lock(anon_vma);
-	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address = vma_address(page, vma);
 		if (address == -EFAULT)

From 86c2ad19956f84f2191e062fcb979367b6365871 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:42 -0700
Subject: [PATCH 087/160] mm rmap: remove vma_address check for address inside
 vma

In file and anon rmap, we use interval trees to find potentially relevant
vmas and then call vma_address() to find the virtual address the given
page might be found at in these vmas.  vma_address() used to include a
check that the returned address falls within the limits of the vma, but
this check isn't necessary now that we always use interval trees in rmap:
the interval tree just doesn't return any vmas which this check would find
to be irrelevant.  As a result, we can replace the use of -EFAULT error
code (which then needed to be checked in every call site) with a
VM_BUG_ON().

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c |  4 ----
 mm/rmap.c        | 48 +++++++++++++++++++++---------------------------
 2 files changed, 21 insertions(+), 31 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ce59ada09462..7cf8b0ec11ec 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1386,8 +1386,6 @@ static void __split_huge_page(struct page *page,
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long addr = vma_address(page, vma);
 		BUG_ON(is_vma_temporary_stack(vma));
-		if (addr == -EFAULT)
-			continue;
 		mapcount += __split_huge_page_splitting(page, vma, addr);
 	}
 	/*
@@ -1412,8 +1410,6 @@ static void __split_huge_page(struct page *page,
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long addr = vma_address(page, vma);
 		BUG_ON(is_vma_temporary_stack(vma));
-		if (addr == -EFAULT)
-			continue;
 		mapcount2 += __split_huge_page_map(page, vma, addr);
 	}
 	if (mapcount != mapcount2)
diff --git a/mm/rmap.c b/mm/rmap.c
index 9c61bf387fd1..28777412de62 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -510,22 +510,26 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
 
 /*
  * At what user virtual address is page expected in @vma?
- * Returns virtual address or -EFAULT if page's index/offset is not
- * within the range mapped the @vma.
  */
-inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+static inline unsigned long
+__vma_address(struct page *page, struct vm_area_struct *vma)
 {
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-	unsigned long address;
 
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		pgoff = page->index << huge_page_order(page_hstate(page));
-	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
-		/* page should be within @vma mapping range */
-		return -EFAULT;
-	}
+
+	return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+}
+
+inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+	unsigned long address = __vma_address(page, vma);
+
+	/* page should be within @vma mapping range */
+	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+
 	return address;
 }
 
@@ -535,6 +539,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
  */
 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 {
+	unsigned long address;
 	if (PageAnon(page)) {
 		struct anon_vma *page__anon_vma = page_anon_vma(page);
 		/*
@@ -550,7 +555,10 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 			return -EFAULT;
 	} else
 		return -EFAULT;
-	return vma_address(page, vma);
+	address = __vma_address(page, vma);
+	if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+		return -EFAULT;
+	return address;
 }
 
 /*
@@ -624,8 +632,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	address = vma_address(page, vma);
-	if (address == -EFAULT)		/* out of vma range */
+	address = __vma_address(page, vma);
+	if (unlikely(address < vma->vm_start || address >= vma->vm_end))
 		return 0;
 	pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
 	if (!pte)			/* the page is not in this mm */
@@ -732,8 +740,6 @@ static int page_referenced_anon(struct page *page,
 	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address = vma_address(page, vma);
-		if (address == -EFAULT)
-			continue;
 		/*
 		 * If we are reclaiming on behalf of a cgroup, skip
 		 * counting on behalf of references from different
@@ -799,8 +805,6 @@ static int page_referenced_file(struct page *page,
 
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
-		if (address == -EFAULT)
-			continue;
 		/*
 		 * If we are reclaiming on behalf of a cgroup, skip
 		 * counting on behalf of references from different
@@ -904,8 +908,6 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		if (vma->vm_flags & VM_SHARED) {
 			unsigned long address = vma_address(page, vma);
-			if (address == -EFAULT)
-				continue;
 			ret += page_mkclean_one(page, vma, address);
 		}
 	}
@@ -1468,8 +1470,6 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 			continue;
 
 		address = vma_address(page, vma);
-		if (address == -EFAULT)
-			continue;
 		ret = try_to_unmap_one(page, vma, address, flags);
 		if (ret != SWAP_AGAIN || !page_mapped(page))
 			break;
@@ -1508,8 +1508,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 	mutex_lock(&mapping->i_mmap_mutex);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
-		if (address == -EFAULT)
-			continue;
 		ret = try_to_unmap_one(page, vma, address, flags);
 		if (ret != SWAP_AGAIN || !page_mapped(page))
 			goto out;
@@ -1684,8 +1682,6 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
 	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address = vma_address(page, vma);
-		if (address == -EFAULT)
-			continue;
 		ret = rmap_one(page, vma, address, arg);
 		if (ret != SWAP_AGAIN)
 			break;
@@ -1707,8 +1703,6 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
 	mutex_lock(&mapping->i_mmap_mutex);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
-		if (address == -EFAULT)
-			continue;
 		ret = rmap_one(page, vma, address, arg);
 		if (ret != SWAP_AGAIN)
 			break;

From ed8ea8150182f8d715fceb3b175ef0a9ebacd872 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:45 -0700
Subject: [PATCH 088/160] mm: add CONFIG_DEBUG_VM_RB build option

Add a CONFIG_DEBUG_VM_RB build option for the previously existing
DEBUG_MM_RB code.  Now that Andi Kleen modified it to avoid using
recursive algorithms, we can expose it a bit more.

Also extend this code to validate_mm() after stack expansion, and to check
that the vma's start and last pgoffs have not changed since the nodes were
inserted on the anon vma interval tree (as it is important that the nodes
be reindexed after each such update).

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h   |  3 +++
 include/linux/rmap.h |  3 +++
 lib/Kconfig.debug    |  9 +++++++++
 mm/interval_tree.c   | 41 ++++++++++++++++++++++++++++++++++++++++-
 mm/mmap.c            | 19 +++++++++----------
 5 files changed, 64 insertions(+), 11 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0cdab4e0f814..0e6f9c9f2123 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1386,6 +1386,9 @@ struct anon_vma_chain *anon_vma_interval_tree_iter_first(
 	struct rb_root *root, unsigned long start, unsigned long last);
 struct anon_vma_chain *anon_vma_interval_tree_iter_next(
 	struct anon_vma_chain *node, unsigned long start, unsigned long last);
+#ifdef CONFIG_DEBUG_VM_RB
+void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
+#endif
 
 #define anon_vma_interval_tree_foreach(avc, root, start, last)		 \
 	for (avc = anon_vma_interval_tree_iter_first(root, start, last); \
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index dce44f7d3ed8..b2cce644ffc7 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -66,6 +66,9 @@ struct anon_vma_chain {
 	struct list_head same_vma;   /* locked by mmap_sem & page_table_lock */
 	struct rb_node rb;			/* locked by anon_vma->mutex */
 	unsigned long rb_subtree_last;
+#ifdef CONFIG_DEBUG_VM_RB
+	unsigned long cached_vma_start, cached_vma_last;
+#endif
 };
 
 #ifdef CONFIG_MMU
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a6e7e7741523..28e9d6c98941 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -798,6 +798,15 @@ config DEBUG_VM
 
 	  If unsure, say N.
 
+config DEBUG_VM_RB
+	bool "Debug VM red-black trees"
+	depends on DEBUG_VM
+	help
+	  Enable this to turn on more extended checks in the virtual-memory
+	  system that may impact performance.
+
+	  If unsure, say N.
+
 config DEBUG_VIRTUAL
 	bool "Debug VM translations"
 	depends on DEBUG_KERNEL && X86
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index f7c72cd35e1d..4a5822a586e6 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -70,4 +70,43 @@ static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
 }
 
 INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
-		     avc_start_pgoff, avc_last_pgoff,, anon_vma_interval_tree)
+		     avc_start_pgoff, avc_last_pgoff,
+		     static inline, __anon_vma_interval_tree)
+
+void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
+				   struct rb_root *root)
+{
+#ifdef CONFIG_DEBUG_VM_RB
+	node->cached_vma_start = avc_start_pgoff(node);
+	node->cached_vma_last = avc_last_pgoff(node);
+#endif
+	__anon_vma_interval_tree_insert(node, root);
+}
+
+void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
+				   struct rb_root *root)
+{
+	__anon_vma_interval_tree_remove(node, root);
+}
+
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_first(struct rb_root *root,
+				  unsigned long first, unsigned long last)
+{
+	return __anon_vma_interval_tree_iter_first(root, first, last);
+}
+
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
+				 unsigned long first, unsigned long last)
+{
+	return __anon_vma_interval_tree_iter_next(node, first, last);
+}
+
+#ifdef CONFIG_DEBUG_VM_RB
+void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
+{
+	WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
+	WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
+}
+#endif
diff --git a/mm/mmap.c b/mm/mmap.c
index 2e580ed79211..deb422c39e21 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -51,12 +51,6 @@ static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
 
-/*
- * WARNING: the debugging will use recursive algorithms so never enable this
- * unless you know what you are doing.
- */
-#undef DEBUG_MM_RB
-
 /* description of effects of mapping type and prot in current implementation.
  * this is due to the limited x86 page protection hardware.  The expected
  * behavior is in parens:
@@ -303,7 +297,7 @@ out:
 	return retval;
 }
 
-#ifdef DEBUG_MM_RB
+#ifdef CONFIG_DEBUG_VM_RB
 static int browse_rb(struct rb_root *root)
 {
 	int i = 0, j;
@@ -337,9 +331,12 @@ void validate_mm(struct mm_struct *mm)
 {
 	int bug = 0;
 	int i = 0;
-	struct vm_area_struct *tmp = mm->mmap;
-	while (tmp) {
-		tmp = tmp->vm_next;
+	struct vm_area_struct *vma = mm->mmap;
+	while (vma) {
+		struct anon_vma_chain *avc;
+		list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+			anon_vma_interval_tree_verify(avc);
+		vma = vma->vm_next;
 		i++;
 	}
 	if (i != mm->map_count)
@@ -1790,6 +1787,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 	}
 	vma_unlock_anon_vma(vma);
 	khugepaged_enter_vma_merge(vma);
+	validate_mm(vma->vm_mm);
 	return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1843,6 +1841,7 @@ int expand_downwards(struct vm_area_struct *vma,
 	}
 	vma_unlock_anon_vma(vma);
 	khugepaged_enter_vma_merge(vma);
+	validate_mm(vma->vm_mm);
 	return error;
 }
 

From 523d4e2008fd4a68b1a164e63e8c75b7b20f07e0 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:48 -0700
Subject: [PATCH 089/160] mm anon rmap: in mremap, set the new vma's position
 before anon_vma_clone()

anon_vma_clone() expects new_vma->vm_{start,end,pgoff} to be correctly set
so that the new vma can be indexed on the anon interval tree.

copy_vma() was failing to do that, which broke mremap().

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Tested-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index deb422c39e21..81248992120d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2419,16 +2419,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 		if (new_vma) {
 			*new_vma = *vma;
-			pol = mpol_dup(vma_policy(vma));
-			if (IS_ERR(pol))
-				goto out_free_vma;
-			INIT_LIST_HEAD(&new_vma->anon_vma_chain);
-			if (anon_vma_clone(new_vma, vma))
-				goto out_free_mempol;
-			vma_set_policy(new_vma, pol);
 			new_vma->vm_start = addr;
 			new_vma->vm_end = addr + len;
 			new_vma->vm_pgoff = pgoff;
+			pol = mpol_dup(vma_policy(vma));
+			if (IS_ERR(pol))
+				goto out_free_vma;
+			vma_set_policy(new_vma, pol);
+			INIT_LIST_HEAD(&new_vma->anon_vma_chain);
+			if (anon_vma_clone(new_vma, vma))
+				goto out_free_mempol;
 			if (new_vma->vm_file)
 				get_file(new_vma->vm_file);
 			if (new_vma->vm_ops && new_vma->vm_ops->open)

From 38a76013ad809beb0b52f60d365c960d035bd83c Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:31:50 -0700
Subject: [PATCH 090/160] mm: avoid taking rmap locks in move_ptes()

During mremap(), the destination VMA is generally placed after the
original vma in rmap traversal order: in move_vma(), we always have
new_pgoff >= vma->vm_pgoff, and as a result new_vma->vm_pgoff >=
vma->vm_pgoff unless vma_merge() merged the new vma with an adjacent one.

When the destination VMA is placed after the original in rmap traversal
order, we can avoid taking the rmap locks in move_ptes().

Essentially, this reintroduces the optimization that had been disabled in
"mm anon rmap: remove anon_vma_moveto_tail".  The difference is that we
don't try to impose the rmap traversal order; instead we just rely on
things being in the desired order in the common case and fall back to
taking locks in the uncommon case.  Also we skip the i_mmap_mutex in
addition to the anon_vma lock: in both cases, the vmas are traversed in
increasing vm_pgoff order with ties resolved in tree insertion order.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c          |  2 +-
 include/linux/mm.h |  6 +++--
 mm/mmap.c          |  7 ++++--
 mm/mremap.c        | 57 +++++++++++++++++++++++++++++++---------------
 4 files changed, 49 insertions(+), 23 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 19f4fb80cd17..4f2bebc276c5 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -603,7 +603,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 	 * process cleanup to remove whatever mess we made.
 	 */
 	if (length != move_page_tables(vma, old_start,
-				       vma, new_start, length))
+				       vma, new_start, length, false))
 		return -ENOMEM;
 
 	lru_add_drain();
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0e6f9c9f2123..0d5f823ce3fc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1060,7 +1060,8 @@ vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group);
 
 extern unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
-		unsigned long new_addr, unsigned long len);
+		unsigned long new_addr, unsigned long len,
+		bool need_rmap_locks);
 extern unsigned long do_mremap(unsigned long addr,
 			       unsigned long old_len, unsigned long new_len,
 			       unsigned long flags, unsigned long new_addr);
@@ -1410,7 +1411,8 @@ extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
 	struct rb_node **, struct rb_node *);
 extern void unlink_file_vma(struct vm_area_struct *);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
-	unsigned long addr, unsigned long len, pgoff_t pgoff);
+	unsigned long addr, unsigned long len, pgoff_t pgoff,
+	bool *need_rmap_locks);
 extern void exit_mmap(struct mm_struct *);
 
 extern int mm_take_all_locks(struct mm_struct *mm);
diff --git a/mm/mmap.c b/mm/mmap.c
index 81248992120d..2d942353d681 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2371,7 +2371,8 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
  * prior to moving page table entries, to effect an mremap move.
  */
 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
-	unsigned long addr, unsigned long len, pgoff_t pgoff)
+	unsigned long addr, unsigned long len, pgoff_t pgoff,
+	bool *need_rmap_locks)
 {
 	struct vm_area_struct *vma = *vmap;
 	unsigned long vma_start = vma->vm_start;
@@ -2413,8 +2414,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 			 * linear if there are no pages mapped yet.
 			 */
 			VM_BUG_ON(faulted_in_anon_vma);
-			*vmap = new_vma;
+			*vmap = vma = new_vma;
 		}
+		*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
 	} else {
 		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 		if (new_vma) {
@@ -2434,6 +2436,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 			if (new_vma->vm_ops && new_vma->vm_ops->open)
 				new_vma->vm_ops->open(new_vma);
 			vma_link(mm, new_vma, prev, rb_link, rb_parent);
+			*need_rmap_locks = false;
 		}
 	}
 	return new_vma;
diff --git a/mm/mremap.c b/mm/mremap.c
index 5588bb6e9295..3b639a4b26bd 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -71,26 +71,42 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		unsigned long old_addr, unsigned long old_end,
 		struct vm_area_struct *new_vma, pmd_t *new_pmd,
-		unsigned long new_addr)
+		unsigned long new_addr, bool need_rmap_locks)
 {
 	struct address_space *mapping = NULL;
-	struct anon_vma *anon_vma = vma->anon_vma;
+	struct anon_vma *anon_vma = NULL;
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
 	spinlock_t *old_ptl, *new_ptl;
 
-	if (vma->vm_file) {
-		/*
-		 * Subtle point from Rajesh Venkatasubramanian: before
-		 * moving file-based ptes, we must lock truncate_pagecache
-		 * out, since it might clean the dst vma before the src vma,
-		 * and we propagate stale pages into the dst afterward.
-		 */
-		mapping = vma->vm_file->f_mapping;
-		mutex_lock(&mapping->i_mmap_mutex);
+	/*
+	 * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
+	 * locks to ensure that rmap will always observe either the old or the
+	 * new ptes. This is the easiest way to avoid races with
+	 * truncate_pagecache(), page migration, etc...
+	 *
+	 * When need_rmap_locks is false, we use other ways to avoid
+	 * such races:
+	 *
+	 * - During exec() shift_arg_pages(), we use a specially tagged vma
+	 *   which rmap call sites look for using is_vma_temporary_stack().
+	 *
+	 * - During mremap(), new_vma is often known to be placed after vma
+	 *   in rmap traversal order. This ensures rmap will always observe
+	 *   either the old pte, or the new pte, or both (the page table locks
+	 *   serialize access to individual ptes, but only rmap traversal
+	 *   order guarantees that we won't miss both the old and new ptes).
+	 */
+	if (need_rmap_locks) {
+		if (vma->vm_file) {
+			mapping = vma->vm_file->f_mapping;
+			mutex_lock(&mapping->i_mmap_mutex);
+		}
+		if (vma->anon_vma) {
+			anon_vma = vma->anon_vma;
+			anon_vma_lock(anon_vma);
+		}
 	}
-	if (anon_vma)
-		anon_vma_lock(anon_vma);
 
 	/*
 	 * We don't have to worry about the ordering of src and dst
@@ -127,7 +143,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 
 unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
-		unsigned long new_addr, unsigned long len)
+		unsigned long new_addr, unsigned long len,
+		bool need_rmap_locks)
 {
 	unsigned long extent, next, old_end;
 	pmd_t *old_pmd, *new_pmd;
@@ -174,7 +191,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 		if (extent > LATENCY_LIMIT)
 			extent = LATENCY_LIMIT;
 		move_ptes(vma, old_pmd, old_addr, old_addr + extent,
-				new_vma, new_pmd, new_addr);
+			  new_vma, new_pmd, new_addr, need_rmap_locks);
 		need_flush = true;
 	}
 	if (likely(need_flush))
@@ -198,6 +215,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	unsigned long hiwater_vm;
 	int split = 0;
 	int err;
+	bool need_rmap_locks;
 
 	/*
 	 * We'd prefer to avoid failure later on in do_munmap:
@@ -219,18 +237,21 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		return err;
 
 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
-	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
+	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
+			   &need_rmap_locks);
 	if (!new_vma)
 		return -ENOMEM;
 
-	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
+	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
+				     need_rmap_locks);
 	if (moved_len < old_len) {
 		/*
 		 * On error, move entries back from new area to old,
 		 * which will succeed since page tables still there,
 		 * and then proceed to unmap new area instead of old.
 		 */
-		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
+		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
+				 true);
 		vma = new_vma;
 		old_len = new_len;
 		old_addr = new_addr;

From 1e8537baacd59e96bbe5f8d3d32feafd11f509fe Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Mon, 8 Oct 2012 16:31:51 -0700
Subject: [PATCH 091/160] memory-hotplug: build zonelists when offlining pages

online_pages() does build_all_zonelists() and zone_pcp_update(), I think
offline_pages() should do it too.

When the zone has no memory to allocate, remove it from other nodes'
zonelists.  zone_batchsize() depends on zone's present pages, if zone's
present pages are changed, zone's pcp should be updated.

Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6a5b90d0cfd7..b35016156c19 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -970,8 +970,13 @@ repeat:
 
 	init_per_zone_wmark_min();
 
-	if (!populated_zone(zone))
+	if (!populated_zone(zone)) {
 		zone_pcp_reset(zone);
+		mutex_lock(&zonelists_mutex);
+		build_all_zonelists(NULL, NULL);
+		mutex_unlock(&zonelists_mutex);
+	} else
+		zone_pcp_update(zone);
 
 	if (!node_present_pages(node)) {
 		node_clear_state(node, N_HIGH_MEMORY);

From 70400303ce0c4ced3139499c676d5c79636b0c72 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Mon, 8 Oct 2012 16:31:52 -0700
Subject: [PATCH 092/160] mm: mmu_notifier: make the mmu_notifier srcu static

The variable must be static especially given the variable name.

s/RCU/SRCU/ over a few comments.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Sagi Grimberg <sagig@mellanox.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmu_notifier.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 947df83dccb0..c297142f0fe6 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -20,7 +20,7 @@
 #include <linux/slab.h>
 
 /* global SRCU for all MMs */
-struct srcu_struct srcu;
+static struct srcu_struct srcu;
 
 /*
  * This function can't run concurrently against mmu_notifier_register
@@ -41,7 +41,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
 	int id;
 
 	/*
-	 * RCU here will block mmu_notifier_unregister until
+	 * SRCU here will block mmu_notifier_unregister until
 	 * ->release returns.
 	 */
 	id = srcu_read_lock(&srcu);
@@ -302,7 +302,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 
 	if (!hlist_unhashed(&mn->hlist)) {
 		/*
-		 * RCU here will force exit_mmap to wait ->release to finish
+		 * SRCU here will force exit_mmap to wait ->release to finish
 		 * before freeing the pages.
 		 */
 		int id;

From 02c6de8d757cb32c0829a45d81c3dfcbcafd998b Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:31:55 -0700
Subject: [PATCH 093/160] mm: cma: discard clean pages during contiguous
 allocation instead of migration

Drop clean cache pages instead of migration during alloc_contig_range() to
minimise allocation latency by reducing the amount of migration that is
necessary.  It's useful for CMA because latency of migration is more
important than evicting the background process's working set.  In
addition, as pages are reclaimed then fewer free pages for migration
targets are required so it avoids memory reclaiming to get free pages,
which is a contributory factor to increased latency.

I measured elapsed time of __alloc_contig_migrate_range() which migrates
10M in 40M movable zone in QEMU machine.

Before - 146ms, After - 7ms

[akpm@linux-foundation.org: fix nommu build]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Mel Gorman <mgorman@suse.de>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Michal Nazarewicz <mina86@mina86.com>
Cc: Rik van Riel <riel@redhat.com>
Tested-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 21 +++++++++++----------
 mm/internal.h        |  3 ++-
 mm/page_alloc.c      |  2 ++
 mm/vmscan.c          | 43 +++++++++++++++++++++++++++++++++++++------
 4 files changed, 52 insertions(+), 17 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b2cce644ffc7..bfe1f4780644 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -71,6 +71,17 @@ struct anon_vma_chain {
 #endif
 };
 
+enum ttu_flags {
+	TTU_UNMAP = 0,			/* unmap mode */
+	TTU_MIGRATION = 1,		/* migration mode */
+	TTU_MUNLOCK = 2,		/* munlock mode */
+	TTU_ACTION_MASK = 0xff,
+
+	TTU_IGNORE_MLOCK = (1 << 8),	/* ignore mlock */
+	TTU_IGNORE_ACCESS = (1 << 9),	/* don't age */
+	TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
+};
+
 #ifdef CONFIG_MMU
 static inline void get_anon_vma(struct anon_vma *anon_vma)
 {
@@ -164,16 +175,6 @@ int page_referenced(struct page *, int is_locked,
 int page_referenced_one(struct page *, struct vm_area_struct *,
 	unsigned long address, unsigned int *mapcount, unsigned long *vm_flags);
 
-enum ttu_flags {
-	TTU_UNMAP = 0,			/* unmap mode */
-	TTU_MIGRATION = 1,		/* migration mode */
-	TTU_MUNLOCK = 2,		/* munlock mode */
-	TTU_ACTION_MASK = 0xff,
-
-	TTU_IGNORE_MLOCK = (1 << 8),	/* ignore mlock */
-	TTU_IGNORE_ACCESS = (1 << 9),	/* don't age */
-	TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
-};
 #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
 
 int try_to_unmap(struct page *, enum ttu_flags flags);
diff --git a/mm/internal.h b/mm/internal.h
index bbd7b34db4ea..8312d4fadf59 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -356,5 +356,6 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
         unsigned long, unsigned long);
 
 extern void set_pageblock_order(void);
-
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+					    struct list_head *page_list);
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cfd565dbe124..cefd14e6dcf2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5700,6 +5700,8 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
 			break;
 		}
 
+		reclaim_clean_pages_from_list(cc.zone, &cc.migratepages);
+
 		ret = migrate_pages(&cc.migratepages,
 				    __alloc_contig_migrate_alloc,
 				    0, false, MIGRATE_SYNC);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d16bf5a53266..1ee4b69a28a5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -674,8 +674,10 @@ static enum page_references page_check_references(struct page *page,
 static unsigned long shrink_page_list(struct list_head *page_list,
 				      struct zone *zone,
 				      struct scan_control *sc,
+				      enum ttu_flags ttu_flags,
 				      unsigned long *ret_nr_dirty,
-				      unsigned long *ret_nr_writeback)
+				      unsigned long *ret_nr_writeback,
+				      bool force_reclaim)
 {
 	LIST_HEAD(ret_pages);
 	LIST_HEAD(free_pages);
@@ -689,10 +691,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
 	mem_cgroup_uncharge_start();
 	while (!list_empty(page_list)) {
-		enum page_references references;
 		struct address_space *mapping;
 		struct page *page;
 		int may_enter_fs;
+		enum page_references references = PAGEREF_RECLAIM_CLEAN;
 
 		cond_resched();
 
@@ -758,7 +760,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			wait_on_page_writeback(page);
 		}
 
-		references = page_check_references(page, sc);
+		if (!force_reclaim)
+			references = page_check_references(page, sc);
+
 		switch (references) {
 		case PAGEREF_ACTIVATE:
 			goto activate_locked;
@@ -788,7 +792,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page) && mapping) {
-			switch (try_to_unmap(page, TTU_UNMAP)) {
+			switch (try_to_unmap(page, ttu_flags)) {
 			case SWAP_FAIL:
 				goto activate_locked;
 			case SWAP_AGAIN:
@@ -960,6 +964,33 @@ keep:
 	return nr_reclaimed;
 }
 
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+					    struct list_head *page_list)
+{
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.priority = DEF_PRIORITY,
+		.may_unmap = 1,
+	};
+	unsigned long ret, dummy1, dummy2;
+	struct page *page, *next;
+	LIST_HEAD(clean_pages);
+
+	list_for_each_entry_safe(page, next, page_list, lru) {
+		if (page_is_file_cache(page) && !PageDirty(page)) {
+			ClearPageActive(page);
+			list_move(&page->lru, &clean_pages);
+		}
+	}
+
+	ret = shrink_page_list(&clean_pages, zone, &sc,
+				TTU_UNMAP|TTU_IGNORE_ACCESS,
+				&dummy1, &dummy2, true);
+	list_splice(&clean_pages, page_list);
+	__mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+	return ret;
+}
+
 /*
  * Attempt to remove the specified page from its LRU.  Only take this page
  * if it is of the appropriate PageActive status.  Pages which are being
@@ -1278,8 +1309,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	if (nr_taken == 0)
 		return 0;
 
-	nr_reclaimed = shrink_page_list(&page_list, zone, sc,
-						&nr_dirty, &nr_writeback);
+	nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
+					&nr_dirty, &nr_writeback, false);
 
 	spin_lock_irq(&zone->lru_lock);
 

From 770c8aaaf6f04a87e6765f24d497132de9152a46 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Mon, 8 Oct 2012 16:31:57 -0700
Subject: [PATCH 094/160] mm: fix tracing in free_pcppages_bulk()

page->private gets re-used in __free_one_page() to store page order
(so trace_mm_page_pcpu_drain() may print order instead of migratetype)
thus migratetype value must be cached locally.

Fixes regression introduced in commit a7016235a61d ("mm: fix migratetype
bug which slowed swapping").  This caused incorrect data to be attached
to the mm_page_pcpu_drain trace event.

[akpm@linux-foundation.org: add comment]
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cefd14e6dcf2..3f18a14effb8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -668,12 +668,15 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			batch_free = to_free;
 
 		do {
+			int mt;	/* migratetype of the to-be-freed page */
+
 			page = list_entry(list->prev, struct page, lru);
 			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
+			mt = page_private(page);
 			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
-			__free_one_page(page, zone, 0, page_private(page));
-			trace_mm_page_pcpu_drain(page, 0, page_private(page));
+			__free_one_page(page, zone, 0, mt);
+			trace_mm_page_pcpu_drain(page, 0, mt);
 		} while (--to_free && --batch_free && !list_empty(list));
 	}
 	__mod_zone_page_state(zone, NR_FREE_PAGES, count);

From 2139cbe627b8910ded55148f87ee10f7485408ed Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Mon, 8 Oct 2012 16:32:00 -0700
Subject: [PATCH 095/160] cma: fix counting of isolated pages

Isolated free pages shouldn't be accounted to NR_FREE_PAGES counter.  Fix
it by properly decreasing/increasing NR_FREE_PAGES counter in
set_migratetype_isolate()/unset_migratetype_isolate() and removing counter
adjustment for isolated pages from free_one_page() and split_free_page().

Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c     |  9 +++++++--
 mm/page_isolation.c | 12 +++++++++---
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3f18a14effb8..d259cc2b69c1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -691,7 +691,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
 	zone->pages_scanned = 0;
 
 	__free_one_page(page, zone, order, migratetype);
-	__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+	if (unlikely(migratetype != MIGRATE_ISOLATE))
+		__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
 	spin_unlock(&zone->lock);
 }
 
@@ -1392,6 +1393,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
 	unsigned int order;
 	unsigned long watermark;
 	struct zone *zone;
+	int mt;
 
 	BUG_ON(!PageBuddy(page));
 
@@ -1407,7 +1409,10 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
 	list_del(&page->lru);
 	zone->free_area[order].nr_free--;
 	rmv_page_order(page);
-	__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
+
+	mt = get_pageblock_migratetype(page);
+	if (unlikely(mt != MIGRATE_ISOLATE))
+		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
 
 	if (alloc_order != order)
 		expand(zone, page, alloc_order, order,
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 247d1f175739..3ca1716471bc 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -76,8 +76,12 @@ int set_migratetype_isolate(struct page *page)
 
 out:
 	if (!ret) {
+		unsigned long nr_pages;
+
 		set_pageblock_isolate(page);
-		move_freepages_block(zone, page, MIGRATE_ISOLATE);
+		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
+
+		__mod_zone_page_state(zone, NR_FREE_PAGES, -nr_pages);
 	}
 
 	spin_unlock_irqrestore(&zone->lock, flags);
@@ -89,12 +93,14 @@ out:
 void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 {
 	struct zone *zone;
-	unsigned long flags;
+	unsigned long flags, nr_pages;
+
 	zone = page_zone(page);
 	spin_lock_irqsave(&zone->lock, flags);
 	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
 		goto out;
-	move_freepages_block(zone, page, migratetype);
+	nr_pages = move_freepages_block(zone, page, migratetype);
+	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
 	restore_pageblock_isolate(page, migratetype);
 out:
 	spin_unlock_irqrestore(&zone->lock, flags);

From d1ce749a0db12202b711d1aba1d29e823034648d Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Mon, 8 Oct 2012 16:32:02 -0700
Subject: [PATCH 096/160] cma: count free CMA pages

Add NR_FREE_CMA_PAGES counter to be later used for checking watermark in
__zone_watermark_ok().  For simplicity and to avoid #ifdef hell make this
counter always available (not only when CONFIG_CMA=y).

[akpm@linux-foundation.org: use conventional migratetype naming]
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  1 +
 include/linux/vmstat.h |  8 ++++++++
 mm/page_alloc.c        | 26 +++++++++++++++++++-------
 mm/page_isolation.c    |  5 +++--
 mm/vmstat.c            |  1 +
 5 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2daa54f55db7..85ac67aa5770 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -142,6 +142,7 @@ enum zone_stat_item {
 	NUMA_OTHER,		/* allocation from other node */
 #endif
 	NR_ANON_TRANSPARENT_HUGEPAGES,
+	NR_FREE_CMA_PAGES,
 	NR_VM_ZONE_STAT_ITEMS };
 
 /*
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index ad2cfd53dadc..a5bb15018b5c 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -253,6 +253,14 @@ static inline void refresh_zone_stat_thresholds(void) { }
 
 #endif		/* CONFIG_SMP */
 
+static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
+					     int migratetype)
+{
+	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
+	if (is_migrate_cma(migratetype))
+		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
+}
+
 extern const char * const vmstat_text[];
 
 #endif /* _LINUX_VMSTAT_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d259cc2b69c1..6969a8abdba2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -558,7 +558,8 @@ static inline void __free_one_page(struct page *page,
 		if (page_is_guard(buddy)) {
 			clear_page_guard_flag(buddy);
 			set_page_private(page, 0);
-			__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+			__mod_zone_freepage_state(zone, 1 << order,
+						  migratetype);
 		} else {
 			list_del(&buddy->lru);
 			zone->free_area[order].nr_free--;
@@ -677,6 +678,8 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
 			__free_one_page(page, zone, 0, mt);
 			trace_mm_page_pcpu_drain(page, 0, mt);
+			if (is_migrate_cma(mt))
+				__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
 		} while (--to_free && --batch_free && !list_empty(list));
 	}
 	__mod_zone_page_state(zone, NR_FREE_PAGES, count);
@@ -692,7 +695,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
 
 	__free_one_page(page, zone, order, migratetype);
 	if (unlikely(migratetype != MIGRATE_ISOLATE))
-		__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+		__mod_zone_freepage_state(zone, 1 << order, migratetype);
 	spin_unlock(&zone->lock);
 }
 
@@ -815,7 +818,8 @@ static inline void expand(struct zone *zone, struct page *page,
 			set_page_guard_flag(&page[size]);
 			set_page_private(&page[size], high);
 			/* Guard pages are not available for any usage */
-			__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+			__mod_zone_freepage_state(zone, -(1 << high),
+						  migratetype);
 			continue;
 		}
 #endif
@@ -1141,6 +1145,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		}
 		set_page_private(page, mt);
 		list = &page->lru;
+		if (is_migrate_cma(mt))
+			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
+					      -(1 << order));
 	}
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
 	spin_unlock(&zone->lock);
@@ -1412,7 +1419,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
 
 	mt = get_pageblock_migratetype(page);
 	if (unlikely(mt != MIGRATE_ISOLATE))
-		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
+		__mod_zone_freepage_state(zone, -(1UL << order), mt);
 
 	if (alloc_order != order)
 		expand(zone, page, alloc_order, order,
@@ -1516,7 +1523,8 @@ again:
 		spin_unlock(&zone->lock);
 		if (!page)
 			goto failed;
-		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
+		__mod_zone_freepage_state(zone, -(1 << order),
+					  get_pageblock_migratetype(page));
 	}
 
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -2890,7 +2898,8 @@ void show_free_areas(unsigned int filter)
 		" unevictable:%lu"
 		" dirty:%lu writeback:%lu unstable:%lu\n"
 		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
-		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
+		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+		" free_cma:%lu\n",
 		global_page_state(NR_ACTIVE_ANON),
 		global_page_state(NR_INACTIVE_ANON),
 		global_page_state(NR_ISOLATED_ANON),
@@ -2907,7 +2916,8 @@ void show_free_areas(unsigned int filter)
 		global_page_state(NR_FILE_MAPPED),
 		global_page_state(NR_SHMEM),
 		global_page_state(NR_PAGETABLE),
-		global_page_state(NR_BOUNCE));
+		global_page_state(NR_BOUNCE),
+		global_page_state(NR_FREE_CMA_PAGES));
 
 	for_each_populated_zone(zone) {
 		int i;
@@ -2939,6 +2949,7 @@ void show_free_areas(unsigned int filter)
 			" pagetables:%lukB"
 			" unstable:%lukB"
 			" bounce:%lukB"
+			" free_cma:%lukB"
 			" writeback_tmp:%lukB"
 			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
@@ -2968,6 +2979,7 @@ void show_free_areas(unsigned int filter)
 			K(zone_page_state(zone, NR_PAGETABLE)),
 			K(zone_page_state(zone, NR_UNSTABLE_NFS)),
 			K(zone_page_state(zone, NR_BOUNCE)),
+			K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
 			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
 			zone->pages_scanned,
 			(zone->all_unreclaimable ? "yes" : "no")
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 3ca1716471bc..345643b85bd4 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -77,11 +77,12 @@ int set_migratetype_isolate(struct page *page)
 out:
 	if (!ret) {
 		unsigned long nr_pages;
+		int migratetype = get_pageblock_migratetype(page);
 
 		set_pageblock_isolate(page);
 		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
 
-		__mod_zone_page_state(zone, NR_FREE_PAGES, -nr_pages);
+		__mod_zone_freepage_state(zone, -nr_pages, migratetype);
 	}
 
 	spin_unlock_irqrestore(&zone->lock, flags);
@@ -100,7 +101,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
 		goto out;
 	nr_pages = move_freepages_block(zone, page, migratetype);
-	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
+	__mod_zone_freepage_state(zone, nr_pages, migratetype);
 	restore_pageblock_isolate(page, migratetype);
 out:
 	spin_unlock_irqrestore(&zone->lock, flags);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b3e3b9d525d0..acbd85c983e6 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -722,6 +722,7 @@ const char * const vmstat_text[] = {
 	"numa_other",
 #endif
 	"nr_anon_transparent_hugepages",
+	"nr_free_cma",
 	"nr_dirty_threshold",
 	"nr_dirty_background_threshold",
 

From d95ea5d18e699515468368415c93ed49b1a3221b Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Mon, 8 Oct 2012 16:32:05 -0700
Subject: [PATCH 097/160] cma: fix watermark checking

* Add ALLOC_CMA alloc flag and pass it to [__]zone_watermark_ok()
  (from Minchan Kim).

* During watermark check decrease available free pages number by
  free CMA pages number if necessary (unmovable allocations cannot
  use pages from CMA areas).

Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c |  8 +++++++-
 mm/internal.h   | 14 ++++++++++++++
 mm/page_alloc.c | 31 +++++++++++++++----------------
 3 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 0fbc6b73a522..1f61bcbd6262 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -934,6 +934,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 	struct zoneref *z;
 	struct zone *zone;
 	int rc = COMPACT_SKIPPED;
+	int alloc_flags = 0;
 
 	/* Check if the GFP flags allow compaction */
 	if (!order || !may_enter_fs || !may_perform_io)
@@ -941,6 +942,10 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 
 	count_vm_event(COMPACTSTALL);
 
+#ifdef CONFIG_CMA
+	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+		alloc_flags |= ALLOC_CMA;
+#endif
 	/* Compact each zone in the list */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
 								nodemask) {
@@ -951,7 +956,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 		rc = max(status, rc);
 
 		/* If a normal allocation would succeed, stop compacting */
-		if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
+		if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
+				      alloc_flags))
 			break;
 	}
 
diff --git a/mm/internal.h b/mm/internal.h
index 8312d4fadf59..96cda4c6ac56 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -358,4 +358,18 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
 extern void set_pageblock_order(void);
 unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 					    struct list_head *page_list);
+/* The ALLOC_WMARK bits are used as an index to zone->watermark */
+#define ALLOC_WMARK_MIN		WMARK_MIN
+#define ALLOC_WMARK_LOW		WMARK_LOW
+#define ALLOC_WMARK_HIGH	WMARK_HIGH
+#define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */
+
+/* Mask to get the watermark bits */
+#define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)
+
+#define ALLOC_HARDER		0x10 /* try to alloc harder */
+#define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET		0x40 /* check for correct cpuset */
+#define ALLOC_CMA		0x80 /* allow allocations from CMA areas */
+
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6969a8abdba2..f2c7cc6a3039 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1541,19 +1541,6 @@ failed:
 	return NULL;
 }
 
-/* The ALLOC_WMARK bits are used as an index to zone->watermark */
-#define ALLOC_WMARK_MIN		WMARK_MIN
-#define ALLOC_WMARK_LOW		WMARK_LOW
-#define ALLOC_WMARK_HIGH	WMARK_HIGH
-#define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */
-
-/* Mask to get the watermark bits */
-#define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)
-
-#define ALLOC_HARDER		0x10 /* try to alloc harder */
-#define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
-#define ALLOC_CPUSET		0x40 /* check for correct cpuset */
-
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 
 static struct {
@@ -1648,7 +1635,11 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		min -= min / 2;
 	if (alloc_flags & ALLOC_HARDER)
 		min -= min / 4;
-
+#ifdef CONFIG_CMA
+	/* If allocation can't use CMA areas don't use free CMA pages */
+	if (!(alloc_flags & ALLOC_CMA))
+		free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
 	if (free_pages <= min + lowmem_reserve)
 		return false;
 	for (o = 0; o < order; o++) {
@@ -2362,7 +2353,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 				 unlikely(test_thread_flag(TIF_MEMDIE))))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 	}
-
+#ifdef CONFIG_CMA
+	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+		alloc_flags |= ALLOC_CMA;
+#endif
 	return alloc_flags;
 }
 
@@ -2587,6 +2581,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	struct page *page = NULL;
 	int migratetype = allocflags_to_migratetype(gfp_mask);
 	unsigned int cpuset_mems_cookie;
+	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
 
 	gfp_mask &= gfp_allowed_mask;
 
@@ -2615,9 +2610,13 @@ retry_cpuset:
 	if (!preferred_zone)
 		goto out;
 
+#ifdef CONFIG_CMA
+	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+		alloc_flags |= ALLOC_CMA;
+#endif
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
-			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
+			zonelist, high_zoneidx, alloc_flags,
 			preferred_zone, migratetype);
 	if (unlikely(!page))
 		page = __alloc_pages_slowpath(gfp_mask, order,

From b12c4ad14ee0232ad47c2bef404b6d42a3578332 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:32:08 -0700
Subject: [PATCH 098/160] mm: page_alloc: use get_freepage_migratetype()
 instead of page_private()

The page allocator uses set_page_private and page_private for handling
migratetype when it frees page.  Let's replace them with [set|get]
_freepage_migratetype to make it more clear.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h  | 12 ++++++++++++
 mm/page_alloc.c     |  6 +++---
 mm/page_isolation.c |  2 +-
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0d5f823ce3fc..4ed5c7367b9b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -237,6 +237,18 @@ struct inode;
 #define page_private(page)		((page)->private)
 #define set_page_private(page, v)	((page)->private = (v))
 
+/* It's valid only if the page is free path or free_list */
+static inline void set_freepage_migratetype(struct page *page, int migratetype)
+{
+	set_page_private(page, migratetype);
+}
+
+/* It's valid only if the page is free path or free_list */
+static inline int get_freepage_migratetype(struct page *page)
+{
+	return page_private(page);
+}
+
 /*
  * FIXME: take this include out, include page-flags.h in
  * files which need it (119 of them)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f2c7cc6a3039..6aa0a8e89c5d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -674,7 +674,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			page = list_entry(list->prev, struct page, lru);
 			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
-			mt = page_private(page);
+			mt = get_freepage_migratetype(page);
 			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
 			__free_one_page(page, zone, 0, mt);
 			trace_mm_page_pcpu_drain(page, 0, mt);
@@ -1143,7 +1143,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
 				mt = migratetype;
 		}
-		set_page_private(page, mt);
+		set_freepage_migratetype(page, mt);
 		list = &page->lru;
 		if (is_migrate_cma(mt))
 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -1313,7 +1313,7 @@ void free_hot_cold_page(struct page *page, int cold)
 		return;
 
 	migratetype = get_pageblock_migratetype(page);
-	set_page_private(page, migratetype);
+	set_freepage_migratetype(page, migratetype);
 	local_irq_save(flags);
 	if (unlikely(wasMlocked))
 		free_page_mlock(page);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 345643b85bd4..9c03dca8c2ad 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -203,7 +203,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
 		if (PageBuddy(page))
 			pfn += 1 << page_order(page);
 		else if (page_count(page) == 0 &&
-				page_private(page) == MIGRATE_ISOLATE)
+			get_freepage_migratetype(page) == MIGRATE_ISOLATE)
 			pfn += 1;
 		else
 			break;

From 95e3441248053fc06bbb1dbbd34409a84211619e Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:32:11 -0700
Subject: [PATCH 099/160] mm: remain migratetype in freed page

The page allocator caches the pageblock information in page->private while
it is in the PCP freelists but this is overwritten with the order of the
page when freed to the buddy allocator.  This patch stores the migratetype
of the page in the page->index field so that it is available at all times
when the page remain in free_list.

This patch adds a new call site in __free_pages_ok so it might be overhead
a bit but it's for high order allocation.  So I believe damage isn't hurt.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 4 ++--
 mm/page_alloc.c    | 7 +++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4ed5c7367b9b..b01e585ab4b5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -240,13 +240,13 @@ struct inode;
 /* It's valid only if the page is free path or free_list */
 static inline void set_freepage_migratetype(struct page *page, int migratetype)
 {
-	set_page_private(page, migratetype);
+	page->index = migratetype;
 }
 
 /* It's valid only if the page is free path or free_list */
 static inline int get_freepage_migratetype(struct page *page)
 {
-	return page_private(page);
+	return page->index;
 }
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6aa0a8e89c5d..94fd283dde98 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -729,6 +729,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
 	int wasMlocked = __TestClearPageMlocked(page);
+	int migratetype;
 
 	if (!free_pages_prepare(page, order))
 		return;
@@ -737,8 +738,9 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	if (unlikely(wasMlocked))
 		free_page_mlock(page);
 	__count_vm_events(PGFREE, 1 << order);
-	free_one_page(page_zone(page), page, order,
-					get_pageblock_migratetype(page));
+	migratetype = get_pageblock_migratetype(page);
+	set_freepage_migratetype(page, migratetype);
+	free_one_page(page_zone(page), page, order, migratetype);
 	local_irq_restore(flags);
 }
 
@@ -959,6 +961,7 @@ static int move_freepages(struct zone *zone,
 		order = page_order(page);
 		list_move(&page->lru,
 			  &zone->free_area[order].free_list[migratetype]);
+		set_freepage_migratetype(page, migratetype);
 		page += 1 << order;
 		pages_moved += 1 << order;
 	}

From 41d575ad4a511b71a4a41c8313004212f5c229b1 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:32:14 -0700
Subject: [PATCH 100/160] memory-hotplug: bug fix race between isolation and
 allocation

Like below, memory-hotplug makes race between page-isolation
and page-allocation so it can hit BUG_ON in __offline_isolated_pages.

	CPU A					CPU B

start_isolate_page_range
set_migratetype_isolate
spin_lock_irqsave(zone->lock)

				free_hot_cold_page(Page A)
				/* without zone->lock */
				migratetype = get_pageblock_migratetype(Page A);
				/*
				 * Page could be moved into MIGRATE_MOVABLE
				 * of per_cpu_pages
				 */
				list_add_tail(&page->lru, &pcp->lists[migratetype]);

set_pageblock_isolate
move_freepages_block
drain_all_pages

				/* Page A could be in MIGRATE_MOVABLE of free_list. */

check_pages_isolated
__test_page_isolated_in_pageblock
/*
 * We can't catch freed page which
 * is free_list[MIGRATE_MOVABLE]
 */
if (PageBuddy(page A))
	pfn += 1 << page_order(page A);

				/* So, Page A could be allocated */

__offline_isolated_pages
/*
 * BUG_ON hit or offline page
 * which is used by someone
 */
BUG_ON(!PageBuddy(page A));

This patch checks page's migratetype in freelist in
__test_page_isolated_in_pageblock.  So now
__test_page_isolated_in_pageblock can check the page caused by above race
and can fail of memory offlining.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_isolation.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 9c03dca8c2ad..6744235d2d0e 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -200,8 +200,11 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
 			continue;
 		}
 		page = pfn_to_page(pfn);
-		if (PageBuddy(page))
+		if (PageBuddy(page)) {
+			if (get_freepage_migratetype(page) != MIGRATE_ISOLATE)
+				break;
 			pfn += 1 << page_order(page);
+		}
 		else if (page_count(page) == 0 &&
 			get_freepage_migratetype(page) == MIGRATE_ISOLATE)
 			pfn += 1;

From 435b405c06119d93333738172b8060b0ed12af41 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:32:16 -0700
Subject: [PATCH 101/160] memory-hotplug: fix pages missed by race rather than
 failing

If race between allocation and isolation in memory-hotplug offline
happens, some pages could be in MIGRATE_MOVABLE of free_list although the
pageblock's migratetype of the page is MIGRATE_ISOLATE.

The race could be detected by get_freepage_migratetype in
__test_page_isolated_in_pageblock.  If it is detected, now EBUSY gets
bubbled all the way up and the hotplug operations fails.

But better idea is instead of returning and failing memory-hotremove, move
the free page to the correct list at the time it is detected.  It could
enhance memory-hotremove operation success ratio although the race is
really rare.

Suggested by Mel Gorman.

[akpm@linux-foundation.org: small cleanup]
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-isolation.h |  4 ++++
 mm/page_alloc.c                |  2 +-
 mm/page_isolation.c            | 16 ++++++++++++++--
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 105077aa7685..fca8c0a5c188 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -6,6 +6,10 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count);
 void set_pageblock_migratetype(struct page *page, int migratetype);
 int move_freepages_block(struct zone *zone, struct page *page,
 				int migratetype);
+int move_freepages(struct zone *zone,
+			  struct page *start_page, struct page *end_page,
+			  int migratetype);
+
 /*
  * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
  * If specified range includes migrate types other than MOVABLE or CMA,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94fd283dde98..82f0b2f54f81 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -925,7 +925,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
-static int move_freepages(struct zone *zone,
+int move_freepages(struct zone *zone,
 			  struct page *start_page, struct page *end_page,
 			  int migratetype)
 {
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 6744235d2d0e..5f34a9053ce0 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -201,8 +201,20 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
 		}
 		page = pfn_to_page(pfn);
 		if (PageBuddy(page)) {
-			if (get_freepage_migratetype(page) != MIGRATE_ISOLATE)
-				break;
+			/*
+			 * If race between isolatation and allocation happens,
+			 * some free pages could be in MIGRATE_MOVABLE list
+			 * although pageblock's migratation type of the page
+			 * is MIGRATE_ISOLATE. Catch it and move the page into
+			 * MIGRATE_ISOLATE list.
+			 */
+			if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
+				struct page *end_page;
+
+				end_page = page + (1 << page_order(page)) - 1;
+				move_freepages(page_zone(page), page, end_page,
+						MIGRATE_ISOLATE);
+			}
 			pfn += 1 << page_order(page);
 		}
 		else if (page_count(page) == 0 &&

From e79bee24fd6134f90af4228cfebd010136d67631 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Mon, 8 Oct 2012 16:32:18 -0700
Subject: [PATCH 102/160] atomic: implement generic atomic_dec_if_positive()

The x86 implementation of atomic_dec_if_positive is quite generic, so make
it available to all architectures.

This is needed for "swap: add a simple detector for inappropriate swapin
readahead".

[akpm@linux-foundation.org: do the "#define foo foo" trick in the conventional manner]
Signed-off-by: Shaohua Li <shli@fusionio.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michal Simek <monstr@monstr.eu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/microblaze/include/asm/atomic.h |  1 +
 arch/powerpc/include/asm/atomic.h    |  1 +
 arch/x86/include/asm/atomic.h        | 24 ------------------------
 include/linux/atomic.h               | 25 +++++++++++++++++++++++++
 4 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/arch/microblaze/include/asm/atomic.h b/arch/microblaze/include/asm/atomic.h
index 472d8bf726df..42ac382a09da 100644
--- a/arch/microblaze/include/asm/atomic.h
+++ b/arch/microblaze/include/asm/atomic.h
@@ -22,5 +22,6 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 
 	return res;
 }
+#define atomic_dec_if_positive atomic_dec_if_positive
 
 #endif /* _ASM_MICROBLAZE_ATOMIC_H */
diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index da29032ae38f..e3b1d41c89be 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -268,6 +268,7 @@ static __inline__ int atomic_dec_if_positive(atomic_t *v)
 
 	return t;
 }
+#define atomic_dec_if_positive atomic_dec_if_positive
 
 #define smp_mb__before_atomic_dec()     smp_mb()
 #define smp_mb__after_atomic_dec()      smp_mb()
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 250b8774c158..b6c3b821acf6 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -240,30 +240,6 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 	return c;
 }
 
-
-/*
- * atomic_dec_if_positive - decrement by 1 if old value positive
- * @v: pointer of type atomic_t
- *
- * The function returns the old value of *v minus 1, even if
- * the atomic variable, v, was not decremented.
- */
-static inline int atomic_dec_if_positive(atomic_t *v)
-{
-	int c, old, dec;
-	c = atomic_read(v);
-	for (;;) {
-		dec = c - 1;
-		if (unlikely(dec < 0))
-			break;
-		old = atomic_cmpxchg((v), c, dec);
-		if (likely(old == c))
-			break;
-		c = old;
-	}
-	return dec;
-}
-
 /**
  * atomic_inc_short - increment of a short integer
  * @v: pointer to type int
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 70cfcb2d63c4..5b08a8540ecf 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -86,6 +86,31 @@ static inline int atomic_dec_unless_positive(atomic_t *p)
 }
 #endif
 
+/*
+ * atomic_dec_if_positive - decrement by 1 if old value positive
+ * @v: pointer of type atomic_t
+ *
+ * The function returns the old value of *v minus 1, even if
+ * the atomic variable, v, was not decremented.
+ */
+#ifndef atomic_dec_if_positive
+static inline int atomic_dec_if_positive(atomic_t *v)
+{
+	int c, old, dec;
+	c = atomic_read(v);
+	for (;;) {
+		dec = c - 1;
+		if (unlikely(dec < 0))
+			break;
+		old = atomic_cmpxchg((v), c, dec);
+		if (likely(old == c))
+			break;
+		c = old;
+	}
+	return dec;
+}
+#endif
+
 #ifndef CONFIG_ARCH_HAS_ATOMIC_OR
 static inline void atomic_or(int i, atomic_t *v)
 {

From 45cac65b0fcd287ebb877b141d40ba9bbe8e5da7 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Mon, 8 Oct 2012 16:32:19 -0700
Subject: [PATCH 103/160] readahead: fault retry breaks mmap file read random
 detection

.fault now can retry.  The retry can break state machine of .fault.  In
filemap_fault, if page is miss, ra->mmap_miss is increased.  In the second
try, since the page is in page cache now, ra->mmap_miss is decreased.  And
these are done in one fault, so we can't detect random mmap file access.

Add a new flag to indicate .fault is tried once.  In the second try, skip
ra->mmap_miss decreasing.  The filemap_fault state machine is ok with it.

I only tested x86, didn't test other archs, but looks the change for other
archs is obvious, but who knows :)

Signed-off-by: Shaohua Li <shaohua.li@fusionio.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/fault.c        | 1 +
 arch/avr32/mm/fault.c      | 1 +
 arch/cris/mm/fault.c       | 1 +
 arch/hexagon/mm/vm_fault.c | 1 +
 arch/ia64/mm/fault.c       | 1 +
 arch/m68k/mm/fault.c       | 1 +
 arch/microblaze/mm/fault.c | 1 +
 arch/mips/mm/fault.c       | 1 +
 arch/openrisc/mm/fault.c   | 1 +
 arch/powerpc/mm/fault.c    | 1 +
 arch/s390/mm/fault.c       | 1 +
 arch/sh/mm/fault.c         | 1 +
 arch/sparc/mm/fault_32.c   | 1 +
 arch/sparc/mm/fault_64.c   | 1 +
 arch/tile/mm/fault.c       | 1 +
 arch/um/kernel/trap.c      | 1 +
 arch/x86/mm/fault.c        | 1 +
 arch/xtensa/mm/fault.c     | 1 +
 include/linux/mm.h         | 1 +
 mm/filemap.c               | 4 ++--
 20 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index c3bd83450227..5dbf13f954f6 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -336,6 +336,7 @@ retry:
 			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
 			* of starvation. */
 			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
 			goto retry;
 		}
 	}
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index b92e60958617..b2f2d2d66849 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -152,6 +152,7 @@ good_area:
 			tsk->min_flt++;
 		if (fault & VM_FAULT_RETRY) {
 			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
 
 			/*
 			 * No need to up_read(&mm->mmap_sem) as we would have
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index 45fd542cf173..73312ab6c696 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -186,6 +186,7 @@ retry:
 			tsk->min_flt++;
 		if (fault & VM_FAULT_RETRY) {
 			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
 
 			/*
 			 * No need to up_read(&mm->mmap_sem) as we would
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
index 06695cc4fe58..513b74cb397e 100644
--- a/arch/hexagon/mm/vm_fault.c
+++ b/arch/hexagon/mm/vm_fault.c
@@ -113,6 +113,7 @@ good_area:
 				current->min_flt++;
 			if (fault & VM_FAULT_RETRY) {
 				flags &= ~FAULT_FLAG_ALLOW_RETRY;
+				flags |= FAULT_FLAG_TRIED;
 				goto retry;
 			}
 		}
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 8443daf4f515..6cf0341f978e 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -184,6 +184,7 @@ retry:
 			current->min_flt++;
 		if (fault & VM_FAULT_RETRY) {
 			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
 
 			 /* No need to up_read(&mm->mmap_sem) as we would
 			 * have already released it in __lock_page_or_retry
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index aeebbb7b30f0..a563727806bf 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -170,6 +170,7 @@ good_area:
 			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
 			 * of starvation. */
 			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
 
 			/*
 			 * No need to up_read(&mm->mmap_sem) as we would
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index eb365d6795fa..714b35a9c4f7 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -233,6 +233,7 @@ good_area:
 			current->min_flt++;
 		if (fault & VM_FAULT_RETRY) {
 			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
 
 			/*
 			 * No need to up_read(&mm->mmap_sem) as we would
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index c14f6dfed995..9f513486af10 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -171,6 +171,7 @@ good_area:
 		}
 		if (fault & VM_FAULT_RETRY) {
 			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
 
 			/*
 			 * No need to up_read(&mm->mmap_sem) as we would
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index 40f850e9766c..e2bfafce66c5 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -183,6 +183,7 @@ good_area:
 			tsk->min_flt++;
 		if (fault & VM_FAULT_RETRY) {
 			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
 
 			 /* No need to up_read(&mm->mmap_sem) as we would
 			 * have already released it in __lock_page_or_retry
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 5495ebe983a2..0a6b28336eb0 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -451,6 +451,7 @@ good_area:
 			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
 			 * of starvation. */
 			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
 			goto retry;
 		}
 	}
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index ac9122ca1152..04ad4001a289 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -367,6 +367,7 @@ retry:
 			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
 			 * of starvation. */
 			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
 			down_read(&mm->mmap_sem);
 			goto retry;
 		}
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 3bdc1ad9a341..cbbdcad8fcb3 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -504,6 +504,7 @@ good_area:
 		}
 		if (fault & VM_FAULT_RETRY) {
 			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
 
 			/*
 			 * No need to up_read(&mm->mmap_sem) as we would
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index 77ac917be152..e98bfda205a2 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -265,6 +265,7 @@ good_area:
 		}
 		if (fault & VM_FAULT_RETRY) {
 			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
 
 			/* No need to up_read(&mm->mmap_sem) as we would
 			 * have already released it in __lock_page_or_retry
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 1fe0429b6314..413d29263304 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -452,6 +452,7 @@ good_area:
 		}
 		if (fault & VM_FAULT_RETRY) {
 			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
 
 			/* No need to up_read(&mm->mmap_sem) as we would
 			 * have already released it in __lock_page_or_retry
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 84ce7abbf5af..fe811fa5f1b9 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -454,6 +454,7 @@ good_area:
 			tsk->min_flt++;
 		if (fault & VM_FAULT_RETRY) {
 			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
 
 			 /*
 			  * No need to up_read(&mm->mmap_sem) as we would
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 0353b98ae35a..0f00e9c82080 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -89,6 +89,7 @@ good_area:
 				current->min_flt++;
 			if (fault & VM_FAULT_RETRY) {
 				flags &= ~FAULT_FLAG_ALLOW_RETRY;
+				flags |= FAULT_FLAG_TRIED;
 
 				goto retry;
 			}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a530b230e7d7..8e13ecb41bee 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1220,6 +1220,7 @@ good_area:
 			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
 			 * of starvation. */
 			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
 			goto retry;
 		}
 	}
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 5a74c53bc69c..2c2f710ed1dc 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -126,6 +126,7 @@ good_area:
 			current->min_flt++;
 		if (fault & VM_FAULT_RETRY) {
 			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
 
 			 /* No need to up_read(&mm->mmap_sem) as we would
 			 * have already released it in __lock_page_or_retry
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b01e585ab4b5..bcaab4e6fe91 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -161,6 +161,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_ALLOW_RETRY	0x08	/* Retry fault if blocking */
 #define FAULT_FLAG_RETRY_NOWAIT	0x10	/* Don't drop mmap_sem and wait when retrying */
 #define FAULT_FLAG_KILLABLE	0x20	/* The fault task is in SIGKILL killable region */
+#define FAULT_FLAG_TRIED	0x40	/* second try */
 
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
diff --git a/mm/filemap.c b/mm/filemap.c
index a9827b42556e..83efee76a5c0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1607,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 * Do we have something in the page cache already?
 	 */
 	page = find_get_page(mapping, offset);
-	if (likely(page)) {
+	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
 		/*
 		 * We found the page, so try async readahead before
 		 * waiting for the lock.
 		 */
 		do_async_mmap_readahead(vma, ra, file, page, offset);
-	} else {
+	} else if (!page) {
 		/* No page in the page cache at all */
 		do_sync_mmap_readahead(vma, ra, file, offset);
 		count_vm_event(PGMAJFAULT);

From e9d24ad30fc5c4c601824fb39712350b053ca812 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:32:21 -0700
Subject: [PATCH 104/160] mm/memblock: use existing interface to set nid

Use the existing interface function to set the NUMA node ID (NID) for the
regions, either memory or reserved region.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Gavin Shan <shangw@linux.vnet.ibm.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memblock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memblock.c b/mm/memblock.c
index 82aa349d2f7a..d809f17ef75b 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -756,7 +756,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
 		return ret;
 
 	for (i = start_rgn; i < end_rgn; i++)
-		type->regions[i].nid = nid;
+		memblock_set_region_node(&type->regions[i], nid);
 
 	memblock_merge_regions(type);
 	return 0;

From f2d52fe51c8c0a18cf5fbe583bad51090d12c146 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:32:24 -0700
Subject: [PATCH 105/160] mm/memblock: cleanup early_node_map[] related
 comments

Commit 0ee332c14518 ("memblock: Kill early_node_map[]") removed
early_node_map[].  Clean up the comments to comply with that change.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Gavin Shan <shangw@linux.vnet.ibm.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 3 +--
 mm/nobootmem.c           | 2 --
 mm/page_alloc.c          | 2 +-
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 19dc455b4f3d..569d67d4243e 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -70,8 +70,7 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
  * @p_end: ptr to ulong for end pfn of the range, can be %NULL
  * @p_nid: ptr to int for nid of the range, can be %NULL
  *
- * Walks over configured memory ranges.  Available after early_node_map is
- * populated.
+ * Walks over configured memory ranges.
  */
 #define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid)		\
 	for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 405573010f99..bd82f6b31411 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -162,8 +162,6 @@ unsigned long __init free_all_bootmem(void)
 	 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
 	 *  because in some case like Node0 doesn't have RAM installed
 	 *  low ram will be on Node1
-	 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
-	 *  will be used instead of only Node0 related
 	 */
 	return free_low_memory_core_early(MAX_NUMNODES);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 82f0b2f54f81..ca002b39b9b4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4931,7 +4931,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 			       zone_movable_pfn[i] << PAGE_SHIFT);
 	}
 
-	/* Print out the early_node_map[] */
+	/* Print out the early node map */
 	printk("Early memory node ranges\n");
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
 		printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,

From e64c5237cf6ff474cb2f3f832f48f2b441dd9979 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Mon, 8 Oct 2012 16:32:27 -0700
Subject: [PATCH 106/160] mm: compaction: abort compaction loop if lock is
 contended or run too long

isolate_migratepages_range() might isolate no pages if for example when
zone->lru_lock is contended and running asynchronous compaction. In this
case, we should abort compaction, otherwise, compact_zone will run a
useless loop and make zone->lru_lock is even contended.

An additional check is added to ensure that cc.migratepages and
cc.freepages get properly drained whan compaction is aborted.

[minchan@kernel.org: Putback pages isolated for migration if aborting]
[akpm@linux-foundation.org: compact_zone_order requires non-NULL arg contended]
[akpm@linux-foundation.org: make compact_zone_order() require non-NULL arg `contended']
[minchan@kernel.org: Putback pages isolated for migration if aborting]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 17 ++++++++++++-----
 mm/internal.h   |  2 +-
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 1f61bcbd6262..0649cc1b3479 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -70,8 +70,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 
 		/* async aborts if taking too long or contended */
 		if (!cc->sync) {
-			if (cc->contended)
-				*cc->contended = true;
+			cc->contended = true;
 			return false;
 		}
 
@@ -688,7 +687,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 
 	/* Perform the isolation */
 	low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
-	if (!low_pfn)
+	if (!low_pfn || cc->contended)
 		return ISOLATE_ABORT;
 
 	cc->migrate_pfn = low_pfn;
@@ -848,6 +847,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		switch (isolate_migratepages(zone, cc)) {
 		case ISOLATE_ABORT:
 			ret = COMPACT_PARTIAL;
+			putback_lru_pages(&cc->migratepages);
+			cc->nr_migratepages = 0;
 			goto out;
 		case ISOLATE_NONE:
 			continue;
@@ -896,6 +897,7 @@ static unsigned long compact_zone_order(struct zone *zone,
 				 bool sync, bool *contended,
 				 struct page **page)
 {
+	unsigned long ret;
 	struct compact_control cc = {
 		.nr_freepages = 0,
 		.nr_migratepages = 0,
@@ -903,13 +905,18 @@ static unsigned long compact_zone_order(struct zone *zone,
 		.migratetype = allocflags_to_migratetype(gfp_mask),
 		.zone = zone,
 		.sync = sync,
-		.contended = contended,
 		.page = page,
 	};
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
 
-	return compact_zone(zone, &cc);
+	ret = compact_zone(zone, &cc);
+
+	VM_BUG_ON(!list_empty(&cc.freepages));
+	VM_BUG_ON(!list_empty(&cc.migratepages));
+
+	*contended = cc.contended;
+	return ret;
 }
 
 int sysctl_extfrag_threshold = 500;
diff --git a/mm/internal.h b/mm/internal.h
index 96cda4c6ac56..97664be2ca37 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -130,7 +130,7 @@ struct compact_control {
 	int order;			/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
 	struct zone *zone;
-	bool *contended;		/* True if a lock was contended */
+	bool contended;			/* True if a lock was contended */
 	struct page **page;		/* Page captured of requested size */
 };
 

From 3cc668f4e30fbd97b3c0574d8cac7a83903c9bc7 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:32:30 -0700
Subject: [PATCH 107/160] mm: compaction: move fatal signal check out of
 compact_checklock_irqsave

Commit c67fe3752abe ("mm: compaction: Abort async compaction if locks
are contended or taking too long") addressed a lock contention problem
in compaction by introducing compact_checklock_irqsave() that effecively
aborting async compaction in the event of compaction.

To preserve existing behaviour it also moved a fatal_signal_pending()
check into compact_checklock_irqsave() but that is very misleading.  It
"hides" the check within a locking function but has nothing to do with
locking as such.  It just happens to work in a desirable fashion.

This patch moves the fatal_signal_pending() check to
isolate_migratepages_range() where it belongs.  Arguably the same check
should also happen when isolating pages for freeing but it's overkill.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 0649cc1b3479..78075a268399 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -75,8 +75,6 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 		}
 
 		cond_resched();
-		if (fatal_signal_pending(current))
-			return false;
 	}
 
 	if (!locked)
@@ -363,7 +361,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		/* Check if it is ok to still hold the lock */
 		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
 								locked, cc);
-		if (!locked)
+		if (!locked || fatal_signal_pending(current))
 			break;
 
 		/*

From 661c4cb9b829110cb68c18ea05a56be39f75a4d2 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:32:31 -0700
Subject: [PATCH 108/160] mm: compaction: Update
 try_to_compact_pages()kerneldoc comment

Parameters were added without documentation, tut tut.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/compaction.c b/mm/compaction.c
index 78075a268399..b16dd3822995 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -926,6 +926,8 @@ int sysctl_extfrag_threshold = 500;
  * @gfp_mask: The GFP mask of the current allocation
  * @nodemask: The allowed nodes to allocate from
  * @sync: Whether migration is synchronous or not
+ * @contended: Return value that is true if compaction was aborted due to lock contention
+ * @page: Optionally capture a free page of the requested order during compaction
  *
  * This is the main entry point for direct page compaction.
  */

From 2a1402aa044b55c2d30ab0ed9405693ef06fb07c Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:32:33 -0700
Subject: [PATCH 109/160] mm: compaction: acquire the zone->lru_lock as late as
 possible

Richard Davies and Shaohua Li have both reported lock contention problems
in compaction on the zone and LRU locks as well as significant amounts of
time being spent in compaction.  This series aims to reduce lock
contention and scanning rates to reduce that CPU usage.  Richard reported
at https://lkml.org/lkml/2012/9/21/91 that this series made a big
different to a problem he reported in August:

   http://marc.info/?l=kvm&m=134511507015614&w=2

Patch 1 defers acquiring the zone->lru_lock as long as possible.

Patch 2 defers acquiring the zone->lock as lock as possible.

Patch 3 reverts Rik's "skip-free" patches as the core concept gets
	reimplemented later and the remaining patches are easier to
	understand if this is reverted first.

Patch 4 adds a pageblock-skip bit to the pageblock flags to cache what
	pageblocks should be skipped by the migrate and free scanners.
	This drastically reduces the amount of scanning compaction has
	to do.

Patch 5 reimplements something similar to Rik's idea except it uses the
	pageblock-skip information to decide where the scanners should
	restart from and does not need to wrap around.

I tested this on 3.6-rc6 + linux-next/akpm. Kernels tested were

akpm-20120920	3.6-rc6 + linux-next/akpm as of Septeber 20th, 2012
lesslock	Patches 1-6
revert		Patches 1-7
cachefail	Patches 1-8
skipuseless	Patches 1-9

Stress high-order allocation tests looked ok.  Success rates are more or
less the same with the full series applied but there is an expectation
that there is less opportunity to race with other allocation requests if
there is less scanning.  The time to complete the tests did not vary that
much and are uninteresting as were the vmstat statistics so I will not
present them here.

Using ftrace I recorded how much scanning was done by compaction and got this

                            3.6.0-rc6     3.6.0-rc6   3.6.0-rc6  3.6.0-rc6 3.6.0-rc6
                            akpm-20120920 lockless  revert-v2r2  cachefail skipuseless

Total   free    scanned         360753976  515414028  565479007   17103281   18916589
Total   free    isolated          2852429    3597369    4048601     670493     727840
Total   free    efficiency        0.0079%    0.0070%    0.0072%    0.0392%    0.0385%
Total   migrate scanned         247728664  822729112 1004645830   17946827   14118903
Total   migrate isolated          2555324    3245937    3437501     616359     658616
Total   migrate efficiency        0.0103%    0.0039%    0.0034%    0.0343%    0.0466%

The efficiency is worthless because of the nature of the test and the
number of failures.  The really interesting point as far as this patch
series is concerned is the number of pages scanned.  Note that reverting
Rik's patches massively increases the number of pages scanned indicating
that those patches really did make a difference to CPU usage.

However, caching what pageblocks should be skipped has a much higher
impact.  With patches 1-8 applied, free page and migrate page scanning are
both reduced by 95% in comparison to the akpm kernel.  If the basic
concept of Rik's patches are implemened on top then scanning then the free
scanner barely changed but migrate scanning was further reduced.  That
said, tests on 3.6-rc5 indicated that the last patch had greater impact
than what was measured here so it is a bit variable.

One way or the other, this series has a large impact on the amount of
scanning compaction does when there is a storm of THP allocations.

This patch:

Compaction's migrate scanner acquires the zone->lru_lock when scanning a
range of pages looking for LRU pages to acquire.  It does this even if
there are no LRU pages in the range.  If multiple processes are compacting
then this can cause severe locking contention.  To make matters worse
commit b2eef8c0 ("mm: compaction: minimise the time IRQs are disabled
while isolating pages for migration") releases the lru_lock every
SWAP_CLUSTER_MAX pages that are scanned.

This patch makes two changes to how the migrate scanner acquires the LRU
lock.  First, it only releases the LRU lock every SWAP_CLUSTER_MAX pages
if the lock is contended.  This reduces the number of times it
unnecessarily disables and re-enables IRQs.  The second is that it defers
acquiring the LRU lock for as long as possible.  If there are no LRU pages
or the only LRU pages are transhuge then the LRU lock will not be acquired
at all which reduces contention on zone->lru_lock.

[minchan@kernel.org: augment comment]
[akpm@linux-foundation.org: tweak comment text]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Richard Davies <richard@arachsys.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Avi Kivity <avi@redhat.com>
Acked-by: Rafael Aquini <aquini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 65 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 45 insertions(+), 20 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index b16dd3822995..832c4183dccc 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -50,6 +50,11 @@ static inline bool migrate_async_suitable(int migratetype)
 	return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
 }
 
+static inline bool should_release_lock(spinlock_t *lock)
+{
+	return need_resched() || spin_is_contended(lock);
+}
+
 /*
  * Compaction requires the taking of some coarse locks that are potentially
  * very heavily contended. Check if the process needs to be scheduled or
@@ -62,7 +67,7 @@ static inline bool migrate_async_suitable(int migratetype)
 static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 				      bool locked, struct compact_control *cc)
 {
-	if (need_resched() || spin_is_contended(lock)) {
+	if (should_release_lock(lock)) {
 		if (locked) {
 			spin_unlock_irqrestore(lock, *flags);
 			locked = false;
@@ -327,7 +332,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	isolate_mode_t mode = 0;
 	struct lruvec *lruvec;
 	unsigned long flags;
-	bool locked;
+	bool locked = false;
 
 	/*
 	 * Ensure that there are not too many pages isolated from the LRU
@@ -347,23 +352,17 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 
 	/* Time to isolate some pages for migration */
 	cond_resched();
-	spin_lock_irqsave(&zone->lru_lock, flags);
-	locked = true;
 	for (; low_pfn < end_pfn; low_pfn++) {
 		struct page *page;
 
 		/* give a chance to irqs before checking need_resched() */
-		if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
-			spin_unlock_irqrestore(&zone->lru_lock, flags);
-			locked = false;
+		if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+			if (should_release_lock(&zone->lru_lock)) {
+				spin_unlock_irqrestore(&zone->lru_lock, flags);
+				locked = false;
+			}
 		}
 
-		/* Check if it is ok to still hold the lock */
-		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
-								locked, cc);
-		if (!locked || fatal_signal_pending(current))
-			break;
-
 		/*
 		 * migrate_pfn does not necessarily start aligned to a
 		 * pageblock. Ensure that pfn_valid is called when moving
@@ -403,20 +402,39 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		pageblock_nr = low_pfn >> pageblock_order;
 		if (!cc->sync && last_pageblock_nr != pageblock_nr &&
 		    !migrate_async_suitable(get_pageblock_migratetype(page))) {
-			low_pfn += pageblock_nr_pages;
-			low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
-			last_pageblock_nr = pageblock_nr;
-			continue;
+			goto next_pageblock;
 		}
 
+		/* Check may be lockless but that's ok as we recheck later */
 		if (!PageLRU(page))
 			continue;
 
 		/*
-		 * PageLRU is set, and lru_lock excludes isolation,
-		 * splitting and collapsing (collapsing has already
-		 * happened if PageLRU is set).
+		 * PageLRU is set. lru_lock normally excludes isolation
+		 * splitting and collapsing (collapsing has already happened
+		 * if PageLRU is set) but the lock is not necessarily taken
+		 * here and it is wasteful to take it just to check transhuge.
+		 * Check TransHuge without lock and skip the whole pageblock if
+		 * it's either a transhuge or hugetlbfs page, as calling
+		 * compound_order() without preventing THP from splitting the
+		 * page underneath us may return surprising results.
 		 */
+		if (PageTransHuge(page)) {
+			if (!locked)
+				goto next_pageblock;
+			low_pfn += (1 << compound_order(page)) - 1;
+			continue;
+		}
+
+		/* Check if it is ok to still hold the lock */
+		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
+								locked, cc);
+		if (!locked || fatal_signal_pending(current))
+			break;
+
+		/* Recheck PageLRU and PageTransHuge under lock */
+		if (!PageLRU(page))
+			continue;
 		if (PageTransHuge(page)) {
 			low_pfn += (1 << compound_order(page)) - 1;
 			continue;
@@ -444,6 +462,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			++low_pfn;
 			break;
 		}
+
+		continue;
+
+next_pageblock:
+		low_pfn += pageblock_nr_pages;
+		low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+		last_pageblock_nr = pageblock_nr;
 	}
 
 	acct_isolated(zone, locked, cc);

From f40d1e42bb988d2a26e8e111ea4c4c7bac819b7e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:32:36 -0700
Subject: [PATCH 110/160] mm: compaction: acquire the zone->lock as late as
 possible

Compaction's free scanner acquires the zone->lock when checking for
PageBuddy pages and isolating them.  It does this even if there are no
PageBuddy pages in the range.

This patch defers acquiring the zone lock for as long as possible.  In the
event there are no free pages in the pageblock then the lock will not be
acquired at all which reduces contention on zone->lock.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Richard Davies <richard@arachsys.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Avi Kivity <avi@redhat.com>
Acked-by: Rafael Aquini <aquini@redhat.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Tested-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 142 ++++++++++++++++++++++++++----------------------
 1 file changed, 77 insertions(+), 65 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 832c4183dccc..bdf6e13045ea 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -93,6 +93,27 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
 	return compact_checklock_irqsave(lock, flags, false, cc);
 }
 
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
+{
+	int migratetype = get_pageblock_migratetype(page);
+
+	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
+	if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
+		return false;
+
+	/* If the page is a large free page, then allow migration */
+	if (PageBuddy(page) && page_order(page) >= pageblock_order)
+		return true;
+
+	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+	if (migrate_async_suitable(migratetype))
+		return true;
+
+	/* Otherwise skip the block */
+	return false;
+}
+
 static void compact_capture_page(struct compact_control *cc)
 {
 	unsigned long flags;
@@ -153,38 +174,56 @@ static void compact_capture_page(struct compact_control *cc)
  * pages inside of the pageblock (even though it may still end up isolating
  * some pages).
  */
-static unsigned long isolate_freepages_block(unsigned long blockpfn,
+static unsigned long isolate_freepages_block(struct compact_control *cc,
+				unsigned long blockpfn,
 				unsigned long end_pfn,
 				struct list_head *freelist,
 				bool strict)
 {
 	int nr_scanned = 0, total_isolated = 0;
 	struct page *cursor;
+	unsigned long nr_strict_required = end_pfn - blockpfn;
+	unsigned long flags;
+	bool locked = false;
 
 	cursor = pfn_to_page(blockpfn);
 
-	/* Isolate free pages. This assumes the block is valid */
+	/* Isolate free pages. */
 	for (; blockpfn < end_pfn; blockpfn++, cursor++) {
 		int isolated, i;
 		struct page *page = cursor;
 
-		if (!pfn_valid_within(blockpfn)) {
-			if (strict)
-				return 0;
-			continue;
-		}
 		nr_scanned++;
-
-		if (!PageBuddy(page)) {
-			if (strict)
-				return 0;
+		if (!pfn_valid_within(blockpfn))
+			continue;
+		if (!PageBuddy(page))
+			continue;
+
+		/*
+		 * The zone lock must be held to isolate freepages.
+		 * Unfortunately this is a very coarse lock and can be
+		 * heavily contended if there are parallel allocations
+		 * or parallel compactions. For async compaction do not
+		 * spin on the lock and we acquire the lock as late as
+		 * possible.
+		 */
+		locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
+								locked, cc);
+		if (!locked)
+			break;
+
+		/* Recheck this is a suitable migration target under lock */
+		if (!strict && !suitable_migration_target(page))
+			break;
+
+		/* Recheck this is a buddy page under lock */
+		if (!PageBuddy(page))
 			continue;
-		}
 
 		/* Found a free page, break it into order-0 pages */
 		isolated = split_free_page(page);
 		if (!isolated && strict)
-			return 0;
+			break;
 		total_isolated += isolated;
 		for (i = 0; i < isolated; i++) {
 			list_add(&page->lru, freelist);
@@ -199,6 +238,18 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
 	}
 
 	trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
+
+	/*
+	 * If strict isolation is requested by CMA then check that all the
+	 * pages requested were isolated. If there were any failures, 0 is
+	 * returned and CMA will fail.
+	 */
+	if (strict && nr_strict_required != total_isolated)
+		total_isolated = 0;
+
+	if (locked)
+		spin_unlock_irqrestore(&cc->zone->lock, flags);
+
 	return total_isolated;
 }
 
@@ -218,12 +269,17 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
 unsigned long
 isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
 {
-	unsigned long isolated, pfn, block_end_pfn, flags;
+	unsigned long isolated, pfn, block_end_pfn;
 	struct zone *zone = NULL;
 	LIST_HEAD(freelist);
 
+	/* cc needed for isolate_freepages_block to acquire zone->lock */
+	struct compact_control cc = {
+		.sync = true,
+	};
+
 	if (pfn_valid(start_pfn))
-		zone = page_zone(pfn_to_page(start_pfn));
+		cc.zone = zone = page_zone(pfn_to_page(start_pfn));
 
 	for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
 		if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn)))
@@ -236,10 +292,8 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
 		block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
 		block_end_pfn = min(block_end_pfn, end_pfn);
 
-		spin_lock_irqsave(&zone->lock, flags);
-		isolated = isolate_freepages_block(pfn, block_end_pfn,
+		isolated = isolate_freepages_block(&cc, pfn, block_end_pfn,
 						   &freelist, true);
-		spin_unlock_irqrestore(&zone->lock, flags);
 
 		/*
 		 * In strict mode, isolate_freepages_block() returns 0 if
@@ -483,29 +537,6 @@ next_pageblock:
 
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
 #ifdef CONFIG_COMPACTION
-
-/* Returns true if the page is within a block suitable for migration to */
-static bool suitable_migration_target(struct page *page)
-{
-
-	int migratetype = get_pageblock_migratetype(page);
-
-	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
-	if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
-		return false;
-
-	/* If the page is a large free page, then allow migration */
-	if (PageBuddy(page) && page_order(page) >= pageblock_order)
-		return true;
-
-	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
-	if (migrate_async_suitable(migratetype))
-		return true;
-
-	/* Otherwise skip the block */
-	return false;
-}
-
 /*
  * Returns the start pfn of the last page block in a zone.  This is the starting
  * point for full compaction of a zone.  Compaction searches for free pages from
@@ -529,7 +560,6 @@ static void isolate_freepages(struct zone *zone,
 {
 	struct page *page;
 	unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
-	unsigned long flags;
 	int nr_freepages = cc->nr_freepages;
 	struct list_head *freelist = &cc->freepages;
 
@@ -577,30 +607,12 @@ static void isolate_freepages(struct zone *zone,
 		if (!suitable_migration_target(page))
 			continue;
 
-		/*
-		 * Found a block suitable for isolating free pages from. Now
-		 * we disabled interrupts, double check things are ok and
-		 * isolate the pages. This is to minimise the time IRQs
-		 * are disabled
-		 */
+		/* Found a block suitable for isolating free pages from */
 		isolated = 0;
-
-		/*
-		 * The zone lock must be held to isolate freepages. This
-		 * unfortunately this is a very coarse lock and can be
-		 * heavily contended if there are parallel allocations
-		 * or parallel compactions. For async compaction do not
-		 * spin on the lock
-		 */
-		if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
-			break;
-		if (suitable_migration_target(page)) {
-			end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
-			isolated = isolate_freepages_block(pfn, end_pfn,
-							   freelist, false);
-			nr_freepages += isolated;
-		}
-		spin_unlock_irqrestore(&zone->lock, flags);
+		end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
+		isolated = isolate_freepages_block(cc, pfn, end_pfn,
+						   freelist, false);
+		nr_freepages += isolated;
 
 		/*
 		 * Record the highest PFN we isolated pages from. When next

From 753341a4b85ff337487b9959c71c529f522004f4 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:32:40 -0700
Subject: [PATCH 111/160] revert "mm: have order > 0 compaction start off where
 it left"

This reverts commit 7db8889ab05b ("mm: have order > 0 compaction start
off where it left") and commit de74f1cc ("mm: have order > 0 compaction
start near a pageblock with free pages").  These patches were a good
idea and tests confirmed that they massively reduced the amount of
scanning but the implementation is complex and tricky to understand.  A
later patch will cache what pageblocks should be skipped and
reimplements the concept of compact_cached_free_pfn on top for both
migration and free scanners.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Richard Davies <richard@arachsys.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Avi Kivity <avi@redhat.com>
Acked-by: Rafael Aquini <aquini@redhat.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  4 ---
 mm/compaction.c        | 65 ++++--------------------------------------
 mm/internal.h          |  6 ----
 mm/page_alloc.c        |  5 ----
 4 files changed, 5 insertions(+), 75 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 85ac67aa5770..16a4cc2950a0 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -369,10 +369,6 @@ struct zone {
 	 */
 	spinlock_t		lock;
 	int                     all_unreclaimable; /* All pages pinned */
-#if defined CONFIG_COMPACTION || defined CONFIG_CMA
-	/* pfn where the last incremental compaction isolated free pages */
-	unsigned long		compact_cached_free_pfn;
-#endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 	/* see spanned/present_pages for more description */
 	seqlock_t		span_seqlock;
diff --git a/mm/compaction.c b/mm/compaction.c
index bdf6e13045ea..db76361a3117 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -537,20 +537,6 @@ next_pageblock:
 
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
 #ifdef CONFIG_COMPACTION
-/*
- * Returns the start pfn of the last page block in a zone.  This is the starting
- * point for full compaction of a zone.  Compaction searches for free pages from
- * the end of each zone, while isolate_freepages_block scans forward inside each
- * page block.
- */
-static unsigned long start_free_pfn(struct zone *zone)
-{
-	unsigned long free_pfn;
-	free_pfn = zone->zone_start_pfn + zone->spanned_pages;
-	free_pfn &= ~(pageblock_nr_pages-1);
-	return free_pfn;
-}
-
 /*
  * Based on information in the current compact_control, find blocks
  * suitable for isolating free pages from and then isolate them.
@@ -619,19 +605,8 @@ static void isolate_freepages(struct zone *zone,
 		 * looking for free pages, the search will restart here as
 		 * page migration may have returned some pages to the allocator
 		 */
-		if (isolated) {
+		if (isolated)
 			high_pfn = max(high_pfn, pfn);
-
-			/*
-			 * If the free scanner has wrapped, update
-			 * compact_cached_free_pfn to point to the highest
-			 * pageblock with free pages. This reduces excessive
-			 * scanning of full pageblocks near the end of the
-			 * zone
-			 */
-			if (cc->order > 0 && cc->wrapped)
-				zone->compact_cached_free_pfn = high_pfn;
-		}
 	}
 
 	/* split_free_page does not map the pages */
@@ -639,11 +614,6 @@ static void isolate_freepages(struct zone *zone,
 
 	cc->free_pfn = high_pfn;
 	cc->nr_freepages = nr_freepages;
-
-	/* If compact_cached_free_pfn is reset then set it now */
-	if (cc->order > 0 && !cc->wrapped &&
-			zone->compact_cached_free_pfn == start_free_pfn(zone))
-		zone->compact_cached_free_pfn = high_pfn;
 }
 
 /*
@@ -738,26 +708,8 @@ static int compact_finished(struct zone *zone,
 	if (fatal_signal_pending(current))
 		return COMPACT_PARTIAL;
 
-	/*
-	 * A full (order == -1) compaction run starts at the beginning and
-	 * end of a zone; it completes when the migrate and free scanner meet.
-	 * A partial (order > 0) compaction can start with the free scanner
-	 * at a random point in the zone, and may have to restart.
-	 */
-	if (cc->free_pfn <= cc->migrate_pfn) {
-		if (cc->order > 0 && !cc->wrapped) {
-			/* We started partway through; restart at the end. */
-			unsigned long free_pfn = start_free_pfn(zone);
-			zone->compact_cached_free_pfn = free_pfn;
-			cc->free_pfn = free_pfn;
-			cc->wrapped = 1;
-			return COMPACT_CONTINUE;
-		}
-		return COMPACT_COMPLETE;
-	}
-
-	/* We wrapped around and ended up where we started. */
-	if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
+	/* Compaction run completes if the migrate and free scanner meet */
+	if (cc->free_pfn <= cc->migrate_pfn)
 		return COMPACT_COMPLETE;
 
 	/*
@@ -863,15 +815,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 	/* Setup to move all movable pages to the end of the zone */
 	cc->migrate_pfn = zone->zone_start_pfn;
-
-	if (cc->order > 0) {
-		/* Incremental compaction. Start where the last one stopped. */
-		cc->free_pfn = zone->compact_cached_free_pfn;
-		cc->start_free_pfn = cc->free_pfn;
-	} else {
-		/* Order == -1 starts at the end of the zone. */
-		cc->free_pfn = start_free_pfn(zone);
-	}
+	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
+	cc->free_pfn &= ~(pageblock_nr_pages-1);
 
 	migrate_prep_local();
 
diff --git a/mm/internal.h b/mm/internal.h
index 97664be2ca37..6f6bb9ab9386 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -118,14 +118,8 @@ struct compact_control {
 	unsigned long nr_freepages;	/* Number of isolated free pages */
 	unsigned long nr_migratepages;	/* Number of pages to migrate */
 	unsigned long free_pfn;		/* isolate_freepages search base */
-	unsigned long start_free_pfn;	/* where we started the search */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
 	bool sync;			/* Synchronous migration */
-	bool wrapped;			/* Order > 0 compactions are
-					   incremental, once free_pfn
-					   and migrate_pfn meet, we restart
-					   from the top of the zone;
-					   remember we wrapped around. */
 
 	int order;			/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ca002b39b9b4..628968c1ccf4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4490,11 +4490,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
-#if defined CONFIG_COMPACTION || defined CONFIG_CMA
-		zone->compact_cached_free_pfn = zone->zone_start_pfn +
-						zone->spanned_pages;
-		zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
-#endif
 #ifdef CONFIG_NUMA
 		zone->node = nid;
 		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)

From bb13ffeb9f6bfeb301443994dfbf29f91117dfb3 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:32:41 -0700
Subject: [PATCH 112/160] mm: compaction: cache if a pageblock was scanned and
 no pages were isolated

When compaction was implemented it was known that scanning could
potentially be excessive.  The ideal was that a counter be maintained for
each pageblock but maintaining this information would incur a severe
penalty due to a shared writable cache line.  It has reached the point
where the scanning costs are a serious problem, particularly on
long-lived systems where a large process starts and allocates a large
number of THPs at the same time.

Instead of using a shared counter, this patch adds another bit to the
pageblock flags called PG_migrate_skip.  If a pageblock is scanned by
either migrate or free scanner and 0 pages were isolated, the pageblock is
marked to be skipped in the future.  When scanning, this bit is checked
before any scanning takes place and the block skipped if set.

The main difficulty with a patch like this is "when to ignore the cached
information?" If it's ignored too often, the scanning rates will still be
excessive.  If the information is too stale then allocations will fail
that might have otherwise succeeded.  In this patch

o CMA always ignores the information
o If the migrate and free scanner meet then the cached information will
  be discarded if it's at least 5 seconds since the last time the cache
  was discarded
o If there are a large number of allocation failures, discard the cache.

The time-based heuristic is very clumsy but there are few choices for a
better event.  Depending solely on multiple allocation failures still
allows excessive scanning when THP allocations are failing in quick
succession due to memory pressure.  Waiting until memory pressure is
relieved would cause compaction to continually fail instead of using
reclaim/compaction to try allocate the page.  The time-based mechanism is
clumsy but a better option is not obvious.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Richard Davies <richard@arachsys.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Avi Kivity <avi@redhat.com>
Acked-by: Rafael Aquini <aquini@redhat.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h          |   3 +
 include/linux/pageblock-flags.h |  19 ++++-
 mm/compaction.c                 | 125 +++++++++++++++++++++++++++-----
 mm/internal.h                   |   4 +-
 mm/page_alloc.c                 |  38 +++++-----
 5 files changed, 151 insertions(+), 38 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 16a4cc2950a0..f85ecc9cfa1b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -369,6 +369,9 @@ struct zone {
 	 */
 	spinlock_t		lock;
 	int                     all_unreclaimable; /* All pages pinned */
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+	unsigned long		compact_blockskip_expire;
+#endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 	/* see spanned/present_pages for more description */
 	seqlock_t		span_seqlock;
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 19ef95d293ae..eed27f4f4c3e 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -30,6 +30,9 @@ enum pageblock_bits {
 	PB_migrate,
 	PB_migrate_end = PB_migrate + 3 - 1,
 			/* 3 bits required for migrate types */
+#ifdef CONFIG_COMPACTION
+	PB_migrate_skip,/* If set the block is skipped by compaction */
+#endif /* CONFIG_COMPACTION */
 	NR_PAGEBLOCK_BITS
 };
 
@@ -65,10 +68,22 @@ unsigned long get_pageblock_flags_group(struct page *page,
 void set_pageblock_flags_group(struct page *page, unsigned long flags,
 					int start_bitidx, int end_bitidx);
 
+#ifdef CONFIG_COMPACTION
+#define get_pageblock_skip(page) \
+			get_pageblock_flags_group(page, PB_migrate_skip,     \
+							PB_migrate_skip + 1)
+#define clear_pageblock_skip(page) \
+			set_pageblock_flags_group(page, 0, PB_migrate_skip,  \
+							PB_migrate_skip + 1)
+#define set_pageblock_skip(page) \
+			set_pageblock_flags_group(page, 1, PB_migrate_skip,  \
+							PB_migrate_skip + 1)
+#endif /* CONFIG_COMPACTION */
+
 #define get_pageblock_flags(page) \
-			get_pageblock_flags_group(page, 0, NR_PAGEBLOCK_BITS-1)
+			get_pageblock_flags_group(page, 0, PB_migrate_end)
 #define set_pageblock_flags(page, flags) \
 			set_pageblock_flags_group(page, flags,	\
-						  0, NR_PAGEBLOCK_BITS-1)
+						  0, PB_migrate_end)
 
 #endif	/* PAGEBLOCK_FLAGS_H */
diff --git a/mm/compaction.c b/mm/compaction.c
index db76361a3117..d9dbb97e607b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -50,6 +50,79 @@ static inline bool migrate_async_suitable(int migratetype)
 	return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
 }
 
+#ifdef CONFIG_COMPACTION
+/* Returns true if the pageblock should be scanned for pages to isolate. */
+static inline bool isolation_suitable(struct compact_control *cc,
+					struct page *page)
+{
+	if (cc->ignore_skip_hint)
+		return true;
+
+	return !get_pageblock_skip(page);
+}
+
+/*
+ * This function is called to clear all cached information on pageblocks that
+ * should be skipped for page isolation when the migrate and free page scanner
+ * meet.
+ */
+static void reset_isolation_suitable(struct zone *zone)
+{
+	unsigned long start_pfn = zone->zone_start_pfn;
+	unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+	unsigned long pfn;
+
+	/*
+	 * Do not reset more than once every five seconds. If allocations are
+	 * failing sufficiently quickly to allow this to happen then continually
+	 * scanning for compaction is not going to help. The choice of five
+	 * seconds is arbitrary but will mitigate excessive scanning.
+	 */
+	if (time_before(jiffies, zone->compact_blockskip_expire))
+		return;
+	zone->compact_blockskip_expire = jiffies + (HZ * 5);
+
+	/* Walk the zone and mark every pageblock as suitable for isolation */
+	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+		struct page *page;
+
+		cond_resched();
+
+		if (!pfn_valid(pfn))
+			continue;
+
+		page = pfn_to_page(pfn);
+		if (zone != page_zone(page))
+			continue;
+
+		clear_pageblock_skip(page);
+	}
+}
+
+/*
+ * If no pages were isolated then mark this pageblock to be skipped in the
+ * future. The information is later cleared by reset_isolation_suitable().
+ */
+static void update_pageblock_skip(struct page *page, unsigned long nr_isolated)
+{
+	if (!page)
+		return;
+
+	if (!nr_isolated)
+		set_pageblock_skip(page);
+}
+#else
+static inline bool isolation_suitable(struct compact_control *cc,
+					struct page *page)
+{
+	return true;
+}
+
+static void update_pageblock_skip(struct page *page, unsigned long nr_isolated)
+{
+}
+#endif /* CONFIG_COMPACTION */
+
 static inline bool should_release_lock(spinlock_t *lock)
 {
 	return need_resched() || spin_is_contended(lock);
@@ -181,7 +254,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 				bool strict)
 {
 	int nr_scanned = 0, total_isolated = 0;
-	struct page *cursor;
+	struct page *cursor, *valid_page = NULL;
 	unsigned long nr_strict_required = end_pfn - blockpfn;
 	unsigned long flags;
 	bool locked = false;
@@ -196,6 +269,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		nr_scanned++;
 		if (!pfn_valid_within(blockpfn))
 			continue;
+		if (!valid_page)
+			valid_page = page;
 		if (!PageBuddy(page))
 			continue;
 
@@ -250,6 +325,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 	if (locked)
 		spin_unlock_irqrestore(&cc->zone->lock, flags);
 
+	/* Update the pageblock-skip if the whole pageblock was scanned */
+	if (blockpfn == end_pfn)
+		update_pageblock_skip(valid_page, total_isolated);
+
 	return total_isolated;
 }
 
@@ -267,22 +346,14 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
  * a free page).
  */
 unsigned long
-isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
+isolate_freepages_range(struct compact_control *cc,
+			unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long isolated, pfn, block_end_pfn;
-	struct zone *zone = NULL;
 	LIST_HEAD(freelist);
 
-	/* cc needed for isolate_freepages_block to acquire zone->lock */
-	struct compact_control cc = {
-		.sync = true,
-	};
-
-	if (pfn_valid(start_pfn))
-		cc.zone = zone = page_zone(pfn_to_page(start_pfn));
-
 	for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
-		if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn)))
+		if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
 			break;
 
 		/*
@@ -292,7 +363,7 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
 		block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
 		block_end_pfn = min(block_end_pfn, end_pfn);
 
-		isolated = isolate_freepages_block(&cc, pfn, block_end_pfn,
+		isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
 						   &freelist, true);
 
 		/*
@@ -387,6 +458,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	struct lruvec *lruvec;
 	unsigned long flags;
 	bool locked = false;
+	struct page *page = NULL, *valid_page = NULL;
 
 	/*
 	 * Ensure that there are not too many pages isolated from the LRU
@@ -407,8 +479,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	/* Time to isolate some pages for migration */
 	cond_resched();
 	for (; low_pfn < end_pfn; low_pfn++) {
-		struct page *page;
-
 		/* give a chance to irqs before checking need_resched() */
 		if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
 			if (should_release_lock(&zone->lru_lock)) {
@@ -444,6 +514,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		if (page_zone(page) != zone)
 			continue;
 
+		if (!valid_page)
+			valid_page = page;
+
+		/* If isolation recently failed, do not retry */
+		pageblock_nr = low_pfn >> pageblock_order;
+		if (!isolation_suitable(cc, page))
+			goto next_pageblock;
+
 		/* Skip if free */
 		if (PageBuddy(page))
 			continue;
@@ -453,7 +531,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		 * migration is optimistic to see if the minimum amount of work
 		 * satisfies the allocation
 		 */
-		pageblock_nr = low_pfn >> pageblock_order;
 		if (!cc->sync && last_pageblock_nr != pageblock_nr &&
 		    !migrate_async_suitable(get_pageblock_migratetype(page))) {
 			goto next_pageblock;
@@ -530,6 +607,10 @@ next_pageblock:
 	if (locked)
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 
+	/* Update the pageblock-skip if the whole pageblock was scanned */
+	if (low_pfn == end_pfn)
+		update_pageblock_skip(valid_page, nr_isolated);
+
 	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
 
 	return low_pfn;
@@ -593,6 +674,10 @@ static void isolate_freepages(struct zone *zone,
 		if (!suitable_migration_target(page))
 			continue;
 
+		/* If isolation recently failed, do not retry */
+		if (!isolation_suitable(cc, page))
+			continue;
+
 		/* Found a block suitable for isolating free pages from */
 		isolated = 0;
 		end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
@@ -709,8 +794,10 @@ static int compact_finished(struct zone *zone,
 		return COMPACT_PARTIAL;
 
 	/* Compaction run completes if the migrate and free scanner meet */
-	if (cc->free_pfn <= cc->migrate_pfn)
+	if (cc->free_pfn <= cc->migrate_pfn) {
+		reset_isolation_suitable(cc->zone);
 		return COMPACT_COMPLETE;
+	}
 
 	/*
 	 * order == -1 is expected when compacting via
@@ -818,6 +905,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
 	cc->free_pfn &= ~(pageblock_nr_pages-1);
 
+	/* Clear pageblock skip if there are numerous alloc failures */
+	if (zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT)
+		reset_isolation_suitable(zone);
+
 	migrate_prep_local();
 
 	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
diff --git a/mm/internal.h b/mm/internal.h
index 6f6bb9ab9386..7ba56ac360b2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -120,6 +120,7 @@ struct compact_control {
 	unsigned long free_pfn;		/* isolate_freepages search base */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
 	bool sync;			/* Synchronous migration */
+	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
 
 	int order;			/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
@@ -129,7 +130,8 @@ struct compact_control {
 };
 
 unsigned long
-isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn);
+isolate_freepages_range(struct compact_control *cc,
+			unsigned long start_pfn, unsigned long end_pfn);
 unsigned long
 isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			   unsigned long low_pfn, unsigned long end_pfn);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 628968c1ccf4..44c56049edf9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5679,7 +5679,8 @@ __alloc_contig_migrate_alloc(struct page *page, unsigned long private,
 }
 
 /* [start, end) must belong to a single zone. */
-static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
+static int __alloc_contig_migrate_range(struct compact_control *cc,
+					unsigned long start, unsigned long end)
 {
 	/* This function is based on compact_zone() from compaction.c. */
 
@@ -5687,25 +5688,17 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
 	unsigned int tries = 0;
 	int ret = 0;
 
-	struct compact_control cc = {
-		.nr_migratepages = 0,
-		.order = -1,
-		.zone = page_zone(pfn_to_page(start)),
-		.sync = true,
-	};
-	INIT_LIST_HEAD(&cc.migratepages);
-
 	migrate_prep_local();
 
-	while (pfn < end || !list_empty(&cc.migratepages)) {
+	while (pfn < end || !list_empty(&cc->migratepages)) {
 		if (fatal_signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 
-		if (list_empty(&cc.migratepages)) {
-			cc.nr_migratepages = 0;
-			pfn = isolate_migratepages_range(cc.zone, &cc,
+		if (list_empty(&cc->migratepages)) {
+			cc->nr_migratepages = 0;
+			pfn = isolate_migratepages_range(cc->zone, cc,
 							 pfn, end);
 			if (!pfn) {
 				ret = -EINTR;
@@ -5717,14 +5710,14 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
 			break;
 		}
 
-		reclaim_clean_pages_from_list(cc.zone, &cc.migratepages);
+		reclaim_clean_pages_from_list(cc->zone, &cc->migratepages);
 
-		ret = migrate_pages(&cc.migratepages,
+		ret = migrate_pages(&cc->migratepages,
 				    __alloc_contig_migrate_alloc,
 				    0, false, MIGRATE_SYNC);
 	}
 
-	putback_lru_pages(&cc.migratepages);
+	putback_lru_pages(&cc->migratepages);
 	return ret > 0 ? 0 : ret;
 }
 
@@ -5803,6 +5796,15 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 	unsigned long outer_start, outer_end;
 	int ret = 0, order;
 
+	struct compact_control cc = {
+		.nr_migratepages = 0,
+		.order = -1,
+		.zone = page_zone(pfn_to_page(start)),
+		.sync = true,
+		.ignore_skip_hint = true,
+	};
+	INIT_LIST_HEAD(&cc.migratepages);
+
 	/*
 	 * What we do here is we mark all pageblocks in range as
 	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
@@ -5832,7 +5834,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 	if (ret)
 		goto done;
 
-	ret = __alloc_contig_migrate_range(start, end);
+	ret = __alloc_contig_migrate_range(&cc, start, end);
 	if (ret)
 		goto done;
 
@@ -5881,7 +5883,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 	__reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
 
 	/* Grab isolated pages from freelists. */
-	outer_end = isolate_freepages_range(outer_start, end);
+	outer_end = isolate_freepages_range(&cc, outer_start, end);
 	if (!outer_end) {
 		ret = -EBUSY;
 		goto done;

From c89511ab2f8fe2b47585e60da8af7fd213ec877e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:32:45 -0700
Subject: [PATCH 113/160] mm: compaction: Restart compaction from near where it
 left off

This is almost entirely based on Rik's previous patches and discussions
with him about how this might be implemented.

Order > 0 compaction stops when enough free pages of the correct page
order have been coalesced.  When doing subsequent higher order
allocations, it is possible for compaction to be invoked many times.

However, the compaction code always starts out looking for things to
compact at the start of the zone, and for free pages to compact things to
at the end of the zone.

This can cause quadratic behaviour, with isolate_freepages starting at the
end of the zone each time, even though previous invocations of the
compaction code already filled up all free memory on that end of the zone.
 This can cause isolate_freepages to take enormous amounts of CPU with
certain workloads on larger memory systems.

This patch caches where the migration and free scanner should start from
on subsequent compaction invocations using the pageblock-skip information.
 When compaction starts it begins from the cached restart points and will
update the cached restart points until a page is isolated or a pageblock
is skipped that would have been scanned by synchronous compaction.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Richard Davies <richard@arachsys.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Avi Kivity <avi@redhat.com>
Acked-by: Rafael Aquini <aquini@redhat.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  4 +++
 mm/compaction.c        | 58 ++++++++++++++++++++++++++++++++++--------
 mm/internal.h          |  4 +++
 3 files changed, 56 insertions(+), 10 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f85ecc9cfa1b..c8b3abc97a1e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -371,6 +371,10 @@ struct zone {
 	int                     all_unreclaimable; /* All pages pinned */
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
 	unsigned long		compact_blockskip_expire;
+
+	/* pfns where compaction scanners should start */
+	unsigned long		compact_cached_free_pfn;
+	unsigned long		compact_cached_migrate_pfn;
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 	/* see spanned/present_pages for more description */
diff --git a/mm/compaction.c b/mm/compaction.c
index d9dbb97e607b..f94cbc0b99a5 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -80,6 +80,9 @@ static void reset_isolation_suitable(struct zone *zone)
 	 */
 	if (time_before(jiffies, zone->compact_blockskip_expire))
 		return;
+
+	zone->compact_cached_migrate_pfn = start_pfn;
+	zone->compact_cached_free_pfn = end_pfn;
 	zone->compact_blockskip_expire = jiffies + (HZ * 5);
 
 	/* Walk the zone and mark every pageblock as suitable for isolation */
@@ -103,13 +106,29 @@ static void reset_isolation_suitable(struct zone *zone)
  * If no pages were isolated then mark this pageblock to be skipped in the
  * future. The information is later cleared by reset_isolation_suitable().
  */
-static void update_pageblock_skip(struct page *page, unsigned long nr_isolated)
+static void update_pageblock_skip(struct compact_control *cc,
+			struct page *page, unsigned long nr_isolated,
+			bool migrate_scanner)
 {
+	struct zone *zone = cc->zone;
 	if (!page)
 		return;
 
-	if (!nr_isolated)
+	if (!nr_isolated) {
+		unsigned long pfn = page_to_pfn(page);
 		set_pageblock_skip(page);
+
+		/* Update where compaction should restart */
+		if (migrate_scanner) {
+			if (!cc->finished_update_migrate &&
+			    pfn > zone->compact_cached_migrate_pfn)
+				zone->compact_cached_migrate_pfn = pfn;
+		} else {
+			if (!cc->finished_update_free &&
+			    pfn < zone->compact_cached_free_pfn)
+				zone->compact_cached_free_pfn = pfn;
+		}
+	}
 }
 #else
 static inline bool isolation_suitable(struct compact_control *cc,
@@ -118,7 +137,9 @@ static inline bool isolation_suitable(struct compact_control *cc,
 	return true;
 }
 
-static void update_pageblock_skip(struct page *page, unsigned long nr_isolated)
+static void update_pageblock_skip(struct compact_control *cc,
+			struct page *page, unsigned long nr_isolated,
+			bool migrate_scanner)
 {
 }
 #endif /* CONFIG_COMPACTION */
@@ -327,7 +348,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 
 	/* Update the pageblock-skip if the whole pageblock was scanned */
 	if (blockpfn == end_pfn)
-		update_pageblock_skip(valid_page, total_isolated);
+		update_pageblock_skip(cc, valid_page, total_isolated, false);
 
 	return total_isolated;
 }
@@ -533,6 +554,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		 */
 		if (!cc->sync && last_pageblock_nr != pageblock_nr &&
 		    !migrate_async_suitable(get_pageblock_migratetype(page))) {
+			cc->finished_update_migrate = true;
 			goto next_pageblock;
 		}
 
@@ -583,6 +605,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		VM_BUG_ON(PageTransCompound(page));
 
 		/* Successfully isolated */
+		cc->finished_update_migrate = true;
 		del_page_from_lru_list(page, lruvec, page_lru(page));
 		list_add(&page->lru, migratelist);
 		cc->nr_migratepages++;
@@ -609,7 +632,7 @@ next_pageblock:
 
 	/* Update the pageblock-skip if the whole pageblock was scanned */
 	if (low_pfn == end_pfn)
-		update_pageblock_skip(valid_page, nr_isolated);
+		update_pageblock_skip(cc, valid_page, nr_isolated, true);
 
 	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
 
@@ -690,8 +713,10 @@ static void isolate_freepages(struct zone *zone,
 		 * looking for free pages, the search will restart here as
 		 * page migration may have returned some pages to the allocator
 		 */
-		if (isolated)
+		if (isolated) {
+			cc->finished_update_free = true;
 			high_pfn = max(high_pfn, pfn);
+		}
 	}
 
 	/* split_free_page does not map the pages */
@@ -888,6 +913,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
 	int ret;
+	unsigned long start_pfn = zone->zone_start_pfn;
+	unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
 
 	ret = compaction_suitable(zone, cc->order);
 	switch (ret) {
@@ -900,10 +927,21 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		;
 	}
 
-	/* Setup to move all movable pages to the end of the zone */
-	cc->migrate_pfn = zone->zone_start_pfn;
-	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
-	cc->free_pfn &= ~(pageblock_nr_pages-1);
+	/*
+	 * Setup to move all movable pages to the end of the zone. Used cached
+	 * information on where the scanners should start but check that it
+	 * is initialised by ensuring the values are within zone boundaries.
+	 */
+	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
+	cc->free_pfn = zone->compact_cached_free_pfn;
+	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
+		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
+		zone->compact_cached_free_pfn = cc->free_pfn;
+	}
+	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+		cc->migrate_pfn = start_pfn;
+		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
+	}
 
 	/* Clear pageblock skip if there are numerous alloc failures */
 	if (zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT)
diff --git a/mm/internal.h b/mm/internal.h
index 7ba56ac360b2..7f72f249bc29 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -121,6 +121,10 @@ struct compact_control {
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
 	bool sync;			/* Synchronous migration */
 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
+	bool finished_update_free;	/* True when the zone cached pfns are
+					 * no longer being updated
+					 */
+	bool finished_update_migrate;
 
 	int order;			/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */

From 62997027ca5b3d4618198ed8b1aba40b61b1137b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:32:47 -0700
Subject: [PATCH 114/160] mm: compaction: clear PG_migrate_skip based on
 compaction and reclaim activity

Compaction caches if a pageblock was scanned and no pages were isolated so
that the pageblocks can be skipped in the future to reduce scanning.  This
information is not cleared by the page allocator based on activity due to
the impact it would have to the page allocator fast paths.  Hence there is
a requirement that something clear the cache or pageblocks will be skipped
forever.  Currently the cache is cleared if there were a number of recent
allocation failures and it has not been cleared within the last 5 seconds.
Time-based decisions like this are terrible as they have no relationship
to VM activity and is basically a big hammer.

Unfortunately, accurate heuristics would add cost to some hot paths so
this patch implements a rough heuristic.  There are two cases where the
cache is cleared.

1. If a !kswapd process completes a compaction cycle (migrate and free
   scanner meet), the zone is marked compact_blockskip_flush. When kswapd
   goes to sleep, it will clear the cache. This is expected to be the
   common case where the cache is cleared. It does not really matter if
   kswapd happens to be asleep or going to sleep when the flag is set as
   it will be woken on the next allocation request.

2. If there have been multiple failures recently and compaction just
   finished being deferred then a process will clear the cache and start a
   full scan.  This situation happens if there are multiple high-order
   allocation requests under heavy memory pressure.

The clearing of the PG_migrate_skip bits and other scans is inherently
racy but the race is harmless.  For allocations that can fail such as THP,
they will simply fail.  For requests that cannot fail, they will retry the
allocation.  Tests indicated that scanning rates were roughly similar to
when the time-based heuristic was used and the allocation success rates
were similar.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Richard Davies <richard@arachsys.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Avi Kivity <avi@redhat.com>
Cc: Rafael Aquini <aquini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 15 ++++++++++++
 include/linux/mmzone.h     |  3 ++-
 mm/compaction.c            | 50 ++++++++++++++++++++++++++------------
 mm/page_alloc.c            |  1 +
 mm/vmscan.c                |  8 ++++++
 5 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 0e38a1deeb23..6ecb6dc2f303 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -24,6 +24,7 @@ extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *mask,
 			bool sync, bool *contended, struct page **page);
 extern int compact_pgdat(pg_data_t *pgdat, int order);
+extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
 
 /* Do not skip compaction more than 64 times */
@@ -61,6 +62,16 @@ static inline bool compaction_deferred(struct zone *zone, int order)
 	return zone->compact_considered < defer_limit;
 }
 
+/* Returns true if restarting compaction after many failures */
+static inline bool compaction_restarting(struct zone *zone, int order)
+{
+	if (order < zone->compact_order_failed)
+		return false;
+
+	return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
+		zone->compact_considered >= 1UL << zone->compact_defer_shift;
+}
+
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
@@ -74,6 +85,10 @@ static inline int compact_pgdat(pg_data_t *pgdat, int order)
 	return COMPACT_CONTINUE;
 }
 
+static inline void reset_isolation_suitable(pg_data_t *pgdat)
+{
+}
+
 static inline unsigned long compaction_suitable(struct zone *zone, int order)
 {
 	return COMPACT_SKIPPED;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c8b3abc97a1e..d240efa8f846 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -370,7 +370,8 @@ struct zone {
 	spinlock_t		lock;
 	int                     all_unreclaimable; /* All pages pinned */
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
-	unsigned long		compact_blockskip_expire;
+	/* Set to true when the PG_migrate_skip bits should be cleared */
+	bool			compact_blockskip_flush;
 
 	/* pfns where compaction scanners should start */
 	unsigned long		compact_cached_free_pfn;
diff --git a/mm/compaction.c b/mm/compaction.c
index f94cbc0b99a5..d8187f9cabbf 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -66,24 +66,15 @@ static inline bool isolation_suitable(struct compact_control *cc,
  * should be skipped for page isolation when the migrate and free page scanner
  * meet.
  */
-static void reset_isolation_suitable(struct zone *zone)
+static void __reset_isolation_suitable(struct zone *zone)
 {
 	unsigned long start_pfn = zone->zone_start_pfn;
 	unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
 	unsigned long pfn;
 
-	/*
-	 * Do not reset more than once every five seconds. If allocations are
-	 * failing sufficiently quickly to allow this to happen then continually
-	 * scanning for compaction is not going to help. The choice of five
-	 * seconds is arbitrary but will mitigate excessive scanning.
-	 */
-	if (time_before(jiffies, zone->compact_blockskip_expire))
-		return;
-
 	zone->compact_cached_migrate_pfn = start_pfn;
 	zone->compact_cached_free_pfn = end_pfn;
-	zone->compact_blockskip_expire = jiffies + (HZ * 5);
+	zone->compact_blockskip_flush = false;
 
 	/* Walk the zone and mark every pageblock as suitable for isolation */
 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
@@ -102,9 +93,24 @@ static void reset_isolation_suitable(struct zone *zone)
 	}
 }
 
+void reset_isolation_suitable(pg_data_t *pgdat)
+{
+	int zoneid;
+
+	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+		struct zone *zone = &pgdat->node_zones[zoneid];
+		if (!populated_zone(zone))
+			continue;
+
+		/* Only flush if a full compaction finished recently */
+		if (zone->compact_blockskip_flush)
+			__reset_isolation_suitable(zone);
+	}
+}
+
 /*
  * If no pages were isolated then mark this pageblock to be skipped in the
- * future. The information is later cleared by reset_isolation_suitable().
+ * future. The information is later cleared by __reset_isolation_suitable().
  */
 static void update_pageblock_skip(struct compact_control *cc,
 			struct page *page, unsigned long nr_isolated,
@@ -820,7 +826,15 @@ static int compact_finished(struct zone *zone,
 
 	/* Compaction run completes if the migrate and free scanner meet */
 	if (cc->free_pfn <= cc->migrate_pfn) {
-		reset_isolation_suitable(cc->zone);
+		/*
+		 * Mark that the PG_migrate_skip information should be cleared
+		 * by kswapd when it goes to sleep. kswapd does not set the
+		 * flag itself as the decision to be clear should be directly
+		 * based on an allocation request.
+		 */
+		if (!current_is_kswapd())
+			zone->compact_blockskip_flush = true;
+
 		return COMPACT_COMPLETE;
 	}
 
@@ -943,9 +957,13 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
 	}
 
-	/* Clear pageblock skip if there are numerous alloc failures */
-	if (zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT)
-		reset_isolation_suitable(zone);
+	/*
+	 * Clear pageblock skip if there were failures recently and compaction
+	 * is about to be retried after being deferred. kswapd does not do
+	 * this reset as it'll reset the cached information when going to sleep.
+	 */
+	if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+		__reset_isolation_suitable(zone);
 
 	migrate_prep_local();
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 44c56049edf9..b97cf12f07a9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2172,6 +2172,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 				preferred_zone, migratetype);
 		if (page) {
 got_page:
+			preferred_zone->compact_blockskip_flush = false;
 			preferred_zone->compact_considered = 0;
 			preferred_zone->compact_defer_shift = 0;
 			if (order >= preferred_zone->compact_order_failed)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1ee4b69a28a5..b010efc43891 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2895,6 +2895,14 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 		 */
 		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
 
+		/*
+		 * Compaction records what page blocks it recently failed to
+		 * isolate pages from and skips them in the future scanning.
+		 * When kswapd is going to sleep, it is reasonable to assume
+		 * that pages and compaction may succeed so reset the cache.
+		 */
+		reset_isolation_suitable(pgdat);
+
 		if (!kthread_should_stop())
 			schedule();
 

From 3f6d4caeb9a9d8f7e5bbf3f49f1fd71e1414ff64 Mon Sep 17 00:00:00 2001
From: Sachin Kamat <sachin.kamat@linaro.org>
Date: Mon, 8 Oct 2012 16:32:50 -0700
Subject: [PATCH 115/160] mm/hugetlb.c: remove duplicate inclusion of header
 file

Signed-off-by: Sachin Kamat <sachin.kamat@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c9b40e3a9936..8536741f069b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -30,7 +30,6 @@
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/node.h>
-#include <linux/hugetlb_cgroup.h>
 #include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;

From 723a0644a7255f532575fd43245f9ef976491328 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:32:52 -0700
Subject: [PATCH 116/160] mm/page_alloc: refactor out
 __alloc_contig_migrate_alloc()

__alloc_contig_migrate_alloc() can be used by memory-hotplug so refactor
it out (move + rename as a common name) into page_isolation.c.

[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Michal Nazarewicz <mina86@mina86.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-isolation.h |  3 ++-
 mm/page_alloc.c                | 14 +-------------
 mm/page_isolation.c            | 11 +++++++++++
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index fca8c0a5c188..76a9539cfd3f 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -41,6 +41,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn);
  */
 int set_migratetype_isolate(struct page *page);
 void unset_migratetype_isolate(struct page *page, unsigned migratetype);
-
+struct page *alloc_migrate_target(struct page *page, unsigned long private,
+				int **resultp);
 
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b97cf12f07a9..8ac593893e6e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5667,18 +5667,6 @@ static unsigned long pfn_max_align_up(unsigned long pfn)
 				pageblock_nr_pages));
 }
 
-static struct page *
-__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
-			     int **resultp)
-{
-	gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
-
-	if (PageHighMem(page))
-		gfp_mask |= __GFP_HIGHMEM;
-
-	return alloc_page(gfp_mask);
-}
-
 /* [start, end) must belong to a single zone. */
 static int __alloc_contig_migrate_range(struct compact_control *cc,
 					unsigned long start, unsigned long end)
@@ -5714,7 +5702,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		reclaim_clean_pages_from_list(cc->zone, &cc->migratepages);
 
 		ret = migrate_pages(&cc->migratepages,
-				    __alloc_contig_migrate_alloc,
+				    alloc_migrate_target,
 				    0, false, MIGRATE_SYNC);
 	}
 
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5f34a9053ce0..f2f5b4818e94 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -255,3 +255,14 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 	spin_unlock_irqrestore(&zone->lock, flags);
 	return ret ? 0 : -EBUSY;
 }
+
+struct page *alloc_migrate_target(struct page *page, unsigned long private,
+				  int **resultp)
+{
+	gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+
+	if (PageHighMem(page))
+		gfp_mask |= __GFP_HIGHMEM;
+
+	return alloc_page(gfp_mask);
+}

From 74c08f982674cfd5dfeb2702d631db9bcdabf788 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:32:54 -0700
Subject: [PATCH 117/160] memory-hotplug: don't replace lowmem pages with
 highmem

The changelog for commit 6a6dccba2fdc ("mm: cma: don't replace lowmem
pages with highmem") mentioned that lowmem pages can be replaced by
highmem pages during CMA migration.  6a6dccba2fdc fixed that issue.

Quote from that changelog:

:   The filesystem layer expects pages in the block device's mapping to not
:   be in highmem (the mapping's gfp mask is set in bdget()), but CMA can
:   currently replace lowmem pages with highmem pages, leading to crashes in
:   filesystem code such as the one below:
:
:     Unable to handle kernel NULL pointer dereference at virtual address 00000400
:     pgd = c0c98000
:     [00000400] *pgd=00c91831, *pte=00000000, *ppte=00000000
:     Internal error: Oops: 817 [#1] PREEMPT SMP ARM
:     CPU: 0    Not tainted  (3.5.0-rc5+ #80)
:     PC is at __memzero+0x24/0x80
:     ...
:     Process fsstress (pid: 323, stack limit = 0xc0cbc2f0)
:     Backtrace:
:     [<c010e3f0>] (ext4_getblk+0x0/0x180) from [<c010e58c>] (ext4_bread+0x1c/0x98)
:     [<c010e570>] (ext4_bread+0x0/0x98) from [<c0117944>] (ext4_mkdir+0x160/0x3bc)
:      r4:c15337f0
:     [<c01177e4>] (ext4_mkdir+0x0/0x3bc) from [<c00c29e0>] (vfs_mkdir+0x8c/0x98)
:     [<c00c2954>] (vfs_mkdir+0x0/0x98) from [<c00c2a60>] (sys_mkdirat+0x74/0xac)
:      r6:00000000 r5:c152eb40 r4:000001ff r3:c14b43f0
:     [<c00c29ec>] (sys_mkdirat+0x0/0xac) from [<c00c2ab8>] (sys_mkdir+0x20/0x24)
:      r6:beccdcf0 r5:00074000 r4:beccdbbc
:     [<c00c2a98>] (sys_mkdir+0x0/0x24) from [<c000e3c0>] (ret_fast_syscall+0x0/0x30)

Memory-hotplug has same problem as CMA has so the same fix can be applied
to memory-hotplug as well.

Fix it by reusing.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Michal Nazarewicz <mina86@mina86.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b35016156c19..f9ac0955e10a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -756,13 +756,6 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
 	return 0;
 }
 
-static struct page *
-hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
-{
-	/* This should be improooooved!! */
-	return alloc_page(GFP_HIGHUSER_MOVABLE);
-}
-
 #define NR_OFFLINE_AT_ONCE_PAGES	(256)
 static int
 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
@@ -813,8 +806,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 			putback_lru_pages(&source);
 			goto out;
 		}
-		/* this function returns # of failed pages */
-		ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
+
+		/*
+		 * alloc_migrate_target should be improooooved!!
+		 * migrate_pages returns # of failed pages.
+		 */
+		ret = migrate_pages(&source, alloc_migrate_target, 0,
 							true, MIGRATE_SYNC);
 		if (ret)
 			putback_lru_pages(&source);

From e3b4126c556ca3a07699adf202d44bed3f453638 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 8 Oct 2012 16:32:57 -0700
Subject: [PATCH 118/160] thp: khugepaged_prealloc_page() forgot to reset the
 page alloc indicator

If NUMA is enabled, the indicator is not reset if the previous page
request failed, ausing us to trigger the BUG_ON() in
khugepaged_alloc_page().

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7cf8b0ec11ec..7153e0d18670 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1797,6 +1797,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
 			return false;
 
 		*wait = false;
+		*hpage = NULL;
 		khugepaged_alloc_sleep();
 	} else if (*hpage) {
 		put_page(*hpage);

From 2d28a2275c21d04290cfba1555c89a806d2b7706 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 8 Oct 2012 16:32:59 -0700
Subject: [PATCH 119/160] mm: thp: fix the pmd_clear() arguments in
 pmdp_get_and_clear()

The CONFIG_TRANSPARENT_HUGEPAGE implementation of pmdp_get_and_clear()
calls pmd_clear() with 3 arguments instead of 1.

This happens only for !__HAVE_ARCH_PMDP_GET_AND_CLEAR which doesn't seem
to happen because x86 defines this and it uses pmd_update.

[mhocko@suse.cz: changelog addition]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Steve Capper <steve.capper@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/pgtable.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index ebdd74fb8d6a..b36ce40bd1c6 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -87,7 +87,7 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
 				       pmd_t *pmdp)
 {
 	pmd_t pmd = *pmdp;
-	pmd_clear(mm, address, pmdp);
+	pmd_clear(pmdp);
 	return pmd;
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */

From eab1eef9911c36966b5d5934e6970581b3316013 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 8 Oct 2012 16:33:01 -0700
Subject: [PATCH 120/160] mm: thp: fix the update_mmu_cache() last argument
 passing in mm/huge_memory.c

The update_mmu_cache() takes a pointer (to pte_t by default) as the last
argument but the huge_memory.c passes a pmd_t value.  The patch changes
the argument to the pmd_t * pointer.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Steve Capper <steve.capper@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7153e0d18670..0e7740923fb9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -889,7 +889,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		entry = pmd_mkyoung(orig_pmd);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 		if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
-			update_mmu_cache(vma, address, entry);
+			update_mmu_cache(vma, address, pmd);
 		ret |= VM_FAULT_WRITE;
 		goto out_unlock;
 	}
@@ -941,7 +941,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		pmdp_clear_flush_notify(vma, haddr, pmd);
 		page_add_new_anon_rmap(new_page, vma, haddr);
 		set_pmd_at(mm, haddr, pmd, entry);
-		update_mmu_cache(vma, address, entry);
+		update_mmu_cache(vma, address, pmd);
 		page_remove_rmap(page);
 		put_page(page);
 		ret |= VM_FAULT_WRITE;
@@ -2002,7 +2002,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	BUG_ON(!pmd_none(*pmd));
 	page_add_new_anon_rmap(new_page, vma, address);
 	set_pmd_at(mm, address, pmd, _pmd);
-	update_mmu_cache(vma, address, _pmd);
+	update_mmu_cache(vma, address, pmd);
 	pgtable_trans_huge_deposit(mm, pgtable);
 	spin_unlock(&mm->page_table_lock);
 

From 05106e6a54aed321191b4bb5c9ee09538cbad3b1 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 8 Oct 2012 16:33:03 -0700
Subject: [PATCH 121/160] mm: enable CONFIG_COMPACTION by default

Now that lumpy reclaim has been removed, compaction is the only way left
to free up contiguous memory areas.  It is time to just enable
CONFIG_COMPACTION by default.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Acked-by: Rafael Aquini <aquini@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/Kconfig b/mm/Kconfig
index 3322342a1ffb..a3f8dddaaab3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -191,6 +191,7 @@ config SPLIT_PTLOCK_CPUS
 # support for memory compaction
 config COMPACTION
 	bool "Allow for memory compaction"
+	def_bool y
 	select MIGRATION
 	depends on MMU
 	help

From 7f1290f2f2a4d2c3f1b7ce8e87256e052ca23125 Mon Sep 17 00:00:00 2001
From: Jianguo Wu <wujianguo@huawei.com>
Date: Mon, 8 Oct 2012 16:33:06 -0700
Subject: [PATCH 122/160] mm: fix-up zone present pages

I think zone->present_pages indicates pages that buddy system can management,
it should be:

	zone->present_pages = spanned pages - absent pages - bootmem pages,

but is now:
	zone->present_pages = spanned pages - absent pages - memmap pages.

spanned pages: total size, including holes.
absent pages: holes.
bootmem pages: pages used in system boot, managed by bootmem allocator.
memmap pages: pages used by page structs.

This may cause zone->present_pages less than it should be.  For example,
numa node 1 has ZONE_NORMAL and ZONE_MOVABLE, it's memmap and other
bootmem will be allocated from ZONE_MOVABLE, so ZONE_NORMAL's
present_pages should be spanned pages - absent pages, but now it also
minus memmap pages(free_area_init_core), which are actually allocated from
ZONE_MOVABLE.  When offlining all memory of a zone, this will cause
zone->present_pages less than 0, because present_pages is unsigned long
type, it is actually a very large integer, it indirectly caused
zone->watermark[WMARK_MIN] becomes a large
integer(setup_per_zone_wmarks()), than cause totalreserve_pages become a
large integer(calculate_totalreserve_pages()), and finally cause memory
allocating failure when fork process(__vm_enough_memory()).

[root@localhost ~]# dmesg
-bash: fork: Cannot allocate memory

I think the bug described in

  http://marc.info/?l=linux-mm&m=134502182714186&w=2

is also caused by wrong zone present pages.

This patch intends to fix-up zone->present_pages when memory are freed to
buddy system on x86_64 and IA64 platforms.

Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Reported-by: Petr Tesarik <ptesarik@suse.cz>
Tested-by: Petr Tesarik <ptesarik@suse.cz>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/init.c |  1 +
 include/linux/mm.h  |  4 ++++
 mm/bootmem.c        | 10 +++++++++-
 mm/memory_hotplug.c |  7 +++++++
 mm/nobootmem.c      |  3 +++
 mm/page_alloc.c     | 34 ++++++++++++++++++++++++++++++++++
 6 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 082e383c1b6f..acd5b68e8871 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -637,6 +637,7 @@ mem_init (void)
 
 	high_memory = __va(max_low_pfn * PAGE_SIZE);
 
+	reset_zone_present_pages();
 	for_each_online_pgdat(pgdat)
 		if (pgdat->bdata->node_bootmem_map)
 			totalram_pages += free_all_bootmem_node(pgdat);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bcaab4e6fe91..fa0680402738 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1684,5 +1684,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
 static inline bool page_is_guard(struct page *page) { return false; }
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
+extern void reset_zone_present_pages(void);
+extern void fixup_zone_present_pages(int nid, unsigned long start_pfn,
+				unsigned long end_pfn);
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/mm/bootmem.c b/mm/bootmem.c
index f468185b3b28..434be4ae7a04 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -198,6 +198,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 			int order = ilog2(BITS_PER_LONG);
 
 			__free_pages_bootmem(pfn_to_page(start), order);
+			fixup_zone_present_pages(page_to_nid(pfn_to_page(start)),
+					start, start + BITS_PER_LONG);
 			count += BITS_PER_LONG;
 			start += BITS_PER_LONG;
 		} else {
@@ -208,6 +210,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 				if (vec & 1) {
 					page = pfn_to_page(start + off);
 					__free_pages_bootmem(page, 0);
+					fixup_zone_present_pages(
+						page_to_nid(page),
+						start + off, start + off + 1);
 					count++;
 				}
 				vec >>= 1;
@@ -221,8 +226,11 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 	pages = bdata->node_low_pfn - bdata->node_min_pfn;
 	pages = bootmem_bootmap_pages(pages);
 	count += pages;
-	while (pages--)
+	while (pages--) {
+		fixup_zone_present_pages(page_to_nid(page),
+				page_to_pfn(page), page_to_pfn(page) + 1);
 		__free_pages_bootmem(page++, 0);
+	}
 
 	bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f9ac0955e10a..ce690a911f1b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info,  struct page *page,
 void __ref put_page_bootmem(struct page *page)
 {
 	unsigned long type;
+	struct zone *zone;
 
 	type = (unsigned long) page->lru.next;
 	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -116,6 +117,12 @@ void __ref put_page_bootmem(struct page *page)
 		set_page_private(page, 0);
 		INIT_LIST_HEAD(&page->lru);
 		__free_pages_bootmem(page, 0);
+
+		zone = page_zone(page);
+		zone_span_writelock(zone);
+		zone->present_pages++;
+		zone_span_writeunlock(zone);
+		totalram_pages++;
 	}
 
 }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index bd82f6b31411..714d5d650470 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -116,6 +116,8 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
 		return 0;
 
 	__free_pages_memory(start_pfn, end_pfn);
+	fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT),
+			start_pfn, end_pfn);
 
 	return end_pfn - start_pfn;
 }
@@ -126,6 +128,7 @@ unsigned long __init free_low_memory_core_early(int nodeid)
 	phys_addr_t start, end, size;
 	u64 i;
 
+	reset_zone_present_pages();
 	for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
 		count += __free_memory_core(start, end);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8ac593893e6e..00750bc08a3a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6087,3 +6087,37 @@ void dump_page(struct page *page)
 	dump_page_flags(page->flags);
 	mem_cgroup_print_bad_page(page);
 }
+
+/* reset zone->present_pages */
+void reset_zone_present_pages(void)
+{
+	struct zone *z;
+	int i, nid;
+
+	for_each_node_state(nid, N_HIGH_MEMORY) {
+		for (i = 0; i < MAX_NR_ZONES; i++) {
+			z = NODE_DATA(nid)->node_zones + i;
+			z->present_pages = 0;
+		}
+	}
+}
+
+/* calculate zone's present pages in buddy system */
+void fixup_zone_present_pages(int nid, unsigned long start_pfn,
+				unsigned long end_pfn)
+{
+	struct zone *z;
+	unsigned long zone_start_pfn, zone_end_pfn;
+	int i;
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		z = NODE_DATA(nid)->node_zones + i;
+		zone_start_pfn = z->zone_start_pfn;
+		zone_end_pfn = zone_start_pfn + z->spanned_pages;
+
+		/* if the two regions intersect */
+		if (!(zone_start_pfn >= end_pfn	|| zone_end_pfn <= start_pfn))
+			z->present_pages += min(end_pfn, zone_end_pfn) -
+					    max(start_pfn, zone_start_pfn);
+	}
+}

From 1939c557b5c3c0e800328e3589ae3d27fdfea29e Mon Sep 17 00:00:00 2001
From: Michael Kerrisk <mtk.manpages@gmail.com>
Date: Mon, 8 Oct 2012 16:33:09 -0700
Subject: [PATCH 123/160] memcg: trivial fixes for
 Documentation/cgroups/memory.txt

While reading through Documentation/cgroups/memory.txt, I found a number
of minor wordos and typos.  The patch below is a conservative handling of
some of these: it provides just a number of "obviously correct" fixes to
the English that improve the readability of the document somewhat.
Obviously some more significant fixes need to be made to the document, but
some of those may not be in the "obvious correct" category.

Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups/memory.txt | 90 ++++++++++++++++----------------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 4372e6b8a353..c07f7b4fb88d 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -18,16 +18,16 @@ from the rest of the system. The article on LWN [12] mentions some probable
 uses of the memory controller. The memory controller can be used to
 
 a. Isolate an application or a group of applications
-   Memory hungry applications can be isolated and limited to a smaller
+   Memory-hungry applications can be isolated and limited to a smaller
    amount of memory.
-b. Create a cgroup with limited amount of memory, this can be used
+b. Create a cgroup with a limited amount of memory; this can be used
    as a good alternative to booting with mem=XXXX.
 c. Virtualization solutions can control the amount of memory they want
    to assign to a virtual machine instance.
 d. A CD/DVD burner could control the amount of memory used by the
    rest of the system to ensure that burning does not fail due to lack
    of available memory.
-e. There are several other use cases, find one or use the controller just
+e. There are several other use cases; find one or use the controller just
    for fun (to learn and hack on the VM subsystem).
 
 Current Status: linux-2.6.34-mmotm(development version of 2010/April)
@@ -38,12 +38,12 @@ Features:
  - optionally, memory+swap usage can be accounted and limited.
  - hierarchical accounting
  - soft limit
- - moving(recharging) account at moving a task is selectable.
+ - moving (recharging) account at moving a task is selectable.
  - usage threshold notifier
  - oom-killer disable knob and oom-notifier
  - Root cgroup has no limit controls.
 
- Kernel memory support is work in progress, and the current version provides
+ Kernel memory support is a work in progress, and the current version provides
  basically functionality. (See Section 2.7)
 
 Brief summary of control files.
@@ -144,9 +144,9 @@ Figure 1 shows the important aspects of the controller
 3. Each page has a pointer to the page_cgroup, which in turn knows the
    cgroup it belongs to
 
-The accounting is done as follows: mem_cgroup_charge() is invoked to setup
+The accounting is done as follows: mem_cgroup_charge() is invoked to set up
 the necessary data structures and check if the cgroup that is being charged
-is over its limit. If it is then reclaim is invoked on the cgroup.
+is over its limit. If it is, then reclaim is invoked on the cgroup.
 More details can be found in the reclaim section of this document.
 If everything goes well, a page meta-data-structure called page_cgroup is
 updated. page_cgroup has its own LRU on cgroup.
@@ -163,13 +163,13 @@ for earlier. A file page will be accounted for as Page Cache when it's
 inserted into inode (radix-tree). While it's mapped into the page tables of
 processes, duplicate accounting is carefully avoided.
 
-A RSS page is unaccounted when it's fully unmapped. A PageCache page is
+An RSS page is unaccounted when it's fully unmapped. A PageCache page is
 unaccounted when it's removed from radix-tree. Even if RSS pages are fully
 unmapped (by kswapd), they may exist as SwapCache in the system until they
-are really freed. Such SwapCaches also also accounted.
+are really freed. Such SwapCaches are also accounted.
 A swapped-in page is not accounted until it's mapped.
 
-Note: The kernel does swapin-readahead and read multiple swaps at once.
+Note: The kernel does swapin-readahead and reads multiple swaps at once.
 This means swapped-in pages may contain pages for other tasks than a task
 causing page fault. So, we avoid accounting at swap-in I/O.
 
@@ -209,7 +209,7 @@ memsw.limit_in_bytes.
 Example: Assume a system with 4G of swap. A task which allocates 6G of memory
 (by mistake) under 2G memory limitation will use all swap.
 In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
-By using memsw limit, you can avoid system OOM which can be caused by swap
+By using the memsw limit, you can avoid system OOM which can be caused by swap
 shortage.
 
 * why 'memory+swap' rather than swap.
@@ -217,7 +217,7 @@ The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
 to move account from memory to swap...there is no change in usage of
 memory+swap. In other words, when we want to limit the usage of swap without
 affecting global LRU, memory+swap limit is better than just limiting swap from
-OS point of view.
+an OS point of view.
 
 * What happens when a cgroup hits memory.memsw.limit_in_bytes
 When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
@@ -236,7 +236,7 @@ an OOM routine is invoked to select and kill the bulkiest task in the
 cgroup. (See 10. OOM Control below.)
 
 The reclaim algorithm has not been modified for cgroups, except that
-pages that are selected for reclaiming come from the per cgroup LRU
+pages that are selected for reclaiming come from the per-cgroup LRU
 list.
 
 NOTE: Reclaim does not work for the root cgroup, since we cannot set any
@@ -316,7 +316,7 @@ We can check the usage:
 # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes
 1216512
 
-A successful write to this file does not guarantee a successful set of
+A successful write to this file does not guarantee a successful setting of
 this limit to the value written into the file. This can be due to a
 number of factors, such as rounding up to page boundaries or the total
 availability of memory on the system. The user is required to re-read
@@ -350,7 +350,7 @@ Trying usual test under memory controller is always helpful.
 4.1 Troubleshooting
 
 Sometimes a user might find that the application under a cgroup is
-terminated by OOM killer. There are several causes for this:
+terminated by the OOM killer. There are several causes for this:
 
 1. The cgroup limit is too low (just too low to do anything useful)
 2. The user is using anonymous memory and swap is turned off or too low
@@ -358,7 +358,7 @@ terminated by OOM killer. There are several causes for this:
 A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
 some of the pages cached in the cgroup (page cache pages).
 
-To know what happens, disable OOM_Kill by 10. OOM Control(see below) and
+To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
 seeing what happens will be helpful.
 
 4.2 Task migration
@@ -399,10 +399,10 @@ About use_hierarchy, see Section 6.
 
   Almost all pages tracked by this memory cgroup will be unmapped and freed.
   Some pages cannot be freed because they are locked or in-use. Such pages are
-  moved to parent(if use_hierarchy==1) or root (if use_hierarchy==0) and this
+  moved to parent (if use_hierarchy==1) or root (if use_hierarchy==0) and this
   cgroup will be empty.
 
-  Typical use case of this interface is that calling this before rmdir().
+  The typical use case for this interface is before calling rmdir().
   Because rmdir() moves all pages to parent, some out-of-use page caches can be
   moved to the parent. If you want to avoid that, force_empty will be useful.
 
@@ -486,7 +486,7 @@ You can reset failcnt by writing 0 to failcnt file.
 
 For efficiency, as other kernel components, memory cgroup uses some optimization
 to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the
-method and doesn't show 'exact' value of memory(and swap) usage, it's an fuzz
+method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz
 value for efficient access. (Of course, when necessary, it's synchronized.)
 If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP)
 value in memory.stat(see 5.2).
@@ -496,8 +496,8 @@ value in memory.stat(see 5.2).
 This is similar to numa_maps but operates on a per-memcg basis.  This is
 useful for providing visibility into the numa locality information within
 an memcg since the pages are allowed to be allocated from any physical
-node.  One of the usecases is evaluating application performance by
-combining this information with the application's cpu allocation.
+node.  One of the use cases is evaluating application performance by
+combining this information with the application's CPU allocation.
 
 We export "total", "file", "anon" and "unevictable" pages per-node for
 each memcg.  The ouput format of memory.numa_stat is:
@@ -561,10 +561,10 @@ are pushed back to their soft limits. If the soft limit of each control
 group is very high, they are pushed back as much as possible to make
 sure that one control group does not starve the others of memory.
 
-Please note that soft limits is a best effort feature, it comes with
+Please note that soft limits is a best-effort feature; it comes with
 no guarantees, but it does its best to make sure that when memory is
 heavily contended for, memory is allocated based on the soft limit
-hints/setup. Currently soft limit based reclaim is setup such that
+hints/setup. Currently soft limit based reclaim is set up such that
 it gets invoked from balance_pgdat (kswapd).
 
 7.1 Interface
@@ -592,7 +592,7 @@ page tables.
 
 8.1 Interface
 
-This feature is disabled by default. It can be enabled(and disabled again) by
+This feature is disabled by default. It can be enabledi (and disabled again) by
 writing to memory.move_charge_at_immigrate of the destination cgroup.
 
 If you want to enable it:
@@ -601,8 +601,8 @@ If you want to enable it:
 
 Note: Each bits of move_charge_at_immigrate has its own meaning about what type
       of charges should be moved. See 8.2 for details.
-Note: Charges are moved only when you move mm->owner, IOW, a leader of a thread
-      group.
+Note: Charges are moved only when you move mm->owner, in other words,
+      a leader of a thread group.
 Note: If we cannot find enough space for the task in the destination cgroup, we
       try to make space by reclaiming memory. Task migration may fail if we
       cannot make enough space.
@@ -612,25 +612,25 @@ And if you want disable it again:
 
 # echo 0 > memory.move_charge_at_immigrate
 
-8.2 Type of charges which can be move
+8.2 Type of charges which can be moved
 
-Each bits of move_charge_at_immigrate has its own meaning about what type of
-charges should be moved. But in any cases, it must be noted that an account of
-a page or a swap can be moved only when it is charged to the task's current(old)
-memory cgroup.
+Each bit in move_charge_at_immigrate has its own meaning about what type of
+charges should be moved. But in any case, it must be noted that an account of
+a page or a swap can be moved only when it is charged to the task's current
+(old) memory cgroup.
 
   bit | what type of charges would be moved ?
  -----+------------------------------------------------------------------------
-   0  | A charge of an anonymous page(or swap of it) used by the target task.
-      | You must enable Swap Extension(see 2.4) to enable move of swap charges.
+   0  | A charge of an anonymous page (or swap of it) used by the target task.
+      | You must enable Swap Extension (see 2.4) to enable move of swap charges.
  -----+------------------------------------------------------------------------
-   1  | A charge of file pages(normal file, tmpfs file(e.g. ipc shared memory)
+   1  | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory)
       | and swaps of tmpfs file) mmapped by the target task. Unlike the case of
-      | anonymous pages, file pages(and swaps) in the range mmapped by the task
+      | anonymous pages, file pages (and swaps) in the range mmapped by the task
       | will be moved even if the task hasn't done page fault, i.e. they might
       | not be the task's "RSS", but other task's "RSS" that maps the same file.
-      | And mapcount of the page is ignored(the page can be moved even if
-      | page_mapcount(page) > 1). You must enable Swap Extension(see 2.4) to
+      | And mapcount of the page is ignored (the page can be moved even if
+      | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to
       | enable move of swap charges.
 
 8.3 TODO
@@ -640,11 +640,11 @@ memory cgroup.
 
 9. Memory thresholds
 
-Memory cgroup implements memory thresholds using cgroups notification
+Memory cgroup implements memory thresholds using the cgroups notification
 API (see cgroups.txt). It allows to register multiple memory and memsw
 thresholds and gets notifications when it crosses.
 
-To register a threshold application need:
+To register a threshold, an application must:
 - create an eventfd using eventfd(2);
 - open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
 - write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
@@ -659,24 +659,24 @@ It's applicable for root and non-root cgroup.
 
 memory.oom_control file is for OOM notification and other controls.
 
-Memory cgroup implements OOM notifier using cgroup notification
+Memory cgroup implements OOM notifier using the cgroup notification
 API (See cgroups.txt). It allows to register multiple OOM notification
 delivery and gets notification when OOM happens.
 
-To register a notifier, application need:
+To register a notifier, an application must:
  - create an eventfd using eventfd(2)
  - open memory.oom_control file
  - write string like "<event_fd> <fd of memory.oom_control>" to
    cgroup.event_control
 
-Application will be notified through eventfd when OOM happens.
-OOM notification doesn't work for root cgroup.
+The application will be notified through eventfd when OOM happens.
+OOM notification doesn't work for the root cgroup.
 
-You can disable OOM-killer by writing "1" to memory.oom_control file, as:
+You can disable the OOM-killer by writing "1" to memory.oom_control file, as:
 
 	#echo 1 > memory.oom_control
 
-This operation is only allowed to the top cgroup of sub-hierarchy.
+This operation is only allowed to the top cgroup of a sub-hierarchy.
 If OOM-killer is disabled, tasks under cgroup will hang/sleep
 in memory cgroup's OOM-waitqueue when they request accountable memory.
 

From 4bd2c1ee4b439d926e437a4841e8145230df98c9 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Mon, 8 Oct 2012 16:33:10 -0700
Subject: [PATCH 124/160] memcg: cleanup kmem tcp ifdefs

TCP kmem accounting is currently guarded by CONFIG_MEMCG_KMEM ifdefs but
the code is not used if !CONFIG_INET so we should rather test for both.
The same applies to net/sock.h, net/ip.h and net/tcp_memcontrol.h but
let's keep those outside of any ifdefs because it is considered safer wrt.
 future maintainability.

Tested with
- CONFIG_INET && CONFIG_MEMCG_KMEM
- !CONFIG_INET && CONFIG_MEMCG_KMEM
- CONFIG_INET && !CONFIG_MEMCG_KMEM
- !CONFIG_INET && !CONFIG_MEMCG_KMEM

Signed-off-by: Sachin Kamat <sachin.kamat@linaro.org>
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a72f2ffdc3d0..4f5f93657246 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -51,6 +51,7 @@
 #include <linux/oom.h>
 #include "internal.h"
 #include <net/sock.h>
+#include <net/ip.h>
 #include <net/tcp_memcontrol.h>
 
 #include <asm/uaccess.h>
@@ -326,7 +327,7 @@ struct mem_cgroup {
 	struct mem_cgroup_stat_cpu nocpu_base;
 	spinlock_t pcp_counter_lock;
 
-#ifdef CONFIG_INET
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
 	struct tcp_memcontrol tcp_mem;
 #endif
 };
@@ -412,9 +413,7 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
 }
 
 /* Writing them here to avoid exposing memcg's inner layout */
-#ifdef CONFIG_MEMCG_KMEM
-#include <net/sock.h>
-#include <net/ip.h>
+#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 
 static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
 void sock_update_memcg(struct sock *sk)
@@ -461,7 +460,6 @@ void sock_release_memcg(struct sock *sk)
 	}
 }
 
-#ifdef CONFIG_INET
 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 {
 	if (!memcg || mem_cgroup_is_root(memcg))
@@ -470,10 +468,7 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 	return &memcg->tcp_mem.cg_proto;
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
-#endif /* CONFIG_INET */
-#endif /* CONFIG_MEMCG_KMEM */
 
-#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
 	if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))

From 7ffc0edc49d0df5dac077c1830e2533b27d3a4ed Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Mon, 8 Oct 2012 16:33:13 -0700
Subject: [PATCH 125/160] memcg: move mem_cgroup_is_root upwards

kmem code uses this function and it is better to not use forward
declarations for static inline functions as some (older) compilers don't
like it:

gcc version 4.3.4 [gcc-4_3-branch revision 152973] (SUSE Linux)

  mm/memcontrol.c:421: warning: `mem_cgroup_is_root' declared inline after being called
  mm/memcontrol.c:421: warning: previous declaration of `mem_cgroup_is_root' was here

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Sachin Kamat <sachin.kamat@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4f5f93657246..7acf43bf04a2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -412,10 +412,14 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
 	return container_of(s, struct mem_cgroup, css);
 }
 
+static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
+{
+	return (memcg == root_mem_cgroup);
+}
+
 /* Writing them here to avoid exposing memcg's inner layout */
 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 
-static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
 void sock_update_memcg(struct sock *sk)
 {
 	if (mem_cgroup_sockets_enabled) {
@@ -1011,11 +1015,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
 	     iter != NULL;				\
 	     iter = mem_cgroup_iter(NULL, iter, NULL))
 
-static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
-{
-	return (memcg == root_mem_cgroup);
-}
-
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
 	struct mem_cgroup *memcg;

From ec4d9f626d5908b6052c2973f37992f1db52e967 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 8 Oct 2012 16:33:14 -0700
Subject: [PATCH 126/160] mm: fix invalidate_complete_page2() lock ordering

In fuzzing with trinity, lockdep protested "possible irq lock inversion
dependency detected" when isolate_lru_page() reenabled interrupts while
still holding the supposedly irq-safe tree_lock:

invalidate_inode_pages2
  invalidate_complete_page2
    spin_lock_irq(&mapping->tree_lock)
    clear_page_mlock
      isolate_lru_page
        spin_unlock_irq(&zone->lru_lock)

isolate_lru_page() is correct to enable interrupts unconditionally:
invalidate_complete_page2() is incorrect to call clear_page_mlock() while
holding tree_lock, which is supposed to nest inside lru_lock.

Both truncate_complete_page() and invalidate_complete_page() call
clear_page_mlock() before taking tree_lock to remove page from radix_tree.
 I guess invalidate_complete_page2() preferred to test PageDirty (again)
under tree_lock before committing to the munlock; but since the page has
already been unmapped, its state is already somewhat inconsistent, and no
worse if clear_page_mlock() moved up.

Reported-by: Sasha Levin <levinsasha928@gmail.com>
Deciphered-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michel Lespinasse <walken@google.com>
Cc: Ying Han <yinghan@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/truncate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/truncate.c b/mm/truncate.c
index 75801acdaac7..f38055cb8af6 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -394,11 +394,12 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
 		return 0;
 
+	clear_page_mlock(page);
+
 	spin_lock_irq(&mapping->tree_lock);
 	if (PageDirty(page))
 		goto failed;
 
-	clear_page_mlock(page);
 	BUG_ON(page_has_private(page));
 	__delete_from_page_cache(page);
 	spin_unlock_irq(&mapping->tree_lock);

From 39b5f29ac1f988c1615fbc9c69f6651ab0d0c3c7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 8 Oct 2012 16:33:18 -0700
Subject: [PATCH 127/160] mm: remove vma arg from page_evictable

page_evictable(page, vma) is an irritant: almost all its callers pass
NULL for vma.  Remove the vma arg and use mlocked_vma_newpage(vma, page)
explicitly in the couple of places it's needed.  But in those places we
don't even need page_evictable() itself!  They're dealing with a freshly
allocated anonymous page, which has no "mapping" and cannot be mlocked yet.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michel Lespinasse <walken@google.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/unevictable-lru.txt | 10 +++-------
 include/linux/swap.h                 |  2 +-
 mm/internal.h                        |  5 ++---
 mm/ksm.c                             |  2 +-
 mm/rmap.c                            |  2 +-
 mm/swap.c                            |  2 +-
 mm/vmscan.c                          | 27 +++++++++------------------
 7 files changed, 18 insertions(+), 32 deletions(-)

diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt
index 323ff5dba1cc..a68db7692ee8 100644
--- a/Documentation/vm/unevictable-lru.txt
+++ b/Documentation/vm/unevictable-lru.txt
@@ -197,12 +197,8 @@ the pages are also "rescued" from the unevictable list in the process of
 freeing them.
 
 page_evictable() also checks for mlocked pages by testing an additional page
-flag, PG_mlocked (as wrapped by PageMlocked()).  If the page is NOT mlocked,
-and a non-NULL VMA is supplied, page_evictable() will check whether the VMA is
-VM_LOCKED via is_mlocked_vma().  is_mlocked_vma() will SetPageMlocked() and
-update the appropriate statistics if the vma is VM_LOCKED.  This method allows
-efficient "culling" of pages in the fault path that are being faulted in to
-VM_LOCKED VMAs.
+flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is
+faulted into a VM_LOCKED vma, or found in a vma being VM_LOCKED.
 
 
 VMSCAN'S HANDLING OF UNEVICTABLE PAGES
@@ -651,7 +647,7 @@ PAGE RECLAIM IN shrink_*_list()
 -------------------------------
 
 shrink_active_list() culls any obviously unevictable pages - i.e.
-!page_evictable(page, NULL) - diverting these to the unevictable list.
+!page_evictable(page) - diverting these to the unevictable list.
 However, shrink_active_list() only sees unevictable pages that made it onto the
 active/inactive lru lists.  Note that these pages do not have PageUnevictable
 set - otherwise they would be on the unevictable list and shrink_active_list
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 388e70601413..68df9c17fbbb 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -281,7 +281,7 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 }
 #endif
 
-extern int page_evictable(struct page *page, struct vm_area_struct *vma);
+extern int page_evictable(struct page *page);
 extern void check_move_unevictable_pages(struct page **, int nr_pages);
 
 extern unsigned long scan_unevictable_pages;
diff --git a/mm/internal.h b/mm/internal.h
index 7f72f249bc29..78f25d6cc6a7 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -168,9 +168,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 }
 
 /*
- * Called only in fault path via page_evictable() for a new page
- * to determine if it's being mapped into a LOCKED vma.
- * If so, mark page as mlocked.
+ * Called only in fault path, to determine if a new page is being
+ * mapped into a LOCKED vma.  If it is, mark page as mlocked.
  */
 static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
 				    struct page *page)
diff --git a/mm/ksm.c b/mm/ksm.c
index 14ee5cf8a513..ecbc090cdaad 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1586,7 +1586,7 @@ struct page *ksm_does_need_to_copy(struct page *page,
 		SetPageSwapBacked(new_page);
 		__set_page_locked(new_page);
 
-		if (page_evictable(new_page, vma))
+		if (!mlocked_vma_newpage(vma, new_page))
 			lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
 		else
 			add_page_to_unevictable_list(new_page);
diff --git a/mm/rmap.c b/mm/rmap.c
index 28777412de62..0d86433e42d7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1080,7 +1080,7 @@ void page_add_new_anon_rmap(struct page *page,
 	else
 		__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
 	__page_set_anon_rmap(page, vma, address, 1);
-	if (page_evictable(page, vma))
+	if (!mlocked_vma_newpage(vma, page))
 		lru_cache_add_lru(page, LRU_ACTIVE_ANON);
 	else
 		add_page_to_unevictable_list(page);
diff --git a/mm/swap.c b/mm/swap.c
index f76c76c7501b..6310dc2008ff 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -751,7 +751,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
 
 	SetPageLRU(page_tail);
 
-	if (page_evictable(page_tail, NULL)) {
+	if (page_evictable(page_tail)) {
 		if (PageActive(page)) {
 			SetPageActive(page_tail);
 			active = 1;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b010efc43891..8b627309dd44 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -553,7 +553,7 @@ void putback_lru_page(struct page *page)
 redo:
 	ClearPageUnevictable(page);
 
-	if (page_evictable(page, NULL)) {
+	if (page_evictable(page)) {
 		/*
 		 * For evictable pages, we can use the cache.
 		 * In event of a race, worst case is we end up with an
@@ -587,7 +587,7 @@ redo:
 	 * page is on unevictable list, it never be freed. To avoid that,
 	 * check after we added it to the list, again.
 	 */
-	if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
+	if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
 		if (!isolate_lru_page(page)) {
 			put_page(page);
 			goto redo;
@@ -709,7 +709,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
 		sc->nr_scanned++;
 
-		if (unlikely(!page_evictable(page, NULL)))
+		if (unlikely(!page_evictable(page)))
 			goto cull_mlocked;
 
 		if (!sc->may_unmap && page_mapped(page))
@@ -1217,7 +1217,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
 
 		VM_BUG_ON(PageLRU(page));
 		list_del(&page->lru);
-		if (unlikely(!page_evictable(page, NULL))) {
+		if (unlikely(!page_evictable(page))) {
 			spin_unlock_irq(&zone->lru_lock);
 			putback_lru_page(page);
 			spin_lock_irq(&zone->lru_lock);
@@ -1470,7 +1470,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 		page = lru_to_page(&l_hold);
 		list_del(&page->lru);
 
-		if (unlikely(!page_evictable(page, NULL))) {
+		if (unlikely(!page_evictable(page))) {
 			putback_lru_page(page);
 			continue;
 		}
@@ -3414,27 +3414,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 /*
  * page_evictable - test whether a page is evictable
  * @page: the page to test
- * @vma: the VMA in which the page is or will be mapped, may be NULL
  *
  * Test whether page is evictable--i.e., should be placed on active/inactive
- * lists vs unevictable list.  The vma argument is !NULL when called from the
- * fault path to determine how to instantate a new page.
+ * lists vs unevictable list.
  *
  * Reasons page might not be evictable:
  * (1) page's mapping marked unevictable
  * (2) page is part of an mlocked VMA
  *
  */
-int page_evictable(struct page *page, struct vm_area_struct *vma)
+int page_evictable(struct page *page)
 {
-
-	if (mapping_unevictable(page_mapping(page)))
-		return 0;
-
-	if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
-		return 0;
-
-	return 1;
+	return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
 }
 
 #ifdef CONFIG_SHMEM
@@ -3472,7 +3463,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
 		if (!PageLRU(page) || !PageUnevictable(page))
 			continue;
 
-		if (page_evictable(page, NULL)) {
+		if (page_evictable(page)) {
 			enum lru_list lru = page_lru_base_type(page);
 
 			VM_BUG_ON(PageActive(page));

From e6c509f85455041d3d7c4b863bf80bc294288cc1 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 8 Oct 2012 16:33:19 -0700
Subject: [PATCH 128/160] mm: use clear_page_mlock() in page_remove_rmap()

We had thought that pages could no longer get freed while still marked as
mlocked; but Johannes Weiner posted this program to demonstrate that
truncating an mlocked private file mapping containing COWed pages is still
mishandled:

#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>

int main(void)
{
	char *map;
	int fd;

	system("grep mlockfreed /proc/vmstat");
	fd = open("chigurh", O_CREAT|O_EXCL|O_RDWR);
	unlink("chigurh");
	ftruncate(fd, 4096);
	map = mmap(NULL, 4096, PROT_WRITE, MAP_PRIVATE, fd, 0);
	map[0] = 11;
	mlock(map, sizeof(fd));
	ftruncate(fd, 0);
	close(fd);
	munlock(map, sizeof(fd));
	munmap(map, 4096);
	system("grep mlockfreed /proc/vmstat");
	return 0;
}

The anon COWed pages are not caught by truncation's clear_page_mlock() of
the pagecache pages; but unmap_mapping_range() unmaps them, so we ought to
look out for them there in page_remove_rmap().  Indeed, why should
truncation or invalidation be doing the clear_page_mlock() when removing
from pagecache?  mlock is a property of mapping in userspace, not a
property of pagecache: an mlocked unmapped page is nonsensical.

Reported-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Ying Han <yinghan@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h |  7 +------
 mm/memory.c   | 10 +++++-----
 mm/mlock.c    | 16 +++-------------
 mm/rmap.c     |  4 ++++
 mm/truncate.c |  4 ----
 5 files changed, 13 insertions(+), 28 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 78f25d6cc6a7..4dc93e2fe69e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -201,12 +201,7 @@ extern void munlock_vma_page(struct page *page);
  * If called for a page that is still mapped by mlocked vmas, all we do
  * is revert to lazy LRU behaviour -- semantics are not broken.
  */
-extern void __clear_page_mlock(struct page *page);
-static inline void clear_page_mlock(struct page *page)
-{
-	if (unlikely(TestClearPageMlocked(page)))
-		__clear_page_mlock(page);
-}
+extern void clear_page_mlock(struct page *page);
 
 /*
  * mlock_migrate_page - called only from migrate_page_copy() to
diff --git a/mm/memory.c b/mm/memory.c
index d205e4381a34..5f5d1f039bf4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1577,12 +1577,12 @@ split_fallthrough:
 		if (page->mapping && trylock_page(page)) {
 			lru_add_drain();  /* push cached pages to LRU */
 			/*
-			 * Because we lock page here and migration is
-			 * blocked by the pte's page reference, we need
-			 * only check for file-cache page truncation.
+			 * Because we lock page here, and migration is
+			 * blocked by the pte's page reference, and we
+			 * know the page is still mapped, we don't even
+			 * need to check for file-cache page truncation.
 			 */
-			if (page->mapping)
-				mlock_vma_page(page);
+			mlock_vma_page(page);
 			unlock_page(page);
 		}
 	}
diff --git a/mm/mlock.c b/mm/mlock.c
index a948be4b7ba7..de7321592897 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -51,13 +51,10 @@ EXPORT_SYMBOL(can_do_mlock);
 /*
  *  LRU accounting for clear_page_mlock()
  */
-void __clear_page_mlock(struct page *page)
+void clear_page_mlock(struct page *page)
 {
-	VM_BUG_ON(!PageLocked(page));
-
-	if (!page->mapping) {	/* truncated ? */
+	if (!TestClearPageMlocked(page))
 		return;
-	}
 
 	dec_zone_page_state(page, NR_MLOCK);
 	count_vm_event(UNEVICTABLE_PGCLEARED);
@@ -290,14 +287,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
 		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
 		if (page && !IS_ERR(page)) {
 			lock_page(page);
-			/*
-			 * Like in __mlock_vma_pages_range(),
-			 * because we lock page here and migration is
-			 * blocked by the elevated reference, we need
-			 * only check for file-cache page truncation.
-			 */
-			if (page->mapping)
-				munlock_vma_page(page);
+			munlock_vma_page(page);
 			unlock_page(page);
 			put_page(page);
 		}
diff --git a/mm/rmap.c b/mm/rmap.c
index 0d86433e42d7..bf03149f495c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1155,7 +1155,10 @@ void page_remove_rmap(struct page *page)
 	} else {
 		__dec_zone_page_state(page, NR_FILE_MAPPED);
 		mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
+		mem_cgroup_end_update_page_stat(page, &locked, &flags);
 	}
+	if (unlikely(PageMlocked(page)))
+		clear_page_mlock(page);
 	/*
 	 * It would be tidy to reset the PageAnon mapping here,
 	 * but that might overwrite a racing page_add_anon_rmap
@@ -1165,6 +1168,7 @@ void page_remove_rmap(struct page *page)
 	 * Leaving it set also helps swapoff to reinstate ptes
 	 * faster for those pages still in swapcache.
 	 */
+	return;
 out:
 	if (!anon)
 		mem_cgroup_end_update_page_stat(page, &locked, &flags);
diff --git a/mm/truncate.c b/mm/truncate.c
index f38055cb8af6..d51ce92d6e83 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -107,7 +107,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 
 	cancel_dirty_page(page, PAGE_CACHE_SIZE);
 
-	clear_page_mlock(page);
 	ClearPageMappedToDisk(page);
 	delete_from_page_cache(page);
 	return 0;
@@ -132,7 +131,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 	if (page_has_private(page) && !try_to_release_page(page, 0))
 		return 0;
 
-	clear_page_mlock(page);
 	ret = remove_mapping(mapping, page);
 
 	return ret;
@@ -394,8 +392,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
 		return 0;
 
-	clear_page_mlock(page);
-
 	spin_lock_irq(&mapping->tree_lock);
 	if (PageDirty(page))
 		goto failed;

From a0c5e813f087dffc0d9b173d2e7d3328b1482fd5 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 8 Oct 2012 16:33:21 -0700
Subject: [PATCH 129/160] mm: remove free_page_mlock

We should not be seeing non-0 unevictable_pgs_mlockfreed any longer.  So
remove free_page_mlock() from the page freeing paths: __PG_MLOCKED is
already in PAGE_FLAGS_CHECK_AT_FREE, so free_pages_check() will now be
checking it, reporting "BUG: Bad page state" if it's ever found set.
Comment UNEVICTABLE_MLOCKFREED and unevictable_pgs_mlockfreed always 0.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michel Lespinasse <walken@google.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vm_event_item.h |  2 +-
 mm/page_alloc.c               | 17 -----------------
 mm/vmstat.c                   |  2 +-
 3 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 57f7b1091511..ede4ddd46695 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -52,7 +52,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		UNEVICTABLE_PGMUNLOCKED,
 		UNEVICTABLE_PGCLEARED,	/* on COW, page truncate */
 		UNEVICTABLE_PGSTRANDED,	/* unable to isolate on unlock */
-		UNEVICTABLE_MLOCKFREED,
+		UNEVICTABLE_MLOCKFREED,	/* no longer useful: always zero */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 		THP_FAULT_ALLOC,
 		THP_FAULT_FALLBACK,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 00750bc08a3a..dbb53866c3aa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -598,17 +598,6 @@ out:
 	zone->free_area[order].nr_free++;
 }
 
-/*
- * free_page_mlock() -- clean up attempts to free and mlocked() page.
- * Page should not be on lru, so no need to fix that up.
- * free_pages_check() will verify...
- */
-static inline void free_page_mlock(struct page *page)
-{
-	__dec_zone_page_state(page, NR_MLOCK);
-	__count_vm_event(UNEVICTABLE_MLOCKFREED);
-}
-
 static inline int free_pages_check(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
@@ -728,15 +717,12 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
-	int wasMlocked = __TestClearPageMlocked(page);
 	int migratetype;
 
 	if (!free_pages_prepare(page, order))
 		return;
 
 	local_irq_save(flags);
-	if (unlikely(wasMlocked))
-		free_page_mlock(page);
 	__count_vm_events(PGFREE, 1 << order);
 	migratetype = get_pageblock_migratetype(page);
 	set_freepage_migratetype(page, migratetype);
@@ -1310,7 +1296,6 @@ void free_hot_cold_page(struct page *page, int cold)
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
 	int migratetype;
-	int wasMlocked = __TestClearPageMlocked(page);
 
 	if (!free_pages_prepare(page, 0))
 		return;
@@ -1318,8 +1303,6 @@ void free_hot_cold_page(struct page *page, int cold)
 	migratetype = get_pageblock_migratetype(page);
 	set_freepage_migratetype(page, migratetype);
 	local_irq_save(flags);
-	if (unlikely(wasMlocked))
-		free_page_mlock(page);
 	__count_vm_event(PGFREE);
 
 	/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index acbd85c983e6..05e3a9913744 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -782,7 +782,7 @@ const char * const vmstat_text[] = {
 	"unevictable_pgs_munlocked",
 	"unevictable_pgs_cleared",
 	"unevictable_pgs_stranded",
-	"unevictable_pgs_mlockfreed",
+	"unevictable_pgs_mlockfreed",	/* no longer useful: always zero */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	"thp_fault_alloc",

From 957f822a0ab95e88b146638bad6209bbc315bedd Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 8 Oct 2012 16:33:24 -0700
Subject: [PATCH 130/160] mm, numa: reclaim from all nodes within reclaim
 distance

RECLAIM_DISTANCE represents the distance between nodes at which it is
deemed too costly to allocate from; it's preferred to try to reclaim from
a local zone before falling back to allocating on a remote node with such
a distance.

To do this, zone_reclaim_mode is set if the distance between any two
nodes on the system is greather than this distance.  This, however, ends
up causing the page allocator to reclaim from every zone regardless of
its affinity.

What we really want is to reclaim only from zones that are closer than
RECLAIM_DISTANCE.  This patch adds a nodemask to each node that
represents the set of nodes that are within this distance.  During the
zone iteration, if the bit for a zone's node is set for the local node,
then reclaim is attempted; otherwise, the zone is skipped.

[akpm@linux-foundation.org: fix CONFIG_NUMA=n build]
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  1 +
 mm/page_alloc.c        | 41 ++++++++++++++++++++++++++++++-----------
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d240efa8f846..a5578871d033 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -709,6 +709,7 @@ typedef struct pglist_data {
 	unsigned long node_spanned_pages; /* total size of physical page
 					     range, including holes */
 	int node_id;
+	nodemask_t reclaim_nodes;	/* Nodes allowed to reclaim from */
 	wait_queue_head_t kswapd_wait;
 	wait_queue_head_t pfmemalloc_wait;
 	struct task_struct *kswapd;	/* Protected by lock_memory_hotplug() */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dbb53866c3aa..9b8e6243a524 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1799,6 +1799,22 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 }
 
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+	return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
+}
+
+static void __paginginit init_zone_allows_reclaim(int nid)
+{
+	int i;
+
+	for_each_online_node(i)
+		if (node_distance(nid, i) <= RECLAIM_DISTANCE) {
+			node_set(i, NODE_DATA(nid)->reclaim_nodes);
+			zone_reclaim_mode = 1;
+		}
+}
+
 #else	/* CONFIG_NUMA */
 
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1819,6 +1835,15 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 }
+
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+	return true;
+}
+
+static inline void init_zone_allows_reclaim(int nid)
+{
+}
 #endif	/* CONFIG_NUMA */
 
 /*
@@ -1903,7 +1928,8 @@ zonelist_scan:
 				did_zlc_setup = 1;
 			}
 
-			if (zone_reclaim_mode == 0)
+			if (zone_reclaim_mode == 0 ||
+			    !zone_allows_reclaim(preferred_zone, zone))
 				goto this_zone_full;
 
 			/*
@@ -3364,21 +3390,13 @@ static void build_zonelists(pg_data_t *pgdat)
 	j = 0;
 
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
-		int distance = node_distance(local_node, node);
-
-		/*
-		 * If another node is sufficiently far away then it is better
-		 * to reclaim pages in a zone before going off node.
-		 */
-		if (distance > RECLAIM_DISTANCE)
-			zone_reclaim_mode = 1;
-
 		/*
 		 * We don't want to pressure a particular node.
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
-		if (distance != node_distance(local_node, prev_node))
+		if (node_distance(local_node, node) !=
+		    node_distance(local_node, prev_node))
 			node_load[node] = load;
 
 		prev_node = node;
@@ -4552,6 +4570,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
+	init_zone_allows_reclaim(nid);
 	calculate_node_totalpages(pgdat, zones_size, zholes_size);
 
 	alloc_node_mem_map(pgdat);

From 00ea8990aadf754bbbb46ae85d4f2afba19508d8 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Mon, 8 Oct 2012 16:33:25 -0700
Subject: [PATCH 131/160] memory.txt: remove stray information

Andi removed some outedated documentation from Documentation/memory.txt
back in 2009 by commit 3b2b9a875ddc ("Documentation/memory.txt: remove
some very outdated recommendations"), but the resulting document is not
in a nice shape either.

It seems to me like we are not losing anything by completely removing the
file now.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/memory.txt | 33 ---------------------------------
 1 file changed, 33 deletions(-)
 delete mode 100644 Documentation/memory.txt

diff --git a/Documentation/memory.txt b/Documentation/memory.txt
deleted file mode 100644
index 802efe58647c..000000000000
--- a/Documentation/memory.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-There are several classic problems related to memory on Linux
-systems.
-
-	1) There are some motherboards that will not cache above
-	   a certain quantity of memory.  If you have one of these
-	   motherboards, your system will be SLOWER, not faster
-	   as you add more memory.  Consider exchanging your 
-           motherboard.
-
-All of these problems can be addressed with the "mem=XXXM" boot option
-(where XXX is the size of RAM to use in megabytes).  
-It can also tell Linux to use less memory than is actually installed.
-If you use "mem=" on a machine with PCI, consider using "memmap=" to avoid
-physical address space collisions.
-
-See the documentation of your boot loader (LILO, grub, loadlin, etc.) about
-how to pass options to the kernel.
-
-There are other memory problems which Linux cannot deal with.  Random
-corruption of memory is usually a sign of serious hardware trouble.
-Try:
-
-	* Reducing memory settings in the BIOS to the most conservative 
-          timings.
-
-	* Adding a cooling fan.
-
-	* Not overclocking your CPU.
-
-	* Having the memory tested in a memory tester or exchanged
-	  with the vendor. Consider testing it with memtest86 yourself.
-	
-	* Exchanging your CPU, cache, or motherboard for one that works.

From 027ef6c87853b0a9df53175063028edb4950d476 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Mon, 8 Oct 2012 16:33:27 -0700
Subject: [PATCH 132/160] mm: thp: fix pmd_present for split_huge_page and
 PROT_NONE with THP

In many places !pmd_present has been converted to pmd_none.  For pmds
that's equivalent and pmd_none is quicker so using pmd_none is better.

However (unless we delete pmd_present) we should provide an accurate
pmd_present too.  This will avoid the risk of code thinking the pmd is non
present because it's under __split_huge_page_map, see the pmd_mknotpresent
there and the comment above it.

If the page has been mprotected as PROT_NONE, it would also lead to a
pmd_present false negative in the same way as the race with
split_huge_page.

Because the PSE bit stays on at all times (both during split_huge_page and
when the _PAGE_PROTNONE bit get set), we could only check for the PSE bit,
but checking the PROTNONE bit too is still good to remember pmd_present
must always keep PROT_NONE into account.

This explains a not reproducible BUG_ON that was seldom reported on the
lists.

The same issue is in pmd_large, it would go wrong with both PROT_NONE and
if it races with split_huge_page.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index fc9948465293..a1f780d45f76 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -146,8 +146,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
 
 static inline int pmd_large(pmd_t pte)
 {
-	return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
-		(_PAGE_PSE | _PAGE_PRESENT);
+	return pmd_flags(pte) & _PAGE_PSE;
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -415,7 +414,13 @@ static inline int pte_hidden(pte_t pte)
 
 static inline int pmd_present(pmd_t pmd)
 {
-	return pmd_flags(pmd) & _PAGE_PRESENT;
+	/*
+	 * Checking for _PAGE_PSE is needed too because
+	 * split_huge_page will temporarily clear the present bit (but
+	 * the _PAGE_PSE flag will remain set at all times while the
+	 * _PAGE_PRESENT bit is clear).
+	 */
+	return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
 }
 
 static inline int pmd_none(pmd_t pmd)

From 36e4f20af833d1ce196e6a4ade05dc26c44652d1 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Mon, 8 Oct 2012 16:33:31 -0700
Subject: [PATCH 133/160] hugetlb: do not use vma_hugecache_offset() for
 vma_prio_tree_foreach

Commit 0c176d52b0b2 ("mm: hugetlb: fix pgoff computation when unmapping
page from vma") fixed pgoff calculation but it has replaced it by
vma_hugecache_offset() which is not approapriate for offsets used for
vma_prio_tree_foreach() because that one expects index in page units
rather than in huge_page_shift.

Johannes said:

: The resulting index may not be too big, but it can be too small: assume
: hpage size of 2M and the address to unmap to be 0x200000.  This is regular
: page index 512 and hpage index 1.  If you have a VMA that maps the file
: only starting at the second huge page, that VMAs vm_pgoff will be 512 but
: you ask for offset 1 and miss it even though it does map the page of
: interest.  hugetlb_cow() will try to unmap, miss the vma, and retry the
: cow until the allocation succeeds or the skipped vma(s) go away.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Hillf Danton <dhillf@gmail.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8536741f069b..de5d1dcf34fe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2480,7 +2480,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * from page cache lookup which is in HPAGE_SIZE units.
 	 */
 	address = address & huge_page_mask(h);
-	pgoff = vma_hugecache_offset(h, vma, address);
+	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
+			vma->vm_pgoff;
 	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
 
 	/*

From 2ec74c3ef2d8c58d71e0e00336fb6b891192155a Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Mon, 8 Oct 2012 16:33:33 -0700
Subject: [PATCH 134/160] mm: move all mmu notifier invocations to be done
 outside the PT lock

In order to allow sleeping during mmu notifier calls, we need to avoid
invoking them under the page table spinlock.  This patch solves the
problem by calling invalidate_page notification after releasing the lock
(but before freeing the page itself), or by wrapping the page invalidation
with calls to invalidate_range_begin and invalidate_range_end.

To prevent accidental changes to the invalidate_range_end arguments after
the call to invalidate_range_begin, the patch introduces a convention of
saving the arguments in consistently named locals:

	unsigned long mmun_start;	/* For mmu_notifiers */
	unsigned long mmun_end;	/* For mmu_notifiers */

	...

	mmun_start = ...
	mmun_end = ...
	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);

	...

	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);

The patch changes code to use this convention for all calls to
mmu_notifier_invalidate_range_start/end, except those where the calls are
close enough so that anyone who glances at the code can see the values
aren't changing.

This patchset is a preliminary step towards on-demand paging design to be
added to the RDMA stack.

Why do we want on-demand paging for Infiniband?

  Applications register memory with an RDMA adapter using system calls,
  and subsequently post IO operations that refer to the corresponding
  virtual addresses directly to HW.  Until now, this was achieved by
  pinning the memory during the registration calls.  The goal of on demand
  paging is to avoid pinning the pages of registered memory regions (MRs).
   This will allow users the same flexibility they get when swapping any
  other part of their processes address spaces.  Instead of requiring the
  entire MR to fit in physical memory, we can allow the MR to be larger,
  and only fit the current working set in physical memory.

Why should anyone care?  What problems are users currently experiencing?

  This can make programming with RDMA much simpler.  Today, developers
  that are working with more data than their RAM can hold need either to
  deregister and reregister memory regions throughout their process's
  life, or keep a single memory region and copy the data to it.  On demand
  paging will allow these developers to register a single MR at the
  beginning of their process's life, and let the operating system manage
  which pages needs to be fetched at a given time.  In the future, we
  might be able to provide a single memory access key for each process
  that would provide the entire process's address as one large memory
  region, and the developers wouldn't need to register memory regions at
  all.

Is there any prospect that any other subsystems will utilise these
infrastructural changes?  If so, which and how, etc?

  As for other subsystems, I understand that XPMEM wanted to sleep in
  MMU notifiers, as Christoph Lameter wrote at
  http://lkml.indiana.edu/hypermail/linux/kernel/0802.1/0460.html and
  perhaps Andrea knows about other use cases.

  Scheduling in mmu notifications is required since we need to sync the
  hardware with the secondary page tables change.  A TLB flush of an IO
  device is inherently slower than a CPU TLB flush, so our design works by
  sending the invalidation request to the device, and waiting for an
  interrupt before exiting the mmu notifier handler.

Avi said:

  kvm may be a buyer.  kvm::mmu_lock, which serializes guest page
  faults, also protects long operations such as destroying large ranges.
  It would be good to convert it into a spinlock, but as it is used inside
  mmu notifiers, this cannot be done.

  (there are alternatives, such as keeping the spinlock and using a
  generation counter to do the teardown in O(1), which is what the "may"
  is doing up there).

[akpm@linux-foundation.orgpossible speed tweak in hugetlb_cow(), cleanups]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Or Gerlitz <ogerlitz@mellanox.com>
Cc: Haggai Eran <haggaie@mellanox.com>
Cc: Shachar Raindel <raindel@mellanox.com>
Cc: Liran Liss <liranl@mellanox.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Avi Kivity <avi@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 47 ------------------------------------
 mm/filemap_xip.c             |  4 ++-
 mm/huge_memory.c             | 42 +++++++++++++++++++++++++++-----
 mm/hugetlb.c                 | 21 ++++++++++------
 mm/memory.c                  | 28 ++++++++++++++-------
 mm/mremap.c                  |  8 ++++--
 mm/rmap.c                    | 18 +++++++++++---
 7 files changed, 92 insertions(+), 76 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 4b7183e98061..bc823c4c028b 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -246,50 +246,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 		__mmu_notifier_mm_destroy(mm);
 }
 
-/*
- * These two macros will sometime replace ptep_clear_flush.
- * ptep_clear_flush is implemented as macro itself, so this also is
- * implemented as a macro until ptep_clear_flush will converted to an
- * inline function, to diminish the risk of compilation failure. The
- * invalidate_page method over time can be moved outside the PT lock
- * and these two macros can be later removed.
- */
-#define ptep_clear_flush_notify(__vma, __address, __ptep)		\
-({									\
-	pte_t __pte;							\
-	struct vm_area_struct *___vma = __vma;				\
-	unsigned long ___address = __address;				\
-	__pte = ptep_clear_flush(___vma, ___address, __ptep);		\
-	mmu_notifier_invalidate_page(___vma->vm_mm, ___address);	\
-	__pte;								\
-})
-
-#define pmdp_clear_flush_notify(__vma, __address, __pmdp)		\
-({									\
-	pmd_t __pmd;							\
-	struct vm_area_struct *___vma = __vma;				\
-	unsigned long ___address = __address;				\
-	VM_BUG_ON(__address & ~HPAGE_PMD_MASK);				\
-	mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address,	\
-					    (__address)+HPAGE_PMD_SIZE);\
-	__pmd = pmdp_clear_flush(___vma, ___address, __pmdp);		\
-	mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address,	\
-					  (__address)+HPAGE_PMD_SIZE);	\
-	__pmd;								\
-})
-
-#define pmdp_splitting_flush_notify(__vma, __address, __pmdp)		\
-({									\
-	struct vm_area_struct *___vma = __vma;				\
-	unsigned long ___address = __address;				\
-	VM_BUG_ON(__address & ~HPAGE_PMD_MASK);				\
-	mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address,	\
-					    (__address)+HPAGE_PMD_SIZE);\
-	pmdp_splitting_flush(___vma, ___address, __pmdp);		\
-	mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address,	\
-					  (__address)+HPAGE_PMD_SIZE);	\
-})
-
 #define ptep_clear_flush_young_notify(__vma, __address, __ptep)		\
 ({									\
 	int __young;							\
@@ -380,9 +336,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
 #define pmdp_clear_flush_young_notify pmdp_clear_flush_young
-#define ptep_clear_flush_notify ptep_clear_flush
-#define pmdp_clear_flush_notify pmdp_clear_flush
-#define pmdp_splitting_flush_notify pmdp_splitting_flush
 #define set_pte_at_notify set_pte_at
 
 #endif /* CONFIG_MMU_NOTIFIER */
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index a52daee11d3f..a912da6ddfd4 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -192,11 +192,13 @@ retry:
 		if (pte) {
 			/* Nuke the page table entry. */
 			flush_cache_page(vma, address, pte_pfn(*pte));
-			pteval = ptep_clear_flush_notify(vma, address, pte);
+			pteval = ptep_clear_flush(vma, address, pte);
 			page_remove_rmap(page);
 			dec_mm_counter(mm, MM_FILEPAGES);
 			BUG_ON(pte_dirty(pteval));
 			pte_unmap_unlock(pte, ptl);
+			/* must invalidate_page _before_ freeing the page */
+			mmu_notifier_invalidate_page(mm, address);
 			page_cache_release(page);
 		}
 	}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0e7740923fb9..08a943b9cf95 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -787,6 +787,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 	pmd_t _pmd;
 	int ret = 0, i;
 	struct page **pages;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
 
 	pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
 			GFP_KERNEL);
@@ -823,12 +825,16 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 		cond_resched();
 	}
 
+	mmun_start = haddr;
+	mmun_end   = haddr + HPAGE_PMD_SIZE;
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
 	spin_lock(&mm->page_table_lock);
 	if (unlikely(!pmd_same(*pmd, orig_pmd)))
 		goto out_free_pages;
 	VM_BUG_ON(!PageHead(page));
 
-	pmdp_clear_flush_notify(vma, haddr, pmd);
+	pmdp_clear_flush(vma, haddr, pmd);
 	/* leave pmd empty until pte is filled */
 
 	pgtable = pgtable_trans_huge_withdraw(mm);
@@ -851,6 +857,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 	page_remove_rmap(page);
 	spin_unlock(&mm->page_table_lock);
 
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
 	ret |= VM_FAULT_WRITE;
 	put_page(page);
 
@@ -859,6 +867,7 @@ out:
 
 out_free_pages:
 	spin_unlock(&mm->page_table_lock);
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	mem_cgroup_uncharge_start();
 	for (i = 0; i < HPAGE_PMD_NR; i++) {
 		mem_cgroup_uncharge_page(pages[i]);
@@ -875,6 +884,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	int ret = 0;
 	struct page *page, *new_page;
 	unsigned long haddr;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
 
 	VM_BUG_ON(!vma->anon_vma);
 	spin_lock(&mm->page_table_lock);
@@ -925,20 +936,24 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
 	__SetPageUptodate(new_page);
 
+	mmun_start = haddr;
+	mmun_end   = haddr + HPAGE_PMD_SIZE;
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
 	spin_lock(&mm->page_table_lock);
 	put_page(page);
 	if (unlikely(!pmd_same(*pmd, orig_pmd))) {
 		spin_unlock(&mm->page_table_lock);
 		mem_cgroup_uncharge_page(new_page);
 		put_page(new_page);
-		goto out;
+		goto out_mn;
 	} else {
 		pmd_t entry;
 		VM_BUG_ON(!PageHead(page));
 		entry = mk_pmd(new_page, vma->vm_page_prot);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 		entry = pmd_mkhuge(entry);
-		pmdp_clear_flush_notify(vma, haddr, pmd);
+		pmdp_clear_flush(vma, haddr, pmd);
 		page_add_new_anon_rmap(new_page, vma, haddr);
 		set_pmd_at(mm, haddr, pmd, entry);
 		update_mmu_cache(vma, address, pmd);
@@ -946,9 +961,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		put_page(page);
 		ret |= VM_FAULT_WRITE;
 	}
+	spin_unlock(&mm->page_table_lock);
+out_mn:
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+out:
+	return ret;
 out_unlock:
 	spin_unlock(&mm->page_table_lock);
-out:
 	return ret;
 }
 
@@ -1162,7 +1181,11 @@ static int __split_huge_page_splitting(struct page *page,
 	struct mm_struct *mm = vma->vm_mm;
 	pmd_t *pmd;
 	int ret = 0;
+	/* For mmu_notifiers */
+	const unsigned long mmun_start = address;
+	const unsigned long mmun_end   = address + HPAGE_PMD_SIZE;
 
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	spin_lock(&mm->page_table_lock);
 	pmd = page_check_address_pmd(page, mm, address,
 				     PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
@@ -1174,10 +1197,11 @@ static int __split_huge_page_splitting(struct page *page,
 		 * and it won't wait on the anon_vma->root->mutex to
 		 * serialize against split_huge_page*.
 		 */
-		pmdp_splitting_flush_notify(vma, address, pmd);
+		pmdp_splitting_flush(vma, address, pmd);
 		ret = 1;
 	}
 	spin_unlock(&mm->page_table_lock);
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
 	return ret;
 }
@@ -1898,6 +1922,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 	spinlock_t *ptl;
 	int isolated;
 	unsigned long hstart, hend;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
@@ -1952,6 +1978,9 @@ static void collapse_huge_page(struct mm_struct *mm,
 	pte = pte_offset_map(pmd, address);
 	ptl = pte_lockptr(mm, pmd);
 
+	mmun_start = address;
+	mmun_end   = address + HPAGE_PMD_SIZE;
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	spin_lock(&mm->page_table_lock); /* probably unnecessary */
 	/*
 	 * After this gup_fast can't run anymore. This also removes
@@ -1959,8 +1988,9 @@ static void collapse_huge_page(struct mm_struct *mm,
 	 * huge and small TLB entries for the same virtual address
 	 * to avoid the risk of CPU bugs in that area.
 	 */
-	_pmd = pmdp_clear_flush_notify(vma, address, pmd);
+	_pmd = pmdp_clear_flush(vma, address, pmd);
 	spin_unlock(&mm->page_table_lock);
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
 	spin_lock(ptl);
 	isolated = __collapse_huge_page_isolate(vma, address, pte);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index de5d1dcf34fe..993f7c1820a8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2355,13 +2355,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	struct page *page;
 	struct hstate *h = hstate_vma(vma);
 	unsigned long sz = huge_page_size(h);
+	const unsigned long mmun_start = start;	/* For mmu_notifiers */
+	const unsigned long mmun_end   = end;	/* For mmu_notifiers */
 
 	WARN_ON(!is_vm_hugetlb_page(vma));
 	BUG_ON(start & ~huge_page_mask(h));
 	BUG_ON(end & ~huge_page_mask(h));
 
 	tlb_start_vma(tlb, vma);
-	mmu_notifier_invalidate_range_start(mm, start, end);
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 again:
 	spin_lock(&mm->page_table_lock);
 	for (address = start; address < end; address += sz) {
@@ -2425,7 +2427,7 @@ again:
 		if (address < end && !ref_page)
 			goto again;
 	}
-	mmu_notifier_invalidate_range_end(mm, start, end);
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	tlb_end_vma(tlb, vma);
 }
 
@@ -2525,6 +2527,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page *old_page, *new_page;
 	int avoidcopy;
 	int outside_reserve = 0;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
 
 	old_page = pte_page(pte);
 
@@ -2611,6 +2615,9 @@ retry_avoidcopy:
 			    pages_per_huge_page(h));
 	__SetPageUptodate(new_page);
 
+	mmun_start = address & huge_page_mask(h);
+	mmun_end = mmun_start + huge_page_size(h);
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	/*
 	 * Retake the page_table_lock to check for racing updates
 	 * before the page tables are altered
@@ -2619,9 +2626,6 @@ retry_avoidcopy:
 	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
 	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
 		/* Break COW */
-		mmu_notifier_invalidate_range_start(mm,
-			address & huge_page_mask(h),
-			(address & huge_page_mask(h)) + huge_page_size(h));
 		huge_ptep_clear_flush(vma, address, ptep);
 		set_huge_pte_at(mm, address, ptep,
 				make_huge_pte(vma, new_page, 1));
@@ -2629,10 +2633,11 @@ retry_avoidcopy:
 		hugepage_add_new_anon_rmap(new_page, vma, address);
 		/* Make the old page be freed below */
 		new_page = old_page;
-		mmu_notifier_invalidate_range_end(mm,
-			address & huge_page_mask(h),
-			(address & huge_page_mask(h)) + huge_page_size(h));
 	}
+	spin_unlock(&mm->page_table_lock);
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+	/* Caller expects lock to be held */
+	spin_lock(&mm->page_table_lock);
 	page_cache_release(new_page);
 	page_cache_release(old_page);
 	return 0;
diff --git a/mm/memory.c b/mm/memory.c
index 5f5d1f039bf4..b03a4a21c1d0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -712,7 +712,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
 	add_taint(TAINT_BAD_PAGE);
 }
 
-static inline int is_cow_mapping(vm_flags_t flags)
+static inline bool is_cow_mapping(vm_flags_t flags)
 {
 	return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 }
@@ -1039,6 +1039,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	unsigned long next;
 	unsigned long addr = vma->vm_start;
 	unsigned long end = vma->vm_end;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
+	bool is_cow;
 	int ret;
 
 	/*
@@ -1072,8 +1075,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 * parent mm. And a permission downgrade will only happen if
 	 * is_cow_mapping() returns true.
 	 */
-	if (is_cow_mapping(vma->vm_flags))
-		mmu_notifier_invalidate_range_start(src_mm, addr, end);
+	is_cow = is_cow_mapping(vma->vm_flags);
+	mmun_start = addr;
+	mmun_end   = end;
+	if (is_cow)
+		mmu_notifier_invalidate_range_start(src_mm, mmun_start,
+						    mmun_end);
 
 	ret = 0;
 	dst_pgd = pgd_offset(dst_mm, addr);
@@ -1089,9 +1096,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		}
 	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
 
-	if (is_cow_mapping(vma->vm_flags))
-		mmu_notifier_invalidate_range_end(src_mm,
-						  vma->vm_start, end);
+	if (is_cow)
+		mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
 	return ret;
 }
 
@@ -2516,7 +2522,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		spinlock_t *ptl, pte_t orig_pte)
 	__releases(ptl)
 {
-	struct page *old_page, *new_page;
+	struct page *old_page, *new_page = NULL;
 	pte_t entry;
 	int ret = 0;
 	int page_mkwrite = 0;
@@ -2760,10 +2766,14 @@ gotten:
 	} else
 		mem_cgroup_uncharge_page(new_page);
 
-	if (new_page)
-		page_cache_release(new_page);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
+	if (new_page) {
+		if (new_page == old_page)
+			/* cow happened, notify before releasing old_page */
+			mmu_notifier_invalidate_page(mm, address);
+		page_cache_release(new_page);
+	}
 	if (old_page) {
 		/*
 		 * Don't let another task, with possibly unlocked vma,
diff --git a/mm/mremap.c b/mm/mremap.c
index 3b639a4b26bd..1b61c2d3307a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -149,11 +149,15 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 	unsigned long extent, next, old_end;
 	pmd_t *old_pmd, *new_pmd;
 	bool need_flush = false;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
 
 	old_end = old_addr + len;
 	flush_cache_range(vma, old_addr, old_end);
 
-	mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end);
+	mmun_start = old_addr;
+	mmun_end   = old_end;
+	mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
 
 	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
 		cond_resched();
@@ -197,7 +201,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 	if (likely(need_flush))
 		flush_tlb_range(vma, old_end-len, old_addr);
 
-	mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end);
+	mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
 
 	return len + old_addr - old_end;	/* how much done */
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index bf03149f495c..7df7984d476c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -884,7 +884,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
 		pte_t entry;
 
 		flush_cache_page(vma, address, pte_pfn(*pte));
-		entry = ptep_clear_flush_notify(vma, address, pte);
+		entry = ptep_clear_flush(vma, address, pte);
 		entry = pte_wrprotect(entry);
 		entry = pte_mkclean(entry);
 		set_pte_at(mm, address, pte, entry);
@@ -892,6 +892,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
 	}
 
 	pte_unmap_unlock(pte, ptl);
+
+	if (ret)
+		mmu_notifier_invalidate_page(mm, address);
 out:
 	return ret;
 }
@@ -1212,7 +1215,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
 	/* Nuke the page table entry. */
 	flush_cache_page(vma, address, page_to_pfn(page));
-	pteval = ptep_clear_flush_notify(vma, address, pte);
+	pteval = ptep_clear_flush(vma, address, pte);
 
 	/* Move the dirty bit to the physical page now the pte is gone. */
 	if (pte_dirty(pteval))
@@ -1274,6 +1277,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
+	if (ret != SWAP_FAIL)
+		mmu_notifier_invalidate_page(mm, address);
 out:
 	return ret;
 
@@ -1338,6 +1343,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
 	spinlock_t *ptl;
 	struct page *page;
 	unsigned long address;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
 	unsigned long end;
 	int ret = SWAP_AGAIN;
 	int locked_vma = 0;
@@ -1361,6 +1368,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
 	if (!pmd_present(*pmd))
 		return ret;
 
+	mmun_start = address;
+	mmun_end   = end;
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
 	/*
 	 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
 	 * keep the sem while scanning the cluster for mlocking pages.
@@ -1394,7 +1405,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
 
 		/* Nuke the page table entry. */
 		flush_cache_page(vma, address, pte_pfn(*pte));
-		pteval = ptep_clear_flush_notify(vma, address, pte);
+		pteval = ptep_clear_flush(vma, address, pte);
 
 		/* If nonlinear, store the file page offset in the pte. */
 		if (page->index != linear_page_index(vma, address))
@@ -1410,6 +1421,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
 		(*mapcount)--;
 	}
 	pte_unmap_unlock(pte - 1, ptl);
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	if (locked_vma)
 		up_read(&vma->vm_mm->mmap_sem);
 	return ret;

From 6bdb913f0a70a4dfb7f066fb15e2d6f960701d00 Mon Sep 17 00:00:00 2001
From: Haggai Eran <haggaie@mellanox.com>
Date: Mon, 8 Oct 2012 16:33:35 -0700
Subject: [PATCH 135/160] mm: wrap calls to set_pte_at_notify with
 invalidate_range_start and invalidate_range_end

In order to allow sleeping during invalidate_page mmu notifier calls, we
need to avoid calling when holding the PT lock.  In addition to its direct
calls, invalidate_page can also be called as a substitute for a change_pte
call, in case the notifier client hasn't implemented change_pte.

This patch drops the invalidate_page call from change_pte, and instead
wraps all calls to change_pte with invalidate_range_start and
invalidate_range_end calls.

Note that change_pte still cannot sleep after this patch, and that clients
implementing change_pte should not take action on it in case the number of
outstanding invalidate_range_start calls is larger than one, otherwise
they might miss a later invalidation.

Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Cc: Andrea Arcangeli <andrea@qumranet.com>
Cc: Sagi Grimberg <sagig@mellanox.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Or Gerlitz <ogerlitz@mellanox.com>
Cc: Haggai Eran <haggaie@mellanox.com>
Cc: Shachar Raindel <raindel@mellanox.com>
Cc: Liran Liss <liranl@mellanox.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Avi Kivity <avi@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/events/uprobes.c |  5 +++++
 mm/ksm.c                | 21 +++++++++++++++++++--
 mm/memory.c             | 18 ++++++++++++------
 mm/mmu_notifier.c       |  6 ------
 4 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 1d9c0a985960..98256bc71ee1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -141,10 +141,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 	spinlock_t *ptl;
 	pte_t *ptep;
 	int err;
+	/* For mmu_notifiers */
+	const unsigned long mmun_start = addr;
+	const unsigned long mmun_end   = addr + PAGE_SIZE;
 
 	/* For try_to_free_swap() and munlock_vma_page() below */
 	lock_page(page);
 
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	err = -EAGAIN;
 	ptep = page_check_address(page, mm, addr, &ptl, 0);
 	if (!ptep)
@@ -173,6 +177,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 
 	err = 0;
  unlock:
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	unlock_page(page);
 	return err;
 }
diff --git a/mm/ksm.c b/mm/ksm.c
index ecbc090cdaad..ae539f0b8aa1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -709,15 +709,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 	spinlock_t *ptl;
 	int swapped;
 	int err = -EFAULT;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
 
 	addr = page_address_in_vma(page, vma);
 	if (addr == -EFAULT)
 		goto out;
 
 	BUG_ON(PageTransCompound(page));
+
+	mmun_start = addr;
+	mmun_end   = addr + PAGE_SIZE;
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
 	ptep = page_check_address(page, mm, addr, &ptl, 0);
 	if (!ptep)
-		goto out;
+		goto out_mn;
 
 	if (pte_write(*ptep) || pte_dirty(*ptep)) {
 		pte_t entry;
@@ -752,6 +759,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 
 out_unlock:
 	pte_unmap_unlock(ptep, ptl);
+out_mn:
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out:
 	return err;
 }
@@ -776,6 +785,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	spinlock_t *ptl;
 	unsigned long addr;
 	int err = -EFAULT;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
 
 	addr = page_address_in_vma(page, vma);
 	if (addr == -EFAULT)
@@ -794,10 +805,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	if (!pmd_present(*pmd))
 		goto out;
 
+	mmun_start = addr;
+	mmun_end   = addr + PAGE_SIZE;
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
 	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	if (!pte_same(*ptep, orig_pte)) {
 		pte_unmap_unlock(ptep, ptl);
-		goto out;
+		goto out_mn;
 	}
 
 	get_page(kpage);
@@ -814,6 +829,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 
 	pte_unmap_unlock(ptep, ptl);
 	err = 0;
+out_mn:
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out:
 	return err;
 }
diff --git a/mm/memory.c b/mm/memory.c
index b03a4a21c1d0..01ec048ece8b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2527,6 +2527,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	int ret = 0;
 	int page_mkwrite = 0;
 	struct page *dirty_page = NULL;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
+	bool mmun_called = false;	/* For mmu_notifiers */
 
 	old_page = vm_normal_page(vma, address, orig_pte);
 	if (!old_page) {
@@ -2704,6 +2707,11 @@ gotten:
 	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
 		goto oom_free_new;
 
+	mmun_start  = address & PAGE_MASK;
+	mmun_end    = (address & PAGE_MASK) + PAGE_SIZE;
+	mmun_called = true;
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
@@ -2766,14 +2774,12 @@ gotten:
 	} else
 		mem_cgroup_uncharge_page(new_page);
 
+	if (new_page)
+		page_cache_release(new_page);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
-	if (new_page) {
-		if (new_page == old_page)
-			/* cow happened, notify before releasing old_page */
-			mmu_notifier_invalidate_page(mm, address);
-		page_cache_release(new_page);
-	}
+	if (mmun_called)
+		mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	if (old_page) {
 		/*
 		 * Don't let another task, with possibly unlocked vma,
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index c297142f0fe6..479a1e751a73 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -137,12 +137,6 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->change_pte)
 			mn->ops->change_pte(mn, mm, address, pte);
-		/*
-		 * Some drivers don't have change_pte,
-		 * so we must call invalidate_page in that case.
-		 */
-		else if (mn->ops->invalidate_page)
-			mn->ops->invalidate_page(mn, mm, address);
 	}
 	srcu_read_unlock(&srcu, id);
 }

From 082708072a4250f5c4dbc62065e7af93f5e45646 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:33:38 -0700
Subject: [PATCH 136/160] mm: revert 0def08e3 ("mm/mempolicy.c: check return
 code of check_range")

Revert commit 0def08e3acc2 because check_range can't fail in
migrate_to_node with considering current usecases.

Quote from Johannes

: I think it makes sense to revert.  Not because of the semantics, but I
: just don't see how check_range() could even fail for this callsite:
:
: 1. we pass mm->mmap->vm_start in there, so we should not fail due to
:    find_vma()
:
: 2. we pass MPOL_MF_DISCONTIG_OK, so the discontig checks do not apply
:    and so can not fail
:
: 3. we pass MPOL_MF_MOVE | MPOL_MF_MOVE_ALL, the page table loops will
:    continue until addr == end, so we never fail with -EIO

And I added a new VM_BUG_ON for checking migrate_to_node's future usecase
which might pass to MPOL_MF_STRICT.

Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Vasiliy Kulikov <segooon@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3d64b369180d..0b78fb9ea65b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -946,15 +946,18 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 	nodemask_t nmask;
 	LIST_HEAD(pagelist);
 	int err = 0;
-	struct vm_area_struct *vma;
 
 	nodes_clear(nmask);
 	node_set(source, nmask);
 
-	vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+	/*
+	 * This does not "check" the range but isolates all pages that
+	 * need migration.  Between passing in the full user address
+	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
+	 */
+	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
+	check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
-	if (IS_ERR(vma))
-		return PTR_ERR(vma);
 
 	if (!list_empty(&pagelist)) {
 		err = migrate_pages(&pagelist, new_node_page, dest,

From 5a883813845a2bb5ed2bd8c9240736c0740b156f Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:33:39 -0700
Subject: [PATCH 137/160] memory-hotplug: fix zone stat mismatch

During memory-hotplug, I found NR_ISOLATED_[ANON|FILE] are increasing,
causing the kernel to hang.  When the system doesn't have enough free
pages, it enters reclaim but never reclaim any pages due to
too_many_isolated()==true and loops forever.

The cause is that when we do memory-hotadd after memory-remove,
__zone_pcp_update() clears a zone's ZONE_STAT_ITEMS in setup_pageset()
although the vm_stat_diff of all CPUs still have values.

In addtion, when we offline all pages of the zone, we reset them in
zone_pcp_reset without draining so we loss some zone stat item.

Reviewed-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h |  4 ++++
 mm/page_alloc.c        |  7 +++++++
 mm/vmstat.c            | 12 ++++++++++++
 3 files changed, 23 insertions(+)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index a5bb15018b5c..92a86b2cce33 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -198,6 +198,8 @@ extern void __dec_zone_state(struct zone *, enum zone_stat_item);
 void refresh_cpu_vm_stats(int);
 void refresh_zone_stat_thresholds(void);
 
+void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
+
 int calculate_pressure_threshold(struct zone *zone);
 int calculate_normal_threshold(struct zone *zone);
 void set_pgdat_percpu_threshold(pg_data_t *pgdat,
@@ -251,6 +253,8 @@ static inline void __dec_zone_page_state(struct page *page,
 static inline void refresh_cpu_vm_stats(int cpu) { }
 static inline void refresh_zone_stat_thresholds(void) { }
 
+static inline void drain_zonestat(struct zone *zone,
+			struct per_cpu_pageset *pset) { }
 #endif		/* CONFIG_SMP */
 
 static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9b8e6243a524..5485f0ef4ec3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5916,6 +5916,7 @@ static int __meminit __zone_pcp_update(void *data)
 		local_irq_save(flags);
 		if (pcp->count > 0)
 			free_pcppages_bulk(zone, pcp->count, pcp);
+		drain_zonestat(zone, pset);
 		setup_pageset(pset, batch);
 		local_irq_restore(flags);
 	}
@@ -5932,10 +5933,16 @@ void __meminit zone_pcp_update(struct zone *zone)
 void zone_pcp_reset(struct zone *zone)
 {
 	unsigned long flags;
+	int cpu;
+	struct per_cpu_pageset *pset;
 
 	/* avoid races with drain_pages()  */
 	local_irq_save(flags);
 	if (zone->pageset != &boot_pageset) {
+		for_each_online_cpu(cpu) {
+			pset = per_cpu_ptr(zone->pageset, cpu);
+			drain_zonestat(zone, pset);
+		}
 		free_percpu(zone->pageset);
 		zone->pageset = &boot_pageset;
 	}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 05e3a9913744..2f11309955cc 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -495,6 +495,18 @@ void refresh_cpu_vm_stats(int cpu)
 			atomic_long_add(global_diff[i], &vm_stat[i]);
 }
 
+void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
+{
+	int i;
+
+	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+		if (pset->vm_stat_diff[i]) {
+			int v = pset->vm_stat_diff[i];
+			pset->vm_stat_diff[i] = 0;
+			atomic_long_add(v, &zone->vm_stat[i]);
+			atomic_long_add(v, &vm_stat[i]);
+		}
+}
 #endif
 
 #ifdef CONFIG_NUMA

From 8befedfe678ff84d5bc356be4f3cb1fd84959d02 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 8 Oct 2012 16:33:42 -0700
Subject: [PATCH 138/160] mm: remove unevictable_pgs_mlockfreed

Simply remove UNEVICTABLE_MLOCKFREED and unevictable_pgs_mlockfreed line
from /proc/vmstat: Johannes and Mel point out that it was very unlikely to
have been used by any tool, and of course we can restore it easily enough
if that turns out to be wrong.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michel Lespinasse <walken@google.com>
Cc: Ying Han <yinghan@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vm_event_item.h | 1 -
 mm/vmstat.c                   | 1 -
 2 files changed, 2 deletions(-)

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index ede4ddd46695..3d3114594370 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -52,7 +52,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		UNEVICTABLE_PGMUNLOCKED,
 		UNEVICTABLE_PGCLEARED,	/* on COW, page truncate */
 		UNEVICTABLE_PGSTRANDED,	/* unable to isolate on unlock */
-		UNEVICTABLE_MLOCKFREED,	/* no longer useful: always zero */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 		THP_FAULT_ALLOC,
 		THP_FAULT_FALLBACK,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 2f11309955cc..c7370579111b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -794,7 +794,6 @@ const char * const vmstat_text[] = {
 	"unevictable_pgs_munlocked",
 	"unevictable_pgs_cleared",
 	"unevictable_pgs_stranded",
-	"unevictable_pgs_mlockfreed",	/* no longer useful: always zero */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	"thp_fault_alloc",

From c462f179e4d273f0da0e7cf302e29b0edf43844e Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Mon, 8 Oct 2012 16:33:43 -0700
Subject: [PATCH 139/160] mm/memory.c: fix typo in comment

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index 01ec048ece8b..45bb6d296b6f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2148,7 +2148,7 @@ out:
  * @addr: target user address of this page
  * @pfn: source kernel pfn
  *
- * Similar to vm_inert_page, this allows drivers to insert individual pages
+ * Similar to vm_insert_page, this allows drivers to insert individual pages
  * they've allocated into a user vma. Same comments apply.
  *
  * This function should only be called from a vm_ops->fault handler, and

From cd8ed2a45a401cb692d769e92de7d73aa42fabce Mon Sep 17 00:00:00 2001
From: Yan Hong <clouds.yan@gmail.com>
Date: Mon, 8 Oct 2012 16:33:45 -0700
Subject: [PATCH 140/160] fs/fs-writeback.c: remove unneccesary parameter of
 __writeback_single_inode()

The parameter 'wb' is never used in this function.

Signed-off-by: Yan Hong <clouds.yan@gmail.com>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 8e1d7b9e4a33..401b6c6248ae 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -439,8 +439,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
  * setting I_SYNC flag and calling inode_sync_complete() to clear it.
  */
 static int
-__writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
-			 struct writeback_control *wbc)
+__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct address_space *mapping = inode->i_mapping;
 	long nr_to_write = wbc->nr_to_write;
@@ -527,7 +526,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	inode->i_state |= I_SYNC;
 	spin_unlock(&inode->i_lock);
 
-	ret = __writeback_single_inode(inode, wb, wbc);
+	ret = __writeback_single_inode(inode, wbc);
 
 	spin_lock(&wb->list_lock);
 	spin_lock(&inode->i_lock);
@@ -670,7 +669,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 		 * We use I_SYNC to pin the inode in memory. While it is set
 		 * evict_inode() will wait so the inode cannot be freed.
 		 */
-		__writeback_single_inode(inode, wb, &wbc);
+		__writeback_single_inode(inode, &wbc);
 
 		work->nr_pages -= write_chunk - wbc.nr_to_write;
 		wrote += write_chunk - wbc.nr_to_write;

From 7a71932d5676b7410ab64d149bad8bde6b0d8632 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Mon, 8 Oct 2012 16:33:47 -0700
Subject: [PATCH 141/160] kpageflags: fix wrong KPF_THP on non-huge compound
 pages

KPF_THP can be set on non-huge compound pages (like slab pages or pages
allocated by drivers with __GFP_COMP) because PageTransCompound only
checks PG_head and PG_tail.  Obviously this is a bug and breaks user space
applications which look for thp via /proc/kpageflags.

This patch rules out setting KPF_THP wrongly by additionally checking
PageLRU on the head pages.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Fengguang Wu <fengguang.wu@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/page.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/proc/page.c b/fs/proc/page.c
index 7fcd0d60a968..b8730d9ebaee 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -115,7 +115,13 @@ u64 stable_page_flags(struct page *page)
 		u |= 1 << KPF_COMPOUND_TAIL;
 	if (PageHuge(page))
 		u |= 1 << KPF_HUGE;
-	else if (PageTransCompound(page))
+	/*
+	 * PageTransCompound can be true for non-huge compound pages (slab
+	 * pages or pages allocated by drivers with __GFP_COMP) because it
+	 * just checks PG_head/PG_tail, so we need to check PageLRU to make
+	 * sure a given page is a thp, not a non-huge compound page.
+	 */
+	else if (PageTransCompound(page) && PageLRU(compound_trans_head(page)))
 		u |= 1 << KPF_THP;
 
 	/*

From e46a28790e594c0876d1a84270926abf75460f61 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:33:48 -0700
Subject: [PATCH 142/160] CMA: migrate mlocked pages

Presently CMA cannot migrate mlocked pages so it ends up failing to allocate
contiguous memory space.

This patch makes mlocked pages be migrated out.  Of course, it can affect
realtime processes but in CMA usecase, contiguous memory allocation failing
is far worse than access latency to an mlocked page being variable while
CMA is running.  If someone wants to make the system realtime, he shouldn't
enable CMA because stalls can still happen at random times.

[akpm@linux-foundation.org: tweak comment text, per Mel]
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 ++
 mm/compaction.c        | 8 ++++++--
 mm/internal.h          | 2 +-
 mm/page_alloc.c        | 2 +-
 mm/vmscan.c            | 4 ++--
 5 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a5578871d033..50aaca81f63d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -218,6 +218,8 @@ struct lruvec {
 #define ISOLATE_UNMAPPED	((__force isolate_mode_t)0x2)
 /* Isolate for asynchronous migration */
 #define ISOLATE_ASYNC_MIGRATE	((__force isolate_mode_t)0x4)
+/* Isolate unevictable pages */
+#define ISOLATE_UNEVICTABLE	((__force isolate_mode_t)0x8)
 
 /* LRU Isolation modes. */
 typedef unsigned __bitwise__ isolate_mode_t;
diff --git a/mm/compaction.c b/mm/compaction.c
index d8187f9cabbf..2c4ce17651d8 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -461,6 +461,7 @@ static bool too_many_isolated(struct zone *zone)
  * @cc:		Compaction control structure.
  * @low_pfn:	The first PFN of the range.
  * @end_pfn:	The one-past-the-last PFN of the range.
+ * @unevictable: true if it allows to isolate unevictable pages
  *
  * Isolate all pages that can be migrated from the range specified by
  * [low_pfn, end_pfn).  Returns zero if there is a fatal signal
@@ -476,7 +477,7 @@ static bool too_many_isolated(struct zone *zone)
  */
 unsigned long
 isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
-			   unsigned long low_pfn, unsigned long end_pfn)
+		unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
 {
 	unsigned long last_pageblock_nr = 0, pageblock_nr;
 	unsigned long nr_scanned = 0, nr_isolated = 0;
@@ -602,6 +603,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		if (!cc->sync)
 			mode |= ISOLATE_ASYNC_MIGRATE;
 
+		if (unevictable)
+			mode |= ISOLATE_UNEVICTABLE;
+
 		lruvec = mem_cgroup_page_lruvec(page, zone);
 
 		/* Try isolate the page */
@@ -807,7 +811,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 	}
 
 	/* Perform the isolation */
-	low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
+	low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false);
 	if (!low_pfn || cc->contended)
 		return ISOLATE_ABORT;
 
diff --git a/mm/internal.h b/mm/internal.h
index 4dc93e2fe69e..f5f295fe11e1 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -138,7 +138,7 @@ isolate_freepages_range(struct compact_control *cc,
 			unsigned long start_pfn, unsigned long end_pfn);
 unsigned long
 isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
-			   unsigned long low_pfn, unsigned long end_pfn);
+	unsigned long low_pfn, unsigned long end_pfn, bool unevictable);
 
 #endif
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5485f0ef4ec3..fd86c47de86f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5690,7 +5690,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		if (list_empty(&cc->migratepages)) {
 			cc->nr_migratepages = 0;
 			pfn = isolate_migratepages_range(cc->zone, cc,
-							 pfn, end);
+							 pfn, end, true);
 			if (!pfn) {
 				ret = -EINTR;
 				break;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8b627309dd44..2624edcfb420 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1009,8 +1009,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 	if (!PageLRU(page))
 		return ret;
 
-	/* Do not give back unevictable pages for compaction */
-	if (PageUnevictable(page))
+	/* Compaction should not handle unevictable pages but CMA can do so */
+	if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
 		return ret;
 
 	ret = -EBUSY;

From beb51eaa88238daba698ad837222ad277d440b6d Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 8 Oct 2012 16:33:51 -0700
Subject: [PATCH 143/160] cma: decrease cc.nr_migratepages after reclaiming
 pagelist

reclaim_clean_pages_from_list() reclaims clean pages before migration so
cc.nr_migratepages should be updated.  Currently, there is no problem but
it can be wrong if we try to use the value in future.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fd86c47de86f..bb90971182bd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5674,7 +5674,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 					unsigned long start, unsigned long end)
 {
 	/* This function is based on compact_zone() from compaction.c. */
-
+	unsigned long nr_reclaimed;
 	unsigned long pfn = start;
 	unsigned int tries = 0;
 	int ret = 0;
@@ -5701,7 +5701,9 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 			break;
 		}
 
-		reclaim_clean_pages_from_list(cc->zone, &cc->migratepages);
+		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
+							&cc->migratepages);
+		cc->nr_migratepages -= nr_reclaimed;
 
 		ret = migrate_pages(&cc->migratepages,
 				    alloc_migrate_target,

From 3e648ebe076390018c317881d7d926f24d7bac6b Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@parallels.com>
Date: Mon, 8 Oct 2012 16:33:52 -0700
Subject: [PATCH 144/160] make GFP_NOTRACK definition unconditional

There was a general sentiment in a recent discussion (See
https://lkml.org/lkml/2012/9/18/258) that the __GFP flags should be
defined unconditionally.  Currently, the only offender is GFP_NOTRACK,
which is conditional to KMEMCHECK.

Signed-off-by: Glauber Costa <glommer@parallels.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Mel Gorman <mgorman@suse.de>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f9bc873ce7d6..02c1c9710be0 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -30,11 +30,7 @@ struct vm_area_struct;
 #define ___GFP_HARDWALL		0x20000u
 #define ___GFP_THISNODE		0x40000u
 #define ___GFP_RECLAIMABLE	0x80000u
-#ifdef CONFIG_KMEMCHECK
 #define ___GFP_NOTRACK		0x200000u
-#else
-#define ___GFP_NOTRACK		0
-#endif
 #define ___GFP_OTHER_NODE	0x800000u
 #define ___GFP_WRITE		0x1000000u
 

From c22331166b49b94b51424bb7fa1f83e09ad82734 Mon Sep 17 00:00:00 2001
From: Raghavendra D Prabhu <rprabhu@wnohang.net>
Date: Mon, 8 Oct 2012 16:33:55 -0700
Subject: [PATCH 145/160] mm: avoid section mismatch warning for
 memblock_type_name

Following section mismatch warning is thrown during build;

    WARNING: vmlinux.o(.text+0x32408f): Section mismatch in reference from the function memblock_type_name() to the variable .meminit.data:memblock
    The function memblock_type_name() references
    the variable __meminitdata memblock.
    This is often because memblock_type_name lacks a __meminitdata
    annotation or the annotation of memblock is wrong.

This is because memblock_type_name makes reference to memblock variable
with attribute __meminitdata.  Hence, the warning (even if the function is
inline).

[akpm@linux-foundation.org: remove inline]
Signed-off-by: Raghavendra D Prabhu <rprabhu@wnohang.net>
Cc: Tejun Heo <tj@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memblock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/memblock.c b/mm/memblock.c
index d809f17ef75b..931eef145af5 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -41,7 +41,8 @@ static int memblock_memory_in_slab __initdata_memblock = 0;
 static int memblock_reserved_in_slab __initdata_memblock = 0;
 
 /* inline so we don't get a warning when pr_debug is compiled out */
-static inline const char *memblock_type_name(struct memblock_type *type)
+static __init_memblock const char *
+memblock_type_name(struct memblock_type *type)
 {
 	if (type == &memblock.memory)
 		return "memory";

From a16cee10c7ab994546ed98d9abfd4de74050124a Mon Sep 17 00:00:00 2001
From: Wen Congyang <wency@cn.fujitsu.com>
Date: Mon, 8 Oct 2012 16:33:58 -0700
Subject: [PATCH 146/160] memory-hotplug: preparation to notify memory block's
 state at memory hot remove

remove_memory() is called in two cases:
1. echo offline >/sys/devices/system/memory/memoryXX/state
2. hot remove a memory device

In the 1st case, the memory block's state is changed and the notification
that memory block's state changed is sent to userland after calling
remove_memory().  So user can notice memory block is changed.

But in the 2nd case, the memory block's state is not changed and the
notification is not also sent to userspcae even if calling
remove_memory().  So user cannot notice memory block is changed.

For adding the notification at memory hot remove, the patch just prepare
as follows:
1st case uses offline_pages() for offlining memory.
2nd case uses remove_memory() for offlining memory and changing memory block's
    state and notifing the information.

The patch does not implement notification to remove_memory().

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c          |  9 +++------
 include/linux/memory_hotplug.h |  1 +
 mm/memory_hotplug.c            | 13 +++++++++++--
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 7dda4f790f00..44e7de6ce694 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -248,26 +248,23 @@ static bool pages_correctly_reserved(unsigned long start_pfn,
 static int
 memory_block_action(unsigned long phys_index, unsigned long action)
 {
-	unsigned long start_pfn, start_paddr;
+	unsigned long start_pfn;
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 	struct page *first_page;
 	int ret;
 
 	first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);
+	start_pfn = page_to_pfn(first_page);
 
 	switch (action) {
 		case MEM_ONLINE:
-			start_pfn = page_to_pfn(first_page);
-
 			if (!pages_correctly_reserved(start_pfn, nr_pages))
 				return -EBUSY;
 
 			ret = online_pages(start_pfn, nr_pages);
 			break;
 		case MEM_OFFLINE:
-			start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
-			ret = remove_memory(start_paddr,
-					    nr_pages << PAGE_SHIFT);
+			ret = offline_pages(start_pfn, nr_pages);
 			break;
 		default:
 			WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 910550f3b70e..e64fe80eba96 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -233,6 +233,7 @@ static inline int is_mem_section_removable(unsigned long pfn,
 extern int mem_online_node(int nid);
 extern int add_memory(int nid, u64 start, u64 size);
 extern int arch_add_memory(int nid, u64 start, u64 size);
+extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern int remove_memory(u64 start, u64 size);
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
 								int nr_pages);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ce690a911f1b..dfc0a6134c7c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -874,7 +874,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 	return offlined;
 }
 
-static int __ref offline_pages(unsigned long start_pfn,
+static int __ref __offline_pages(unsigned long start_pfn,
 		  unsigned long end_pfn, unsigned long timeout)
 {
 	unsigned long pfn, nr_pages, expire;
@@ -1007,15 +1007,24 @@ out:
 	return ret;
 }
 
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+{
+	return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
+}
+
 int remove_memory(u64 start, u64 size)
 {
 	unsigned long start_pfn, end_pfn;
 
 	start_pfn = PFN_DOWN(start);
 	end_pfn = start_pfn + PFN_DOWN(size);
-	return offline_pages(start_pfn, end_pfn, 120 * HZ);
+	return __offline_pages(start_pfn, end_pfn, 120 * HZ);
 }
 #else
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+{
+	return -EINVAL;
+}
 int remove_memory(u64 start, u64 size)
 {
 	return -EINVAL;

From e90bdb7f52f94204c78fb40b0804645defdebd71 Mon Sep 17 00:00:00 2001
From: Wen Congyang <wency@cn.fujitsu.com>
Date: Mon, 8 Oct 2012 16:34:01 -0700
Subject: [PATCH 147/160] memory-hotplug: update memory block's state and
 notify userspace

remove_memory() will be called when hot removing a memory device.  But
even if offlining memory, we cannot notice it.  So the patch updates the
memory block's state and sends notification to userspace.

Additionally, the memory device may contain more than one memory block.
If the memory block has been offlined, __offline_pages() will fail.  So we
should try to offline one memory block at a time.

Thus remove_memory() also check each memory block's state.  So there is no
need to check the memory block's state before calling remove_memory().

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c          | 31 +++++++++++++++++++++++++++----
 include/linux/memory_hotplug.h |  2 ++
 mm/memory_hotplug.c            | 33 ++++++++++++++++++++++++++++++++-
 3 files changed, 61 insertions(+), 5 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 44e7de6ce694..86c88216a503 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -275,13 +275,11 @@ memory_block_action(unsigned long phys_index, unsigned long action)
 	return ret;
 }
 
-static int memory_block_change_state(struct memory_block *mem,
+static int __memory_block_change_state(struct memory_block *mem,
 		unsigned long to_state, unsigned long from_state_req)
 {
 	int ret = 0;
 
-	mutex_lock(&mem->state_mutex);
-
 	if (mem->state != from_state_req) {
 		ret = -EINVAL;
 		goto out;
@@ -309,10 +307,20 @@ static int memory_block_change_state(struct memory_block *mem,
 		break;
 	}
 out:
-	mutex_unlock(&mem->state_mutex);
 	return ret;
 }
 
+static int memory_block_change_state(struct memory_block *mem,
+		unsigned long to_state, unsigned long from_state_req)
+{
+	int ret;
+
+	mutex_lock(&mem->state_mutex);
+	ret = __memory_block_change_state(mem, to_state, from_state_req);
+	mutex_unlock(&mem->state_mutex);
+
+	return ret;
+}
 static ssize_t
 store_mem_state(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
@@ -652,6 +660,21 @@ int unregister_memory_section(struct mem_section *section)
 	return remove_memory_block(0, section, 0);
 }
 
+/*
+ * offline one memory block. If the memory block has been offlined, do nothing.
+ */
+int offline_memory_block(struct memory_block *mem)
+{
+	int ret = 0;
+
+	mutex_lock(&mem->state_mutex);
+	if (mem->state != MEM_OFFLINE)
+		ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
+	mutex_unlock(&mem->state_mutex);
+
+	return ret;
+}
+
 /*
  * Initialize the sysfs support for memory devices...
  */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index e64fe80eba96..95573ec4ee6c 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -10,6 +10,7 @@ struct page;
 struct zone;
 struct pglist_data;
 struct mem_section;
+struct memory_block;
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 
@@ -234,6 +235,7 @@ extern int mem_online_node(int nid);
 extern int add_memory(int nid, u64 start, u64 size);
 extern int arch_add_memory(int nid, u64 start, u64 size);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
+extern int offline_memory_block(struct memory_block *mem);
 extern int remove_memory(u64 start, u64 size);
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
 								int nr_pages);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index dfc0a6134c7c..7d0797475a47 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1014,11 +1014,42 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 
 int remove_memory(u64 start, u64 size)
 {
+	struct memory_block *mem = NULL;
+	struct mem_section *section;
 	unsigned long start_pfn, end_pfn;
+	unsigned long pfn, section_nr;
+	int ret;
 
 	start_pfn = PFN_DOWN(start);
 	end_pfn = start_pfn + PFN_DOWN(size);
-	return __offline_pages(start_pfn, end_pfn, 120 * HZ);
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+		section_nr = pfn_to_section_nr(pfn);
+		if (!present_section_nr(section_nr))
+			continue;
+
+		section = __nr_to_section(section_nr);
+		/* same memblock? */
+		if (mem)
+			if ((section_nr >= mem->start_section_nr) &&
+			    (section_nr <= mem->end_section_nr))
+				continue;
+
+		mem = find_memory_block_hinted(section, mem);
+		if (!mem)
+			continue;
+
+		ret = offline_memory_block(mem);
+		if (ret) {
+			kobject_put(&mem->dev.kobj);
+			return ret;
+		}
+	}
+
+	if (mem)
+		kobject_put(&mem->dev.kobj);
+
+	return 0;
 }
 #else
 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)

From b676b293fb48672904ee1b9828cb50b4eed01717 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 8 Oct 2012 16:34:03 -0700
Subject: [PATCH 148/160] mm, thp: fix mapped pages avoiding unevictable list
 on mlock

When a transparent hugepage is mapped and it is included in an mlock()
range, follow_page() incorrectly avoids setting the page's mlock bit and
moving it to the unevictable lru.

This is evident if you try to mlock(), munlock(), and then mlock() a
range again.  Currently:

	#define MAP_SIZE	(4 << 30)	/* 4GB */

	void *ptr = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
			 MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
	mlock(ptr, MAP_SIZE);

		$ grep -E "Unevictable|Inactive\(anon" /proc/meminfo
		Inactive(anon):     6304 kB
		Unevictable:     4213924 kB

	munlock(ptr, MAP_SIZE);

		Inactive(anon):  4186252 kB
		Unevictable:       19652 kB

	mlock(ptr, MAP_SIZE);

		Inactive(anon):  4198556 kB
		Unevictable:       21684 kB

Notice that less than 2MB was added to the unevictable list; this is
because these pages in the range are not transparent hugepages since the
4GB range was allocated with mmap() and has no specific alignment.  If
posix_memalign() were used instead, unevictable would not have grown at
all on the second mlock().

The fix is to call mlock_vma_page() so that the mlock bit is set and the
page is added to the unevictable list.  With this patch:

	mlock(ptr, MAP_SIZE);

		Inactive(anon):     4056 kB
		Unevictable:     4213940 kB

	munlock(ptr, MAP_SIZE);

		Inactive(anon):  4198268 kB
		Unevictable:       19636 kB

	mlock(ptr, MAP_SIZE);

		Inactive(anon):     4008 kB
		Unevictable:     4213940 kB

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michel Lespinasse <walken@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h |  2 +-
 mm/huge_memory.c        | 11 ++++++++++-
 mm/memory.c             |  2 +-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 6ab47af5a849..b31cb7da0346 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -11,7 +11,7 @@ extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			       unsigned long address, pmd_t *pmd,
 			       pmd_t orig_pmd);
-extern struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 					  unsigned long addr,
 					  pmd_t *pmd,
 					  unsigned int flags);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 08a943b9cf95..3a8d6b7d95db 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -971,11 +971,12 @@ out_unlock:
 	return ret;
 }
 
-struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 				   unsigned long addr,
 				   pmd_t *pmd,
 				   unsigned int flags)
 {
+	struct mm_struct *mm = vma->vm_mm;
 	struct page *page = NULL;
 
 	assert_spin_locked(&mm->page_table_lock);
@@ -998,6 +999,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
 		_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
 		set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
 	}
+	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+		if (page->mapping && trylock_page(page)) {
+			lru_add_drain();
+			if (page->mapping)
+				mlock_vma_page(page);
+			unlock_page(page);
+		}
+	}
 	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
 	VM_BUG_ON(!PageCompound(page));
 	if (flags & FOLL_GET)
diff --git a/mm/memory.c b/mm/memory.c
index 45bb6d296b6f..fb135ba4aba9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1528,7 +1528,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 				spin_unlock(&mm->page_table_lock);
 				wait_split_huge_page(vma->anon_vma, pmd);
 			} else {
-				page = follow_trans_huge_pmd(mm, address,
+				page = follow_trans_huge_pmd(vma, address,
 							     pmd, flags);
 				spin_unlock(&mm->page_table_lock);
 				goto out;

From 8449d21fb49e9824e2736c5febd6b5d287cd2ba1 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 8 Oct 2012 16:34:06 -0700
Subject: [PATCH 149/160] mm, thp: fix mlock statistics

NR_MLOCK is only accounted in single page units: there's no logic to
handle transparent hugepages.  This patch checks the appropriate number of
pages to adjust the statistics by so that the correct amount of memory is
reflected.

Currently:

		$ grep Mlocked /proc/meminfo
		Mlocked:           19636 kB

	#define MAP_SIZE	(4 << 30)	/* 4GB */

	void *ptr = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
			 MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
	mlock(ptr, MAP_SIZE);

		$ grep Mlocked /proc/meminfo
		Mlocked:           29844 kB

	munlock(ptr, MAP_SIZE);

		$ grep Mlocked /proc/meminfo
		Mlocked:           19636 kB

And with this patch:

		$ grep Mlock /proc/meminfo
		Mlocked:           19636 kB

	mlock(ptr, MAP_SIZE);

		$ grep Mlock /proc/meminfo
		Mlocked:         4213664 kB

	munlock(ptr, MAP_SIZE);

		$ grep Mlock /proc/meminfo
		Mlocked:           19636 kB

Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: Hugh Dickens <hughd@google.com>
Acked-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Michel Lespinasse <walken@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h | 3 ++-
 mm/mlock.c    | 9 ++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index f5f295fe11e1..a4fa284f6bc2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -180,7 +180,8 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
 		return 0;
 
 	if (!TestSetPageMlocked(page)) {
-		inc_zone_page_state(page, NR_MLOCK);
+		mod_zone_page_state(page_zone(page), NR_MLOCK,
+				    hpage_nr_pages(page));
 		count_vm_event(UNEVICTABLE_PGMLOCKED);
 	}
 	return 1;
diff --git a/mm/mlock.c b/mm/mlock.c
index de7321592897..f0b9ce572fc7 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -56,7 +56,8 @@ void clear_page_mlock(struct page *page)
 	if (!TestClearPageMlocked(page))
 		return;
 
-	dec_zone_page_state(page, NR_MLOCK);
+	mod_zone_page_state(page_zone(page), NR_MLOCK,
+			    -hpage_nr_pages(page));
 	count_vm_event(UNEVICTABLE_PGCLEARED);
 	if (!isolate_lru_page(page)) {
 		putback_lru_page(page);
@@ -78,7 +79,8 @@ void mlock_vma_page(struct page *page)
 	BUG_ON(!PageLocked(page));
 
 	if (!TestSetPageMlocked(page)) {
-		inc_zone_page_state(page, NR_MLOCK);
+		mod_zone_page_state(page_zone(page), NR_MLOCK,
+				    hpage_nr_pages(page));
 		count_vm_event(UNEVICTABLE_PGMLOCKED);
 		if (!isolate_lru_page(page))
 			putback_lru_page(page);
@@ -105,7 +107,8 @@ void munlock_vma_page(struct page *page)
 	BUG_ON(!PageLocked(page));
 
 	if (TestClearPageMlocked(page)) {
-		dec_zone_page_state(page, NR_MLOCK);
+		mod_zone_page_state(page_zone(page), NR_MLOCK,
+				    -hpage_nr_pages(page));
 		if (!isolate_lru_page(page)) {
 			int ret = SWAP_AGAIN;
 

From 45ec16908e84e00d847e9063231fb3a75e6366bc Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 8 Oct 2012 16:34:09 -0700
Subject: [PATCH 150/160] mm: use %pK for /proc/vmallocinfo

In the paranoid case of sysctl kernel.kptr_restrict=2, mask the kernel
virtual addresses in /proc/vmallocinfo too.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reported-by: Brad Spengler <spender@grsecurity.net>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8de704679bfc..78e08300db21 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2571,7 +2571,7 @@ static int s_show(struct seq_file *m, void *p)
 {
 	struct vm_struct *v = p;
 
-	seq_printf(m, "0x%p-0x%p %7ld",
+	seq_printf(m, "0x%pK-0x%pK %7ld",
 		v->addr, v->addr + v->size, v->size);
 
 	if (v->caller)

From 7795912c257bc068445f1db429c94d6b4b6ee604 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 8 Oct 2012 16:34:11 -0700
Subject: [PATCH 151/160] mm: document PageHuge somewhat

Acked-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 993f7c1820a8..59a0059b39e2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -671,6 +671,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
 	}
 }
 
+/*
+ * PageHuge() only returns true for hugetlbfs pages, but not for normal or
+ * transparent huge pages.  See the PageTransHuge() documentation for more
+ * details.
+ */
 int PageHuge(struct page *page)
 {
 	compound_page_dtor *dtor;

From 587af308cc30ec6b94bde9aeb322e85fe4363e32 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 8 Oct 2012 16:34:12 -0700
Subject: [PATCH 152/160] mm: memcg: clean up mm_match_cgroup() signature

It really should return a boolean for match/no match.  And since it takes
a memcg, not a cgroup, fix that parameter name as well.

[akpm@linux-foundation.org: mm_match_cgroup() is not a macro]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8d9489fdab2e..fd0e6d53836e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -84,14 +84,14 @@ extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
 extern struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont);
 
 static inline
-int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
+bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg)
 {
-	struct mem_cgroup *memcg;
-	int match;
+	struct mem_cgroup *task_memcg;
+	bool match;
 
 	rcu_read_lock();
-	memcg = mem_cgroup_from_task(rcu_dereference((mm)->owner));
-	match = __mem_cgroup_same_or_subtree(cgroup, memcg);
+	task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+	match = __mem_cgroup_same_or_subtree(memcg, task_memcg);
 	rcu_read_unlock();
 	return match;
 }
@@ -258,10 +258,10 @@ static inline struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm
 	return NULL;
 }
 
-static inline int mm_match_cgroup(struct mm_struct *mm,
+static inline bool mm_match_cgroup(struct mm_struct *mm,
 		struct mem_cgroup *memcg)
 {
-	return 1;
+	return true;
 }
 
 static inline int task_in_mem_cgroup(struct task_struct *task,

From d760afd4d2570653891f94e13b848e97150dc5a6 Mon Sep 17 00:00:00 2001
From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Date: Mon, 8 Oct 2012 16:34:14 -0700
Subject: [PATCH 153/160] memory-hotplug: suppress "Trying to free nonexistent
 resource <XXXXXXXXXXXXXXXX-YYYYYYYYYYYYYYYY>" warning

When our x86 box calls __remove_pages(), release_mem_region() shows many
warnings.  And x86 box cannot unregister iomem_resource.

  "Trying to free nonexistent resource <XXXXXXXXXXXXXXXX-YYYYYYYYYYYYYYYY>"

release_mem_region() has been changed to be called in each
PAGES_PER_SECTION by commit de7f0cba9678 ("memory hotplug: release
memory regions in PAGES_PER_SECTION chunks").  Because powerpc registers
iomem_resource in each PAGES_PER_SECTION chunk.  But when I hot add
memory on x86 box, iomem_resource is register in each _CRS not
PAGES_PER_SECTION chunk.  So x86 box unregisters iomem_resource.

The patch fixes the problem.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Nathan Fontenot <nfont@austin.ibm.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/pseries/hotplug-memory.c | 13 +++++++++----
 mm/memory_hotplug.c                             |  4 ++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 11d8e0544ac0..dc0a035e63bb 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -77,7 +77,8 @@ static int pseries_remove_memblock(unsigned long base, unsigned int memblock_siz
 {
 	unsigned long start, start_pfn;
 	struct zone *zone;
-	int ret;
+	int i, ret;
+	int sections_to_remove;
 
 	start_pfn = base >> PAGE_SHIFT;
 
@@ -97,9 +98,13 @@ static int pseries_remove_memblock(unsigned long base, unsigned int memblock_siz
 	 * to sysfs "state" file and we can't remove sysfs entries
 	 * while writing to it. So we have to defer it to here.
 	 */
-	ret = __remove_pages(zone, start_pfn, memblock_size >> PAGE_SHIFT);
-	if (ret)
-		return ret;
+	sections_to_remove = (memblock_size >> PAGE_SHIFT) / PAGES_PER_SECTION;
+	for (i = 0; i < sections_to_remove; i++) {
+		unsigned long pfn = start_pfn + i * PAGES_PER_SECTION;
+		ret = __remove_pages(zone, start_pfn,  PAGES_PER_SECTION);
+		if (ret)
+			return ret;
+	}
 
 	/*
 	 * Update memory regions for memory remove
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7d0797475a47..56b758ae57d2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -369,11 +369,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
 	BUG_ON(nr_pages % PAGES_PER_SECTION);
 
+	release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
+
 	sections_to_remove = nr_pages / PAGES_PER_SECTION;
 	for (i = 0; i < sections_to_remove; i++) {
 		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
-		release_mem_region(pfn << PAGE_SHIFT,
-				   PAGES_PER_SECTION << PAGE_SHIFT);
 		ret = __remove_section(zone, __pfn_to_section(pfn));
 		if (ret)
 			break;

From 15b9350a177b9fb23b69e00f78f52890ee338880 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Mon, 8 Oct 2012 16:34:19 -0700
Subject: [PATCH 154/160] sparc64: Only support 4MB huge pages and 8KB base
 pages.

Narrowing the scope of the page size configurations will make the
transparent hugepage changes much simpler.

In the end what we really want to do is have the kernel support multiple
huge page sizes and use whatever is appropriate as the context dictactes.

Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sparc/Kconfig                  | 36 ---------------------------
 arch/sparc/include/asm/mmu_64.h     | 16 +-----------
 arch/sparc/include/asm/page_64.h    | 12 ---------
 arch/sparc/include/asm/pgtable_64.h | 38 ++++-------------------------
 arch/sparc/mm/init_64.c             |  9 ++-----
 arch/sparc/mm/tsb.c                 | 17 -------------
 6 files changed, 8 insertions(+), 120 deletions(-)

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e184075877d7..91c780c973ba 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -230,25 +230,6 @@ config EARLYFB
 	help
 	  Say Y here to enable a faster early framebuffer boot console.
 
-choice
-	prompt "Kernel page size" if SPARC64
-	default SPARC64_PAGE_SIZE_8KB
-
-config SPARC64_PAGE_SIZE_8KB
-	bool "8KB"
-	help
-	  This lets you select the page size of the kernel.
-
-	  8KB and 64KB work quite well, since SPARC ELF sections
-	  provide for up to 64KB alignment.
-
-	  If you don't know what to do, choose 8KB.
-
-config SPARC64_PAGE_SIZE_64KB
-	bool "64KB"
-
-endchoice
-
 config SECCOMP
 	bool "Enable seccomp to safely compute untrusted bytecode"
 	depends on SPARC64 && PROC_FS
@@ -320,23 +301,6 @@ config GENERIC_LOCKBREAK
 	default y
 	depends on SPARC64 && SMP && PREEMPT
 
-choice
-	prompt "SPARC64 Huge TLB Page Size"
-	depends on SPARC64 && HUGETLB_PAGE
-	default HUGETLB_PAGE_SIZE_4MB
-
-config HUGETLB_PAGE_SIZE_4MB
-	bool "4MB"
-
-config HUGETLB_PAGE_SIZE_512K
-	bool "512K"
-
-config HUGETLB_PAGE_SIZE_64K
-	depends on !SPARC64_PAGE_SIZE_64KB
-	bool "64K"
-
-endchoice
-
 config NUMA
 	bool "NUMA support"
 	depends on SPARC64 && SMP
diff --git a/arch/sparc/include/asm/mmu_64.h b/arch/sparc/include/asm/mmu_64.h
index 9067dc500535..5fb97e19d81e 100644
--- a/arch/sparc/include/asm/mmu_64.h
+++ b/arch/sparc/include/asm/mmu_64.h
@@ -30,22 +30,8 @@
 #define CTX_PGSZ_MASK		((CTX_PGSZ_BITS << CTX_PGSZ0_SHIFT) | \
 				 (CTX_PGSZ_BITS << CTX_PGSZ1_SHIFT))
 
-#if defined(CONFIG_SPARC64_PAGE_SIZE_8KB)
 #define CTX_PGSZ_BASE	CTX_PGSZ_8KB
-#elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB)
-#define CTX_PGSZ_BASE	CTX_PGSZ_64KB
-#else
-#error No page size specified in kernel configuration
-#endif
-
-#if defined(CONFIG_HUGETLB_PAGE_SIZE_4MB)
-#define CTX_PGSZ_HUGE		CTX_PGSZ_4MB
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K)
-#define CTX_PGSZ_HUGE		CTX_PGSZ_512KB
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
-#define CTX_PGSZ_HUGE		CTX_PGSZ_64KB
-#endif
-
+#define CTX_PGSZ_HUGE	CTX_PGSZ_4MB
 #define CTX_PGSZ_KERN	CTX_PGSZ_4MB
 
 /* Thus, when running on UltraSPARC-III+ and later, we use the following
diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
index f0d09b401036..b2df9b8b2d46 100644
--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
@@ -3,13 +3,7 @@
 
 #include <linux/const.h>
 
-#if defined(CONFIG_SPARC64_PAGE_SIZE_8KB)
 #define PAGE_SHIFT   13
-#elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB)
-#define PAGE_SHIFT   16
-#else
-#error No page size specified in kernel configuration
-#endif
 
 #define PAGE_SIZE    (_AC(1,UL) << PAGE_SHIFT)
 #define PAGE_MASK    (~(PAGE_SIZE-1))
@@ -21,13 +15,7 @@
 #define DCACHE_ALIASING_POSSIBLE
 #endif
 
-#if defined(CONFIG_HUGETLB_PAGE_SIZE_4MB)
 #define HPAGE_SHIFT		22
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K)
-#define HPAGE_SHIFT		19
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
-#define HPAGE_SHIFT		16
-#endif
 
 #ifdef CONFIG_HUGETLB_PAGE
 #define HPAGE_SIZE		(_AC(1,UL) << HPAGE_SHIFT)
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 61210db139fb..387d484c62f9 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -160,26 +160,11 @@
 #define _PAGE_SZ8K_4V	  _AC(0x0000000000000000,UL) /* 8K Page              */
 #define _PAGE_SZALL_4V	  _AC(0x0000000000000007,UL) /* All pgsz bits        */
 
-#if PAGE_SHIFT == 13
 #define _PAGE_SZBITS_4U	_PAGE_SZ8K_4U
 #define _PAGE_SZBITS_4V	_PAGE_SZ8K_4V
-#elif PAGE_SHIFT == 16
-#define _PAGE_SZBITS_4U	_PAGE_SZ64K_4U
-#define _PAGE_SZBITS_4V	_PAGE_SZ64K_4V
-#else
-#error Wrong PAGE_SHIFT specified
-#endif
 
-#if defined(CONFIG_HUGETLB_PAGE_SIZE_4MB)
 #define _PAGE_SZHUGE_4U	_PAGE_SZ4MB_4U
 #define _PAGE_SZHUGE_4V	_PAGE_SZ4MB_4V
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K)
-#define _PAGE_SZHUGE_4U	_PAGE_SZ512K_4U
-#define _PAGE_SZHUGE_4V	_PAGE_SZ512K_4V
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
-#define _PAGE_SZHUGE_4U	_PAGE_SZ64K_4U
-#define _PAGE_SZHUGE_4V	_PAGE_SZ64K_4V
-#endif
 
 /* These are actually filled in at boot time by sun4{u,v}_pgprot_init() */
 #define __P000	__pgprot(0)
@@ -218,7 +203,6 @@ extern unsigned long _PAGE_CACHE;
 
 extern unsigned long pg_iobits;
 extern unsigned long _PAGE_ALL_SZ_BITS;
-extern unsigned long _PAGE_SZBITS;
 
 extern struct page *mem_map_zero;
 #define ZERO_PAGE(vaddr)	(mem_map_zero)
@@ -231,22 +215,9 @@ extern struct page *mem_map_zero;
 static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot)
 {
 	unsigned long paddr = pfn << PAGE_SHIFT;
-	unsigned long sz_bits;
 
-	sz_bits = 0UL;
-	if (_PAGE_SZBITS_4U != 0UL || _PAGE_SZBITS_4V != 0UL) {
-		__asm__ __volatile__(
-		"\n661:	sethi		%%uhi(%1), %0\n"
-		"	sllx		%0, 32, %0\n"
-		"	.section	.sun4v_2insn_patch, \"ax\"\n"
-		"	.word		661b\n"
-		"	mov		%2, %0\n"
-		"	nop\n"
-		"	.previous\n"
-		: "=r" (sz_bits)
-		: "i" (_PAGE_SZBITS_4U), "i" (_PAGE_SZBITS_4V));
-	}
-	return __pte(paddr | sz_bits | pgprot_val(prot));
+	BUILD_BUG_ON(_PAGE_SZBITS_4U != 0UL || _PAGE_SZBITS_4V != 0UL);
+	return __pte(paddr | pgprot_val(prot));
 }
 #define mk_pte(page, pgprot)	pfn_pte(page_to_pfn(page), (pgprot))
 
@@ -286,6 +257,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t prot)
 	 * Note: We encode this into 3 sun4v 2-insn patch sequences.
 	 */
 
+	BUILD_BUG_ON(_PAGE_SZBITS_4U != 0UL || _PAGE_SZBITS_4V != 0UL);
 	__asm__ __volatile__(
 	"\n661:	sethi		%%uhi(%2), %1\n"
 	"	sethi		%%hi(%2), %0\n"
@@ -307,10 +279,10 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t prot)
 	: "=r" (mask), "=r" (tmp)
 	: "i" (_PAGE_PADDR_4U | _PAGE_MODIFIED_4U | _PAGE_ACCESSED_4U |
 	       _PAGE_CP_4U | _PAGE_CV_4U | _PAGE_E_4U | _PAGE_PRESENT_4U |
-	       _PAGE_SZBITS_4U | _PAGE_SPECIAL),
+	       _PAGE_SPECIAL),
 	  "i" (_PAGE_PADDR_4V | _PAGE_MODIFIED_4V | _PAGE_ACCESSED_4V |
 	       _PAGE_CP_4V | _PAGE_CV_4V | _PAGE_E_4V | _PAGE_PRESENT_4V |
-	       _PAGE_SZBITS_4V | _PAGE_SPECIAL));
+	       _PAGE_SPECIAL));
 
 	return __pte((pte_val(pte) & mask) | (pgprot_val(prot) & ~mask));
 }
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 7a9b788c6ced..809eecf6c797 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -276,7 +276,6 @@ static inline void tsb_insert(struct tsb *ent, unsigned long tag, unsigned long
 }
 
 unsigned long _PAGE_ALL_SZ_BITS __read_mostly;
-unsigned long _PAGE_SZBITS __read_mostly;
 
 static void flush_dcache(unsigned long pfn)
 {
@@ -2275,8 +2274,7 @@ static void __init sun4u_pgprot_init(void)
 		     __ACCESS_BITS_4U | _PAGE_E_4U);
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
-	kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZBITS_4U) ^
-		0xfffff80000000000UL;
+	kern_linear_pte_xor[0] = _PAGE_VALID ^ 0xfffff80000000000UL;
 #else
 	kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^
 		0xfffff80000000000UL;
@@ -2287,7 +2285,6 @@ static void __init sun4u_pgprot_init(void)
 	for (i = 1; i < 4; i++)
 		kern_linear_pte_xor[i] = kern_linear_pte_xor[0];
 
-	_PAGE_SZBITS = _PAGE_SZBITS_4U;
 	_PAGE_ALL_SZ_BITS =  (_PAGE_SZ4MB_4U | _PAGE_SZ512K_4U |
 			      _PAGE_SZ64K_4U | _PAGE_SZ8K_4U |
 			      _PAGE_SZ32MB_4U | _PAGE_SZ256MB_4U);
@@ -2324,8 +2321,7 @@ static void __init sun4v_pgprot_init(void)
 	_PAGE_CACHE = _PAGE_CACHE_4V;
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
-	kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZBITS_4V) ^
-		0xfffff80000000000UL;
+	kern_linear_pte_xor[0] = _PAGE_VALID ^ 0xfffff80000000000UL;
 #else
 	kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^
 		0xfffff80000000000UL;
@@ -2339,7 +2335,6 @@ static void __init sun4v_pgprot_init(void)
 	pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4V | __DIRTY_BITS_4V |
 		     __ACCESS_BITS_4V | _PAGE_E_4V);
 
-	_PAGE_SZBITS = _PAGE_SZBITS_4V;
 	_PAGE_ALL_SZ_BITS = (_PAGE_SZ16GB_4V | _PAGE_SZ2GB_4V |
 			     _PAGE_SZ256MB_4V | _PAGE_SZ32MB_4V |
 			     _PAGE_SZ4MB_4V | _PAGE_SZ512K_4V |
diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c
index c52add79b83d..70e50ea2a5b6 100644
--- a/arch/sparc/mm/tsb.c
+++ b/arch/sparc/mm/tsb.c
@@ -90,29 +90,12 @@ void flush_tsb_user(struct tlb_batch *tb)
 	spin_unlock_irqrestore(&mm->context.lock, flags);
 }
 
-#if defined(CONFIG_SPARC64_PAGE_SIZE_8KB)
 #define HV_PGSZ_IDX_BASE	HV_PGSZ_IDX_8K
 #define HV_PGSZ_MASK_BASE	HV_PGSZ_MASK_8K
-#elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB)
-#define HV_PGSZ_IDX_BASE	HV_PGSZ_IDX_64K
-#define HV_PGSZ_MASK_BASE	HV_PGSZ_MASK_64K
-#else
-#error Broken base page size setting...
-#endif
 
 #ifdef CONFIG_HUGETLB_PAGE
-#if defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
-#define HV_PGSZ_IDX_HUGE	HV_PGSZ_IDX_64K
-#define HV_PGSZ_MASK_HUGE	HV_PGSZ_MASK_64K
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K)
-#define HV_PGSZ_IDX_HUGE	HV_PGSZ_IDX_512K
-#define HV_PGSZ_MASK_HUGE	HV_PGSZ_MASK_512K
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_4MB)
 #define HV_PGSZ_IDX_HUGE	HV_PGSZ_IDX_4MB
 #define HV_PGSZ_MASK_HUGE	HV_PGSZ_MASK_4MB
-#else
-#error Broken huge page size setting...
-#endif
 #endif
 
 static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsigned long tsb_bytes)

From 56a70b8c6acc73f5d9ec383d840909dd9e63c865 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Mon, 8 Oct 2012 16:34:20 -0700
Subject: [PATCH 155/160] sparc64: Halve the size of PTE tables

The reason we want to do this is to facilitate transparent huge page
support.

Right now PMD's cover 8MB of address space, and our huge page size is 4MB.
 The current transparent hugepage support is not able to handle HPAGE_SIZE
!= PMD_SIZE.

So make PTE tables be sized to half of a page instead of a full page.

We can still map properly the whole supported virtual address range which
on sparc64 requires 44 bits.  Add a compile time CPP test which ensures
that this requirement is always met.

There is a minor inefficiency added by this change.  We only use half of
the page for PTE tables.  It's not trivial to use only half of the page
yet still get all of the pgtable_page_{ctor,dtor}() stuff working
properly.  It is doable, and that will come in a subsequent change.

Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sparc/include/asm/pgtable_64.h | 24 +++++++-----------------
 arch/sparc/include/asm/tsb.h        |  4 ++--
 2 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 387d484c62f9..a7b5091f3b13 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -45,40 +45,30 @@
 
 #define vmemmap			((struct page *)VMEMMAP_BASE)
 
-/* XXX All of this needs to be rethought so we can take advantage
- * XXX cheetah's full 64-bit virtual address space, ie. no more hole
- * XXX in the middle like on spitfire. -DaveM
- */
-/*
- * Given a virtual address, the lowest PAGE_SHIFT bits determine offset
- * into the page; the next higher PAGE_SHIFT-3 bits determine the pte#
- * in the proper pagetable (the -3 is from the 8 byte ptes, and each page
- * table is a single page long). The next higher PMD_BITS determine pmd#
- * in the proper pmdtable (where we must have PMD_BITS <= (PAGE_SHIFT-2)
- * since the pmd entries are 4 bytes, and each pmd page is a single page
- * long). Finally, the higher few bits determine pgde#.
- */
-
 /* PMD_SHIFT determines the size of the area a second-level page
  * table can map
  */
-#define PMD_SHIFT	(PAGE_SHIFT + (PAGE_SHIFT-3))
+#define PMD_SHIFT	(PAGE_SHIFT + (PAGE_SHIFT-4))
 #define PMD_SIZE	(_AC(1,UL) << PMD_SHIFT)
 #define PMD_MASK	(~(PMD_SIZE-1))
 #define PMD_BITS	(PAGE_SHIFT - 2)
 
 /* PGDIR_SHIFT determines what a third-level page table entry can map */
-#define PGDIR_SHIFT	(PAGE_SHIFT + (PAGE_SHIFT-3) + PMD_BITS)
+#define PGDIR_SHIFT	(PAGE_SHIFT + (PAGE_SHIFT-4) + PMD_BITS)
 #define PGDIR_SIZE	(_AC(1,UL) << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 #define PGDIR_BITS	(PAGE_SHIFT - 2)
 
+#if (PGDIR_SHIFT + PGDIR_BITS) != 44
+#error Page table parameters do not cover virtual address space properly.
+#endif
+
 #ifndef __ASSEMBLY__
 
 #include <linux/sched.h>
 
 /* Entries per page directory level. */
-#define PTRS_PER_PTE	(1UL << (PAGE_SHIFT-3))
+#define PTRS_PER_PTE	(1UL << (PAGE_SHIFT-4))
 #define PTRS_PER_PMD	(1UL << PMD_BITS)
 #define PTRS_PER_PGD	(1UL << PGDIR_BITS)
 
diff --git a/arch/sparc/include/asm/tsb.h b/arch/sparc/include/asm/tsb.h
index 1a8afd1ad04f..6435924b4514 100644
--- a/arch/sparc/include/asm/tsb.h
+++ b/arch/sparc/include/asm/tsb.h
@@ -152,7 +152,7 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
 	lduwa		[REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
 	brz,pn		REG1, FAIL_LABEL; \
 	 sllx		VADDR, 64 - PMD_SHIFT, REG2; \
-	srlx		REG2, 64 - PAGE_SHIFT, REG2; \
+	srlx		REG2, 64 - (PAGE_SHIFT - 1), REG2; \
 	sllx		REG1, 11, REG1; \
 	andn		REG2, 0x7, REG2; \
 	add		REG1, REG2, REG1;
@@ -177,7 +177,7 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
 	lduwa		[REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
 	brz,pn		REG1, FAIL_LABEL; \
 	 sllx		VADDR, 64 - PMD_SHIFT, REG2; \
-	srlx		REG2, 64 - PAGE_SHIFT, REG2; \
+	srlx		REG2, 64 - (PAGE_SHIFT - 1), REG2; \
 	sllx		REG1, 11, REG1; \
 	andn		REG2, 0x7, REG2; \
 	add		REG1, REG2, REG1;

From c460bec78d9257a54bcc5f9d5fadf89f8c70dbd1 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Mon, 8 Oct 2012 16:34:22 -0700
Subject: [PATCH 156/160] sparc64: Eliminate PTE table memory wastage.

We've split up the PTE tables so that they take up half a page instead of
a full page.  This is in order to facilitate transparent huge page
support, which works much better if our PMDs cover 4MB instead of 8MB.

What we do is have a one-behind cache for PTE table allocations in the
mm struct.

This logic triggers only on allocations.  For example, we don't try to
keep track of free'd up page table blocks in the style that the s390 port
does.

There were only two slightly annoying aspects to this change:

1) Changing pgtable_t to be a "pte_t *".  There's all of this special
   logic in the TLB free paths that needed adjustments, as did the
   PMD populate interfaces.

2) init_new_context() needs to zap the pointer, since the mm struct
   just gets copied from the parent on fork.

Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sparc/include/asm/mmu_64.h     |   1 +
 arch/sparc/include/asm/page_64.h    |   2 +-
 arch/sparc/include/asm/pgalloc_64.h |  54 +++------------
 arch/sparc/mm/init_64.c             | 101 ++++++++++++++++++++++++++++
 arch/sparc/mm/tsb.c                 |   9 +++
 5 files changed, 123 insertions(+), 44 deletions(-)

diff --git a/arch/sparc/include/asm/mmu_64.h b/arch/sparc/include/asm/mmu_64.h
index 5fb97e19d81e..31977c8dd942 100644
--- a/arch/sparc/include/asm/mmu_64.h
+++ b/arch/sparc/include/asm/mmu_64.h
@@ -93,6 +93,7 @@ typedef struct {
 	spinlock_t		lock;
 	unsigned long		sparc64_ctx_val;
 	unsigned long		huge_pte_count;
+	struct page		*pgtable_page;
 	struct tsb_config	tsb_block[MM_NUM_TSBS];
 	struct hv_tsb_descr	tsb_descr[MM_NUM_TSBS];
 } mm_context_t;
diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
index b2df9b8b2d46..087a5c505c69 100644
--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
@@ -86,7 +86,7 @@ typedef unsigned long pgprot_t;
 
 #endif /* (STRICT_MM_TYPECHECKS) */
 
-typedef struct page *pgtable_t;
+typedef pte_t *pgtable_t;
 
 #define TASK_UNMAPPED_BASE	(test_thread_flag(TIF_32BIT) ? \
 				 (_AC(0x0000000070000000,UL)) : \
diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h
index 40b2d7a7023d..0ebca93ef0f5 100644
--- a/arch/sparc/include/asm/pgalloc_64.h
+++ b/arch/sparc/include/asm/pgalloc_64.h
@@ -38,51 +38,20 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 	kmem_cache_free(pgtable_cache, pmd);
 }
 
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
-					  unsigned long address)
-{
-	return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO);
-}
-
-static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
-					unsigned long address)
-{
-	struct page *page;
-	pte_t *pte;
-
-	pte = pte_alloc_one_kernel(mm, address);
-	if (!pte)
-		return NULL;
-	page = virt_to_page(pte);
-	pgtable_page_ctor(page);
-	return page;
-}
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-	free_page((unsigned long)pte);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
-{
-	pgtable_page_dtor(ptepage);
-	__free_page(ptepage);
-}
+extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
+				   unsigned long address);
+extern pgtable_t pte_alloc_one(struct mm_struct *mm,
+			       unsigned long address);
+extern void pte_free_kernel(struct mm_struct *mm, pte_t *pte);
+extern void pte_free(struct mm_struct *mm, pgtable_t ptepage);
 
 #define pmd_populate_kernel(MM, PMD, PTE)	pmd_set(PMD, PTE)
-#define pmd_populate(MM,PMD,PTE_PAGE)		\
-	pmd_populate_kernel(MM,PMD,page_address(PTE_PAGE))
-#define pmd_pgtable(pmd) pmd_page(pmd)
+#define pmd_populate(MM, PMD, PTE)		pmd_set(PMD, PTE)
+#define pmd_pgtable(PMD)			((pte_t *)__pmd_page(PMD))
 
 #define check_pgt_cache()	do { } while (0)
 
-static inline void pgtable_free(void *table, bool is_page)
-{
-	if (is_page)
-		free_page((unsigned long)table);
-	else
-		kmem_cache_free(pgtable_cache, table);
-}
+extern void pgtable_free(void *table, bool is_page);
 
 #ifdef CONFIG_SMP
 
@@ -113,11 +82,10 @@ static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, bool is
 }
 #endif /* !CONFIG_SMP */
 
-static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage,
+static inline void __pte_free_tlb(struct mmu_gather *tlb, pte_t *pte,
 				  unsigned long address)
 {
-	pgtable_page_dtor(ptepage);
-	pgtable_free_tlb(tlb, page_address(ptepage), true);
+	pgtable_free_tlb(tlb, pte, true);
 }
 
 #define __pmd_free_tlb(tlb, pmd, addr)		      \
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 809eecf6c797..12ef4ea60c88 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2467,3 +2467,104 @@ void __flush_tlb_all(void)
 	__asm__ __volatile__("wrpr	%0, 0, %%pstate"
 			     : : "r" (pstate));
 }
+
+static pte_t *get_from_cache(struct mm_struct *mm)
+{
+	struct page *page;
+	pte_t *ret;
+
+	spin_lock(&mm->page_table_lock);
+	page = mm->context.pgtable_page;
+	ret = NULL;
+	if (page) {
+		void *p = page_address(page);
+
+		mm->context.pgtable_page = NULL;
+
+		ret = (pte_t *) (p + (PAGE_SIZE / 2));
+	}
+	spin_unlock(&mm->page_table_lock);
+
+	return ret;
+}
+
+static struct page *__alloc_for_cache(struct mm_struct *mm)
+{
+	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
+				       __GFP_REPEAT | __GFP_ZERO);
+
+	if (page) {
+		spin_lock(&mm->page_table_lock);
+		if (!mm->context.pgtable_page) {
+			atomic_set(&page->_count, 2);
+			mm->context.pgtable_page = page;
+		}
+		spin_unlock(&mm->page_table_lock);
+	}
+	return page;
+}
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
+			    unsigned long address)
+{
+	struct page *page;
+	pte_t *pte;
+
+	pte = get_from_cache(mm);
+	if (pte)
+		return pte;
+
+	page = __alloc_for_cache(mm);
+	if (page)
+		pte = (pte_t *) page_address(page);
+
+	return pte;
+}
+
+pgtable_t pte_alloc_one(struct mm_struct *mm,
+			unsigned long address)
+{
+	struct page *page;
+	pte_t *pte;
+
+	pte = get_from_cache(mm);
+	if (pte)
+		return pte;
+
+	page = __alloc_for_cache(mm);
+	if (page) {
+		pgtable_page_ctor(page);
+		pte = (pte_t *) page_address(page);
+	}
+
+	return pte;
+}
+
+void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+	struct page *page = virt_to_page(pte);
+	if (put_page_testzero(page))
+		free_hot_cold_page(page, 0);
+}
+
+static void __pte_free(pgtable_t pte)
+{
+	struct page *page = virt_to_page(pte);
+	if (put_page_testzero(page)) {
+		pgtable_page_dtor(page);
+		free_hot_cold_page(page, 0);
+	}
+}
+
+void pte_free(struct mm_struct *mm, pgtable_t pte)
+{
+	__pte_free(pte);
+}
+
+void pgtable_free(void *table, bool is_page)
+{
+	if (is_page)
+		__pte_free(table);
+	else
+		kmem_cache_free(pgtable_cache, table);
+}
diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c
index 70e50ea2a5b6..a35ee832baf3 100644
--- a/arch/sparc/mm/tsb.c
+++ b/arch/sparc/mm/tsb.c
@@ -445,6 +445,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	mm->context.huge_pte_count = 0;
 #endif
 
+	mm->context.pgtable_page = NULL;
+
 	/* copy_mm() copies over the parent's mm_struct before calling
 	 * us, so we need to zero out the TSB pointer or else tsb_grow()
 	 * will be confused and think there is an older TSB to free up.
@@ -483,10 +485,17 @@ static void tsb_destroy_one(struct tsb_config *tp)
 void destroy_context(struct mm_struct *mm)
 {
 	unsigned long flags, i;
+	struct page *page;
 
 	for (i = 0; i < MM_NUM_TSBS; i++)
 		tsb_destroy_one(&mm->context.tsb_block[i]);
 
+	page = mm->context.pgtable_page;
+	if (page && put_page_testzero(page)) {
+		pgtable_page_dtor(page);
+		free_hot_cold_page(page, 0);
+	}
+
 	spin_lock_irqsave(&ctx_alloc_lock, flags);
 
 	if (CTX_VALID(mm->context)) {

From dbc9fdf063dc4f12af71d7858bd216170129822e Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Mon, 8 Oct 2012 16:34:23 -0700
Subject: [PATCH 157/160] sparc64: Document PGD and PMD layout.

We're going to be messing around with the PMD interpretation and layout
for the sake of transparent huge pages, so we better clearly document what
we're starting with.

Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sparc/include/asm/pgtable_64.h | 16 ++++++++++++----
 arch/sparc/include/asm/tsb.h        |  8 ++++----
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index a7b5091f3b13..af3cd7a9e9ac 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -63,6 +63,14 @@
 #error Page table parameters do not cover virtual address space properly.
 #endif
 
+/* PMDs point to PTE tables which are 4K aligned.  */
+#define PMD_PADDR	_AC(0xfffffffe,UL)
+#define PMD_PADDR_SHIFT	_AC(11,UL)
+
+/* PGDs point to PMD tables which are 8K aligned.  */
+#define PGD_PADDR	_AC(0xfffffffc,UL)
+#define PGD_PADDR_SHIFT	_AC(11,UL)
+
 #ifndef __ASSEMBLY__
 
 #include <linux/sched.h>
@@ -581,14 +589,14 @@ static inline unsigned long pte_special(pte_t pte)
 }
 
 #define pmd_set(pmdp, ptep)	\
-	(pmd_val(*(pmdp)) = (__pa((unsigned long) (ptep)) >> 11UL))
+	(pmd_val(*(pmdp)) = (__pa((unsigned long) (ptep)) >> PMD_PADDR_SHIFT))
 #define pud_set(pudp, pmdp)	\
-	(pud_val(*(pudp)) = (__pa((unsigned long) (pmdp)) >> 11UL))
+	(pud_val(*(pudp)) = (__pa((unsigned long) (pmdp)) >> PGD_PADDR_SHIFT))
 #define __pmd_page(pmd)		\
-	((unsigned long) __va((((unsigned long)pmd_val(pmd))<<11UL)))
+	((unsigned long) __va((((unsigned long)pmd_val(pmd))<<PMD_PADDR_SHIFT)))
 #define pmd_page(pmd) 			virt_to_page((void *)__pmd_page(pmd))
 #define pud_page_vaddr(pud)		\
-	((unsigned long) __va((((unsigned long)pud_val(pud))<<11UL)))
+	((unsigned long) __va((((unsigned long)pud_val(pud))<<PGD_PADDR_SHIFT)))
 #define pud_page(pud) 			virt_to_page((void *)pud_page_vaddr(pud))
 #define pmd_none(pmd)			(!pmd_val(pmd))
 #define pmd_bad(pmd)			(0)
diff --git a/arch/sparc/include/asm/tsb.h b/arch/sparc/include/asm/tsb.h
index 6435924b4514..ef8cd1a174f1 100644
--- a/arch/sparc/include/asm/tsb.h
+++ b/arch/sparc/include/asm/tsb.h
@@ -147,13 +147,13 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
 	brz,pn		REG1, FAIL_LABEL; \
 	 sllx		VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \
 	srlx		REG2, 64 - PAGE_SHIFT, REG2; \
-	sllx		REG1, 11, REG1; \
+	sllx		REG1, PGD_PADDR_SHIFT, REG1; \
 	andn		REG2, 0x3, REG2; \
 	lduwa		[REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
 	brz,pn		REG1, FAIL_LABEL; \
 	 sllx		VADDR, 64 - PMD_SHIFT, REG2; \
 	srlx		REG2, 64 - (PAGE_SHIFT - 1), REG2; \
-	sllx		REG1, 11, REG1; \
+	sllx		REG1, PMD_PADDR_SHIFT, REG1; \
 	andn		REG2, 0x7, REG2; \
 	add		REG1, REG2, REG1;
 
@@ -172,13 +172,13 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
 	brz,pn		REG1, FAIL_LABEL; \
 	 sllx		VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \
 	srlx		REG2, 64 - PAGE_SHIFT, REG2; \
-	sllx		REG1, 11, REG1; \
+	sllx		REG1, PGD_PADDR_SHIFT, REG1; \
 	andn		REG2, 0x3, REG2; \
 	lduwa		[REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
 	brz,pn		REG1, FAIL_LABEL; \
 	 sllx		VADDR, 64 - PMD_SHIFT, REG2; \
 	srlx		REG2, 64 - (PAGE_SHIFT - 1), REG2; \
-	sllx		REG1, 11, REG1; \
+	sllx		REG1, PMD_PADDR_SHIFT, REG1; \
 	andn		REG2, 0x7, REG2; \
 	add		REG1, REG2, REG1;
 

From b113da65785d5f3f9ff1451ec0fe43d6d76da25b Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Mon, 8 Oct 2012 16:34:25 -0700
Subject: [PATCH 158/160] mm: Add and use update_mmu_cache_pmd() in transparent
 huge page code.

The transparent huge page code passes a PMD pointer in as the third
argument of update_mmu_cache(), which expects a PTE pointer.

This never got noticed because X86 implements update_mmu_cache() as a
macro and thus we don't get any type checking, and X86 is the only
architecture which supports transparent huge pages currently.

Before other architectures can support transparent huge pages properly we
need to add a new interface which will take a PMD pointer as the third
argument rather than a PTE pointer.

[akpm@linux-foundation.org: implement update_mm_cache_pmd() for s390]
Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/include/asm/pgtable.h   | 1 +
 arch/x86/include/asm/pgtable_32.h | 1 +
 arch/x86/include/asm/pgtable_64.h | 1 +
 mm/huge_memory.c                  | 6 +++---
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index ed14fc2db6e0..979fe3dc0788 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -42,6 +42,7 @@ extern void fault_init(void);
  * tables contain all the necessary information.
  */
 #define update_mmu_cache(vma, address, ptep)     do { } while (0)
+#define update_mmu_cache_pmd(vma, address, ptep) do { } while (0)
 
 /*
  * ZERO_PAGE is a global shared page that is always zero; used
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 0c92113c4cb6..8faa215a503e 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -71,6 +71,7 @@ do {						\
  * tables contain all the necessary information.
  */
 #define update_mmu_cache(vma, address, ptep) do { } while (0)
+#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 8251be02301e..47356f9df82e 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -143,6 +143,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 #define pte_unmap(pte) ((void)(pte))/* NOP */
 
 #define update_mmu_cache(vma, address, ptep) do { } while (0)
+#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
 
 /* Encode and de-code a swap entry */
 #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3a8d6b7d95db..68a3c93036f6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -900,7 +900,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		entry = pmd_mkyoung(orig_pmd);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 		if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
-			update_mmu_cache(vma, address, pmd);
+			update_mmu_cache_pmd(vma, address, pmd);
 		ret |= VM_FAULT_WRITE;
 		goto out_unlock;
 	}
@@ -956,7 +956,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		pmdp_clear_flush(vma, haddr, pmd);
 		page_add_new_anon_rmap(new_page, vma, haddr);
 		set_pmd_at(mm, haddr, pmd, entry);
-		update_mmu_cache(vma, address, pmd);
+		update_mmu_cache_pmd(vma, address, pmd);
 		page_remove_rmap(page);
 		put_page(page);
 		ret |= VM_FAULT_WRITE;
@@ -2041,7 +2041,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	BUG_ON(!pmd_none(*pmd));
 	page_add_new_anon_rmap(new_page, vma, address);
 	set_pmd_at(mm, address, pmd, _pmd);
-	update_mmu_cache(vma, address, pmd);
+	update_mmu_cache_pmd(vma, address, pmd);
 	pgtable_trans_huge_deposit(mm, pgtable);
 	spin_unlock(&mm->page_table_lock);
 

From f5c8ad47284ca01dafc37da5a72bb9644174d387 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Mon, 8 Oct 2012 16:34:26 -0700
Subject: [PATCH 159/160] mm: thp: Use more portable PMD clearing sequenece in
 zap_huge_pmd().

Invalidation sequences are handled in various ways on various
architectures.

One way, which sparc64 uses, is to let the set_*_at() functions accumulate
pending flushes into a per-cpu array.  Then the flush_tlb_range() et al.
calls process the pending TLB flushes.

In this regime, the __tlb_remove_*tlb_entry() implementations are
essentially NOPs.

The canonical PTE zap in mm/memory.c is:

			ptent = ptep_get_and_clear_full(mm, addr, pte,
							tlb->fullmm);
			tlb_remove_tlb_entry(tlb, pte, addr);

With a subsequent tlb_flush_mmu() if needed.

Mirror this in the THP PMD zapping using:

		orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
		page = pmd_page(orig_pmd);
		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);

And we properly accomodate TLB flush mechanims like the one described
above.

Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 68a3c93036f6..a863af26c79c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1024,9 +1024,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	if (__pmd_trans_huge_lock(pmd, vma) == 1) {
 		struct page *page;
 		pgtable_t pgtable;
+		pmd_t orig_pmd;
 		pgtable = pgtable_trans_huge_withdraw(tlb->mm);
-		page = pmd_page(*pmd);
-		pmd_clear(pmd);
+		orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
+		page = pmd_page(orig_pmd);
 		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
 		page_remove_rmap(page);
 		VM_BUG_ON(page_mapcount(page) < 0);

From 9e695d2ecc8451cc2c1603d60b5c8e7f5581923a Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Mon, 8 Oct 2012 16:34:29 -0700
Subject: [PATCH 160/160] sparc64: Support transparent huge pages.

This is relatively easy since PMD's now cover exactly 4MB of memory.

Our PMD entries are 32-bits each, so we use a special encoding.  The
lowest bit, PMD_ISHUGE, determines the interpretation.  This is possible
because sparc64's page tables are purely software entities so we can use
whatever encoding scheme we want.  We just have to make the TLB miss
assembler page table walkers aware of the layout.

set_pmd_at() works much like set_pte_at() but it has to operate in two
page from a table of non-huge PTEs, so we have to queue up TLB flushes
based upon what mappings are valid in the PTE table.  In the second regime
we are going from huge-page to non-huge-page, and in that case we need
only queue up a single TLB flush to push out the huge page mapping.

We still have 5 bits remaining in the huge PMD encoding so we can very
likely support any new pieces of THP state tracking that might get added
in the future.

With lots of help from Johannes Weiner.

Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sparc/include/asm/hugetlb.h        |   5 +-
 arch/sparc/include/asm/mmu_64.h         |   2 +-
 arch/sparc/include/asm/mmu_context_64.h |   2 +-
 arch/sparc/include/asm/page_64.h        |   7 +-
 arch/sparc/include/asm/pgalloc_64.h     |   4 +-
 arch/sparc/include/asm/pgtable_64.h     | 177 +++++++++++++++++++-
 arch/sparc/include/asm/tsb.h            |  94 ++++++++++-
 arch/sparc/kernel/sun4v_tlb_miss.S      |   2 +-
 arch/sparc/kernel/tsb.S                 |   9 +-
 arch/sparc/mm/fault_64.c                |   4 +-
 arch/sparc/mm/hugetlbpage.c             |  50 ------
 arch/sparc/mm/init_64.c                 | 204 ++++++++++++++++++++++--
 arch/sparc/mm/tlb.c                     | 136 +++++++++++++---
 arch/sparc/mm/tsb.c                     |  14 +-
 14 files changed, 591 insertions(+), 119 deletions(-)

diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h
index e7927c9758a1..8c5eed6d267f 100644
--- a/arch/sparc/include/asm/hugetlb.h
+++ b/arch/sparc/include/asm/hugetlb.h
@@ -10,7 +10,10 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep);
 
-void hugetlb_prefault_arch_hook(struct mm_struct *mm);
+static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
+{
+	hugetlb_setup(mm);
+}
 
 static inline int is_hugepage_only_range(struct mm_struct *mm,
 					 unsigned long addr,
diff --git a/arch/sparc/include/asm/mmu_64.h b/arch/sparc/include/asm/mmu_64.h
index 31977c8dd942..76092c4dd277 100644
--- a/arch/sparc/include/asm/mmu_64.h
+++ b/arch/sparc/include/asm/mmu_64.h
@@ -82,7 +82,7 @@ struct tsb_config {
 
 #define MM_TSB_BASE	0
 
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 #define MM_TSB_HUGE	1
 #define MM_NUM_TSBS	2
 #else
diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h
index a97fd085cebe..9191ca62ed9c 100644
--- a/arch/sparc/include/asm/mmu_context_64.h
+++ b/arch/sparc/include/asm/mmu_context_64.h
@@ -36,7 +36,7 @@ static inline void tsb_context_switch(struct mm_struct *mm)
 {
 	__tsb_context_switch(__pa(mm->pgd),
 			     &mm->context.tsb_block[0],
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 			     (mm->context.tsb_block[1].tsb ?
 			      &mm->context.tsb_block[1] :
 			      NULL)
diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
index 087a5c505c69..4b39f74d6ca0 100644
--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
@@ -17,7 +17,7 @@
 
 #define HPAGE_SHIFT		22
 
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 #define HPAGE_SIZE		(_AC(1,UL) << HPAGE_SHIFT)
 #define HPAGE_MASK		(~(HPAGE_SIZE - 1UL))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
@@ -26,6 +26,11 @@
 
 #ifndef __ASSEMBLY__
 
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
+struct mm_struct;
+extern void hugetlb_setup(struct mm_struct *mm);
+#endif
+
 #define WANT_PAGE_VIRTUAL
 
 extern void _clear_page(void *page);
diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h
index 0ebca93ef0f5..bcfe063bce23 100644
--- a/arch/sparc/include/asm/pgalloc_64.h
+++ b/arch/sparc/include/asm/pgalloc_64.h
@@ -45,8 +45,8 @@ extern pgtable_t pte_alloc_one(struct mm_struct *mm,
 extern void pte_free_kernel(struct mm_struct *mm, pte_t *pte);
 extern void pte_free(struct mm_struct *mm, pgtable_t ptepage);
 
-#define pmd_populate_kernel(MM, PMD, PTE)	pmd_set(PMD, PTE)
-#define pmd_populate(MM, PMD, PTE)		pmd_set(PMD, PTE)
+#define pmd_populate_kernel(MM, PMD, PTE)	pmd_set(MM, PMD, PTE)
+#define pmd_populate(MM, PMD, PTE)		pmd_set(MM, PMD, PTE)
 #define pmd_pgtable(PMD)			((pte_t *)__pmd_page(PMD))
 
 #define check_pgt_cache()	do { } while (0)
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index af3cd7a9e9ac..95515f1e7cef 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -63,10 +63,31 @@
 #error Page table parameters do not cover virtual address space properly.
 #endif
 
+#if (PMD_SHIFT != HPAGE_SHIFT)
+#error PMD_SHIFT must equal HPAGE_SHIFT for transparent huge pages.
+#endif
+
 /* PMDs point to PTE tables which are 4K aligned.  */
 #define PMD_PADDR	_AC(0xfffffffe,UL)
 #define PMD_PADDR_SHIFT	_AC(11,UL)
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define PMD_ISHUGE	_AC(0x00000001,UL)
+
+/* This is the PMD layout when PMD_ISHUGE is set.  With 4MB huge
+ * pages, this frees up a bunch of bits in the layout that we can
+ * use for the protection settings and software metadata.
+ */
+#define PMD_HUGE_PADDR		_AC(0xfffff800,UL)
+#define PMD_HUGE_PROTBITS	_AC(0x000007ff,UL)
+#define PMD_HUGE_PRESENT	_AC(0x00000400,UL)
+#define PMD_HUGE_WRITE		_AC(0x00000200,UL)
+#define PMD_HUGE_DIRTY		_AC(0x00000100,UL)
+#define PMD_HUGE_ACCESSED	_AC(0x00000080,UL)
+#define PMD_HUGE_EXEC		_AC(0x00000040,UL)
+#define PMD_HUGE_SPLITTING	_AC(0x00000020,UL)
+#endif
+
 /* PGDs point to PMD tables which are 8K aligned.  */
 #define PGD_PADDR	_AC(0xfffffffc,UL)
 #define PGD_PADDR_SHIFT	_AC(11,UL)
@@ -219,6 +240,19 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot)
 }
 #define mk_pte(page, pgprot)	pfn_pte(page_to_pfn(page), (pgprot))
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot);
+#define mk_pmd(page, pgprot)	pfn_pmd(page_to_pfn(page), (pgprot))
+
+extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+	/* Do nothing, mk_pmd() does this part.  */
+	return pmd;
+}
+#endif
+
 /* This one can be done with two shifts.  */
 static inline unsigned long pte_pfn(pte_t pte)
 {
@@ -588,19 +622,130 @@ static inline unsigned long pte_special(pte_t pte)
 	return pte_val(pte) & _PAGE_SPECIAL;
 }
 
-#define pmd_set(pmdp, ptep)	\
-	(pmd_val(*(pmdp)) = (__pa((unsigned long) (ptep)) >> PMD_PADDR_SHIFT))
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_young(pmd_t pmd)
+{
+	return pmd_val(pmd) & PMD_HUGE_ACCESSED;
+}
+
+static inline int pmd_write(pmd_t pmd)
+{
+	return pmd_val(pmd) & PMD_HUGE_WRITE;
+}
+
+static inline unsigned long pmd_pfn(pmd_t pmd)
+{
+	unsigned long val = pmd_val(pmd) & PMD_HUGE_PADDR;
+
+	return val >> (PAGE_SHIFT - PMD_PADDR_SHIFT);
+}
+
+static inline int pmd_large(pmd_t pmd)
+{
+	return (pmd_val(pmd) & (PMD_ISHUGE | PMD_HUGE_PRESENT)) ==
+		(PMD_ISHUGE | PMD_HUGE_PRESENT);
+}
+
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+	return (pmd_val(pmd) & (PMD_ISHUGE|PMD_HUGE_SPLITTING)) ==
+		(PMD_ISHUGE|PMD_HUGE_SPLITTING);
+}
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+	return pmd_val(pmd) & PMD_ISHUGE;
+}
+
+#define has_transparent_hugepage() 1
+
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+	pmd_val(pmd) &= ~PMD_HUGE_ACCESSED;
+	return pmd;
+}
+
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+	pmd_val(pmd) &= ~PMD_HUGE_WRITE;
+	return pmd;
+}
+
+static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+	pmd_val(pmd) |= PMD_HUGE_DIRTY;
+	return pmd;
+}
+
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+	pmd_val(pmd) |= PMD_HUGE_ACCESSED;
+	return pmd;
+}
+
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
+{
+	pmd_val(pmd) |= PMD_HUGE_WRITE;
+	return pmd;
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+	pmd_val(pmd) &= ~PMD_HUGE_PRESENT;
+	return pmd;
+}
+
+static inline pmd_t pmd_mksplitting(pmd_t pmd)
+{
+	pmd_val(pmd) |= PMD_HUGE_SPLITTING;
+	return pmd;
+}
+
+extern pgprot_t pmd_pgprot(pmd_t entry);
+#endif
+
+static inline int pmd_present(pmd_t pmd)
+{
+	return pmd_val(pmd) != 0U;
+}
+
+#define pmd_none(pmd)			(!pmd_val(pmd))
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+		       pmd_t *pmdp, pmd_t pmd);
+#else
+static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+			      pmd_t *pmdp, pmd_t pmd)
+{
+	*pmdp = pmd;
+}
+#endif
+
+static inline void pmd_set(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep)
+{
+	unsigned long val = __pa((unsigned long) (ptep)) >> PMD_PADDR_SHIFT;
+
+	pmd_val(*pmdp) = val;
+}
+
 #define pud_set(pudp, pmdp)	\
 	(pud_val(*(pudp)) = (__pa((unsigned long) (pmdp)) >> PGD_PADDR_SHIFT))
-#define __pmd_page(pmd)		\
-	((unsigned long) __va((((unsigned long)pmd_val(pmd))<<PMD_PADDR_SHIFT)))
+static inline unsigned long __pmd_page(pmd_t pmd)
+{
+	unsigned long paddr = (unsigned long) pmd_val(pmd);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	if (pmd_val(pmd) & PMD_ISHUGE)
+		paddr &= PMD_HUGE_PADDR;
+#endif
+	paddr <<= PMD_PADDR_SHIFT;
+	return ((unsigned long) __va(paddr));
+}
 #define pmd_page(pmd) 			virt_to_page((void *)__pmd_page(pmd))
 #define pud_page_vaddr(pud)		\
 	((unsigned long) __va((((unsigned long)pud_val(pud))<<PGD_PADDR_SHIFT)))
 #define pud_page(pud) 			virt_to_page((void *)pud_page_vaddr(pud))
-#define pmd_none(pmd)			(!pmd_val(pmd))
 #define pmd_bad(pmd)			(0)
-#define pmd_present(pmd)		(pmd_val(pmd) != 0U)
 #define pmd_clear(pmdp)			(pmd_val(*(pmdp)) = 0U)
 #define pud_none(pud)			(!pud_val(pud))
 #define pud_bad(pud)			(0)
@@ -634,6 +779,16 @@ static inline unsigned long pte_special(pte_t pte)
 extern void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
 			  pte_t *ptep, pte_t orig, int fullmm);
 
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+				       unsigned long addr,
+				       pmd_t *pmdp)
+{
+	pmd_t pmd = *pmdp;
+	set_pmd_at(mm, addr, pmdp, __pmd(0U));
+	return pmd;
+}
+
 static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
 			     pte_t *ptep, pte_t pte, int fullmm)
 {
@@ -689,6 +844,16 @@ extern void mmu_info(struct seq_file *);
 
 struct vm_area_struct;
 extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+				 pmd_t *pmd);
+
+#define __HAVE_ARCH_PGTABLE_DEPOSIT
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
+
+#define __HAVE_ARCH_PGTABLE_WITHDRAW
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
+#endif
 
 /* Encode and de-code a swap entry */
 #define __swp_type(entry)	(((entry).val >> PAGE_SHIFT) & 0xffUL)
diff --git a/arch/sparc/include/asm/tsb.h b/arch/sparc/include/asm/tsb.h
index ef8cd1a174f1..b4c258de4443 100644
--- a/arch/sparc/include/asm/tsb.h
+++ b/arch/sparc/include/asm/tsb.h
@@ -157,10 +157,86 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
 	andn		REG2, 0x7, REG2; \
 	add		REG1, REG2, REG1;
 
-	/* Do a user page table walk in MMU globals.  Leaves physical PTE
-	 * pointer in REG1.  Jumps to FAIL_LABEL on early page table walk
-	 * termination.  Physical base of page tables is in PHYS_PGD which
-	 * will not be modified.
+	/* This macro exists only to make the PMD translator below easier
+	 * to read.  It hides the ELF section switch for the sun4v code
+	 * patching.
+	 */
+#define OR_PTE_BIT(REG, NAME)				\
+661:	or		REG, _PAGE_##NAME##_4U, REG;	\
+	.section	.sun4v_1insn_patch, "ax";	\
+	.word		661b;				\
+	or		REG, _PAGE_##NAME##_4V, REG;	\
+	.previous;
+
+	/* Load into REG the PTE value for VALID, CACHE, and SZHUGE.  */
+#define BUILD_PTE_VALID_SZHUGE_CACHE(REG)				   \
+661:	sethi		%uhi(_PAGE_VALID|_PAGE_SZHUGE_4U), REG;		   \
+	.section	.sun4v_1insn_patch, "ax";			   \
+	.word		661b;						   \
+	sethi		%uhi(_PAGE_VALID), REG;				   \
+	.previous;							   \
+	sllx		REG, 32, REG;					   \
+661:	or		REG, _PAGE_CP_4U|_PAGE_CV_4U, REG;		   \
+	.section	.sun4v_1insn_patch, "ax";			   \
+	.word		661b;						   \
+	or		REG, _PAGE_CP_4V|_PAGE_CV_4V|_PAGE_SZHUGE_4V, REG; \
+	.previous;
+
+	/* PMD has been loaded into REG1, interpret the value, seeing
+	 * if it is a HUGE PMD or a normal one.  If it is not valid
+	 * then jump to FAIL_LABEL.  If it is a HUGE PMD, and it
+	 * translates to a valid PTE, branch to PTE_LABEL.
+	 *
+	 * We translate the PMD by hand, one bit at a time,
+	 * constructing the huge PTE.
+	 *
+	 * So we construct the PTE in REG2 as follows:
+	 *
+	 * 1) Extract the PMD PFN from REG1 and place it into REG2.
+	 *
+	 * 2) Translate PMD protection bits in REG1 into REG2, one bit
+	 *    at a time using andcc tests on REG1 and OR's into REG2.
+	 *
+	 *    Only two bits to be concerned with here, EXEC and WRITE.
+	 *    Now REG1 is freed up and we can use it as a temporary.
+	 *
+	 * 3) Construct the VALID, CACHE, and page size PTE bits in
+	 *    REG1, OR with REG2 to form final PTE.
+	 */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, PTE_LABEL) \
+	brz,pn		REG1, FAIL_LABEL;				      \
+	 andcc		REG1, PMD_ISHUGE, %g0;				      \
+	be,pt		%xcc, 700f;					      \
+	 and		REG1, PMD_HUGE_PRESENT|PMD_HUGE_ACCESSED, REG2;	      \
+	cmp		REG2, PMD_HUGE_PRESENT|PMD_HUGE_ACCESSED;	      \
+	bne,pn		%xcc, FAIL_LABEL;				      \
+	 andn		REG1, PMD_HUGE_PROTBITS, REG2;			      \
+	sllx		REG2, PMD_PADDR_SHIFT, REG2;			      \
+	/* REG2 now holds PFN << PAGE_SHIFT */				      \
+	andcc		REG1, PMD_HUGE_EXEC, %g0;			      \
+	bne,a,pt	%xcc, 1f;					      \
+	 OR_PTE_BIT(REG2, EXEC);					      \
+1:	andcc		REG1, PMD_HUGE_WRITE, %g0;			      \
+	bne,a,pt	%xcc, 1f;					      \
+	 OR_PTE_BIT(REG2, W);						      \
+	/* REG1 can now be clobbered, build final PTE */		      \
+1:	BUILD_PTE_VALID_SZHUGE_CACHE(REG1);				      \
+	ba,pt		%xcc, PTE_LABEL;				      \
+	 or		REG1, REG2, REG1;				      \
+700:
+#else
+#define USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, PTE_LABEL) \
+	brz,pn		REG1, FAIL_LABEL; \
+	 nop;
+#endif
+
+	/* Do a user page table walk in MMU globals.  Leaves final,
+	 * valid, PTE value in REG1.  Jumps to FAIL_LABEL on early
+	 * page table walk termination or if the PTE is not valid.
+	 *
+	 * Physical base of page tables is in PHYS_PGD which will not
+	 * be modified.
 	 *
 	 * VADDR will not be clobbered, but REG1 and REG2 will.
 	 */
@@ -175,12 +251,16 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
 	sllx		REG1, PGD_PADDR_SHIFT, REG1; \
 	andn		REG2, 0x3, REG2; \
 	lduwa		[REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
-	brz,pn		REG1, FAIL_LABEL; \
-	 sllx		VADDR, 64 - PMD_SHIFT, REG2; \
+	USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, 800f) \
+	sllx		VADDR, 64 - PMD_SHIFT, REG2; \
 	srlx		REG2, 64 - (PAGE_SHIFT - 1), REG2; \
 	sllx		REG1, PMD_PADDR_SHIFT, REG1; \
 	andn		REG2, 0x7, REG2; \
-	add		REG1, REG2, REG1;
+	add		REG1, REG2, REG1; \
+	ldxa		[REG1] ASI_PHYS_USE_EC, REG1; \
+	brgez,pn	REG1, FAIL_LABEL; \
+	 nop; \
+800:
 
 /* Lookup a OBP mapping on VADDR in the prom_trans[] table at TL>0.
  * If no entry is found, FAIL_LABEL will be branched to.  On success
diff --git a/arch/sparc/kernel/sun4v_tlb_miss.S b/arch/sparc/kernel/sun4v_tlb_miss.S
index e1fbf8c75787..bde867fd71e8 100644
--- a/arch/sparc/kernel/sun4v_tlb_miss.S
+++ b/arch/sparc/kernel/sun4v_tlb_miss.S
@@ -176,7 +176,7 @@ sun4v_tsb_miss_common:
 
 	sub	%g2, TRAP_PER_CPU_FAULT_INFO, %g2
 
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 	mov	SCRATCHPAD_UTSBREG2, %g5
 	ldxa	[%g5] ASI_SCRATCHPAD, %g5
 	cmp	%g5, -1
diff --git a/arch/sparc/kernel/tsb.S b/arch/sparc/kernel/tsb.S
index db15d123f054..d4bdc7a62375 100644
--- a/arch/sparc/kernel/tsb.S
+++ b/arch/sparc/kernel/tsb.S
@@ -49,7 +49,7 @@ tsb_miss_page_table_walk:
 	/* Before committing to a full page table walk,
 	 * check the huge page TSB.
 	 */
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 
 661:	ldx		[%g7 + TRAP_PER_CPU_TSB_HUGE], %g5
 	nop
@@ -110,12 +110,9 @@ tsb_miss_page_table_walk:
 tsb_miss_page_table_walk_sun4v_fastpath:
 	USER_PGTABLE_WALK_TL1(%g4, %g7, %g5, %g2, tsb_do_fault)
 
-	/* Load and check PTE.  */
-	ldxa		[%g5] ASI_PHYS_USE_EC, %g5
-	brgez,pn	%g5, tsb_do_fault
-	 nop
+	/* Valid PTE is now in %g5.  */
 
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 661:	sethi		%uhi(_PAGE_SZALL_4U), %g7
 	sllx		%g7, 32, %g7
 	.section	.sun4v_2insn_patch, "ax"
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 413d29263304..2976dba1ebaf 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -465,13 +465,13 @@ good_area:
 	up_read(&mm->mmap_sem);
 
 	mm_rss = get_mm_rss(mm);
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 	mm_rss -= (mm->context.huge_pte_count * (HPAGE_SIZE / PAGE_SIZE));
 #endif
 	if (unlikely(mm_rss >
 		     mm->context.tsb_block[MM_TSB_BASE].tsb_rss_limit))
 		tsb_grow(mm, MM_TSB_BASE, mm_rss);
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 	mm_rss = mm->context.huge_pte_count;
 	if (unlikely(mm_rss >
 		     mm->context.tsb_block[MM_TSB_HUGE].tsb_rss_limit))
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 07e14535375c..f76f83d5ac63 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -303,53 +303,3 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 {
 	return NULL;
 }
-
-static void context_reload(void *__data)
-{
-	struct mm_struct *mm = __data;
-
-	if (mm == current->mm)
-		load_secondary_context(mm);
-}
-
-void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-	struct tsb_config *tp = &mm->context.tsb_block[MM_TSB_HUGE];
-
-	if (likely(tp->tsb != NULL))
-		return;
-
-	tsb_grow(mm, MM_TSB_HUGE, 0);
-	tsb_context_switch(mm);
-	smp_tsb_sync(mm);
-
-	/* On UltraSPARC-III+ and later, configure the second half of
-	 * the Data-TLB for huge pages.
-	 */
-	if (tlb_type == cheetah_plus) {
-		unsigned long ctx;
-
-		spin_lock(&ctx_alloc_lock);
-		ctx = mm->context.sparc64_ctx_val;
-		ctx &= ~CTX_PGSZ_MASK;
-		ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT;
-		ctx |= CTX_PGSZ_HUGE << CTX_PGSZ1_SHIFT;
-
-		if (ctx != mm->context.sparc64_ctx_val) {
-			/* When changing the page size fields, we
-			 * must perform a context flush so that no
-			 * stale entries match.  This flush must
-			 * occur with the original context register
-			 * settings.
-			 */
-			do_flush_tlb_mm(mm);
-
-			/* Reload the context register of all processors
-			 * also executing in this address space.
-			 */
-			mm->context.sparc64_ctx_val = ctx;
-			on_each_cpu(context_reload, mm, 0);
-		}
-		spin_unlock(&ctx_alloc_lock);
-	}
-}
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 12ef4ea60c88..9e28a118e6a4 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -306,12 +306,24 @@ static void flush_dcache(unsigned long pfn)
 	}
 }
 
+/* mm->context.lock must be held */
+static void __update_mmu_tsb_insert(struct mm_struct *mm, unsigned long tsb_index,
+				    unsigned long tsb_hash_shift, unsigned long address,
+				    unsigned long tte)
+{
+	struct tsb *tsb = mm->context.tsb_block[tsb_index].tsb;
+	unsigned long tag;
+
+	tsb += ((address >> tsb_hash_shift) &
+		(mm->context.tsb_block[tsb_index].tsb_nentries - 1UL));
+	tag = (address >> 22UL);
+	tsb_insert(tsb, tag, tte);
+}
+
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
 {
+	unsigned long tsb_index, tsb_hash_shift, flags;
 	struct mm_struct *mm;
-	struct tsb *tsb;
-	unsigned long tag, flags;
-	unsigned long tsb_index, tsb_hash_shift;
 	pte_t pte = *ptep;
 
 	if (tlb_type != hypervisor) {
@@ -328,7 +340,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *
 
 	spin_lock_irqsave(&mm->context.lock, flags);
 
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 	if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL) {
 		if ((tlb_type == hypervisor &&
 		     (pte_val(pte) & _PAGE_SZALL_4V) == _PAGE_SZHUGE_4V) ||
@@ -340,11 +352,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *
 	}
 #endif
 
-	tsb = mm->context.tsb_block[tsb_index].tsb;
-	tsb += ((address >> tsb_hash_shift) &
-		(mm->context.tsb_block[tsb_index].tsb_nentries - 1UL));
-	tag = (address >> 22UL);
-	tsb_insert(tsb, tag, pte_val(pte));
+	__update_mmu_tsb_insert(mm, tsb_index, tsb_hash_shift,
+				address, pte_val(pte));
 
 	spin_unlock_irqrestore(&mm->context.lock, flags);
 }
@@ -2568,3 +2577,180 @@ void pgtable_free(void *table, bool is_page)
 	else
 		kmem_cache_free(pgtable_cache, table);
 }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot, bool for_modify)
+{
+	if (pgprot_val(pgprot) & _PAGE_VALID)
+		pmd_val(pmd) |= PMD_HUGE_PRESENT;
+	if (tlb_type == hypervisor) {
+		if (pgprot_val(pgprot) & _PAGE_WRITE_4V)
+			pmd_val(pmd) |= PMD_HUGE_WRITE;
+		if (pgprot_val(pgprot) & _PAGE_EXEC_4V)
+			pmd_val(pmd) |= PMD_HUGE_EXEC;
+
+		if (!for_modify) {
+			if (pgprot_val(pgprot) & _PAGE_ACCESSED_4V)
+				pmd_val(pmd) |= PMD_HUGE_ACCESSED;
+			if (pgprot_val(pgprot) & _PAGE_MODIFIED_4V)
+				pmd_val(pmd) |= PMD_HUGE_DIRTY;
+		}
+	} else {
+		if (pgprot_val(pgprot) & _PAGE_WRITE_4U)
+			pmd_val(pmd) |= PMD_HUGE_WRITE;
+		if (pgprot_val(pgprot) & _PAGE_EXEC_4U)
+			pmd_val(pmd) |= PMD_HUGE_EXEC;
+
+		if (!for_modify) {
+			if (pgprot_val(pgprot) & _PAGE_ACCESSED_4U)
+				pmd_val(pmd) |= PMD_HUGE_ACCESSED;
+			if (pgprot_val(pgprot) & _PAGE_MODIFIED_4U)
+				pmd_val(pmd) |= PMD_HUGE_DIRTY;
+		}
+	}
+
+	return pmd;
+}
+
+pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
+{
+	pmd_t pmd;
+
+	pmd_val(pmd) = (page_nr << ((PAGE_SHIFT - PMD_PADDR_SHIFT)));
+	pmd_val(pmd) |= PMD_ISHUGE;
+	pmd = pmd_set_protbits(pmd, pgprot, false);
+	return pmd;
+}
+
+pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+	pmd_val(pmd) &= ~(PMD_HUGE_PRESENT |
+			  PMD_HUGE_WRITE |
+			  PMD_HUGE_EXEC);
+	pmd = pmd_set_protbits(pmd, newprot, true);
+	return pmd;
+}
+
+pgprot_t pmd_pgprot(pmd_t entry)
+{
+	unsigned long pte = 0;
+
+	if (pmd_val(entry) & PMD_HUGE_PRESENT)
+		pte |= _PAGE_VALID;
+
+	if (tlb_type == hypervisor) {
+		if (pmd_val(entry) & PMD_HUGE_PRESENT)
+			pte |= _PAGE_PRESENT_4V;
+		if (pmd_val(entry) & PMD_HUGE_EXEC)
+			pte |= _PAGE_EXEC_4V;
+		if (pmd_val(entry) & PMD_HUGE_WRITE)
+			pte |= _PAGE_W_4V;
+		if (pmd_val(entry) & PMD_HUGE_ACCESSED)
+			pte |= _PAGE_ACCESSED_4V;
+		if (pmd_val(entry) & PMD_HUGE_DIRTY)
+			pte |= _PAGE_MODIFIED_4V;
+		pte |= _PAGE_CP_4V|_PAGE_CV_4V;
+	} else {
+		if (pmd_val(entry) & PMD_HUGE_PRESENT)
+			pte |= _PAGE_PRESENT_4U;
+		if (pmd_val(entry) & PMD_HUGE_EXEC)
+			pte |= _PAGE_EXEC_4U;
+		if (pmd_val(entry) & PMD_HUGE_WRITE)
+			pte |= _PAGE_W_4U;
+		if (pmd_val(entry) & PMD_HUGE_ACCESSED)
+			pte |= _PAGE_ACCESSED_4U;
+		if (pmd_val(entry) & PMD_HUGE_DIRTY)
+			pte |= _PAGE_MODIFIED_4U;
+		pte |= _PAGE_CP_4U|_PAGE_CV_4U;
+	}
+
+	return __pgprot(pte);
+}
+
+void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+			  pmd_t *pmd)
+{
+	unsigned long pte, flags;
+	struct mm_struct *mm;
+	pmd_t entry = *pmd;
+	pgprot_t prot;
+
+	if (!pmd_large(entry) || !pmd_young(entry))
+		return;
+
+	pte = (pmd_val(entry) & ~PMD_HUGE_PROTBITS);
+	pte <<= PMD_PADDR_SHIFT;
+	pte |= _PAGE_VALID;
+
+	prot = pmd_pgprot(entry);
+
+	if (tlb_type == hypervisor)
+		pgprot_val(prot) |= _PAGE_SZHUGE_4V;
+	else
+		pgprot_val(prot) |= _PAGE_SZHUGE_4U;
+
+	pte |= pgprot_val(prot);
+
+	mm = vma->vm_mm;
+
+	spin_lock_irqsave(&mm->context.lock, flags);
+
+	if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL)
+		__update_mmu_tsb_insert(mm, MM_TSB_HUGE, HPAGE_SHIFT,
+					addr, pte);
+
+	spin_unlock_irqrestore(&mm->context.lock, flags);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
+static void context_reload(void *__data)
+{
+	struct mm_struct *mm = __data;
+
+	if (mm == current->mm)
+		load_secondary_context(mm);
+}
+
+void hugetlb_setup(struct mm_struct *mm)
+{
+	struct tsb_config *tp = &mm->context.tsb_block[MM_TSB_HUGE];
+
+	if (likely(tp->tsb != NULL))
+		return;
+
+	tsb_grow(mm, MM_TSB_HUGE, 0);
+	tsb_context_switch(mm);
+	smp_tsb_sync(mm);
+
+	/* On UltraSPARC-III+ and later, configure the second half of
+	 * the Data-TLB for huge pages.
+	 */
+	if (tlb_type == cheetah_plus) {
+		unsigned long ctx;
+
+		spin_lock(&ctx_alloc_lock);
+		ctx = mm->context.sparc64_ctx_val;
+		ctx &= ~CTX_PGSZ_MASK;
+		ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT;
+		ctx |= CTX_PGSZ_HUGE << CTX_PGSZ1_SHIFT;
+
+		if (ctx != mm->context.sparc64_ctx_val) {
+			/* When changing the page size fields, we
+			 * must perform a context flush so that no
+			 * stale entries match.  This flush must
+			 * occur with the original context register
+			 * settings.
+			 */
+			do_flush_tlb_mm(mm);
+
+			/* Reload the context register of all processors
+			 * also executing in this address space.
+			 */
+			mm->context.sparc64_ctx_val = ctx;
+			on_each_cpu(context_reload, mm, 0);
+		}
+		spin_unlock(&ctx_alloc_lock);
+	}
+}
+#endif
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index b1f279cd00bf..3e8fec391fe0 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -43,16 +43,37 @@ void flush_tlb_pending(void)
 	put_cpu_var(tlb_batch);
 }
 
-void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
-		   pte_t *ptep, pte_t orig, int fullmm)
+static void tlb_batch_add_one(struct mm_struct *mm, unsigned long vaddr,
+			      bool exec)
 {
 	struct tlb_batch *tb = &get_cpu_var(tlb_batch);
 	unsigned long nr;
 
 	vaddr &= PAGE_MASK;
-	if (pte_exec(orig))
+	if (exec)
 		vaddr |= 0x1UL;
 
+	nr = tb->tlb_nr;
+
+	if (unlikely(nr != 0 && mm != tb->mm)) {
+		flush_tlb_pending();
+		nr = 0;
+	}
+
+	if (nr == 0)
+		tb->mm = mm;
+
+	tb->vaddrs[nr] = vaddr;
+	tb->tlb_nr = ++nr;
+	if (nr >= TLB_BATCH_NR)
+		flush_tlb_pending();
+
+	put_cpu_var(tlb_batch);
+}
+
+void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
+		   pte_t *ptep, pte_t orig, int fullmm)
+{
 	if (tlb_type != hypervisor &&
 	    pte_dirty(orig)) {
 		unsigned long paddr, pfn = pte_pfn(orig);
@@ -77,26 +98,91 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
 	}
 
 no_cache_flush:
-
-	if (fullmm) {
-		put_cpu_var(tlb_batch);
-		return;
-	}
-
-	nr = tb->tlb_nr;
-
-	if (unlikely(nr != 0 && mm != tb->mm)) {
-		flush_tlb_pending();
-		nr = 0;
-	}
-
-	if (nr == 0)
-		tb->mm = mm;
-
-	tb->vaddrs[nr] = vaddr;
-	tb->tlb_nr = ++nr;
-	if (nr >= TLB_BATCH_NR)
-		flush_tlb_pending();
-
-	put_cpu_var(tlb_batch);
+	if (!fullmm)
+		tlb_batch_add_one(mm, vaddr, pte_exec(orig));
 }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void tlb_batch_pmd_scan(struct mm_struct *mm, unsigned long vaddr,
+			       pmd_t pmd, bool exec)
+{
+	unsigned long end;
+	pte_t *pte;
+
+	pte = pte_offset_map(&pmd, vaddr);
+	end = vaddr + HPAGE_SIZE;
+	while (vaddr < end) {
+		if (pte_val(*pte) & _PAGE_VALID)
+			tlb_batch_add_one(mm, vaddr, exec);
+		pte++;
+		vaddr += PAGE_SIZE;
+	}
+	pte_unmap(pte);
+}
+
+void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+		pmd_t *pmdp, pmd_t pmd)
+{
+	pmd_t orig = *pmdp;
+
+	*pmdp = pmd;
+
+	if (mm == &init_mm)
+		return;
+
+	if ((pmd_val(pmd) ^ pmd_val(orig)) & PMD_ISHUGE) {
+		if (pmd_val(pmd) & PMD_ISHUGE)
+			mm->context.huge_pte_count++;
+		else
+			mm->context.huge_pte_count--;
+		if (mm->context.huge_pte_count == 1)
+			hugetlb_setup(mm);
+	}
+
+	if (!pmd_none(orig)) {
+		bool exec = ((pmd_val(orig) & PMD_HUGE_EXEC) != 0);
+
+		addr &= HPAGE_MASK;
+		if (pmd_val(orig) & PMD_ISHUGE)
+			tlb_batch_add_one(mm, addr, exec);
+		else
+			tlb_batch_pmd_scan(mm, addr, orig, exec);
+	}
+}
+
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+{
+	struct list_head *lh = (struct list_head *) pgtable;
+
+	assert_spin_locked(&mm->page_table_lock);
+
+	/* FIFO */
+	if (!mm->pmd_huge_pte)
+		INIT_LIST_HEAD(lh);
+	else
+		list_add(lh, (struct list_head *) mm->pmd_huge_pte);
+	mm->pmd_huge_pte = pgtable;
+}
+
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+{
+	struct list_head *lh;
+	pgtable_t pgtable;
+
+	assert_spin_locked(&mm->page_table_lock);
+
+	/* FIFO */
+	pgtable = mm->pmd_huge_pte;
+	lh = (struct list_head *) pgtable;
+	if (list_empty(lh))
+		mm->pmd_huge_pte = NULL;
+	else {
+		mm->pmd_huge_pte = (pgtable_t) lh->next;
+		list_del(lh);
+	}
+	pte_val(pgtable[0]) = 0;
+	pte_val(pgtable[1]) = 0;
+
+	return pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c
index a35ee832baf3..7f6474347491 100644
--- a/arch/sparc/mm/tsb.c
+++ b/arch/sparc/mm/tsb.c
@@ -78,7 +78,7 @@ void flush_tsb_user(struct tlb_batch *tb)
 		base = __pa(base);
 	__flush_tsb_one(tb, PAGE_SHIFT, base, nentries);
 
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 	if (mm->context.tsb_block[MM_TSB_HUGE].tsb) {
 		base = (unsigned long) mm->context.tsb_block[MM_TSB_HUGE].tsb;
 		nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries;
@@ -93,7 +93,7 @@ void flush_tsb_user(struct tlb_batch *tb)
 #define HV_PGSZ_IDX_BASE	HV_PGSZ_IDX_8K
 #define HV_PGSZ_MASK_BASE	HV_PGSZ_MASK_8K
 
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 #define HV_PGSZ_IDX_HUGE	HV_PGSZ_IDX_4MB
 #define HV_PGSZ_MASK_HUGE	HV_PGSZ_MASK_4MB
 #endif
@@ -190,7 +190,7 @@ static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsign
 		case MM_TSB_BASE:
 			hp->pgsz_idx = HV_PGSZ_IDX_BASE;
 			break;
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 		case MM_TSB_HUGE:
 			hp->pgsz_idx = HV_PGSZ_IDX_HUGE;
 			break;
@@ -205,7 +205,7 @@ static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsign
 		case MM_TSB_BASE:
 			hp->pgsz_mask = HV_PGSZ_MASK_BASE;
 			break;
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 		case MM_TSB_HUGE:
 			hp->pgsz_mask = HV_PGSZ_MASK_HUGE;
 			break;
@@ -427,7 +427,7 @@ retry_tsb_alloc:
 
 int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 	unsigned long huge_pte_count;
 #endif
 	unsigned int i;
@@ -436,7 +436,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 
 	mm->context.sparc64_ctx_val = 0UL;
 
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 	/* We reset it to zero because the fork() page copying
 	 * will re-increment the counters as the parent PTEs are
 	 * copied into the child address space.
@@ -459,7 +459,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	 */
 	tsb_grow(mm, MM_TSB_BASE, get_mm_rss(mm));
 
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 	if (unlikely(huge_pte_count))
 		tsb_grow(mm, MM_TSB_HUGE, huge_pte_count);
 #endif