From 07bacb3826d613bb651297a201dd2df8dd4fdee5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 30 Jan 2014 15:45:44 -0800
Subject: [PATCH 01/25] memblock, bootmem: restore goal for alloc_low

Now we have memblock_virt_alloc_low to replace original bootmem api in
swiotlb.

But we should not use BOOTMEM_LOW_LIMIT for arch that does not support
CONFIG_NOBOOTMEM, as old api take 0.

| #define alloc_bootmem_low(x) \
|        __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
|#define alloc_bootmem_low_pages_nopanic(x) \
|        __alloc_bootmem_low_nopanic(x, PAGE_SIZE, 0)

and we have
 #define BOOTMEM_LOW_LIMIT __pa(MAX_DMA_ADDRESS)
for CONFIG_NOBOOTMEM.

Restore goal to 0 to fix ia64 crash, that Tony found.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Reported-by: Tony Luck <tony.luck@gmail.com>
Tested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index b388223bd4a9..db51fe4fe317 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -264,7 +264,7 @@ static inline void * __init memblock_virt_alloc_low(
 {
 	if (!align)
 		align = SMP_CACHE_BYTES;
-	return __alloc_bootmem_low(size, align, BOOTMEM_LOW_LIMIT);
+	return __alloc_bootmem_low(size, align, 0);
 }
 
 static inline void * __init memblock_virt_alloc_low_nopanic(
@@ -272,7 +272,7 @@ static inline void * __init memblock_virt_alloc_low_nopanic(
 {
 	if (!align)
 		align = SMP_CACHE_BYTES;
-	return __alloc_bootmem_low_nopanic(size, align, BOOTMEM_LOW_LIMIT);
+	return __alloc_bootmem_low_nopanic(size, align, 0);
 }
 
 static inline void * __init memblock_virt_alloc_from_nopanic(

From 54f5968db9e09de8779b5a5174719af51f1da199 Mon Sep 17 00:00:00 2001
From: Levente Kurusa <levex@linux.com>
Date: Thu, 30 Jan 2014 15:45:45 -0800
Subject: [PATCH 02/25] drivers/video/backlight/lcd.c: call put_device if
 device_register fails

Currently we kfree the container of the device which failed to register.
This is wrong as the last reference is not given up with a put_device
call.  Also, now that we have put_device() callen, we no longer need the
kfree as the new_ld->dev.release function will take care of kfreeing the
associated memory.

Signed-off-by: Levente Kurusa <levex@linux.com>
Acked-by: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/backlight/lcd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c
index 93cf15efc717..7de847df224f 100644
--- a/drivers/video/backlight/lcd.c
+++ b/drivers/video/backlight/lcd.c
@@ -228,7 +228,7 @@ struct lcd_device *lcd_device_register(const char *name, struct device *parent,
 
 	rc = device_register(&new_ld->dev);
 	if (rc) {
-		kfree(new_ld);
+		put_device(&new_ld->dev);
 		return ERR_PTR(rc);
 	}
 

From 0c692d07842a67d9aa6b8266a80e4ac460a5c1a2 Mon Sep 17 00:00:00 2001
From: Levente Kurusa <levex@linux.com>
Date: Thu, 30 Jan 2014 15:45:46 -0800
Subject: [PATCH 03/25] drivers/net/phy/mdio_bus.c: call put_device on
 device_register() failure

It is required to call put_device() if device_register() fails, so that
we give up the last reference to the device.  Calling put_device allows
for mdiobus_release to be executed, kfreeing the bus.

Signed-off-by: Levente Kurusa <levex@linux.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: David Daney <david.daney@cavium.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/net/phy/mdio_bus.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 930694d3a13f..71e49000fbf3 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -150,6 +150,7 @@ int mdiobus_register(struct mii_bus *bus)
 	err = device_register(&bus->dev);
 	if (err) {
 		pr_err("mii_bus %s failed to register\n", bus->id);
+		put_device(&bus->dev);
 		return -EINVAL;
 	}
 

From 6897fc22ea01b562b55c6168592bcbd3ee62b006 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 30 Jan 2014 15:45:47 -0800
Subject: [PATCH 04/25] kernel: use lockless list for smp_call_function_single

Make smp_call_function_single and friends more efficient by using a
lockless list.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/blkdev.h |  5 +----
 include/linux/smp.h    |  6 ++++-
 kernel/smp.c           | 51 +++++++++++-------------------------------
 3 files changed, 19 insertions(+), 43 deletions(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0375654adb28..8678c4322b44 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -95,10 +95,7 @@ enum rq_cmd_type_bits {
  * as well!
  */
 struct request {
-	union {
-		struct list_head queuelist;
-		struct llist_node ll_list;
-	};
+	struct list_head queuelist;
 	union {
 		struct call_single_data csd;
 		struct work_struct mq_flush_data;
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 5da22ee42e16..3834f43f9993 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -11,12 +11,16 @@
 #include <linux/list.h>
 #include <linux/cpumask.h>
 #include <linux/init.h>
+#include <linux/llist.h>
 
 extern void cpu_idle(void);
 
 typedef void (*smp_call_func_t)(void *info);
 struct call_single_data {
-	struct list_head list;
+	union {
+		struct list_head list;
+		struct llist_node llist;
+	};
 	smp_call_func_t func;
 	void *info;
 	u16 flags;
diff --git a/kernel/smp.c b/kernel/smp.c
index bd9f94028838..4ad913e7c253 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -28,12 +28,7 @@ struct call_function_data {
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
 
-struct call_single_queue {
-	struct list_head	list;
-	raw_spinlock_t		lock;
-};
-
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
 
 static int
 hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -85,12 +80,8 @@ void __init call_function_init(void)
 	void *cpu = (void *)(long)smp_processor_id();
 	int i;
 
-	for_each_possible_cpu(i) {
-		struct call_single_queue *q = &per_cpu(call_single_queue, i);
-
-		raw_spin_lock_init(&q->lock);
-		INIT_LIST_HEAD(&q->list);
-	}
+	for_each_possible_cpu(i)
+		init_llist_head(&per_cpu(call_single_queue, i));
 
 	hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
 	register_cpu_notifier(&hotplug_cfd_notifier);
@@ -141,18 +132,9 @@ static void csd_unlock(struct call_single_data *csd)
  */
 static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
 {
-	struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
-	unsigned long flags;
-	int ipi;
-
 	if (wait)
 		csd->flags |= CSD_FLAG_WAIT;
 
-	raw_spin_lock_irqsave(&dst->lock, flags);
-	ipi = list_empty(&dst->list);
-	list_add_tail(&csd->list, &dst->list);
-	raw_spin_unlock_irqrestore(&dst->lock, flags);
-
 	/*
 	 * The list addition should be visible before sending the IPI
 	 * handler locks the list to pull the entry off it because of
@@ -164,7 +146,7 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
 	 * locking and barrier primitives. Generic code isn't really
 	 * equipped to do the right thing...
 	 */
-	if (ipi)
+	if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
 		arch_send_call_function_single_ipi(cpu);
 
 	if (wait)
@@ -177,27 +159,26 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
  */
 void generic_smp_call_function_single_interrupt(void)
 {
-	struct call_single_queue *q = &__get_cpu_var(call_single_queue);
-	LIST_HEAD(list);
+	struct llist_node *entry, *next;
 
 	/*
 	 * Shouldn't receive this interrupt on a cpu that is not yet online.
 	 */
 	WARN_ON_ONCE(!cpu_online(smp_processor_id()));
 
-	raw_spin_lock(&q->lock);
-	list_replace_init(&q->list, &list);
-	raw_spin_unlock(&q->lock);
+	entry = llist_del_all(&__get_cpu_var(call_single_queue));
+	entry = llist_reverse_order(entry);
 
-	while (!list_empty(&list)) {
+	while (entry) {
 		struct call_single_data *csd;
 
-		csd = list_entry(list.next, struct call_single_data, list);
-		list_del(&csd->list);
+		next = entry->next;
 
+		csd = llist_entry(entry, struct call_single_data, llist);
 		csd->func(csd->info);
-
 		csd_unlock(csd);
+
+		entry = next;
 	}
 }
 
@@ -411,17 +392,11 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	for_each_cpu(cpu, cfd->cpumask) {
 		struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
-		struct call_single_queue *dst =
-					&per_cpu(call_single_queue, cpu);
-		unsigned long flags;
 
 		csd_lock(csd);
 		csd->func = func;
 		csd->info = info;
-
-		raw_spin_lock_irqsave(&dst->lock, flags);
-		list_add_tail(&csd->list, &dst->list);
-		raw_spin_unlock_irqrestore(&dst->lock, flags);
+		llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
 	}
 
 	/* Send a message to all CPUs in the map */

From 73f945505b9bf798d8c3ee830cb330dd6d7fb4c7 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <klamm@yandex-team.ru>
Date: Thu, 30 Jan 2014 15:45:48 -0800
Subject: [PATCH 05/25] kernel/smp.c: remove cpumask_ipi

After commit 9a46ad6d6df3 ("smp: make smp_call_function_many() use logic
similar to smp_call_function_single()"), cfd->cpumask is accessed only
in smp_call_function_many().  So there is no more need to copy it into
cfd->cpumask_ipi before putting csd into the list.  The cpumask_ipi
field is obsolete and can be removed.

Signed-off-by: Roman Gushchin <klamm@yandex-team.ru>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Wang YanQing <udknight@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Cc: Shaohua Li <shli@fusionio.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/smp.c | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index 4ad913e7c253..ffee35bef179 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -23,7 +23,6 @@ enum {
 struct call_function_data {
 	struct call_single_data	__percpu *csd;
 	cpumask_var_t		cpumask;
-	cpumask_var_t		cpumask_ipi;
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
@@ -42,14 +41,8 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
 				cpu_to_node(cpu)))
 			return notifier_from_errno(-ENOMEM);
-		if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
-				cpu_to_node(cpu))) {
-			free_cpumask_var(cfd->cpumask);
-			return notifier_from_errno(-ENOMEM);
-		}
 		cfd->csd = alloc_percpu(struct call_single_data);
 		if (!cfd->csd) {
-			free_cpumask_var(cfd->cpumask_ipi);
 			free_cpumask_var(cfd->cpumask);
 			return notifier_from_errno(-ENOMEM);
 		}
@@ -62,7 +55,6 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		free_cpumask_var(cfd->cpumask);
-		free_cpumask_var(cfd->cpumask_ipi);
 		free_percpu(cfd->csd);
 		break;
 #endif
@@ -383,13 +375,6 @@ void smp_call_function_many(const struct cpumask *mask,
 	if (unlikely(!cpumask_weight(cfd->cpumask)))
 		return;
 
-	/*
-	 * After we put an entry into the list, cfd->cpumask may be cleared
-	 * again when another CPU sends another IPI for a SMP function call, so
-	 * cfd->cpumask will be zero.
-	 */
-	cpumask_copy(cfd->cpumask_ipi, cfd->cpumask);
-
 	for_each_cpu(cpu, cfd->cpumask) {
 		struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
 
@@ -400,7 +385,7 @@ void smp_call_function_many(const struct cpumask *mask,
 	}
 
 	/* Send a message to all CPUs in the map */
-	arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
+	arch_send_call_function_ipi_mask(cfd->cpumask);
 
 	if (wait) {
 		for_each_cpu(cpu, cfd->cpumask) {

From bcf1647d0899666f0fb90d176abf63bae22abb7c Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 30 Jan 2014 15:45:50 -0800
Subject: [PATCH 06/25] zsmalloc: move it under mm

This patch moves zsmalloc under mm directory.

Before that, description will explain why we have needed custom
allocator.

Zsmalloc is a new slab-based memory allocator for storing compressed
pages.  It is designed for low fragmentation and high allocation success
rate on large object, but <= PAGE_SIZE allocations.

zsmalloc differs from the kernel slab allocator in two primary ways to
achieve these design goals.

zsmalloc never requires high order page allocations to back slabs, or
"size classes" in zsmalloc terms.  Instead it allows multiple
single-order pages to be stitched together into a "zspage" which backs
the slab.  This allows for higher allocation success rate under memory
pressure.

Also, zsmalloc allows objects to span page boundaries within the zspage.
This allows for lower fragmentation than could be had with the kernel
slab allocator for objects between PAGE_SIZE/2 and PAGE_SIZE.  With the
kernel slab allocator, if a page compresses to 60% of it original size,
the memory savings gained through compression is lost in fragmentation
because another object of the same size can't be stored in the leftover
space.

This ability to span pages results in zsmalloc allocations not being
directly addressable by the user.  The user is given an
non-dereferencable handle in response to an allocation request.  That
handle must be mapped, using zs_map_object(), which returns a pointer to
the mapped region that can be used.  The mapping is necessary since the
object data may reside in two different noncontigious pages.

The zsmalloc fulfills the allocation needs for zram perfectly

[sjenning@linux.vnet.ibm.com: borrow Seth's quote]
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Nitin Gupta <ngupta@vflare.org>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Bob Liu <bob.liu@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Luigi Semenzato <semenzato@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Seth Jennings <sjenning@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/staging/Kconfig                       |  2 --
 drivers/staging/Makefile                      |  1 -
 drivers/staging/zram/zram_drv.h               |  3 +--
 drivers/staging/zsmalloc/Kconfig              | 24 ------------------
 drivers/staging/zsmalloc/Makefile             |  3 ---
 .../zsmalloc => include/linux}/zsmalloc.h     |  0
 mm/Kconfig                                    | 25 +++++++++++++++++++
 mm/Makefile                                   |  1 +
 .../zsmalloc/zsmalloc-main.c => mm/zsmalloc.c |  3 +--
 9 files changed, 28 insertions(+), 34 deletions(-)
 delete mode 100644 drivers/staging/zsmalloc/Kconfig
 delete mode 100644 drivers/staging/zsmalloc/Makefile
 rename {drivers/staging/zsmalloc => include/linux}/zsmalloc.h (100%)
 rename drivers/staging/zsmalloc/zsmalloc-main.c => mm/zsmalloc.c (99%)

diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 4bb6b11166b3..120d2fa9e531 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -76,8 +76,6 @@ source "drivers/staging/sep/Kconfig"
 
 source "drivers/staging/iio/Kconfig"
 
-source "drivers/staging/zsmalloc/Kconfig"
-
 source "drivers/staging/zram/Kconfig"
 
 source "drivers/staging/wlags49_h2/Kconfig"
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index 9f07e5e16094..cb19d0afa0da 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -33,7 +33,6 @@ obj-$(CONFIG_VME_BUS)		+= vme/
 obj-$(CONFIG_DX_SEP)            += sep/
 obj-$(CONFIG_IIO)		+= iio/
 obj-$(CONFIG_ZRAM)		+= zram/
-obj-$(CONFIG_ZSMALLOC)		+= zsmalloc/
 obj-$(CONFIG_WLAGS49_H2)	+= wlags49_h2/
 obj-$(CONFIG_WLAGS49_H25)	+= wlags49_h25/
 obj-$(CONFIG_FB_SM7XX)		+= sm7xxfb/
diff --git a/drivers/staging/zram/zram_drv.h b/drivers/staging/zram/zram_drv.h
index 97a3acf6ab76..d8f6596513c3 100644
--- a/drivers/staging/zram/zram_drv.h
+++ b/drivers/staging/zram/zram_drv.h
@@ -17,8 +17,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
-
-#include "../zsmalloc/zsmalloc.h"
+#include <linux/zsmalloc.h>
 
 /*
  * Some arbitrary value. This is just to catch
diff --git a/drivers/staging/zsmalloc/Kconfig b/drivers/staging/zsmalloc/Kconfig
deleted file mode 100644
index 9d1f2a24ad62..000000000000
--- a/drivers/staging/zsmalloc/Kconfig
+++ /dev/null
@@ -1,24 +0,0 @@
-config ZSMALLOC
-	bool "Memory allocator for compressed pages"
-	depends on MMU
-	default n
-	help
-	  zsmalloc is a slab-based memory allocator designed to store
-	  compressed RAM pages.  zsmalloc uses virtual memory mapping
-	  in order to reduce fragmentation.  However, this results in a
-	  non-standard allocator interface where a handle, not a pointer, is
-	  returned by an alloc().  This handle must be mapped in order to
-	  access the allocated space.
-
-config PGTABLE_MAPPING
-	bool "Use page table mapping to access object in zsmalloc"
-	depends on ZSMALLOC
-	help
-	  By default, zsmalloc uses a copy-based object mapping method to
-	  access allocations that span two pages. However, if a particular
-	  architecture (ex, ARM) performs VM mapping faster than copying,
-	  then you should select this. This causes zsmalloc to use page table
-	  mapping rather than copying for object mapping.
-
-	  You can check speed with zsmalloc benchmark[1].
-	  [1] https://github.com/spartacus06/zsmalloc
diff --git a/drivers/staging/zsmalloc/Makefile b/drivers/staging/zsmalloc/Makefile
deleted file mode 100644
index b134848a590d..000000000000
--- a/drivers/staging/zsmalloc/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-zsmalloc-y 		:= zsmalloc-main.o
-
-obj-$(CONFIG_ZSMALLOC)	+= zsmalloc.o
diff --git a/drivers/staging/zsmalloc/zsmalloc.h b/include/linux/zsmalloc.h
similarity index 100%
rename from drivers/staging/zsmalloc/zsmalloc.h
rename to include/linux/zsmalloc.h
diff --git a/mm/Kconfig b/mm/Kconfig
index 723bbe04a0b0..2d9f1504d75e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -552,3 +552,28 @@ config MEM_SOFT_DIRTY
 	  it can be cleared by hands.
 
 	  See Documentation/vm/soft-dirty.txt for more details.
+
+config ZSMALLOC
+	bool "Memory allocator for compressed pages"
+	depends on MMU
+	default n
+	help
+	  zsmalloc is a slab-based memory allocator designed to store
+	  compressed RAM pages.  zsmalloc uses virtual memory mapping
+	  in order to reduce fragmentation.  However, this results in a
+	  non-standard allocator interface where a handle, not a pointer, is
+	  returned by an alloc().  This handle must be mapped in order to
+	  access the allocated space.
+
+config PGTABLE_MAPPING
+	bool "Use page table mapping to access object in zsmalloc"
+	depends on ZSMALLOC
+	help
+	  By default, zsmalloc uses a copy-based object mapping method to
+	  access allocations that span two pages. However, if a particular
+	  architecture (ex, ARM) performs VM mapping faster than copying,
+	  then you should select this. This causes zsmalloc to use page table
+	  mapping rather than copying for object mapping.
+
+	  You can check speed with zsmalloc benchmark[1].
+	  [1] https://github.com/spartacus06/zsmalloc
diff --git a/mm/Makefile b/mm/Makefile
index 305d10acd081..310c90a09264 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -60,3 +60,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
 obj-$(CONFIG_ZBUD)	+= zbud.o
+obj-$(CONFIG_ZSMALLOC)	+= zsmalloc.o
diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/mm/zsmalloc.c
similarity index 99%
rename from drivers/staging/zsmalloc/zsmalloc-main.c
rename to mm/zsmalloc.c
index 7660c87d8b2a..5d42adfcb67b 100644
--- a/drivers/staging/zsmalloc/zsmalloc-main.c
+++ b/mm/zsmalloc.c
@@ -90,8 +90,7 @@
 #include <linux/hardirq.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
-
-#include "zsmalloc.h"
+#include <linux/zsmalloc.h>
 
 /*
  * This must be power of 2 and greater than of equal to sizeof(link_free).

From cd67e10ac6997c6d1e1504e3c111b693bfdbc148 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 30 Jan 2014 15:45:52 -0800
Subject: [PATCH 07/25] zram: promote zram from staging

Zram has lived in staging for a LONG LONG time and have been
fixed/improved by many contributors so code is clean and stable now.  Of
course, there are lots of product using zram in real practice.

The major TV companys have used zram as swap since two years ago and
recently our production team released android smart phone with zram
which is used as swap, too and recently Android Kitkat start to use zram
for small memory smart phone.  And there was a report Google released
their ChromeOS with zram, too and cyanogenmod have been used zram long
time ago.  And I heard some disto have used zram block device for tmpfs.
In addition, I saw many report from many other peoples.  For example,
Lubuntu start to use it.

The benefit of zram is very clear.  With my experience, one of the
benefit was to remove jitter of video application with backgroud memory
pressure.  It would be effect of efficient memory usage by compression
but more issue is whether swap is there or not in the system.  Recent
mobile platforms have used JAVA so there are many anonymous pages.  But
embedded system normally are reluctant to use eMMC or SDCard as swap
because there is wear-leveling and latency issues so if we do not use
swap, it means we can't reclaim anoymous pages and at last, we could
encounter OOM kill.  :(

Although we have real storage as swap, it was a problem, too.  Because
it sometime ends up making system very unresponsible caused by slow swap
storage performance.

Quote from Luigi on Google
 "Since Chrome OS was mentioned: the main reason why we don't use swap
  to a disk (rotating or SSD) is because it doesn't degrade gracefully
  and leads to a bad interactive experience.  Generally we prefer to
  manage RAM at a higher level, by transparently killing and restarting
  processes.  But we noticed that zram is fast enough to be competitive
  with the latter, and it lets us make more efficient use of the
  available RAM.  " and he announced.
http://www.spinics.net/lists/linux-mm/msg57717.html

Other uses case is to use zram for block device.  Zram is block device
so anyone can format the block device and mount on it so some guys on
the internet start zram as /var/tmp.
http://forums.gentoo.org/viewtopic-t-838198-start-0.html

Let's promote zram and enhance/maintain it instead of removing.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Nitin Gupta <ngupta@vflare.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Bob Liu <bob.liu@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Luigi Semenzato <semenzato@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Seth Jennings <sjenning@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 {drivers/staging/zram => Documentation/blockdev}/zram.txt | 0
 drivers/block/Kconfig                                     | 2 ++
 drivers/block/Makefile                                    | 1 +
 drivers/{staging => block}/zram/Kconfig                   | 0
 drivers/{staging => block}/zram/Makefile                  | 0
 drivers/{staging => block}/zram/zram_drv.c                | 0
 drivers/{staging => block}/zram/zram_drv.h                | 0
 drivers/staging/Kconfig                                   | 2 --
 drivers/staging/Makefile                                  | 1 -
 9 files changed, 3 insertions(+), 3 deletions(-)
 rename {drivers/staging/zram => Documentation/blockdev}/zram.txt (100%)
 rename drivers/{staging => block}/zram/Kconfig (100%)
 rename drivers/{staging => block}/zram/Makefile (100%)
 rename drivers/{staging => block}/zram/zram_drv.c (100%)
 rename drivers/{staging => block}/zram/zram_drv.h (100%)

diff --git a/drivers/staging/zram/zram.txt b/Documentation/blockdev/zram.txt
similarity index 100%
rename from drivers/staging/zram/zram.txt
rename to Documentation/blockdev/zram.txt
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 9ffa90c6201c..014a1cfc41c5 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -108,6 +108,8 @@ source "drivers/block/paride/Kconfig"
 
 source "drivers/block/mtip32xx/Kconfig"
 
+source "drivers/block/zram/Kconfig"
+
 config BLK_CPQ_DA
 	tristate "Compaq SMART2 support"
 	depends on PCI && VIRT_TO_BUS && 0
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 816d979c3266..02b688d1438d 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
 
 obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
 obj-$(CONFIG_BLK_DEV_NULL_BLK)	+= null_blk.o
+obj-$(CONFIG_ZRAM) += zram/
 
 nvme-y		:= nvme-core.o nvme-scsi.o
 skd-y		:= skd_main.o
diff --git a/drivers/staging/zram/Kconfig b/drivers/block/zram/Kconfig
similarity index 100%
rename from drivers/staging/zram/Kconfig
rename to drivers/block/zram/Kconfig
diff --git a/drivers/staging/zram/Makefile b/drivers/block/zram/Makefile
similarity index 100%
rename from drivers/staging/zram/Makefile
rename to drivers/block/zram/Makefile
diff --git a/drivers/staging/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
similarity index 100%
rename from drivers/staging/zram/zram_drv.c
rename to drivers/block/zram/zram_drv.c
diff --git a/drivers/staging/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
similarity index 100%
rename from drivers/staging/zram/zram_drv.h
rename to drivers/block/zram/zram_drv.h
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 120d2fa9e531..040a51525b42 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -76,8 +76,6 @@ source "drivers/staging/sep/Kconfig"
 
 source "drivers/staging/iio/Kconfig"
 
-source "drivers/staging/zram/Kconfig"
-
 source "drivers/staging/wlags49_h2/Kconfig"
 
 source "drivers/staging/wlags49_h25/Kconfig"
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index cb19d0afa0da..dea056bf7ff2 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -32,7 +32,6 @@ obj-$(CONFIG_VT6656)		+= vt6656/
 obj-$(CONFIG_VME_BUS)		+= vme/
 obj-$(CONFIG_DX_SEP)            += sep/
 obj-$(CONFIG_IIO)		+= iio/
-obj-$(CONFIG_ZRAM)		+= zram/
 obj-$(CONFIG_WLAGS49_H2)	+= wlags49_h2/
 obj-$(CONFIG_WLAGS49_H25)	+= wlags49_h25/
 obj-$(CONFIG_FB_SM7XX)		+= sm7xxfb/

From 49061236a9c2e18b31617cef10d27ba136068bac Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 30 Jan 2014 15:45:54 -0800
Subject: [PATCH 08/25] zram: remove old private project comment

Remove the old private compcache project address so upcoming patches
should be sent to LKML because we Linux kernel community will take care.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/blockdev/zram.txt | 6 ------
 drivers/block/zram/Kconfig      | 1 -
 drivers/block/zram/zram_drv.c   | 1 -
 drivers/block/zram/zram_drv.h   | 1 -
 4 files changed, 9 deletions(-)

diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 765d790ae831..2eccddffa6c8 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -1,8 +1,6 @@
 zram: Compressed RAM based block devices
 ----------------------------------------
 
-Project home: http://compcache.googlecode.com/
-
 * Introduction
 
 The zram module creates RAM based block devices named /dev/zram<id>
@@ -69,9 +67,5 @@ Following shows a typical sequence of steps for using zram.
 	resets the disksize to zero. You must set the disksize again
 	before reusing the device.
 
-Please report any problems at:
- - Mailing list: linux-mm-cc at laptop dot org
- - Issue tracker: http://code.google.com/p/compcache/issues/list
-
 Nitin Gupta
 ngupta@vflare.org
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index 983314c41349..3450be850399 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -14,7 +14,6 @@ config ZRAM
 	  disks and maybe many more.
 
 	  See zram.txt for more information.
-	  Project home: <https://compcache.googlecode.com/>
 
 config ZRAM_DEBUG
 	bool "Compressed RAM block device debug support"
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 108f2733106d..134d605836ca 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -9,7 +9,6 @@
  * Released under the terms of 3-clause BSD License
  * Released under the terms of GNU General Public License Version 2.0
  *
- * Project home: http://compcache.googlecode.com
  */
 
 #define KMSG_COMPONENT "zram"
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index d8f6596513c3..92f70e8f457c 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -9,7 +9,6 @@
  * Released under the terms of 3-clause BSD License
  * Released under the terms of GNU General Public License Version 2.0
  *
- * Project home: http://compcache.googlecode.com
  */
 
 #ifndef _ZRAM_DRV_H_

From 7bfb3de8a1b3bebc2dc68d381efe27448c0584c5 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 30 Jan 2014 15:45:55 -0800
Subject: [PATCH 09/25] zram: add copyright

Add my copyright to the zram source code which I maintain.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 1 +
 drivers/block/zram/zram_drv.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 134d605836ca..f9711c520269 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -2,6 +2,7 @@
  * Compressed RAM block device
  *
  * Copyright (C) 2008, 2009, 2010  Nitin Gupta
+ *               2012, 2013 Minchan Kim
  *
  * This code is released using a dual license strategy: BSD/GPL
  * You can choose the licence that better fits your requirements.
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 92f70e8f457c..0e46953c08e9 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -2,6 +2,7 @@
  * Compressed RAM block device
  *
  * Copyright (C) 2008, 2009, 2010  Nitin Gupta
+ *               2012, 2013 Minchan Kim
  *
  * This code is released using a dual license strategy: BSD/GPL
  * You can choose the licence that better fits your requirements.

From 31fc00bb788ffde7d8d861d8b2bba798ab445992 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 30 Jan 2014 15:45:55 -0800
Subject: [PATCH 10/25] zsmalloc: add copyright

Add my copyright to the zsmalloc source code which I maintain.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/zsmalloc.h | 1 +
 mm/zsmalloc.c            | 1 +
 2 files changed, 2 insertions(+)

diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index c2eb174b97ee..e44d634e7fb7 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -2,6 +2,7 @@
  * zsmalloc memory allocator
  *
  * Copyright (C) 2011  Nitin Gupta
+ * Copyright (C) 2012, 2013 Minchan Kim
  *
  * This code is released using a dual license strategy: BSD/GPL
  * You can choose the license that better fits your requirements.
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 5d42adfcb67b..c03ca5e9fe15 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2,6 +2,7 @@
  * zsmalloc memory allocator
  *
  * Copyright (C) 2011  Nitin Gupta
+ * Copyright (C) 2012, 2013 Minchan Kim
  *
  * This code is released using a dual license strategy: BSD/GPL
  * You can choose the license that better fits your requirements.

From 6920f2cc9ed498fabb38368afaa751c411f672fb Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 30 Jan 2014 15:45:56 -0800
Subject: [PATCH 11/25] zram: add zram maintainers

Add maintainer information for zram into the MAINTAINERS file.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index a31a6e3e199f..6f76058a686b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9740,6 +9740,14 @@ T:	Mercurial http://linuxtv.org/hg/v4l-dvb
 S:	Odd Fixes
 F:	drivers/media/pci/zoran/
 
+ZRAM COMPRESSED RAM BLOCK DEVICE DRVIER
+M:	Minchan Kim <minchan@kernel.org>
+M:	Nitin Gupta <ngupta@vflare.org>
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+F:	drivers/block/zram/
+F:	Documentation/blockdev/zram.txt
+
 ZS DECSTATION Z85C30 SERIAL DRIVER
 M:	"Maciej W. Rozycki" <macro@linux-mips.org>
 S:	Maintained

From eae70d06846199afc97524ed986b910836c0abe5 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 30 Jan 2014 15:45:57 -0800
Subject: [PATCH 12/25] zsmalloc: add maintainers

tAdd adds maintainer information for zsmalloc into the MAINTAINERS file.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 6f76058a686b..b5795f031b3e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9753,6 +9753,14 @@ M:	"Maciej W. Rozycki" <macro@linux-mips.org>
 S:	Maintained
 F:	drivers/tty/serial/zs.*
 
+ZSMALLOC COMPRESSED SLAB MEMORY ALLOCATOR
+M:	Minchan Kim <minchan@kernel.org>
+M:	Nitin Gupta <ngupta@vflare.org>
+L:	linux-mm@kvack.org
+S:	Maintained
+F:	mm/zsmalloc.c
+F:	include/linux/zsmalloc.h
+
 ZSWAP COMPRESSED SWAP CACHING
 M:	Seth Jennings <sjenning@linux.vnet.ibm.com>
 L:	linux-mm@kvack.org

From da4a04126baa3be03bc566d4a2ee0944c5e783d0 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 30 Jan 2014 15:45:58 -0800
Subject: [PATCH 13/25] zram: fix race between reset and flushing pending work

Dan and Sergey reported that there is a racy between reset and flushing
of pending work so that it could make oops by freeing zram->meta in
reset while zram_slot_free can access zram->meta if new request is
adding during the race window.

This patch moves flush after taking init_lock so it prevents new request
so that it closes the race.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Tested-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index f9711c520269..213dfc10b783 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -553,14 +553,14 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
 	size_t index;
 	struct zram_meta *meta;
 
-	flush_work(&zram->free_work);
-
 	down_write(&zram->init_lock);
 	if (!zram->init_done) {
 		up_write(&zram->init_lock);
 		return;
 	}
 
+	flush_work(&zram->free_work);
+
 	meta = zram->meta;
 	zram->init_done = 0;
 

From 9b353db16d18f87242337e3e61a948c023505a65 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 30 Jan 2014 15:46:00 -0800
Subject: [PATCH 14/25] zram: delay pending free request in read path

Sergey reported we don't need to handle pending free request every I/O
so that this patch removes it in read path while we remain it in write
path.

Let's consider below example.

Swap subsystem ask to zram "A" block free by swap_slot_free_notify but
zram had been pended it without real freeing.  Swap subsystem allocates
"A" block for new data but request pended for a long time just handled
and zram blindly free new data on the "A" block.  :(

That's why we couldn't remove handle pending free request right before
zram-write.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Reported-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Tested-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 213dfc10b783..d87c7e98fb11 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -535,7 +535,6 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 
 	if (rw == READ) {
 		down_read(&zram->lock);
-		handle_pending_slot_free(zram);
 		ret = zram_bvec_read(zram, bvec, index, offset, bio);
 		up_read(&zram->lock);
 	} else {

From 874e3cddc33f0c0f9cc08ad2b73fa0cbe7dfaa63 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 30 Jan 2014 15:46:01 -0800
Subject: [PATCH 15/25] zram: remove unnecessary free

Commit a0c516cbfc74 ("zram: don't grab mutex in zram_slot_free_noity")
introduced pending zram slot free in zram's write path in case of
missing slot free by memory allocation failure in zram_slot_free_notify
but it is not necessary because we have already freed the slot right
before overwriting.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Tested-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index d87c7e98fb11..ebfddd852272 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -441,14 +441,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 		goto out;
 	}
 
-	/*
-	 * zram_slot_free_notify could miss free so that let's
-	 * double check.
-	 */
-	if (unlikely(meta->table[index].handle ||
-			zram_test_flag(meta, index, ZRAM_ZERO)))
-		zram_free_page(zram, index);
-
 	ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen,
 			       meta->compress_workmem);
 

From deb0bdeb2f3d6b81d37fc778316dae46b6daab56 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 30 Jan 2014 15:46:02 -0800
Subject: [PATCH 16/25] zram: use atomic operation for stat

Some of fields in zram->stats are protected by zram->lock which is
rather coarse-grained so let's use atomic operation without explict
locking.

This patch is ready for removing dependency of zram->lock in read path
which is very coarse-grained rw_semaphore.  Of course, this patch adds
new atomic operation so it might make slow but my 12CPU test couldn't
spot any regression.  All gain/lose is marginal within stddev.

  iozone -t -T -l 12 -u 12 -r 16K -s 60M -I +Z -V 0

  ==Initial write                ==Initial write
  records: 50                    records: 50
  avg:  412875.17                avg:  415638.23
  std:   38543.12 (9.34%)        std:   36601.11 (8.81%)
  max:  521262.03                max:  502976.72
  min:  343263.13                min:  351389.12
  ==Rewrite                      ==Rewrite
  records: 50                    records: 50
  avg:  416640.34                avg:  397914.33
  std:   60798.92 (14.59%)       std:   46150.42 (11.60%)
  max:  543057.07                max:  522669.17
  min:  304071.67                min:  316588.77
  ==Read                         ==Read
  records: 50                    records: 50
  avg: 4147338.63                avg: 4070736.51
  std:  179333.25 (4.32%)        std:  223499.89 (5.49%)
  max: 4459295.28                max: 4539514.44
  min: 3753057.53                min: 3444686.31
  ==Re-read                      ==Re-read
  records: 50                    records: 50
  avg: 4096706.71                avg: 4117218.57
  std:  229735.04 (5.61%)        std:  171676.25 (4.17%)
  max: 4430012.09                max: 4459263.94
  min: 2987217.80                min: 3666904.28
  ==Reverse Read                 ==Reverse Read
  records: 50                    records: 50
  avg: 4062763.83                avg: 4078508.32
  std:  186208.46 (4.58%)        std:  172684.34 (4.23%)
  max: 4401358.78                max: 4424757.22
  min: 3381625.00                min: 3679359.94
  ==Stride read                  ==Stride read
  records: 50                    records: 50
  avg: 4094933.49                avg: 4082170.22
  std:  185710.52 (4.54%)        std:  196346.68 (4.81%)
  max: 4478241.25                max: 4460060.97
  min: 3732593.23                min: 3584125.78
  ==Random read                  ==Random read
  records: 50                    records: 50
  avg: 4031070.04                avg: 4074847.49
  std:  192065.51 (4.76%)        std:  206911.33 (5.08%)
  max: 4356931.16                max: 4399442.56
  min: 3481619.62                min: 3548372.44
  ==Mixed workload               ==Mixed workload
  records: 50                    records: 50
  avg:  149925.73                avg:  149675.54
  std:    7701.26 (5.14%)        std:    6902.09 (4.61%)
  max:  191301.56                max:  175162.05
  min:  133566.28                min:  137762.87
  ==Random write                 ==Random write
  records: 50                    records: 50
  avg:  404050.11                avg:  393021.47
  std:   58887.57 (14.57%)       std:   42813.70 (10.89%)
  max:  601798.09                max:  524533.43
  min:  325176.99                min:  313255.34
  ==Pwrite                       ==Pwrite
  records: 50                    records: 50
  avg:  411217.70                avg:  411237.96
  std:   43114.99 (10.48%)       std:   33136.29 (8.06%)
  max:  530766.79                max:  471899.76
  min:  320786.84                min:  317906.94
  ==Pread                        ==Pread
  records: 50                    records: 50
  avg: 4154908.65                avg: 4087121.92
  std:  151272.08 (3.64%)        std:  219505.04 (5.37%)
  max: 4459478.12                max: 4435857.38
  min: 3730512.41                min: 3101101.67

Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Tested-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 20 ++++++++++----------
 drivers/block/zram/zram_drv.h | 16 ++++++----------
 2 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index ebfddd852272..9ab884999420 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -104,7 +104,7 @@ static ssize_t zero_pages_show(struct device *dev,
 {
 	struct zram *zram = dev_to_zram(dev);
 
-	return sprintf(buf, "%u\n", zram->stats.pages_zero);
+	return sprintf(buf, "%u\n", atomic_read(&zram->stats.pages_zero));
 }
 
 static ssize_t orig_data_size_show(struct device *dev,
@@ -113,7 +113,7 @@ static ssize_t orig_data_size_show(struct device *dev,
 	struct zram *zram = dev_to_zram(dev);
 
 	return sprintf(buf, "%llu\n",
-		(u64)(zram->stats.pages_stored) << PAGE_SHIFT);
+		(u64)(atomic_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
 }
 
 static ssize_t compr_data_size_show(struct device *dev,
@@ -293,21 +293,21 @@ static void zram_free_page(struct zram *zram, size_t index)
 		 */
 		if (zram_test_flag(meta, index, ZRAM_ZERO)) {
 			zram_clear_flag(meta, index, ZRAM_ZERO);
-			zram->stats.pages_zero--;
+			atomic_dec(&zram->stats.pages_zero);
 		}
 		return;
 	}
 
 	if (unlikely(size > max_zpage_size))
-		zram->stats.bad_compress--;
+		atomic_dec(&zram->stats.bad_compress);
 
 	zs_free(meta->mem_pool, handle);
 
 	if (size <= PAGE_SIZE / 2)
-		zram->stats.good_compress--;
+		atomic_dec(&zram->stats.good_compress);
 
 	atomic64_sub(meta->table[index].size, &zram->stats.compr_size);
-	zram->stats.pages_stored--;
+	atomic_dec(&zram->stats.pages_stored);
 
 	meta->table[index].handle = 0;
 	meta->table[index].size = 0;
@@ -435,7 +435,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 		/* Free memory associated with this sector now. */
 		zram_free_page(zram, index);
 
-		zram->stats.pages_zero++;
+		atomic_inc(&zram->stats.pages_zero);
 		zram_set_flag(meta, index, ZRAM_ZERO);
 		ret = 0;
 		goto out;
@@ -456,7 +456,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 	}
 
 	if (unlikely(clen > max_zpage_size)) {
-		zram->stats.bad_compress++;
+		atomic_inc(&zram->stats.bad_compress);
 		clen = PAGE_SIZE;
 		src = NULL;
 		if (is_partial_io(bvec))
@@ -493,9 +493,9 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 
 	/* Update stats */
 	atomic64_add(clen, &zram->stats.compr_size);
-	zram->stats.pages_stored++;
+	atomic_inc(&zram->stats.pages_stored);
 	if (clen <= PAGE_SIZE / 2)
-		zram->stats.good_compress++;
+		atomic_inc(&zram->stats.good_compress);
 
 out:
 	if (is_partial_io(bvec))
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 0e46953c08e9..81b0170de369 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -68,10 +68,6 @@ struct table {
 	u8 flags;
 } __aligned(4);
 
-/*
- * All 64bit fields should only be manipulated by 64bit atomic accessors.
- * All modifications to 32bit counter should be protected by zram->lock.
- */
 struct zram_stats {
 	atomic64_t compr_size;	/* compressed size of pages stored */
 	atomic64_t num_reads;	/* failed + successful */
@@ -80,10 +76,10 @@ struct zram_stats {
 	atomic64_t failed_writes;	/* can happen when memory is too low */
 	atomic64_t invalid_io;	/* non-page-aligned I/O requests */
 	atomic64_t notify_free;	/* no. of swap slot free notifications */
-	u32 pages_zero;		/* no. of zero filled pages */
-	u32 pages_stored;	/* no. of pages currently stored */
-	u32 good_compress;	/* % of pages with compression ratio<=50% */
-	u32 bad_compress;	/* % of pages with compression ratio>=75% */
+	atomic_t pages_zero;		/* no. of zero filled pages */
+	atomic_t pages_stored;	/* no. of pages currently stored */
+	atomic_t good_compress;	/* % of pages with compression ratio<=50% */
+	atomic_t bad_compress;	/* % of pages with compression ratio>=75% */
 };
 
 struct zram_meta {
@@ -101,8 +97,8 @@ struct zram_slot_free {
 struct zram {
 	struct zram_meta *meta;
 	struct rw_semaphore lock; /* protect compression buffers, table,
-				   * 32bit stat counters against concurrent
-				   * notifications, reads and writes */
+				   * reads and writes
+				   */
 
 	struct work_struct free_work;  /* handle pending free request */
 	struct zram_slot_free *slot_free_rq; /* list head of free request */

From 92967471b67163bb1654e9b7fe99449ab70a4aaa Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 30 Jan 2014 15:46:03 -0800
Subject: [PATCH 17/25] zram: introduce zram->tb_lock

Currently, the zram table is protected by zram->lock but it's rather
coarse-grained lock and it makes hard for scalibility.

Let's use own rwlock instead of depending on zram->lock.  This patch
adds new locking so obviously, it would make slow but this patch is just
prepartion for removing coarse-grained rw_semaphore(ie, zram->lock)
which is hurdle about zram scalability.

Final patch in this patchset series will remove the lock from read-path
and change rw_semaphore with mutex in write path.  With bonus, we could
drop pending slot free mess in next patch.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Tested-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 26 +++++++++++++++++++++-----
 drivers/block/zram/zram_drv.h |  3 ++-
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9ab884999420..24e6426cf258 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -140,6 +140,7 @@ static ssize_t mem_used_total_show(struct device *dev,
 	return sprintf(buf, "%llu\n", val);
 }
 
+/* flag operations needs meta->tb_lock */
 static int zram_test_flag(struct zram_meta *meta, u32 index,
 			enum zram_pageflags flag)
 {
@@ -228,6 +229,7 @@ static struct zram_meta *zram_meta_alloc(u64 disksize)
 		goto free_table;
 	}
 
+	rwlock_init(&meta->tb_lock);
 	return meta;
 
 free_table:
@@ -280,6 +282,7 @@ static void handle_zero_page(struct bio_vec *bvec)
 	flush_dcache_page(page);
 }
 
+/* NOTE: caller should hold meta->tb_lock with write-side */
 static void zram_free_page(struct zram *zram, size_t index)
 {
 	struct zram_meta *meta = zram->meta;
@@ -319,20 +322,26 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
 	size_t clen = PAGE_SIZE;
 	unsigned char *cmem;
 	struct zram_meta *meta = zram->meta;
-	unsigned long handle = meta->table[index].handle;
+	unsigned long handle;
+	u16 size;
+
+	read_lock(&meta->tb_lock);
+	handle = meta->table[index].handle;
+	size = meta->table[index].size;
 
 	if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
+		read_unlock(&meta->tb_lock);
 		clear_page(mem);
 		return 0;
 	}
 
 	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
-	if (meta->table[index].size == PAGE_SIZE)
+	if (size == PAGE_SIZE)
 		copy_page(mem, cmem);
 	else
-		ret = lzo1x_decompress_safe(cmem, meta->table[index].size,
-						mem, &clen);
+		ret = lzo1x_decompress_safe(cmem, size,	mem, &clen);
 	zs_unmap_object(meta->mem_pool, handle);
+	read_unlock(&meta->tb_lock);
 
 	/* Should NEVER happen. Return bio error if it does. */
 	if (unlikely(ret != LZO_E_OK)) {
@@ -353,11 +362,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
 	struct zram_meta *meta = zram->meta;
 	page = bvec->bv_page;
 
+	read_lock(&meta->tb_lock);
 	if (unlikely(!meta->table[index].handle) ||
 			zram_test_flag(meta, index, ZRAM_ZERO)) {
+		read_unlock(&meta->tb_lock);
 		handle_zero_page(bvec);
 		return 0;
 	}
+	read_unlock(&meta->tb_lock);
 
 	if (is_partial_io(bvec))
 		/* Use  a temporary buffer to decompress the page */
@@ -433,10 +445,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 	if (page_zero_filled(uncmem)) {
 		kunmap_atomic(user_mem);
 		/* Free memory associated with this sector now. */
+		write_lock(&zram->meta->tb_lock);
 		zram_free_page(zram, index);
+		zram_set_flag(meta, index, ZRAM_ZERO);
+		write_unlock(&zram->meta->tb_lock);
 
 		atomic_inc(&zram->stats.pages_zero);
-		zram_set_flag(meta, index, ZRAM_ZERO);
 		ret = 0;
 		goto out;
 	}
@@ -486,10 +500,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 	 * Free memory associated with this sector
 	 * before overwriting unused sectors.
 	 */
+	write_lock(&zram->meta->tb_lock);
 	zram_free_page(zram, index);
 
 	meta->table[index].handle = handle;
 	meta->table[index].size = clen;
+	write_unlock(&zram->meta->tb_lock);
 
 	/* Update stats */
 	atomic64_add(clen, &zram->stats.compr_size);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 81b0170de369..c3f453f04974 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -83,6 +83,7 @@ struct zram_stats {
 };
 
 struct zram_meta {
+	rwlock_t tb_lock;	/* protect table */
 	void *compress_workmem;
 	void *compress_buffer;
 	struct table *table;
@@ -96,7 +97,7 @@ struct zram_slot_free {
 
 struct zram {
 	struct zram_meta *meta;
-	struct rw_semaphore lock; /* protect compression buffers, table,
+	struct rw_semaphore lock; /* protect compression buffers,
 				   * reads and writes
 				   */
 

From f614a9f48dedd2b80d1dc8bae8094842fcdb39dd Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 30 Jan 2014 15:46:04 -0800
Subject: [PATCH 18/25] zram: remove workqueue for freeing removed pending slot

Commit a0c516cbfc74 ("zram: don't grab mutex in zram_slot_free_noity")
introduced free request pending code to avoid scheduling by mutex under
spinlock and it was a mess which made code lenghty and increased
overhead.

Now, we don't need zram->lock any more to free slot so this patch
reverts it and then, tb_lock should protect it.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Tested-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 54 ++++-------------------------------
 drivers/block/zram/zram_drv.h | 10 -------
 2 files changed, 6 insertions(+), 58 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 24e6426cf258..f1a3c958d84b 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -522,20 +522,6 @@ out:
 	return ret;
 }
 
-static void handle_pending_slot_free(struct zram *zram)
-{
-	struct zram_slot_free *free_rq;
-
-	spin_lock(&zram->slot_free_lock);
-	while (zram->slot_free_rq) {
-		free_rq = zram->slot_free_rq;
-		zram->slot_free_rq = free_rq->next;
-		zram_free_page(zram, free_rq->index);
-		kfree(free_rq);
-	}
-	spin_unlock(&zram->slot_free_lock);
-}
-
 static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 			int offset, struct bio *bio, int rw)
 {
@@ -547,7 +533,6 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 		up_read(&zram->lock);
 	} else {
 		down_write(&zram->lock);
-		handle_pending_slot_free(zram);
 		ret = zram_bvec_write(zram, bvec, index, offset);
 		up_write(&zram->lock);
 	}
@@ -566,8 +551,6 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
 		return;
 	}
 
-	flush_work(&zram->free_work);
-
 	meta = zram->meta;
 	zram->init_done = 0;
 
@@ -769,40 +752,19 @@ error:
 	bio_io_error(bio);
 }
 
-static void zram_slot_free(struct work_struct *work)
-{
-	struct zram *zram;
-
-	zram = container_of(work, struct zram, free_work);
-	down_write(&zram->lock);
-	handle_pending_slot_free(zram);
-	up_write(&zram->lock);
-}
-
-static void add_slot_free(struct zram *zram, struct zram_slot_free *free_rq)
-{
-	spin_lock(&zram->slot_free_lock);
-	free_rq->next = zram->slot_free_rq;
-	zram->slot_free_rq = free_rq;
-	spin_unlock(&zram->slot_free_lock);
-}
-
 static void zram_slot_free_notify(struct block_device *bdev,
 				unsigned long index)
 {
 	struct zram *zram;
-	struct zram_slot_free *free_rq;
+	struct zram_meta *meta;
 
 	zram = bdev->bd_disk->private_data;
+	meta = zram->meta;
+
+	write_lock(&meta->tb_lock);
+	zram_free_page(zram, index);
+	write_unlock(&meta->tb_lock);
 	atomic64_inc(&zram->stats.notify_free);
-
-	free_rq = kmalloc(sizeof(struct zram_slot_free), GFP_ATOMIC);
-	if (!free_rq)
-		return;
-
-	free_rq->index = index;
-	add_slot_free(zram, free_rq);
-	schedule_work(&zram->free_work);
 }
 
 static const struct block_device_operations zram_devops = {
@@ -849,10 +811,6 @@ static int create_device(struct zram *zram, int device_id)
 	init_rwsem(&zram->lock);
 	init_rwsem(&zram->init_lock);
 
-	INIT_WORK(&zram->free_work, zram_slot_free);
-	spin_lock_init(&zram->slot_free_lock);
-	zram->slot_free_rq = NULL;
-
 	zram->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!zram->queue) {
 		pr_err("Error allocating disk queue for device %d\n",
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index c3f453f04974..d876300da6c9 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -90,20 +90,11 @@ struct zram_meta {
 	struct zs_pool *mem_pool;
 };
 
-struct zram_slot_free {
-	unsigned long index;
-	struct zram_slot_free *next;
-};
-
 struct zram {
 	struct zram_meta *meta;
 	struct rw_semaphore lock; /* protect compression buffers,
 				   * reads and writes
 				   */
-
-	struct work_struct free_work;  /* handle pending free request */
-	struct zram_slot_free *slot_free_rq; /* list head of free request */
-
 	struct request_queue *queue;
 	struct gendisk *disk;
 	int init_done;
@@ -114,7 +105,6 @@ struct zram {
 	 * we can store in a disk.
 	 */
 	u64 disksize;	/* bytes */
-	spinlock_t slot_free_lock;
 
 	struct zram_stats stats;
 };

From e46e33152eb82b8e2db7ffb3790a2a2653c34513 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 30 Jan 2014 15:46:06 -0800
Subject: [PATCH 19/25] zram: remove zram->lock in read path and change it with
 mutex

Finally, we separated zram->lock dependency from 32bit stat/ table
handling so there is no reason to use rw_semaphore between read and
write path so this patch removes the lock from read path totally and
changes rw_semaphore with mutex.  So, we could do

old:

  read-read: OK
  read-write: NO
  write-write: NO

Now:

  read-read: OK
  read-write: OK
  write-write: NO

The below data proves mixed workload performs well 11 times and there is
also enhance on write-write path because current rw-semaphore doesn't
support SPIN_ON_OWNER.  It's side effect but anyway good thing for us.

Write-related tests perform better (from 61% to 1058%) but read path has
good/bad(from -2.22% to 1.45%) but they are all marginal within stddev.

  CPU 12
  iozone -t -T -l 12 -u 12 -r 16K -s 60M -I +Z -V 0

  ==Initial write                ==Initial write
  records: 10                    records: 10
  avg:  516189.16                avg:  839907.96
  std:   22486.53 (4.36%)        std:   47902.17 (5.70%)
  max:  546970.60                max:  909910.35
  min:  481131.54                min:  751148.38
  ==Rewrite                      ==Rewrite
  records: 10                    records: 10
  avg:  509527.98                avg: 1050156.37
  std:   45799.94 (8.99%)        std:   40695.44 (3.88%)
  max:  611574.27                max: 1111929.26
  min:  443679.95                min:  980409.62
  ==Read                         ==Read
  records: 10                    records: 10
  avg: 4408624.17                avg: 4472546.76
  std:  281152.61 (6.38%)        std:  163662.78 (3.66%)
  max: 4867888.66                max: 4727351.03
  min: 4058347.69                min: 4126520.88
  ==Re-read                      ==Re-read
  records: 10                    records: 10
  avg: 4462147.53                avg: 4363257.75
  std:  283546.11 (6.35%)        std:  247292.63 (5.67%)
  max: 4912894.44                max: 4677241.75
  min: 4131386.50                min: 4035235.84
  ==Reverse Read                 ==Reverse Read
  records: 10                    records: 10
  avg: 4565865.97                avg: 4485818.08
  std:  313395.63 (6.86%)        std:  248470.10 (5.54%)
  max: 5232749.16                max: 4789749.94
  min: 4185809.62                min: 3963081.34
  ==Stride read                  ==Stride read
  records: 10                    records: 10
  avg: 4515981.80                avg: 4418806.01
  std:  211192.32 (4.68%)        std:  212837.97 (4.82%)
  max: 4889287.28                max: 4686967.22
  min: 4210362.00                min: 4083041.84
  ==Random read                  ==Random read
  records: 10                    records: 10
  avg: 4410525.23                avg: 4387093.18
  std:  236693.22 (5.37%)        std:  235285.23 (5.36%)
  max: 4713698.47                max: 4669760.62
  min: 4057163.62                min: 3952002.16
  ==Mixed workload               ==Mixed workload
  records: 10                    records: 10
  avg:  243234.25                avg: 2818677.27
  std:   28505.07 (11.72%)       std:  195569.70 (6.94%)
  max:  288905.23                max: 3126478.11
  min:  212473.16                min: 2484150.69
  ==Random write                 ==Random write
  records: 10                    records: 10
  avg:  555887.07                avg: 1053057.79
  std:   70841.98 (12.74%)       std:   35195.36 (3.34%)
  max:  683188.28                max: 1096125.73
  min:  437299.57                min:  992481.93
  ==Pwrite                       ==Pwrite
  records: 10                    records: 10
  avg:  501745.93                avg:  810363.09
  std:   16373.54 (3.26%)        std:   19245.01 (2.37%)
  max:  518724.52                max:  833359.70
  min:  464208.73                min:  765501.87
  ==Pread                        ==Pread
  records: 10                    records: 10
  avg: 4539894.60                avg: 4457680.58
  std:  197094.66 (4.34%)        std:  188965.60 (4.24%)
  max: 4877170.38                max: 4689905.53
  min: 4226326.03                min: 4095739.72

Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Tested-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 17 ++++++++---------
 drivers/block/zram/zram_drv.h |  4 +---
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index f1a3c958d84b..011e55d820b1 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -230,6 +230,7 @@ static struct zram_meta *zram_meta_alloc(u64 disksize)
 	}
 
 	rwlock_init(&meta->tb_lock);
+	mutex_init(&meta->buffer_lock);
 	return meta;
 
 free_table:
@@ -412,6 +413,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 	struct page *page;
 	unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
 	struct zram_meta *meta = zram->meta;
+	bool locked = false;
 
 	page = bvec->bv_page;
 	src = meta->compress_buffer;
@@ -431,6 +433,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 			goto out;
 	}
 
+	mutex_lock(&meta->buffer_lock);
+	locked = true;
 	user_mem = kmap_atomic(page);
 
 	if (is_partial_io(bvec)) {
@@ -457,7 +461,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 
 	ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen,
 			       meta->compress_workmem);
-
 	if (!is_partial_io(bvec)) {
 		kunmap_atomic(user_mem);
 		user_mem = NULL;
@@ -514,6 +517,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 		atomic_inc(&zram->stats.good_compress);
 
 out:
+	if (locked)
+		mutex_unlock(&meta->buffer_lock);
 	if (is_partial_io(bvec))
 		kfree(uncmem);
 
@@ -527,15 +532,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 {
 	int ret;
 
-	if (rw == READ) {
-		down_read(&zram->lock);
+	if (rw == READ)
 		ret = zram_bvec_read(zram, bvec, index, offset, bio);
-		up_read(&zram->lock);
-	} else {
-		down_write(&zram->lock);
+	else
 		ret = zram_bvec_write(zram, bvec, index, offset);
-		up_write(&zram->lock);
-	}
 
 	return ret;
 }
@@ -808,7 +808,6 @@ static int create_device(struct zram *zram, int device_id)
 {
 	int ret = -ENOMEM;
 
-	init_rwsem(&zram->lock);
 	init_rwsem(&zram->init_lock);
 
 	zram->queue = blk_alloc_queue(GFP_KERNEL);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index d876300da6c9..ad8aa35bae00 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -88,13 +88,11 @@ struct zram_meta {
 	void *compress_buffer;
 	struct table *table;
 	struct zs_pool *mem_pool;
+	struct mutex buffer_lock; /* protect compress buffers */
 };
 
 struct zram {
 	struct zram_meta *meta;
-	struct rw_semaphore lock; /* protect compression buffers,
-				   * reads and writes
-				   */
 	struct request_queue *queue;
 	struct gendisk *disk;
 	int init_done;

From 8790c71a18e5d2d93532ae250bcf5eddbba729cd Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 30 Jan 2014 15:46:08 -0800
Subject: [PATCH 20/25] mm/mempolicy.c: fix mempolicy printing in numa_maps

As a result of commit 5606e3877ad8 ("mm: numa: Migrate on reference
policy"), /proc/<pid>/numa_maps prints the mempolicy for any <pid> as
"prefer:N" for the local node, N, of the process reading the file.

This should only be printed when the mempolicy of <pid> is
MPOL_PREFERRED for node N.

If the process is actually only using the default mempolicy for local
node allocation, make sure "default" is printed as expected.

Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: Robert Lippert <rlippert@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: <stable@vger.kernel.org>	[3.7+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 873de7e542bc..ae3c8f3595d4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2930,7 +2930,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	unsigned short mode = MPOL_DEFAULT;
 	unsigned short flags = 0;
 
-	if (pol && pol != &default_policy) {
+	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
 		mode = pol->mode;
 		flags = pol->flags;
 	}

From a03208652dad18232e9ec3432df69f9c19c856ec Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Thu, 30 Jan 2014 15:46:09 -0800
Subject: [PATCH 21/25] mm/slub.c: fix page->_count corruption (again)

Commit abca7c496584 ("mm: fix slab->page _count corruption when using
slub") notes that we can not _set_ a page->counters directly, except
when using a real double-cmpxchg.  Doing so can lose updates to
->_count.

That is an absolute rule:

        You may not *set* page->counters except via a cmpxchg.

Commit abca7c496584 fixed this for the folks who have the slub
cmpxchg_double code turned off at compile time, but it left the bad case
alone.  It can still be reached, and the same bug triggered in two
cases:

1. Turning on slub debugging at runtime, which is available on
   the distro kernels that I looked at.
2. On 64-bit CPUs with no CMPXCHG16B (some early AMD x86-64
   cpus, evidently)

There are at least 3 ways we could fix this:

1. Take all of the exising calls to cmpxchg_double_slab() and
   __cmpxchg_double_slab() and convert them to take an old, new
   and target 'struct page'.
2. Do (1), but with the newly-introduced 'slub_data'.
3. Do some magic inside the two cmpxchg...slab() functions to
   pull the counters out of new_counters and only set those
   fields in page->{inuse,frozen,objects}.

I've done (2) as well, but it's a bunch more code.  This patch is an
attempt at (3).  This was the most straightforward and foolproof way
that I could think to do this.

This would also technically allow us to get rid of the ugly

#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
       defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)

in 'struct page', but leaving it alone has the added benefit that
'counters' stays 'unsigned' instead of 'unsigned long', so all the
copies that the slub code does stay a bit smaller.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 545a170ebf9f..2b1a6970e46f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -355,6 +355,21 @@ static __always_inline void slab_unlock(struct page *page)
 	__bit_spin_unlock(PG_locked, &page->flags);
 }
 
+static inline void set_page_slub_counters(struct page *page, unsigned long counters_new)
+{
+	struct page tmp;
+	tmp.counters = counters_new;
+	/*
+	 * page->counters can cover frozen/inuse/objects as well
+	 * as page->_count.  If we assign to ->counters directly
+	 * we run the risk of losing updates to page->_count, so
+	 * be careful and only assign to the fields we need.
+	 */
+	page->frozen  = tmp.frozen;
+	page->inuse   = tmp.inuse;
+	page->objects = tmp.objects;
+}
+
 /* Interrupts must be disabled (for the fallback code to work right) */
 static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 		void *freelist_old, unsigned long counters_old,
@@ -376,7 +391,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
 		if (page->freelist == freelist_old &&
 					page->counters == counters_old) {
 			page->freelist = freelist_new;
-			page->counters = counters_new;
+			set_page_slub_counters(page, counters_new);
 			slab_unlock(page);
 			return 1;
 		}
@@ -415,7 +430,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 		if (page->freelist == freelist_old &&
 					page->counters == counters_old) {
 			page->freelist = freelist_new;
-			page->counters = counters_new;
+			set_page_slub_counters(page, counters_new);
 			slab_unlock(page);
 			local_irq_restore(flags);
 			return 1;

From 24f91eba18bbfdb27e71a1aae5b3a61b67fcd091 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Thu, 30 Jan 2014 15:46:10 -0800
Subject: [PATCH 22/25] mm: don't lose the SOFT_DIRTY flag on mprotect

The SOFT_DIRTY bit shows that the content of memory was changed after a
defined point in the past.  mprotect() doesn't change the content of
memory, so it must not change the SOFT_DIRTY bit.

This bug causes a malfunction: on the first iteration all pages are
dumped.  On other iterations only pages with the SOFT_DIRTY bit are
dumped.  So if the SOFT_DIRTY bit is cleared from a page by mistake, the
page is not dumped and its content will be restored incorrectly.

This patch does nothing with _PAGE_SWP_SOFT_DIRTY, becase pte_modify()
is called only for present pages.

Fixes commit 0f8975ec4db2 ("mm: soft-dirty bits for user memory changes
tracking").

Signed-off-by: Andrey Vagin <avagin@openvz.org>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable_types.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index a83aa44bb1fb..1aa9ccd43223 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -121,7 +121,8 @@
 
 /* Set of bits not changed in pte_modify */
 #define _PAGE_CHG_MASK	(PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |		\
-			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
+			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY |	\
+			 _PAGE_SOFT_DIRTY)
 #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 
 #define _PAGE_CACHE_MASK	(_PAGE_PCD | _PAGE_PWT)

From 778c14affaf94a9e4953179d3e13a544ccce7707 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 30 Jan 2014 15:46:11 -0800
Subject: [PATCH 23/25] mm, oom: base root bonus on current usage

A 3% of system memory bonus is sometimes too excessive in comparison to
other processes.

With commit a63d83f427fb ("oom: badness heuristic rewrite"), the OOM
killer tries to avoid killing privileged tasks by subtracting 3% of
overall memory (system or cgroup) from their per-task consumption.  But
as a result, all root tasks that consume less than 3% of overall memory
are considered equal, and so it only takes 33+ privileged tasks pushing
the system out of memory for the OOM killer to do something stupid and
kill dhclient or other root-owned processes.  For example, on a 32G
machine it can't tell the difference between the 1M agetty and the 10G
fork bomb member.

The changelog describes this 3% boost as the equivalent to the global
overcommit limit being 3% higher for privileged tasks, but this is not
the same as discounting 3% of overall memory from _every privileged task
individually_ during OOM selection.

Replace the 3% of system memory bonus with a 3% of current memory usage
bonus.

By giving root tasks a bonus that is proportional to their actual size,
they remain comparable even when relatively small.  In the example
above, the OOM killer will discount the 1M agetty's 256 badness points
down to 179, and the 10G fork bomb's 262144 points down to 183500 points
and make the right choice, instead of discounting both to 0 and killing
agetty because it's first in the task list.

Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 4 ++--
 mm/oom_kill.c                      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 31f76178c987..f00bee144add 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1386,8 +1386,8 @@ may allocate from based on an estimation of its current memory and swap use.
 For example, if a task is using all allowed memory, its badness score will be
 1000.  If it is using half of its allowed memory, its score will be 500.
 
-There is an additional factor included in the badness score: root
-processes are given 3% extra memory over other tasks.
+There is an additional factor included in the badness score: the current memory
+and swap usage is discounted by 3% for root processes.
 
 The amount of "allowed" memory depends on the context in which the oom killer
 was called.  If it is due to the memory assigned to the allocating task's cpuset
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 37b1b1903fb2..3291e82d4352 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -178,7 +178,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * implementation used by LSMs.
 	 */
 	if (has_capability_noaudit(p, CAP_SYS_ADMIN))
-		adj -= 30;
+		points -= (points * 3) / 100;
 
 	/* Normalize to oom_score_adj units */
 	adj *= totalpages / 1000;

From 46bf16c44b90791445975463da671521fc430cae Mon Sep 17 00:00:00 2001
From: Richard Yao <ryao@gentoo.org>
Date: Thu, 30 Jan 2014 15:46:12 -0800
Subject: [PATCH 24/25] Documentation/filesystems/vfs.txt: update
 file_operations documentation

->readv, ->writev and ->sendfile have been removed while ->show_fdinfo
has been added. The documentation should reflect this.

Signed-off-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/vfs.txt | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index deb48b5fd883..c53784c119c8 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -782,7 +782,7 @@ struct file_operations
 ----------------------
 
 This describes how the VFS can manipulate an open file. As of kernel
-3.5, the following members are defined:
+3.12, the following members are defined:
 
 struct file_operations {
 	struct module *owner;
@@ -803,9 +803,6 @@ struct file_operations {
 	int (*aio_fsync) (struct kiocb *, int datasync);
 	int (*fasync) (int, struct file *, int);
 	int (*lock) (struct file *, int, struct file_lock *);
-	ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *);
-	ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
-	ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *);
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
 	int (*check_flags)(int);
@@ -814,6 +811,7 @@ struct file_operations {
 	ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int);
 	int (*setlease)(struct file *, long arg, struct file_lock **);
 	long (*fallocate)(struct file *, int mode, loff_t offset, loff_t len);
+	int (*show_fdinfo)(struct seq_file *m, struct file *f);
 };
 
 Again, all methods are called without any locks being held, unless
@@ -864,12 +862,6 @@ otherwise noted.
   lock: called by the fcntl(2) system call for F_GETLK, F_SETLK, and F_SETLKW
   	commands
 
-  readv: called by the readv(2) system call
-
-  writev: called by the writev(2) system call
-
-  sendfile: called by the sendfile(2) system call
-
   get_unmapped_area: called by the mmap(2) system call
 
   check_flags: called by the fcntl(2) system call for F_SETFL command

From 7c094fd698de2f333fa39b6da213f880d40b9bfe Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 30 Jan 2014 15:46:14 -0800
Subject: [PATCH 25/25] memcg: fix mutex not unlocked on
 memcg_create_kmem_cache fail path

Commit 842e2873697e ("memcg: get rid of kmem_cache_dup()") introduced a
mutex for memcg_create_kmem_cache() to protect the tmp_name buffer that
holds the memcg name.  It failed to unlock the mutex if this buffer
could not be allocated.

This patch fixes the issue by appropriately unlocking the mutex if the
allocation fails.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Glauber Costa <glommer@parallels.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 19d5d4274e22..53385cd4e6f0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3400,7 +3400,7 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
 static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
 						  struct kmem_cache *s)
 {
-	struct kmem_cache *new;
+	struct kmem_cache *new = NULL;
 	static char *tmp_name = NULL;
 	static DEFINE_MUTEX(mutex);	/* protects tmp_name */
 
@@ -3416,7 +3416,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
 	if (!tmp_name) {
 		tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
 		if (!tmp_name)
-			return NULL;
+			goto out;
 	}
 
 	rcu_read_lock();
@@ -3426,12 +3426,11 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
 
 	new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
 				      (s->flags & ~SLAB_PANIC), s->ctor, s);
-
 	if (new)
 		new->allocflags |= __GFP_KMEMCG;
 	else
 		new = s;
-
+out:
 	mutex_unlock(&mutex);
 	return new;
 }