From 07bacb3826d613bb651297a201dd2df8dd4fdee5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 30 Jan 2014 15:45:44 -0800 Subject: [PATCH 01/25] memblock, bootmem: restore goal for alloc_low Now we have memblock_virt_alloc_low to replace original bootmem api in swiotlb. But we should not use BOOTMEM_LOW_LIMIT for arch that does not support CONFIG_NOBOOTMEM, as old api take 0. | #define alloc_bootmem_low(x) \ | __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0) |#define alloc_bootmem_low_pages_nopanic(x) \ | __alloc_bootmem_low_nopanic(x, PAGE_SIZE, 0) and we have #define BOOTMEM_LOW_LIMIT __pa(MAX_DMA_ADDRESS) for CONFIG_NOBOOTMEM. Restore goal to 0 to fix ia64 crash, that Tony found. Signed-off-by: Yinghai Lu Reported-by: Tony Luck Tested-by: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index b388223bd4a9..db51fe4fe317 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -264,7 +264,7 @@ static inline void * __init memblock_virt_alloc_low( { if (!align) align = SMP_CACHE_BYTES; - return __alloc_bootmem_low(size, align, BOOTMEM_LOW_LIMIT); + return __alloc_bootmem_low(size, align, 0); } static inline void * __init memblock_virt_alloc_low_nopanic( @@ -272,7 +272,7 @@ static inline void * __init memblock_virt_alloc_low_nopanic( { if (!align) align = SMP_CACHE_BYTES; - return __alloc_bootmem_low_nopanic(size, align, BOOTMEM_LOW_LIMIT); + return __alloc_bootmem_low_nopanic(size, align, 0); } static inline void * __init memblock_virt_alloc_from_nopanic( From 54f5968db9e09de8779b5a5174719af51f1da199 Mon Sep 17 00:00:00 2001 From: Levente Kurusa Date: Thu, 30 Jan 2014 15:45:45 -0800 Subject: [PATCH 02/25] drivers/video/backlight/lcd.c: call put_device if device_register fails Currently we kfree the container of the device which failed to register. This is wrong as the last reference is not given up with a put_device call. Also, now that we have put_device() callen, we no longer need the kfree as the new_ld->dev.release function will take care of kfreeing the associated memory. Signed-off-by: Levente Kurusa Acked-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/lcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c index 93cf15efc717..7de847df224f 100644 --- a/drivers/video/backlight/lcd.c +++ b/drivers/video/backlight/lcd.c @@ -228,7 +228,7 @@ struct lcd_device *lcd_device_register(const char *name, struct device *parent, rc = device_register(&new_ld->dev); if (rc) { - kfree(new_ld); + put_device(&new_ld->dev); return ERR_PTR(rc); } From 0c692d07842a67d9aa6b8266a80e4ac460a5c1a2 Mon Sep 17 00:00:00 2001 From: Levente Kurusa Date: Thu, 30 Jan 2014 15:45:46 -0800 Subject: [PATCH 03/25] drivers/net/phy/mdio_bus.c: call put_device on device_register() failure It is required to call put_device() if device_register() fails, so that we give up the last reference to the device. Calling put_device allows for mdiobus_release to be executed, kfreeing the bus. Signed-off-by: Levente Kurusa Cc: Greg Kroah-Hartman Cc: Grant Likely Cc: David Daney Cc: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/phy/mdio_bus.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 930694d3a13f..71e49000fbf3 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -150,6 +150,7 @@ int mdiobus_register(struct mii_bus *bus) err = device_register(&bus->dev); if (err) { pr_err("mii_bus %s failed to register\n", bus->id); + put_device(&bus->dev); return -EINVAL; } From 6897fc22ea01b562b55c6168592bcbd3ee62b006 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Jan 2014 15:45:47 -0800 Subject: [PATCH 04/25] kernel: use lockless list for smp_call_function_single Make smp_call_function_single and friends more efficient by using a lockless list. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/blkdev.h | 5 +---- include/linux/smp.h | 6 ++++- kernel/smp.c | 51 +++++++++++------------------------------- 3 files changed, 19 insertions(+), 43 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0375654adb28..8678c4322b44 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -95,10 +95,7 @@ enum rq_cmd_type_bits { * as well! */ struct request { - union { - struct list_head queuelist; - struct llist_node ll_list; - }; + struct list_head queuelist; union { struct call_single_data csd; struct work_struct mq_flush_data; diff --git a/include/linux/smp.h b/include/linux/smp.h index 5da22ee42e16..3834f43f9993 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -11,12 +11,16 @@ #include #include #include +#include extern void cpu_idle(void); typedef void (*smp_call_func_t)(void *info); struct call_single_data { - struct list_head list; + union { + struct list_head list; + struct llist_node llist; + }; smp_call_func_t func; void *info; u16 flags; diff --git a/kernel/smp.c b/kernel/smp.c index bd9f94028838..4ad913e7c253 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -28,12 +28,7 @@ struct call_function_data { static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); -struct call_single_queue { - struct list_head list; - raw_spinlock_t lock; -}; - -static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); static int hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -85,12 +80,8 @@ void __init call_function_init(void) void *cpu = (void *)(long)smp_processor_id(); int i; - for_each_possible_cpu(i) { - struct call_single_queue *q = &per_cpu(call_single_queue, i); - - raw_spin_lock_init(&q->lock); - INIT_LIST_HEAD(&q->list); - } + for_each_possible_cpu(i) + init_llist_head(&per_cpu(call_single_queue, i)); hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); register_cpu_notifier(&hotplug_cfd_notifier); @@ -141,18 +132,9 @@ static void csd_unlock(struct call_single_data *csd) */ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) { - struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); - unsigned long flags; - int ipi; - if (wait) csd->flags |= CSD_FLAG_WAIT; - raw_spin_lock_irqsave(&dst->lock, flags); - ipi = list_empty(&dst->list); - list_add_tail(&csd->list, &dst->list); - raw_spin_unlock_irqrestore(&dst->lock, flags); - /* * The list addition should be visible before sending the IPI * handler locks the list to pull the entry off it because of @@ -164,7 +146,7 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) * locking and barrier primitives. Generic code isn't really * equipped to do the right thing... */ - if (ipi) + if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) arch_send_call_function_single_ipi(cpu); if (wait) @@ -177,27 +159,26 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) */ void generic_smp_call_function_single_interrupt(void) { - struct call_single_queue *q = &__get_cpu_var(call_single_queue); - LIST_HEAD(list); + struct llist_node *entry, *next; /* * Shouldn't receive this interrupt on a cpu that is not yet online. */ WARN_ON_ONCE(!cpu_online(smp_processor_id())); - raw_spin_lock(&q->lock); - list_replace_init(&q->list, &list); - raw_spin_unlock(&q->lock); + entry = llist_del_all(&__get_cpu_var(call_single_queue)); + entry = llist_reverse_order(entry); - while (!list_empty(&list)) { + while (entry) { struct call_single_data *csd; - csd = list_entry(list.next, struct call_single_data, list); - list_del(&csd->list); + next = entry->next; + csd = llist_entry(entry, struct call_single_data, llist); csd->func(csd->info); - csd_unlock(csd); + + entry = next; } } @@ -411,17 +392,11 @@ void smp_call_function_many(const struct cpumask *mask, for_each_cpu(cpu, cfd->cpumask) { struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); - struct call_single_queue *dst = - &per_cpu(call_single_queue, cpu); - unsigned long flags; csd_lock(csd); csd->func = func; csd->info = info; - - raw_spin_lock_irqsave(&dst->lock, flags); - list_add_tail(&csd->list, &dst->list); - raw_spin_unlock_irqrestore(&dst->lock, flags); + llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)); } /* Send a message to all CPUs in the map */ From 73f945505b9bf798d8c3ee830cb330dd6d7fb4c7 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 30 Jan 2014 15:45:48 -0800 Subject: [PATCH 05/25] kernel/smp.c: remove cpumask_ipi After commit 9a46ad6d6df3 ("smp: make smp_call_function_many() use logic similar to smp_call_function_single()"), cfd->cpumask is accessed only in smp_call_function_many(). So there is no more need to copy it into cfd->cpumask_ipi before putting csd into the list. The cpumask_ipi field is obsolete and can be removed. Signed-off-by: Roman Gushchin Cc: Ingo Molnar Cc: Christoph Hellwig Cc: Wang YanQing Cc: Xie XiuQi Cc: Shaohua Li Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index 4ad913e7c253..ffee35bef179 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -23,7 +23,6 @@ enum { struct call_function_data { struct call_single_data __percpu *csd; cpumask_var_t cpumask; - cpumask_var_t cpumask_ipi; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); @@ -42,14 +41,8 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return notifier_from_errno(-ENOMEM); - if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, - cpu_to_node(cpu))) { - free_cpumask_var(cfd->cpumask); - return notifier_from_errno(-ENOMEM); - } cfd->csd = alloc_percpu(struct call_single_data); if (!cfd->csd) { - free_cpumask_var(cfd->cpumask_ipi); free_cpumask_var(cfd->cpumask); return notifier_from_errno(-ENOMEM); } @@ -62,7 +55,6 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DEAD: case CPU_DEAD_FROZEN: free_cpumask_var(cfd->cpumask); - free_cpumask_var(cfd->cpumask_ipi); free_percpu(cfd->csd); break; #endif @@ -383,13 +375,6 @@ void smp_call_function_many(const struct cpumask *mask, if (unlikely(!cpumask_weight(cfd->cpumask))) return; - /* - * After we put an entry into the list, cfd->cpumask may be cleared - * again when another CPU sends another IPI for a SMP function call, so - * cfd->cpumask will be zero. - */ - cpumask_copy(cfd->cpumask_ipi, cfd->cpumask); - for_each_cpu(cpu, cfd->cpumask) { struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); @@ -400,7 +385,7 @@ void smp_call_function_many(const struct cpumask *mask, } /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi_mask(cfd->cpumask_ipi); + arch_send_call_function_ipi_mask(cfd->cpumask); if (wait) { for_each_cpu(cpu, cfd->cpumask) { From bcf1647d0899666f0fb90d176abf63bae22abb7c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:45:50 -0800 Subject: [PATCH 06/25] zsmalloc: move it under mm This patch moves zsmalloc under mm directory. Before that, description will explain why we have needed custom allocator. Zsmalloc is a new slab-based memory allocator for storing compressed pages. It is designed for low fragmentation and high allocation success rate on large object, but <= PAGE_SIZE allocations. zsmalloc differs from the kernel slab allocator in two primary ways to achieve these design goals. zsmalloc never requires high order page allocations to back slabs, or "size classes" in zsmalloc terms. Instead it allows multiple single-order pages to be stitched together into a "zspage" which backs the slab. This allows for higher allocation success rate under memory pressure. Also, zsmalloc allows objects to span page boundaries within the zspage. This allows for lower fragmentation than could be had with the kernel slab allocator for objects between PAGE_SIZE/2 and PAGE_SIZE. With the kernel slab allocator, if a page compresses to 60% of it original size, the memory savings gained through compression is lost in fragmentation because another object of the same size can't be stored in the leftover space. This ability to span pages results in zsmalloc allocations not being directly addressable by the user. The user is given an non-dereferencable handle in response to an allocation request. That handle must be mapped, using zs_map_object(), which returns a pointer to the mapped region that can be used. The mapping is necessary since the object data may reside in two different noncontigious pages. The zsmalloc fulfills the allocation needs for zram perfectly [sjenning@linux.vnet.ibm.com: borrow Seth's quote] Signed-off-by: Minchan Kim Acked-by: Nitin Gupta Reviewed-by: Konrad Rzeszutek Wilk Cc: Bob Liu Cc: Greg Kroah-Hartman Cc: Hugh Dickins Cc: Jens Axboe Cc: Luigi Semenzato Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Cc: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/staging/Kconfig | 2 -- drivers/staging/Makefile | 1 - drivers/staging/zram/zram_drv.h | 3 +-- drivers/staging/zsmalloc/Kconfig | 24 ------------------ drivers/staging/zsmalloc/Makefile | 3 --- .../zsmalloc => include/linux}/zsmalloc.h | 0 mm/Kconfig | 25 +++++++++++++++++++ mm/Makefile | 1 + .../zsmalloc/zsmalloc-main.c => mm/zsmalloc.c | 3 +-- 9 files changed, 28 insertions(+), 34 deletions(-) delete mode 100644 drivers/staging/zsmalloc/Kconfig delete mode 100644 drivers/staging/zsmalloc/Makefile rename {drivers/staging/zsmalloc => include/linux}/zsmalloc.h (100%) rename drivers/staging/zsmalloc/zsmalloc-main.c => mm/zsmalloc.c (99%) diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index 4bb6b11166b3..120d2fa9e531 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -76,8 +76,6 @@ source "drivers/staging/sep/Kconfig" source "drivers/staging/iio/Kconfig" -source "drivers/staging/zsmalloc/Kconfig" - source "drivers/staging/zram/Kconfig" source "drivers/staging/wlags49_h2/Kconfig" diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index 9f07e5e16094..cb19d0afa0da 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -33,7 +33,6 @@ obj-$(CONFIG_VME_BUS) += vme/ obj-$(CONFIG_DX_SEP) += sep/ obj-$(CONFIG_IIO) += iio/ obj-$(CONFIG_ZRAM) += zram/ -obj-$(CONFIG_ZSMALLOC) += zsmalloc/ obj-$(CONFIG_WLAGS49_H2) += wlags49_h2/ obj-$(CONFIG_WLAGS49_H25) += wlags49_h25/ obj-$(CONFIG_FB_SM7XX) += sm7xxfb/ diff --git a/drivers/staging/zram/zram_drv.h b/drivers/staging/zram/zram_drv.h index 97a3acf6ab76..d8f6596513c3 100644 --- a/drivers/staging/zram/zram_drv.h +++ b/drivers/staging/zram/zram_drv.h @@ -17,8 +17,7 @@ #include #include - -#include "../zsmalloc/zsmalloc.h" +#include /* * Some arbitrary value. This is just to catch diff --git a/drivers/staging/zsmalloc/Kconfig b/drivers/staging/zsmalloc/Kconfig deleted file mode 100644 index 9d1f2a24ad62..000000000000 --- a/drivers/staging/zsmalloc/Kconfig +++ /dev/null @@ -1,24 +0,0 @@ -config ZSMALLOC - bool "Memory allocator for compressed pages" - depends on MMU - default n - help - zsmalloc is a slab-based memory allocator designed to store - compressed RAM pages. zsmalloc uses virtual memory mapping - in order to reduce fragmentation. However, this results in a - non-standard allocator interface where a handle, not a pointer, is - returned by an alloc(). This handle must be mapped in order to - access the allocated space. - -config PGTABLE_MAPPING - bool "Use page table mapping to access object in zsmalloc" - depends on ZSMALLOC - help - By default, zsmalloc uses a copy-based object mapping method to - access allocations that span two pages. However, if a particular - architecture (ex, ARM) performs VM mapping faster than copying, - then you should select this. This causes zsmalloc to use page table - mapping rather than copying for object mapping. - - You can check speed with zsmalloc benchmark[1]. - [1] https://github.com/spartacus06/zsmalloc diff --git a/drivers/staging/zsmalloc/Makefile b/drivers/staging/zsmalloc/Makefile deleted file mode 100644 index b134848a590d..000000000000 --- a/drivers/staging/zsmalloc/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -zsmalloc-y := zsmalloc-main.o - -obj-$(CONFIG_ZSMALLOC) += zsmalloc.o diff --git a/drivers/staging/zsmalloc/zsmalloc.h b/include/linux/zsmalloc.h similarity index 100% rename from drivers/staging/zsmalloc/zsmalloc.h rename to include/linux/zsmalloc.h diff --git a/mm/Kconfig b/mm/Kconfig index 723bbe04a0b0..2d9f1504d75e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -552,3 +552,28 @@ config MEM_SOFT_DIRTY it can be cleared by hands. See Documentation/vm/soft-dirty.txt for more details. + +config ZSMALLOC + bool "Memory allocator for compressed pages" + depends on MMU + default n + help + zsmalloc is a slab-based memory allocator designed to store + compressed RAM pages. zsmalloc uses virtual memory mapping + in order to reduce fragmentation. However, this results in a + non-standard allocator interface where a handle, not a pointer, is + returned by an alloc(). This handle must be mapped in order to + access the allocated space. + +config PGTABLE_MAPPING + bool "Use page table mapping to access object in zsmalloc" + depends on ZSMALLOC + help + By default, zsmalloc uses a copy-based object mapping method to + access allocations that span two pages. However, if a particular + architecture (ex, ARM) performs VM mapping faster than copying, + then you should select this. This causes zsmalloc to use page table + mapping rather than copying for object mapping. + + You can check speed with zsmalloc benchmark[1]. + [1] https://github.com/spartacus06/zsmalloc diff --git a/mm/Makefile b/mm/Makefile index 305d10acd081..310c90a09264 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -60,3 +60,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZBUD) += zbud.o +obj-$(CONFIG_ZSMALLOC) += zsmalloc.o diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/mm/zsmalloc.c similarity index 99% rename from drivers/staging/zsmalloc/zsmalloc-main.c rename to mm/zsmalloc.c index 7660c87d8b2a..5d42adfcb67b 100644 --- a/drivers/staging/zsmalloc/zsmalloc-main.c +++ b/mm/zsmalloc.c @@ -90,8 +90,7 @@ #include #include #include - -#include "zsmalloc.h" +#include /* * This must be power of 2 and greater than of equal to sizeof(link_free). From cd67e10ac6997c6d1e1504e3c111b693bfdbc148 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:45:52 -0800 Subject: [PATCH 07/25] zram: promote zram from staging Zram has lived in staging for a LONG LONG time and have been fixed/improved by many contributors so code is clean and stable now. Of course, there are lots of product using zram in real practice. The major TV companys have used zram as swap since two years ago and recently our production team released android smart phone with zram which is used as swap, too and recently Android Kitkat start to use zram for small memory smart phone. And there was a report Google released their ChromeOS with zram, too and cyanogenmod have been used zram long time ago. And I heard some disto have used zram block device for tmpfs. In addition, I saw many report from many other peoples. For example, Lubuntu start to use it. The benefit of zram is very clear. With my experience, one of the benefit was to remove jitter of video application with backgroud memory pressure. It would be effect of efficient memory usage by compression but more issue is whether swap is there or not in the system. Recent mobile platforms have used JAVA so there are many anonymous pages. But embedded system normally are reluctant to use eMMC or SDCard as swap because there is wear-leveling and latency issues so if we do not use swap, it means we can't reclaim anoymous pages and at last, we could encounter OOM kill. :( Although we have real storage as swap, it was a problem, too. Because it sometime ends up making system very unresponsible caused by slow swap storage performance. Quote from Luigi on Google "Since Chrome OS was mentioned: the main reason why we don't use swap to a disk (rotating or SSD) is because it doesn't degrade gracefully and leads to a bad interactive experience. Generally we prefer to manage RAM at a higher level, by transparently killing and restarting processes. But we noticed that zram is fast enough to be competitive with the latter, and it lets us make more efficient use of the available RAM. " and he announced. http://www.spinics.net/lists/linux-mm/msg57717.html Other uses case is to use zram for block device. Zram is block device so anyone can format the block device and mount on it so some guys on the internet start zram as /var/tmp. http://forums.gentoo.org/viewtopic-t-838198-start-0.html Let's promote zram and enhance/maintain it instead of removing. Signed-off-by: Minchan Kim Reviewed-by: Konrad Rzeszutek Wilk Acked-by: Nitin Gupta Acked-by: Pekka Enberg Cc: Bob Liu Cc: Greg Kroah-Hartman Cc: Hugh Dickins Cc: Jens Axboe Cc: Luigi Semenzato Cc: Mel Gorman Cc: Rik van Riel Cc: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- {drivers/staging/zram => Documentation/blockdev}/zram.txt | 0 drivers/block/Kconfig | 2 ++ drivers/block/Makefile | 1 + drivers/{staging => block}/zram/Kconfig | 0 drivers/{staging => block}/zram/Makefile | 0 drivers/{staging => block}/zram/zram_drv.c | 0 drivers/{staging => block}/zram/zram_drv.h | 0 drivers/staging/Kconfig | 2 -- drivers/staging/Makefile | 1 - 9 files changed, 3 insertions(+), 3 deletions(-) rename {drivers/staging/zram => Documentation/blockdev}/zram.txt (100%) rename drivers/{staging => block}/zram/Kconfig (100%) rename drivers/{staging => block}/zram/Makefile (100%) rename drivers/{staging => block}/zram/zram_drv.c (100%) rename drivers/{staging => block}/zram/zram_drv.h (100%) diff --git a/drivers/staging/zram/zram.txt b/Documentation/blockdev/zram.txt similarity index 100% rename from drivers/staging/zram/zram.txt rename to Documentation/blockdev/zram.txt diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 9ffa90c6201c..014a1cfc41c5 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -108,6 +108,8 @@ source "drivers/block/paride/Kconfig" source "drivers/block/mtip32xx/Kconfig" +source "drivers/block/zram/Kconfig" + config BLK_CPQ_DA tristate "Compaq SMART2 support" depends on PCI && VIRT_TO_BUS && 0 diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 816d979c3266..02b688d1438d 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o +obj-$(CONFIG_ZRAM) += zram/ nvme-y := nvme-core.o nvme-scsi.o skd-y := skd_main.o diff --git a/drivers/staging/zram/Kconfig b/drivers/block/zram/Kconfig similarity index 100% rename from drivers/staging/zram/Kconfig rename to drivers/block/zram/Kconfig diff --git a/drivers/staging/zram/Makefile b/drivers/block/zram/Makefile similarity index 100% rename from drivers/staging/zram/Makefile rename to drivers/block/zram/Makefile diff --git a/drivers/staging/zram/zram_drv.c b/drivers/block/zram/zram_drv.c similarity index 100% rename from drivers/staging/zram/zram_drv.c rename to drivers/block/zram/zram_drv.c diff --git a/drivers/staging/zram/zram_drv.h b/drivers/block/zram/zram_drv.h similarity index 100% rename from drivers/staging/zram/zram_drv.h rename to drivers/block/zram/zram_drv.h diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index 120d2fa9e531..040a51525b42 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -76,8 +76,6 @@ source "drivers/staging/sep/Kconfig" source "drivers/staging/iio/Kconfig" -source "drivers/staging/zram/Kconfig" - source "drivers/staging/wlags49_h2/Kconfig" source "drivers/staging/wlags49_h25/Kconfig" diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index cb19d0afa0da..dea056bf7ff2 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -32,7 +32,6 @@ obj-$(CONFIG_VT6656) += vt6656/ obj-$(CONFIG_VME_BUS) += vme/ obj-$(CONFIG_DX_SEP) += sep/ obj-$(CONFIG_IIO) += iio/ -obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_WLAGS49_H2) += wlags49_h2/ obj-$(CONFIG_WLAGS49_H25) += wlags49_h25/ obj-$(CONFIG_FB_SM7XX) += sm7xxfb/ From 49061236a9c2e18b31617cef10d27ba136068bac Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:45:54 -0800 Subject: [PATCH 08/25] zram: remove old private project comment Remove the old private compcache project address so upcoming patches should be sent to LKML because we Linux kernel community will take care. Signed-off-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/blockdev/zram.txt | 6 ------ drivers/block/zram/Kconfig | 1 - drivers/block/zram/zram_drv.c | 1 - drivers/block/zram/zram_drv.h | 1 - 4 files changed, 9 deletions(-) diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 765d790ae831..2eccddffa6c8 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -1,8 +1,6 @@ zram: Compressed RAM based block devices ---------------------------------------- -Project home: http://compcache.googlecode.com/ - * Introduction The zram module creates RAM based block devices named /dev/zram @@ -69,9 +67,5 @@ Following shows a typical sequence of steps for using zram. resets the disksize to zero. You must set the disksize again before reusing the device. -Please report any problems at: - - Mailing list: linux-mm-cc at laptop dot org - - Issue tracker: http://code.google.com/p/compcache/issues/list - Nitin Gupta ngupta@vflare.org diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 983314c41349..3450be850399 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -14,7 +14,6 @@ config ZRAM disks and maybe many more. See zram.txt for more information. - Project home: config ZRAM_DEBUG bool "Compressed RAM block device debug support" diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 108f2733106d..134d605836ca 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -9,7 +9,6 @@ * Released under the terms of 3-clause BSD License * Released under the terms of GNU General Public License Version 2.0 * - * Project home: http://compcache.googlecode.com */ #define KMSG_COMPONENT "zram" diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index d8f6596513c3..92f70e8f457c 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -9,7 +9,6 @@ * Released under the terms of 3-clause BSD License * Released under the terms of GNU General Public License Version 2.0 * - * Project home: http://compcache.googlecode.com */ #ifndef _ZRAM_DRV_H_ From 7bfb3de8a1b3bebc2dc68d381efe27448c0584c5 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:45:55 -0800 Subject: [PATCH 09/25] zram: add copyright Add my copyright to the zram source code which I maintain. Signed-off-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 1 + drivers/block/zram/zram_drv.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 134d605836ca..f9711c520269 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2,6 +2,7 @@ * Compressed RAM block device * * Copyright (C) 2008, 2009, 2010 Nitin Gupta + * 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the licence that better fits your requirements. diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 92f70e8f457c..0e46953c08e9 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -2,6 +2,7 @@ * Compressed RAM block device * * Copyright (C) 2008, 2009, 2010 Nitin Gupta + * 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the licence that better fits your requirements. From 31fc00bb788ffde7d8d861d8b2bba798ab445992 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:45:55 -0800 Subject: [PATCH 10/25] zsmalloc: add copyright Add my copyright to the zsmalloc source code which I maintain. Signed-off-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zsmalloc.h | 1 + mm/zsmalloc.c | 1 + 2 files changed, 2 insertions(+) diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index c2eb174b97ee..e44d634e7fb7 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -2,6 +2,7 @@ * zsmalloc memory allocator * * Copyright (C) 2011 Nitin Gupta + * Copyright (C) 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the license that better fits your requirements. diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5d42adfcb67b..c03ca5e9fe15 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2,6 +2,7 @@ * zsmalloc memory allocator * * Copyright (C) 2011 Nitin Gupta + * Copyright (C) 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the license that better fits your requirements. From 6920f2cc9ed498fabb38368afaa751c411f672fb Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:45:56 -0800 Subject: [PATCH 11/25] zram: add zram maintainers Add maintainer information for zram into the MAINTAINERS file. Signed-off-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index a31a6e3e199f..6f76058a686b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9740,6 +9740,14 @@ T: Mercurial http://linuxtv.org/hg/v4l-dvb S: Odd Fixes F: drivers/media/pci/zoran/ +ZRAM COMPRESSED RAM BLOCK DEVICE DRVIER +M: Minchan Kim +M: Nitin Gupta +L: linux-kernel@vger.kernel.org +S: Maintained +F: drivers/block/zram/ +F: Documentation/blockdev/zram.txt + ZS DECSTATION Z85C30 SERIAL DRIVER M: "Maciej W. Rozycki" S: Maintained From eae70d06846199afc97524ed986b910836c0abe5 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:45:57 -0800 Subject: [PATCH 12/25] zsmalloc: add maintainers tAdd adds maintainer information for zsmalloc into the MAINTAINERS file. Signed-off-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6f76058a686b..b5795f031b3e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9753,6 +9753,14 @@ M: "Maciej W. Rozycki" S: Maintained F: drivers/tty/serial/zs.* +ZSMALLOC COMPRESSED SLAB MEMORY ALLOCATOR +M: Minchan Kim +M: Nitin Gupta +L: linux-mm@kvack.org +S: Maintained +F: mm/zsmalloc.c +F: include/linux/zsmalloc.h + ZSWAP COMPRESSED SWAP CACHING M: Seth Jennings L: linux-mm@kvack.org From da4a04126baa3be03bc566d4a2ee0944c5e783d0 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:45:58 -0800 Subject: [PATCH 13/25] zram: fix race between reset and flushing pending work Dan and Sergey reported that there is a racy between reset and flushing of pending work so that it could make oops by freeing zram->meta in reset while zram_slot_free can access zram->meta if new request is adding during the race window. This patch moves flush after taking init_lock so it prevents new request so that it closes the race. Signed-off-by: Minchan Kim Reported-by: Dan Carpenter Cc: Nitin Gupta Cc: Jerome Marchand Tested-by: Sergey Senozhatsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f9711c520269..213dfc10b783 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -553,14 +553,14 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) size_t index; struct zram_meta *meta; - flush_work(&zram->free_work); - down_write(&zram->init_lock); if (!zram->init_done) { up_write(&zram->init_lock); return; } + flush_work(&zram->free_work); + meta = zram->meta; zram->init_done = 0; From 9b353db16d18f87242337e3e61a948c023505a65 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:46:00 -0800 Subject: [PATCH 14/25] zram: delay pending free request in read path Sergey reported we don't need to handle pending free request every I/O so that this patch removes it in read path while we remain it in write path. Let's consider below example. Swap subsystem ask to zram "A" block free by swap_slot_free_notify but zram had been pended it without real freeing. Swap subsystem allocates "A" block for new data but request pended for a long time just handled and zram blindly free new data on the "A" block. :( That's why we couldn't remove handle pending free request right before zram-write. Signed-off-by: Minchan Kim Reported-by: Sergey Senozhatsky Tested-by: Sergey Senozhatsky Cc: Nitin Gupta Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 213dfc10b783..d87c7e98fb11 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -535,7 +535,6 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, if (rw == READ) { down_read(&zram->lock); - handle_pending_slot_free(zram); ret = zram_bvec_read(zram, bvec, index, offset, bio); up_read(&zram->lock); } else { From 874e3cddc33f0c0f9cc08ad2b73fa0cbe7dfaa63 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:46:01 -0800 Subject: [PATCH 15/25] zram: remove unnecessary free Commit a0c516cbfc74 ("zram: don't grab mutex in zram_slot_free_noity") introduced pending zram slot free in zram's write path in case of missing slot free by memory allocation failure in zram_slot_free_notify but it is not necessary because we have already freed the slot right before overwriting. Signed-off-by: Minchan Kim Cc: Nitin Gupta Cc: Jerome Marchand Tested-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d87c7e98fb11..ebfddd852272 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -441,14 +441,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } - /* - * zram_slot_free_notify could miss free so that let's - * double check. - */ - if (unlikely(meta->table[index].handle || - zram_test_flag(meta, index, ZRAM_ZERO))) - zram_free_page(zram, index); - ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen, meta->compress_workmem); From deb0bdeb2f3d6b81d37fc778316dae46b6daab56 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:46:02 -0800 Subject: [PATCH 16/25] zram: use atomic operation for stat Some of fields in zram->stats are protected by zram->lock which is rather coarse-grained so let's use atomic operation without explict locking. This patch is ready for removing dependency of zram->lock in read path which is very coarse-grained rw_semaphore. Of course, this patch adds new atomic operation so it might make slow but my 12CPU test couldn't spot any regression. All gain/lose is marginal within stddev. iozone -t -T -l 12 -u 12 -r 16K -s 60M -I +Z -V 0 ==Initial write ==Initial write records: 50 records: 50 avg: 412875.17 avg: 415638.23 std: 38543.12 (9.34%) std: 36601.11 (8.81%) max: 521262.03 max: 502976.72 min: 343263.13 min: 351389.12 ==Rewrite ==Rewrite records: 50 records: 50 avg: 416640.34 avg: 397914.33 std: 60798.92 (14.59%) std: 46150.42 (11.60%) max: 543057.07 max: 522669.17 min: 304071.67 min: 316588.77 ==Read ==Read records: 50 records: 50 avg: 4147338.63 avg: 4070736.51 std: 179333.25 (4.32%) std: 223499.89 (5.49%) max: 4459295.28 max: 4539514.44 min: 3753057.53 min: 3444686.31 ==Re-read ==Re-read records: 50 records: 50 avg: 4096706.71 avg: 4117218.57 std: 229735.04 (5.61%) std: 171676.25 (4.17%) max: 4430012.09 max: 4459263.94 min: 2987217.80 min: 3666904.28 ==Reverse Read ==Reverse Read records: 50 records: 50 avg: 4062763.83 avg: 4078508.32 std: 186208.46 (4.58%) std: 172684.34 (4.23%) max: 4401358.78 max: 4424757.22 min: 3381625.00 min: 3679359.94 ==Stride read ==Stride read records: 50 records: 50 avg: 4094933.49 avg: 4082170.22 std: 185710.52 (4.54%) std: 196346.68 (4.81%) max: 4478241.25 max: 4460060.97 min: 3732593.23 min: 3584125.78 ==Random read ==Random read records: 50 records: 50 avg: 4031070.04 avg: 4074847.49 std: 192065.51 (4.76%) std: 206911.33 (5.08%) max: 4356931.16 max: 4399442.56 min: 3481619.62 min: 3548372.44 ==Mixed workload ==Mixed workload records: 50 records: 50 avg: 149925.73 avg: 149675.54 std: 7701.26 (5.14%) std: 6902.09 (4.61%) max: 191301.56 max: 175162.05 min: 133566.28 min: 137762.87 ==Random write ==Random write records: 50 records: 50 avg: 404050.11 avg: 393021.47 std: 58887.57 (14.57%) std: 42813.70 (10.89%) max: 601798.09 max: 524533.43 min: 325176.99 min: 313255.34 ==Pwrite ==Pwrite records: 50 records: 50 avg: 411217.70 avg: 411237.96 std: 43114.99 (10.48%) std: 33136.29 (8.06%) max: 530766.79 max: 471899.76 min: 320786.84 min: 317906.94 ==Pread ==Pread records: 50 records: 50 avg: 4154908.65 avg: 4087121.92 std: 151272.08 (3.64%) std: 219505.04 (5.37%) max: 4459478.12 max: 4435857.38 min: 3730512.41 min: 3101101.67 Signed-off-by: Minchan Kim Cc: Nitin Gupta Tested-by: Sergey Senozhatsky Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 20 ++++++++++---------- drivers/block/zram/zram_drv.h | 16 ++++++---------- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index ebfddd852272..9ab884999420 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -104,7 +104,7 @@ static ssize_t zero_pages_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%u\n", zram->stats.pages_zero); + return sprintf(buf, "%u\n", atomic_read(&zram->stats.pages_zero)); } static ssize_t orig_data_size_show(struct device *dev, @@ -113,7 +113,7 @@ static ssize_t orig_data_size_show(struct device *dev, struct zram *zram = dev_to_zram(dev); return sprintf(buf, "%llu\n", - (u64)(zram->stats.pages_stored) << PAGE_SHIFT); + (u64)(atomic_read(&zram->stats.pages_stored)) << PAGE_SHIFT); } static ssize_t compr_data_size_show(struct device *dev, @@ -293,21 +293,21 @@ static void zram_free_page(struct zram *zram, size_t index) */ if (zram_test_flag(meta, index, ZRAM_ZERO)) { zram_clear_flag(meta, index, ZRAM_ZERO); - zram->stats.pages_zero--; + atomic_dec(&zram->stats.pages_zero); } return; } if (unlikely(size > max_zpage_size)) - zram->stats.bad_compress--; + atomic_dec(&zram->stats.bad_compress); zs_free(meta->mem_pool, handle); if (size <= PAGE_SIZE / 2) - zram->stats.good_compress--; + atomic_dec(&zram->stats.good_compress); atomic64_sub(meta->table[index].size, &zram->stats.compr_size); - zram->stats.pages_stored--; + atomic_dec(&zram->stats.pages_stored); meta->table[index].handle = 0; meta->table[index].size = 0; @@ -435,7 +435,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, /* Free memory associated with this sector now. */ zram_free_page(zram, index); - zram->stats.pages_zero++; + atomic_inc(&zram->stats.pages_zero); zram_set_flag(meta, index, ZRAM_ZERO); ret = 0; goto out; @@ -456,7 +456,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, } if (unlikely(clen > max_zpage_size)) { - zram->stats.bad_compress++; + atomic_inc(&zram->stats.bad_compress); clen = PAGE_SIZE; src = NULL; if (is_partial_io(bvec)) @@ -493,9 +493,9 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, /* Update stats */ atomic64_add(clen, &zram->stats.compr_size); - zram->stats.pages_stored++; + atomic_inc(&zram->stats.pages_stored); if (clen <= PAGE_SIZE / 2) - zram->stats.good_compress++; + atomic_inc(&zram->stats.good_compress); out: if (is_partial_io(bvec)) diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 0e46953c08e9..81b0170de369 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -68,10 +68,6 @@ struct table { u8 flags; } __aligned(4); -/* - * All 64bit fields should only be manipulated by 64bit atomic accessors. - * All modifications to 32bit counter should be protected by zram->lock. - */ struct zram_stats { atomic64_t compr_size; /* compressed size of pages stored */ atomic64_t num_reads; /* failed + successful */ @@ -80,10 +76,10 @@ struct zram_stats { atomic64_t failed_writes; /* can happen when memory is too low */ atomic64_t invalid_io; /* non-page-aligned I/O requests */ atomic64_t notify_free; /* no. of swap slot free notifications */ - u32 pages_zero; /* no. of zero filled pages */ - u32 pages_stored; /* no. of pages currently stored */ - u32 good_compress; /* % of pages with compression ratio<=50% */ - u32 bad_compress; /* % of pages with compression ratio>=75% */ + atomic_t pages_zero; /* no. of zero filled pages */ + atomic_t pages_stored; /* no. of pages currently stored */ + atomic_t good_compress; /* % of pages with compression ratio<=50% */ + atomic_t bad_compress; /* % of pages with compression ratio>=75% */ }; struct zram_meta { @@ -101,8 +97,8 @@ struct zram_slot_free { struct zram { struct zram_meta *meta; struct rw_semaphore lock; /* protect compression buffers, table, - * 32bit stat counters against concurrent - * notifications, reads and writes */ + * reads and writes + */ struct work_struct free_work; /* handle pending free request */ struct zram_slot_free *slot_free_rq; /* list head of free request */ From 92967471b67163bb1654e9b7fe99449ab70a4aaa Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:46:03 -0800 Subject: [PATCH 17/25] zram: introduce zram->tb_lock Currently, the zram table is protected by zram->lock but it's rather coarse-grained lock and it makes hard for scalibility. Let's use own rwlock instead of depending on zram->lock. This patch adds new locking so obviously, it would make slow but this patch is just prepartion for removing coarse-grained rw_semaphore(ie, zram->lock) which is hurdle about zram scalability. Final patch in this patchset series will remove the lock from read-path and change rw_semaphore with mutex in write path. With bonus, we could drop pending slot free mess in next patch. Signed-off-by: Minchan Kim Cc: Nitin Gupta Tested-by: Sergey Senozhatsky Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 26 +++++++++++++++++++++----- drivers/block/zram/zram_drv.h | 3 ++- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 9ab884999420..24e6426cf258 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -140,6 +140,7 @@ static ssize_t mem_used_total_show(struct device *dev, return sprintf(buf, "%llu\n", val); } +/* flag operations needs meta->tb_lock */ static int zram_test_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) { @@ -228,6 +229,7 @@ static struct zram_meta *zram_meta_alloc(u64 disksize) goto free_table; } + rwlock_init(&meta->tb_lock); return meta; free_table: @@ -280,6 +282,7 @@ static void handle_zero_page(struct bio_vec *bvec) flush_dcache_page(page); } +/* NOTE: caller should hold meta->tb_lock with write-side */ static void zram_free_page(struct zram *zram, size_t index) { struct zram_meta *meta = zram->meta; @@ -319,20 +322,26 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) size_t clen = PAGE_SIZE; unsigned char *cmem; struct zram_meta *meta = zram->meta; - unsigned long handle = meta->table[index].handle; + unsigned long handle; + u16 size; + + read_lock(&meta->tb_lock); + handle = meta->table[index].handle; + size = meta->table[index].size; if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) { + read_unlock(&meta->tb_lock); clear_page(mem); return 0; } cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); - if (meta->table[index].size == PAGE_SIZE) + if (size == PAGE_SIZE) copy_page(mem, cmem); else - ret = lzo1x_decompress_safe(cmem, meta->table[index].size, - mem, &clen); + ret = lzo1x_decompress_safe(cmem, size, mem, &clen); zs_unmap_object(meta->mem_pool, handle); + read_unlock(&meta->tb_lock); /* Should NEVER happen. Return bio error if it does. */ if (unlikely(ret != LZO_E_OK)) { @@ -353,11 +362,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, struct zram_meta *meta = zram->meta; page = bvec->bv_page; + read_lock(&meta->tb_lock); if (unlikely(!meta->table[index].handle) || zram_test_flag(meta, index, ZRAM_ZERO)) { + read_unlock(&meta->tb_lock); handle_zero_page(bvec); return 0; } + read_unlock(&meta->tb_lock); if (is_partial_io(bvec)) /* Use a temporary buffer to decompress the page */ @@ -433,10 +445,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, if (page_zero_filled(uncmem)) { kunmap_atomic(user_mem); /* Free memory associated with this sector now. */ + write_lock(&zram->meta->tb_lock); zram_free_page(zram, index); + zram_set_flag(meta, index, ZRAM_ZERO); + write_unlock(&zram->meta->tb_lock); atomic_inc(&zram->stats.pages_zero); - zram_set_flag(meta, index, ZRAM_ZERO); ret = 0; goto out; } @@ -486,10 +500,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, * Free memory associated with this sector * before overwriting unused sectors. */ + write_lock(&zram->meta->tb_lock); zram_free_page(zram, index); meta->table[index].handle = handle; meta->table[index].size = clen; + write_unlock(&zram->meta->tb_lock); /* Update stats */ atomic64_add(clen, &zram->stats.compr_size); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 81b0170de369..c3f453f04974 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -83,6 +83,7 @@ struct zram_stats { }; struct zram_meta { + rwlock_t tb_lock; /* protect table */ void *compress_workmem; void *compress_buffer; struct table *table; @@ -96,7 +97,7 @@ struct zram_slot_free { struct zram { struct zram_meta *meta; - struct rw_semaphore lock; /* protect compression buffers, table, + struct rw_semaphore lock; /* protect compression buffers, * reads and writes */ From f614a9f48dedd2b80d1dc8bae8094842fcdb39dd Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:46:04 -0800 Subject: [PATCH 18/25] zram: remove workqueue for freeing removed pending slot Commit a0c516cbfc74 ("zram: don't grab mutex in zram_slot_free_noity") introduced free request pending code to avoid scheduling by mutex under spinlock and it was a mess which made code lenghty and increased overhead. Now, we don't need zram->lock any more to free slot so this patch reverts it and then, tb_lock should protect it. Signed-off-by: Minchan Kim Cc: Nitin Gupta Tested-by: Sergey Senozhatsky Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 54 ++++------------------------------- drivers/block/zram/zram_drv.h | 10 ------- 2 files changed, 6 insertions(+), 58 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 24e6426cf258..f1a3c958d84b 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -522,20 +522,6 @@ out: return ret; } -static void handle_pending_slot_free(struct zram *zram) -{ - struct zram_slot_free *free_rq; - - spin_lock(&zram->slot_free_lock); - while (zram->slot_free_rq) { - free_rq = zram->slot_free_rq; - zram->slot_free_rq = free_rq->next; - zram_free_page(zram, free_rq->index); - kfree(free_rq); - } - spin_unlock(&zram->slot_free_lock); -} - static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, struct bio *bio, int rw) { @@ -547,7 +533,6 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, up_read(&zram->lock); } else { down_write(&zram->lock); - handle_pending_slot_free(zram); ret = zram_bvec_write(zram, bvec, index, offset); up_write(&zram->lock); } @@ -566,8 +551,6 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) return; } - flush_work(&zram->free_work); - meta = zram->meta; zram->init_done = 0; @@ -769,40 +752,19 @@ error: bio_io_error(bio); } -static void zram_slot_free(struct work_struct *work) -{ - struct zram *zram; - - zram = container_of(work, struct zram, free_work); - down_write(&zram->lock); - handle_pending_slot_free(zram); - up_write(&zram->lock); -} - -static void add_slot_free(struct zram *zram, struct zram_slot_free *free_rq) -{ - spin_lock(&zram->slot_free_lock); - free_rq->next = zram->slot_free_rq; - zram->slot_free_rq = free_rq; - spin_unlock(&zram->slot_free_lock); -} - static void zram_slot_free_notify(struct block_device *bdev, unsigned long index) { struct zram *zram; - struct zram_slot_free *free_rq; + struct zram_meta *meta; zram = bdev->bd_disk->private_data; + meta = zram->meta; + + write_lock(&meta->tb_lock); + zram_free_page(zram, index); + write_unlock(&meta->tb_lock); atomic64_inc(&zram->stats.notify_free); - - free_rq = kmalloc(sizeof(struct zram_slot_free), GFP_ATOMIC); - if (!free_rq) - return; - - free_rq->index = index; - add_slot_free(zram, free_rq); - schedule_work(&zram->free_work); } static const struct block_device_operations zram_devops = { @@ -849,10 +811,6 @@ static int create_device(struct zram *zram, int device_id) init_rwsem(&zram->lock); init_rwsem(&zram->init_lock); - INIT_WORK(&zram->free_work, zram_slot_free); - spin_lock_init(&zram->slot_free_lock); - zram->slot_free_rq = NULL; - zram->queue = blk_alloc_queue(GFP_KERNEL); if (!zram->queue) { pr_err("Error allocating disk queue for device %d\n", diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index c3f453f04974..d876300da6c9 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -90,20 +90,11 @@ struct zram_meta { struct zs_pool *mem_pool; }; -struct zram_slot_free { - unsigned long index; - struct zram_slot_free *next; -}; - struct zram { struct zram_meta *meta; struct rw_semaphore lock; /* protect compression buffers, * reads and writes */ - - struct work_struct free_work; /* handle pending free request */ - struct zram_slot_free *slot_free_rq; /* list head of free request */ - struct request_queue *queue; struct gendisk *disk; int init_done; @@ -114,7 +105,6 @@ struct zram { * we can store in a disk. */ u64 disksize; /* bytes */ - spinlock_t slot_free_lock; struct zram_stats stats; }; From e46e33152eb82b8e2db7ffb3790a2a2653c34513 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:46:06 -0800 Subject: [PATCH 19/25] zram: remove zram->lock in read path and change it with mutex Finally, we separated zram->lock dependency from 32bit stat/ table handling so there is no reason to use rw_semaphore between read and write path so this patch removes the lock from read path totally and changes rw_semaphore with mutex. So, we could do old: read-read: OK read-write: NO write-write: NO Now: read-read: OK read-write: OK write-write: NO The below data proves mixed workload performs well 11 times and there is also enhance on write-write path because current rw-semaphore doesn't support SPIN_ON_OWNER. It's side effect but anyway good thing for us. Write-related tests perform better (from 61% to 1058%) but read path has good/bad(from -2.22% to 1.45%) but they are all marginal within stddev. CPU 12 iozone -t -T -l 12 -u 12 -r 16K -s 60M -I +Z -V 0 ==Initial write ==Initial write records: 10 records: 10 avg: 516189.16 avg: 839907.96 std: 22486.53 (4.36%) std: 47902.17 (5.70%) max: 546970.60 max: 909910.35 min: 481131.54 min: 751148.38 ==Rewrite ==Rewrite records: 10 records: 10 avg: 509527.98 avg: 1050156.37 std: 45799.94 (8.99%) std: 40695.44 (3.88%) max: 611574.27 max: 1111929.26 min: 443679.95 min: 980409.62 ==Read ==Read records: 10 records: 10 avg: 4408624.17 avg: 4472546.76 std: 281152.61 (6.38%) std: 163662.78 (3.66%) max: 4867888.66 max: 4727351.03 min: 4058347.69 min: 4126520.88 ==Re-read ==Re-read records: 10 records: 10 avg: 4462147.53 avg: 4363257.75 std: 283546.11 (6.35%) std: 247292.63 (5.67%) max: 4912894.44 max: 4677241.75 min: 4131386.50 min: 4035235.84 ==Reverse Read ==Reverse Read records: 10 records: 10 avg: 4565865.97 avg: 4485818.08 std: 313395.63 (6.86%) std: 248470.10 (5.54%) max: 5232749.16 max: 4789749.94 min: 4185809.62 min: 3963081.34 ==Stride read ==Stride read records: 10 records: 10 avg: 4515981.80 avg: 4418806.01 std: 211192.32 (4.68%) std: 212837.97 (4.82%) max: 4889287.28 max: 4686967.22 min: 4210362.00 min: 4083041.84 ==Random read ==Random read records: 10 records: 10 avg: 4410525.23 avg: 4387093.18 std: 236693.22 (5.37%) std: 235285.23 (5.36%) max: 4713698.47 max: 4669760.62 min: 4057163.62 min: 3952002.16 ==Mixed workload ==Mixed workload records: 10 records: 10 avg: 243234.25 avg: 2818677.27 std: 28505.07 (11.72%) std: 195569.70 (6.94%) max: 288905.23 max: 3126478.11 min: 212473.16 min: 2484150.69 ==Random write ==Random write records: 10 records: 10 avg: 555887.07 avg: 1053057.79 std: 70841.98 (12.74%) std: 35195.36 (3.34%) max: 683188.28 max: 1096125.73 min: 437299.57 min: 992481.93 ==Pwrite ==Pwrite records: 10 records: 10 avg: 501745.93 avg: 810363.09 std: 16373.54 (3.26%) std: 19245.01 (2.37%) max: 518724.52 max: 833359.70 min: 464208.73 min: 765501.87 ==Pread ==Pread records: 10 records: 10 avg: 4539894.60 avg: 4457680.58 std: 197094.66 (4.34%) std: 188965.60 (4.24%) max: 4877170.38 max: 4689905.53 min: 4226326.03 min: 4095739.72 Signed-off-by: Minchan Kim Cc: Nitin Gupta Tested-by: Sergey Senozhatsky Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 17 ++++++++--------- drivers/block/zram/zram_drv.h | 4 +--- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f1a3c958d84b..011e55d820b1 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -230,6 +230,7 @@ static struct zram_meta *zram_meta_alloc(u64 disksize) } rwlock_init(&meta->tb_lock); + mutex_init(&meta->buffer_lock); return meta; free_table: @@ -412,6 +413,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, struct page *page; unsigned char *user_mem, *cmem, *src, *uncmem = NULL; struct zram_meta *meta = zram->meta; + bool locked = false; page = bvec->bv_page; src = meta->compress_buffer; @@ -431,6 +433,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } + mutex_lock(&meta->buffer_lock); + locked = true; user_mem = kmap_atomic(page); if (is_partial_io(bvec)) { @@ -457,7 +461,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen, meta->compress_workmem); - if (!is_partial_io(bvec)) { kunmap_atomic(user_mem); user_mem = NULL; @@ -514,6 +517,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, atomic_inc(&zram->stats.good_compress); out: + if (locked) + mutex_unlock(&meta->buffer_lock); if (is_partial_io(bvec)) kfree(uncmem); @@ -527,15 +532,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, { int ret; - if (rw == READ) { - down_read(&zram->lock); + if (rw == READ) ret = zram_bvec_read(zram, bvec, index, offset, bio); - up_read(&zram->lock); - } else { - down_write(&zram->lock); + else ret = zram_bvec_write(zram, bvec, index, offset); - up_write(&zram->lock); - } return ret; } @@ -808,7 +808,6 @@ static int create_device(struct zram *zram, int device_id) { int ret = -ENOMEM; - init_rwsem(&zram->lock); init_rwsem(&zram->init_lock); zram->queue = blk_alloc_queue(GFP_KERNEL); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index d876300da6c9..ad8aa35bae00 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -88,13 +88,11 @@ struct zram_meta { void *compress_buffer; struct table *table; struct zs_pool *mem_pool; + struct mutex buffer_lock; /* protect compress buffers */ }; struct zram { struct zram_meta *meta; - struct rw_semaphore lock; /* protect compression buffers, - * reads and writes - */ struct request_queue *queue; struct gendisk *disk; int init_done; From 8790c71a18e5d2d93532ae250bcf5eddbba729cd Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 30 Jan 2014 15:46:08 -0800 Subject: [PATCH 20/25] mm/mempolicy.c: fix mempolicy printing in numa_maps As a result of commit 5606e3877ad8 ("mm: numa: Migrate on reference policy"), /proc//numa_maps prints the mempolicy for any as "prefer:N" for the local node, N, of the process reading the file. This should only be printed when the mempolicy of is MPOL_PREFERRED for node N. If the process is actually only using the default mempolicy for local node allocation, make sure "default" is printed as expected. Signed-off-by: David Rientjes Reported-by: Robert Lippert Cc: Peter Zijlstra Acked-by: Mel Gorman Cc: Ingo Molnar Cc: [3.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 873de7e542bc..ae3c8f3595d4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2930,7 +2930,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) unsigned short mode = MPOL_DEFAULT; unsigned short flags = 0; - if (pol && pol != &default_policy) { + if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { mode = pol->mode; flags = pol->flags; } From a03208652dad18232e9ec3432df69f9c19c856ec Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 30 Jan 2014 15:46:09 -0800 Subject: [PATCH 21/25] mm/slub.c: fix page->_count corruption (again) Commit abca7c496584 ("mm: fix slab->page _count corruption when using slub") notes that we can not _set_ a page->counters directly, except when using a real double-cmpxchg. Doing so can lose updates to ->_count. That is an absolute rule: You may not *set* page->counters except via a cmpxchg. Commit abca7c496584 fixed this for the folks who have the slub cmpxchg_double code turned off at compile time, but it left the bad case alone. It can still be reached, and the same bug triggered in two cases: 1. Turning on slub debugging at runtime, which is available on the distro kernels that I looked at. 2. On 64-bit CPUs with no CMPXCHG16B (some early AMD x86-64 cpus, evidently) There are at least 3 ways we could fix this: 1. Take all of the exising calls to cmpxchg_double_slab() and __cmpxchg_double_slab() and convert them to take an old, new and target 'struct page'. 2. Do (1), but with the newly-introduced 'slub_data'. 3. Do some magic inside the two cmpxchg...slab() functions to pull the counters out of new_counters and only set those fields in page->{inuse,frozen,objects}. I've done (2) as well, but it's a bunch more code. This patch is an attempt at (3). This was the most straightforward and foolproof way that I could think to do this. This would also technically allow us to get rid of the ugly #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) in 'struct page', but leaving it alone has the added benefit that 'counters' stays 'unsigned' instead of 'unsigned long', so all the copies that the slub code does stay a bit smaller. Signed-off-by: Dave Hansen Cc: Christoph Lameter Cc: Pekka Enberg Cc: Matt Mackall Cc: Pravin B Shelar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 545a170ebf9f..2b1a6970e46f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -355,6 +355,21 @@ static __always_inline void slab_unlock(struct page *page) __bit_spin_unlock(PG_locked, &page->flags); } +static inline void set_page_slub_counters(struct page *page, unsigned long counters_new) +{ + struct page tmp; + tmp.counters = counters_new; + /* + * page->counters can cover frozen/inuse/objects as well + * as page->_count. If we assign to ->counters directly + * we run the risk of losing updates to page->_count, so + * be careful and only assign to the fields we need. + */ + page->frozen = tmp.frozen; + page->inuse = tmp.inuse; + page->objects = tmp.objects; +} + /* Interrupts must be disabled (for the fallback code to work right) */ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, @@ -376,7 +391,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; - page->counters = counters_new; + set_page_slub_counters(page, counters_new); slab_unlock(page); return 1; } @@ -415,7 +430,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; - page->counters = counters_new; + set_page_slub_counters(page, counters_new); slab_unlock(page); local_irq_restore(flags); return 1; From 24f91eba18bbfdb27e71a1aae5b3a61b67fcd091 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Thu, 30 Jan 2014 15:46:10 -0800 Subject: [PATCH 22/25] mm: don't lose the SOFT_DIRTY flag on mprotect The SOFT_DIRTY bit shows that the content of memory was changed after a defined point in the past. mprotect() doesn't change the content of memory, so it must not change the SOFT_DIRTY bit. This bug causes a malfunction: on the first iteration all pages are dumped. On other iterations only pages with the SOFT_DIRTY bit are dumped. So if the SOFT_DIRTY bit is cleared from a page by mistake, the page is not dumped and its content will be restored incorrectly. This patch does nothing with _PAGE_SWP_SOFT_DIRTY, becase pte_modify() is called only for present pages. Fixes commit 0f8975ec4db2 ("mm: soft-dirty bits for user memory changes tracking"). Signed-off-by: Andrey Vagin Acked-by: Cyrill Gorcunov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Pavel Emelyanov Cc: Borislav Petkov Cc: Wen Congyang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index a83aa44bb1fb..1aa9ccd43223 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -121,7 +121,8 @@ /* Set of bits not changed in pte_modify */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ - _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) + _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ + _PAGE_SOFT_DIRTY) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) From 778c14affaf94a9e4953179d3e13a544ccce7707 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 30 Jan 2014 15:46:11 -0800 Subject: [PATCH 23/25] mm, oom: base root bonus on current usage A 3% of system memory bonus is sometimes too excessive in comparison to other processes. With commit a63d83f427fb ("oom: badness heuristic rewrite"), the OOM killer tries to avoid killing privileged tasks by subtracting 3% of overall memory (system or cgroup) from their per-task consumption. But as a result, all root tasks that consume less than 3% of overall memory are considered equal, and so it only takes 33+ privileged tasks pushing the system out of memory for the OOM killer to do something stupid and kill dhclient or other root-owned processes. For example, on a 32G machine it can't tell the difference between the 1M agetty and the 10G fork bomb member. The changelog describes this 3% boost as the equivalent to the global overcommit limit being 3% higher for privileged tasks, but this is not the same as discounting 3% of overall memory from _every privileged task individually_ during OOM selection. Replace the 3% of system memory bonus with a 3% of current memory usage bonus. By giving root tasks a bonus that is proportional to their actual size, they remain comparable even when relatively small. In the example above, the OOM killer will discount the 1M agetty's 256 badness points down to 179, and the 10G fork bomb's 262144 points down to 183500 points and make the right choice, instead of discounting both to 0 and killing agetty because it's first in the task list. Signed-off-by: David Rientjes Reported-by: Johannes Weiner Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 4 ++-- mm/oom_kill.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 31f76178c987..f00bee144add 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1386,8 +1386,8 @@ may allocate from based on an estimation of its current memory and swap use. For example, if a task is using all allowed memory, its badness score will be 1000. If it is using half of its allowed memory, its score will be 500. -There is an additional factor included in the badness score: root -processes are given 3% extra memory over other tasks. +There is an additional factor included in the badness score: the current memory +and swap usage is discounted by 3% for root processes. The amount of "allowed" memory depends on the context in which the oom killer was called. If it is due to the memory assigned to the allocating task's cpuset diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 37b1b1903fb2..3291e82d4352 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -178,7 +178,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * implementation used by LSMs. */ if (has_capability_noaudit(p, CAP_SYS_ADMIN)) - adj -= 30; + points -= (points * 3) / 100; /* Normalize to oom_score_adj units */ adj *= totalpages / 1000; From 46bf16c44b90791445975463da671521fc430cae Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Thu, 30 Jan 2014 15:46:12 -0800 Subject: [PATCH 24/25] Documentation/filesystems/vfs.txt: update file_operations documentation ->readv, ->writev and ->sendfile have been removed while ->show_fdinfo has been added. The documentation should reflect this. Signed-off-by: Richard Yao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/vfs.txt | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index deb48b5fd883..c53784c119c8 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -782,7 +782,7 @@ struct file_operations ---------------------- This describes how the VFS can manipulate an open file. As of kernel -3.5, the following members are defined: +3.12, the following members are defined: struct file_operations { struct module *owner; @@ -803,9 +803,6 @@ struct file_operations { int (*aio_fsync) (struct kiocb *, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); - ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *); - ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); - ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *); ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); @@ -814,6 +811,7 @@ struct file_operations { ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int); int (*setlease)(struct file *, long arg, struct file_lock **); long (*fallocate)(struct file *, int mode, loff_t offset, loff_t len); + int (*show_fdinfo)(struct seq_file *m, struct file *f); }; Again, all methods are called without any locks being held, unless @@ -864,12 +862,6 @@ otherwise noted. lock: called by the fcntl(2) system call for F_GETLK, F_SETLK, and F_SETLKW commands - readv: called by the readv(2) system call - - writev: called by the writev(2) system call - - sendfile: called by the sendfile(2) system call - get_unmapped_area: called by the mmap(2) system call check_flags: called by the fcntl(2) system call for F_SETFL command From 7c094fd698de2f333fa39b6da213f880d40b9bfe Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 30 Jan 2014 15:46:14 -0800 Subject: [PATCH 25/25] memcg: fix mutex not unlocked on memcg_create_kmem_cache fail path Commit 842e2873697e ("memcg: get rid of kmem_cache_dup()") introduced a mutex for memcg_create_kmem_cache() to protect the tmp_name buffer that holds the memcg name. It failed to unlock the mutex if this buffer could not be allocated. This patch fixes the issue by appropriately unlocking the mutex if the allocation fails. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Cc: Glauber Costa Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 19d5d4274e22..53385cd4e6f0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3400,7 +3400,7 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep) static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, struct kmem_cache *s) { - struct kmem_cache *new; + struct kmem_cache *new = NULL; static char *tmp_name = NULL; static DEFINE_MUTEX(mutex); /* protects tmp_name */ @@ -3416,7 +3416,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, if (!tmp_name) { tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); if (!tmp_name) - return NULL; + goto out; } rcu_read_lock(); @@ -3426,12 +3426,11 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, (s->flags & ~SLAB_PANIC), s->ctor, s); - if (new) new->allocflags |= __GFP_KMEMCG; else new = s; - +out: mutex_unlock(&mutex); return new; }