From 685469e5bf9d31ccd9212be86f861a18fc213d05 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 12 Feb 2018 12:10:41 -0800 Subject: [PATCH 1/8] percpu: add Dennis Zhou as a percpu co-maintainer Dennis rewrote the percpu area allocator some months ago, understands most of the code base and has been responsive with the bug reports and questions. Let's add him as a co-maintainer. Signed-off-by: Tejun Heo Acked-by: Christopher Lameter --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 3bdc260e36b7..f384e22546d3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10832,6 +10832,7 @@ F: drivers/platform/x86/peaq-wmi.c PER-CPU MEMORY ALLOCATOR M: Tejun Heo M: Christoph Lameter +M: Dennis Zhou T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu.git S: Maintained F: include/linux/percpu*.h From 15d9f3d116c02a485441d758d9ca0a2e4f3b30be Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Thu, 15 Feb 2018 10:08:14 -0600 Subject: [PATCH 2/8] percpu: match chunk allocator declarations with definitions At some point the function declaration parameters got out of sync with the function definitions in percpu-vm.c and percpu-km.c. This patch makes them match again. Signed-off-by: Dennis Zhou Acked-by: Christoph Lameter Signed-off-by: Tejun Heo --- mm/percpu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 50e7fdf84055..e1ea41002173 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1277,8 +1277,10 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, * pcpu_addr_to_page - translate address to physical address * pcpu_verify_alloc_info - check alloc_info is acceptable during init */ -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size); -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size); +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end); +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end); static struct pcpu_chunk *pcpu_create_chunk(void); static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); static struct page *pcpu_addr_to_page(void *addr); From 47504ee04b9241548ae2c28be7d0b01cff3b7aa6 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Fri, 16 Feb 2018 12:07:19 -0600 Subject: [PATCH 3/8] percpu: add __GFP_NORETRY semantics to the percpu balancing path Percpu memory using the vmalloc area based chunk allocator lazily populates chunks by first requesting the full virtual address space required for the chunk and subsequently adding pages as allocations come through. To ensure atomic allocations can succeed, a workqueue item is used to maintain a minimum number of empty pages. In certain scenarios, such as reported in [1], it is possible that physical memory becomes quite scarce which can result in either a rather long time spent trying to find free pages or worse, a kernel panic. This patch adds support for __GFP_NORETRY and __GFP_NOWARN passing them through to the underlying allocators. This should prevent any unnecessary panics potentially caused by the workqueue item. The passing of gfp around is as additional flags rather than a full set of flags. The next patch will change these to caller passed semantics. V2: Added const modifier to gfp flags in the balance path. Removed an extra whitespace. [1] https://lkml.org/lkml/2018/2/12/551 Signed-off-by: Dennis Zhou Suggested-by: Daniel Borkmann Reported-by: syzbot+adb03f3f0bb57ce3acda@syzkaller.appspotmail.com Acked-by: Christoph Lameter Signed-off-by: Tejun Heo --- mm/percpu-km.c | 8 ++++---- mm/percpu-vm.c | 18 +++++++++++------- mm/percpu.c | 44 +++++++++++++++++++++++++++----------------- 3 files changed, 42 insertions(+), 28 deletions(-) diff --git a/mm/percpu-km.c b/mm/percpu-km.c index d2a76642c4ae..0d88d7bd5706 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -34,7 +34,7 @@ #include static int pcpu_populate_chunk(struct pcpu_chunk *chunk, - int page_start, int page_end) + int page_start, int page_end, gfp_t gfp) { return 0; } @@ -45,18 +45,18 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, /* nada */ } -static struct pcpu_chunk *pcpu_create_chunk(void) +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) { const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; struct pcpu_chunk *chunk; struct page *pages; int i; - chunk = pcpu_alloc_chunk(); + chunk = pcpu_alloc_chunk(gfp); if (!chunk) return NULL; - pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages)); + pages = alloc_pages(gfp | GFP_KERNEL, order_base_2(nr_pages)); if (!pages) { pcpu_free_chunk(chunk); return NULL; diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 9158e5a81391..0af71eb2fff0 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void) lockdep_assert_held(&pcpu_alloc_mutex); if (!pages) - pages = pcpu_mem_zalloc(pages_size); + pages = pcpu_mem_zalloc(pages_size, 0); return pages; } @@ -73,18 +73,21 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() * @page_start: page index of the first page to be allocated * @page_end: page index of the last page to be allocated + 1 + * @gfp: allocation flags passed to the underlying allocator * * Allocate pages [@page_start,@page_end) into @pages for all units. * The allocation is for @chunk. Percpu core doesn't care about the * content of @pages and will pass it verbatim to pcpu_map_pages(). */ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, - struct page **pages, int page_start, int page_end) + struct page **pages, int page_start, int page_end, + gfp_t gfp) { - const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM; unsigned int cpu, tcpu; int i; + gfp |= GFP_KERNEL | __GFP_HIGHMEM; + for_each_possible_cpu(cpu) { for (i = page_start; i < page_end; i++) { struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; @@ -262,6 +265,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, * @chunk: chunk of interest * @page_start: the start page * @page_end: the end page + * @gfp: allocation flags passed to the underlying memory allocator * * For each cpu, populate and map pages [@page_start,@page_end) into * @chunk. @@ -270,7 +274,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, * pcpu_alloc_mutex, does GFP_KERNEL allocation. */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, - int page_start, int page_end) + int page_start, int page_end, gfp_t gfp) { struct page **pages; @@ -278,7 +282,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, if (!pages) return -ENOMEM; - if (pcpu_alloc_pages(chunk, pages, page_start, page_end)) + if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp)) return -ENOMEM; if (pcpu_map_pages(chunk, pages, page_start, page_end)) { @@ -325,12 +329,12 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, pcpu_free_pages(chunk, pages, page_start, page_end); } -static struct pcpu_chunk *pcpu_create_chunk(void) +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) { struct pcpu_chunk *chunk; struct vm_struct **vms; - chunk = pcpu_alloc_chunk(); + chunk = pcpu_alloc_chunk(gfp); if (!chunk) return NULL; diff --git a/mm/percpu.c b/mm/percpu.c index e1ea41002173..f97443d488a8 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -447,10 +447,12 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, /** * pcpu_mem_zalloc - allocate memory * @size: bytes to allocate + * @gfp: allocation flags * * Allocate @size bytes. If @size is smaller than PAGE_SIZE, - * kzalloc() is used; otherwise, vzalloc() is used. The returned - * memory is always zeroed. + * kzalloc() is used; otherwise, the equivalent of vzalloc() is used. + * This is to facilitate passing through whitelisted flags. The + * returned memory is always zeroed. * * CONTEXT: * Does GFP_KERNEL allocation. @@ -458,15 +460,16 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ -static void *pcpu_mem_zalloc(size_t size) +static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) { if (WARN_ON_ONCE(!slab_is_available())) return NULL; if (size <= PAGE_SIZE) - return kzalloc(size, GFP_KERNEL); + return kzalloc(size, gfp | GFP_KERNEL); else - return vzalloc(size); + return __vmalloc(size, gfp | GFP_KERNEL | __GFP_ZERO, + PAGE_KERNEL); } /** @@ -1154,12 +1157,12 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, return chunk; } -static struct pcpu_chunk *pcpu_alloc_chunk(void) +static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) { struct pcpu_chunk *chunk; int region_bits; - chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size); + chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp); if (!chunk) return NULL; @@ -1168,17 +1171,17 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) region_bits = pcpu_chunk_map_bits(chunk); chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) * - sizeof(chunk->alloc_map[0])); + sizeof(chunk->alloc_map[0]), gfp); if (!chunk->alloc_map) goto alloc_map_fail; chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) * - sizeof(chunk->bound_map[0])); + sizeof(chunk->bound_map[0]), gfp); if (!chunk->bound_map) goto bound_map_fail; chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) * - sizeof(chunk->md_blocks[0])); + sizeof(chunk->md_blocks[0]), gfp); if (!chunk->md_blocks) goto md_blocks_fail; @@ -1278,10 +1281,10 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, * pcpu_verify_alloc_info - check alloc_info is acceptable during init */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, - int page_start, int page_end); + int page_start, int page_end, gfp_t gfp); static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end); -static struct pcpu_chunk *pcpu_create_chunk(void); +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp); static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); static struct page *pcpu_addr_to_page(void *addr); static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); @@ -1423,7 +1426,7 @@ restart: } if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { - chunk = pcpu_create_chunk(); + chunk = pcpu_create_chunk(0); if (!chunk) { err = "failed to allocate new chunk"; goto fail; @@ -1452,7 +1455,7 @@ area_found: page_start, page_end) { WARN_ON(chunk->immutable); - ret = pcpu_populate_chunk(chunk, rs, re); + ret = pcpu_populate_chunk(chunk, rs, re, 0); spin_lock_irqsave(&pcpu_lock, flags); if (ret) { @@ -1563,10 +1566,17 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * - * Reclaim all fully free chunks except for the first one. + * Reclaim all fully free chunks except for the first one. This is also + * responsible for maintaining the pool of empty populated pages. However, + * it is possible that this is called when physical memory is scarce causing + * OOM killer to be triggered. We should avoid doing so until an actual + * allocation causes the failure as it is possible that requests can be + * serviced from already backed regions. */ static void pcpu_balance_workfn(struct work_struct *work) { + /* gfp flags passed to underlying allocators */ + const gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; LIST_HEAD(to_free); struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; @@ -1647,7 +1657,7 @@ retry_pop: chunk->nr_pages) { int nr = min(re - rs, nr_to_pop); - ret = pcpu_populate_chunk(chunk, rs, rs + nr); + ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp); if (!ret) { nr_to_pop -= nr; spin_lock_irq(&pcpu_lock); @@ -1664,7 +1674,7 @@ retry_pop: if (nr_to_pop) { /* ran out of chunks to populate, create a new one and retry */ - chunk = pcpu_create_chunk(); + chunk = pcpu_create_chunk(gfp); if (chunk) { spin_lock_irq(&pcpu_lock); pcpu_chunk_relocate(chunk, -1); From 554fef1c39ee148623a496e04569dabb11463406 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Fri, 16 Feb 2018 12:09:58 -0600 Subject: [PATCH 4/8] percpu: allow select gfp to be passed to underlying allocators The prior patch added support for passing gfp flags through to the underlying allocators. This patch allows users to pass along gfp flags (currently only __GFP_NORETRY and __GFP_NOWARN) to the underlying allocators. This should allow users to decide if they are ok with failing allocations recovering in a more graceful way. Additionally, gfp passing was done as additional flags in the previous patch. Instead, change this to caller passed semantics. GFP_KERNEL is also removed as the default flag. It continues to be used for internally caused underlying percpu allocations. V2: Removed gfp_percpu_mask in favor of doing it inline. Removed GFP_KERNEL as a default flag for __alloc_percpu_gfp. Signed-off-by: Dennis Zhou Suggested-by: Daniel Borkmann Acked-by: Christoph Lameter Signed-off-by: Tejun Heo --- mm/percpu-km.c | 2 +- mm/percpu-vm.c | 4 ++-- mm/percpu.c | 16 +++++++--------- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 0d88d7bd5706..38de70ab1a0d 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -56,7 +56,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) if (!chunk) return NULL; - pages = alloc_pages(gfp | GFP_KERNEL, order_base_2(nr_pages)); + pages = alloc_pages(gfp, order_base_2(nr_pages)); if (!pages) { pcpu_free_chunk(chunk); return NULL; diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 0af71eb2fff0..d8078de912de 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void) lockdep_assert_held(&pcpu_alloc_mutex); if (!pages) - pages = pcpu_mem_zalloc(pages_size, 0); + pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL); return pages; } @@ -86,7 +86,7 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, unsigned int cpu, tcpu; int i; - gfp |= GFP_KERNEL | __GFP_HIGHMEM; + gfp |= __GFP_HIGHMEM; for_each_possible_cpu(cpu) { for (i = page_start; i < page_end; i++) { diff --git a/mm/percpu.c b/mm/percpu.c index f97443d488a8..fa3f854634a1 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -454,9 +454,6 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, * This is to facilitate passing through whitelisted flags. The * returned memory is always zeroed. * - * CONTEXT: - * Does GFP_KERNEL allocation. - * * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ @@ -466,10 +463,9 @@ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) return NULL; if (size <= PAGE_SIZE) - return kzalloc(size, gfp | GFP_KERNEL); + return kzalloc(size, gfp); else - return __vmalloc(size, gfp | GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL); + return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL); } /** @@ -1344,6 +1340,8 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, gfp_t gfp) { + /* whitelisted flags that can be passed to the backing allocators */ + gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; bool do_warn = !(gfp & __GFP_NOWARN); static int warn_limit = 10; @@ -1426,7 +1424,7 @@ restart: } if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { - chunk = pcpu_create_chunk(0); + chunk = pcpu_create_chunk(pcpu_gfp); if (!chunk) { err = "failed to allocate new chunk"; goto fail; @@ -1455,7 +1453,7 @@ area_found: page_start, page_end) { WARN_ON(chunk->immutable); - ret = pcpu_populate_chunk(chunk, rs, re, 0); + ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp); spin_lock_irqsave(&pcpu_lock, flags); if (ret) { @@ -1576,7 +1574,7 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) static void pcpu_balance_workfn(struct work_struct *work) { /* gfp flags passed to underlying allocators */ - const gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; + const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; LIST_HEAD(to_free); struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; From accd4f36a7d11c2d54544007eb65e10604dcf2f5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 23 Feb 2018 08:12:42 -0800 Subject: [PATCH 5/8] percpu: add a schedule point in pcpu_balance_workfn() When a large BPF percpu map is destroyed, I have seen pcpu_balance_workfn() holding cpu for hundreds of milliseconds. On KASAN config and 112 hyperthreads, average time to destroy a chunk is ~4 ms. [ 2489.841376] destroy chunk 1 in 4148689 ns ... [ 2490.093428] destroy chunk 32 in 4072718 ns Signed-off-by: Eric Dumazet Signed-off-by: Tejun Heo --- mm/percpu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/percpu.c b/mm/percpu.c index fa3f854634a1..36e7b65ba6cf 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1610,6 +1610,7 @@ static void pcpu_balance_workfn(struct work_struct *work) spin_unlock_irq(&pcpu_lock); } pcpu_destroy_chunk(chunk); + cond_resched(); } /* From 71546d100422bcc2c543dadeb9328728997cd23a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Mar 2018 08:27:26 -0700 Subject: [PATCH 6/8] percpu: include linux/sched.h for cond_resched() microblaze build broke due to missing declaration of the cond_resched() invocation added recently. Let's include linux/sched.h explicitly. Signed-off-by: Tejun Heo Reported-by: kbuild test robot --- mm/percpu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/percpu.c b/mm/percpu.c index 36e7b65ba6cf..15a398c00791 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #include From f52ba1fef7b92e74d58efef8eae7b6f48c6d218d Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Mar 2018 18:32:10 +0300 Subject: [PATCH 7/8] mm: Allow to kill tasks doing pcpu_alloc() and waiting for pcpu_balance_workfn() In case of memory deficit and low percpu memory pages, pcpu_balance_workfn() takes pcpu_alloc_mutex for a long time (as it makes memory allocations itself and waits for memory reclaim). If tasks doing pcpu_alloc() are choosen by OOM killer, they can't exit, because they are waiting for the mutex. The patch makes pcpu_alloc() to care about killing signal and use mutex_lock_killable(), when it's allowed by GFP flags. This guarantees, a task does not miss SIGKILL from OOM killer. Signed-off-by: Kirill Tkhai Signed-off-by: Tejun Heo --- mm/percpu.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 15a398c00791..9297098519a6 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1373,8 +1373,17 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, return NULL; } - if (!is_atomic) - mutex_lock(&pcpu_alloc_mutex); + if (!is_atomic) { + /* + * pcpu_balance_workfn() allocates memory under this mutex, + * and it may wait for memory reclaim. Allow current task + * to become OOM victim, in case of memory pressure. + */ + if (gfp & __GFP_NOFAIL) + mutex_lock(&pcpu_alloc_mutex); + else if (mutex_lock_killable(&pcpu_alloc_mutex)) + return NULL; + } spin_lock_irqsave(&pcpu_lock, flags); From b3a5d111994450909158929560906f2c1c6c1d85 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Mar 2018 12:45:12 -0700 Subject: [PATCH 8/8] percpu_ref: Update doc to dissuade users from depending on internal RCU grace periods percpu_ref internally uses sched-RCU to implement the percpu -> atomic mode switching and the documentation suggested that this could be depended upon. This doesn't seem like a good idea. * percpu_ref uses sched-RCU which has different grace periods regular RCU. Users may combine percpu_ref with regular RCU usage and incorrectly believe that regular RCU grace periods are performed by percpu_ref. This can lead to, for example, use-after-free due to premature freeing. * percpu_ref has a grace period when switching from percpu to atomic mode. It doesn't have one between the last put and release. This distinction is subtle and can lead to surprising bugs. * percpu_ref allows starting in and switching to atomic mode manually for debugging and other purposes. This means that there may not be any grace periods from kill to release. This patch makes it clear that the grace periods are percpu_ref's internal implementation detail and can't be depended upon by the users. Signed-off-by: Tejun Heo Cc: Kent Overstreet Cc: Linus Torvalds Signed-off-by: Tejun Heo --- include/linux/percpu-refcount.h | 18 ++++++++++++------ lib/percpu-refcount.c | 2 ++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 864d167a1073..009cdf3d65b6 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -30,10 +30,14 @@ * calls io_destroy() or the process exits. * * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it - * calls percpu_ref_kill(), then hlist_del_rcu() and synchronize_rcu() to remove - * the kioctx from the proccess's list of kioctxs - after that, there can't be - * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop - * the initial ref with percpu_ref_put(). + * removes the kioctx from the proccess's table of kioctxs and kills percpu_ref. + * After that, there can't be any new users of the kioctx (from lookup_ioctx()) + * and it's then safe to drop the initial ref with percpu_ref_put(). + * + * Note that the free path, free_ioctx(), needs to go through explicit call_rcu() + * to synchronize with RCU protected lookup_ioctx(). percpu_ref operations don't + * imply RCU grace periods of any kind and if a user wants to combine percpu_ref + * with RCU protection, it must be done explicitly. * * Code that does a two stage shutdown like this often needs some kind of * explicit synchronization to ensure the initial refcount can only be dropped @@ -113,8 +117,10 @@ void percpu_ref_reinit(struct percpu_ref *ref); * Must be used to drop the initial ref on a percpu refcount; must be called * precisely once before shutdown. * - * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the - * percpu counters and dropping the initial ref. + * Switches @ref into atomic mode before gathering up the percpu counters + * and dropping the initial ref. + * + * There are no implied RCU grace periods between kill and release. */ static inline void percpu_ref_kill(struct percpu_ref *ref) { diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 30e7dd88148b..9f96fa7bc000 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -322,6 +322,8 @@ EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu); * This function normally doesn't block and can be called from any context * but it may block if @confirm_kill is specified and @ref is in the * process of switching to atomic mode by percpu_ref_switch_to_atomic(). + * + * There are no implied RCU grace periods between kill and release. */ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill)