From cd6a884d09558014d0df5400d78b5c7f40f0a62e Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Sat, 15 Jul 2017 22:23:06 -0400 Subject: [PATCH 01/27] percpu: pcpu-stats change void buffer to int buffer Changes the use of a void buffer to an int buffer for clarity. Signed-off-by: Dennis Zhou Signed-off-by: Tejun Heo --- mm/percpu-stats.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index 03524a56eeff..0d810443785f 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -49,7 +49,7 @@ static int find_max_map_used(void) * the beginning of the chunk to the last allocation. */ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, - void *buffer) + int *buffer) { int i, s_index, last_alloc, alloc_sign, as_len; int *alloc_sizes, *p; @@ -113,7 +113,7 @@ static int percpu_stats_show(struct seq_file *m, void *v) { struct pcpu_chunk *chunk; int slot, max_map_used; - void *buffer; + int *buffer; alloc_buffer: spin_lock_irq(&pcpu_lock); From 02459164a27e6cf9577fd12fc3ebed78f8fa767f Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Sat, 15 Jul 2017 22:23:07 -0400 Subject: [PATCH 02/27] percpu: change the format for percpu_stats output This makes the debugfs output for percpu_stats a little easier to read by changing the spacing of the output to be consistent. Signed-off-by: Dennis Zhou Signed-off-by: Tejun Heo --- mm/percpu-stats.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index 0d810443785f..fa0f5de26c68 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -18,7 +18,7 @@ #include "percpu-internal.h" #define P(X, Y) \ - seq_printf(m, " %-24s: %8lld\n", X, (long long int)Y) + seq_printf(m, " %-20s: %12lld\n", X, (long long int)Y) struct percpu_stats pcpu_stats; struct pcpu_alloc_info pcpu_stats_ai; @@ -134,7 +134,7 @@ alloc_buffer: } #define PL(X) \ - seq_printf(m, " %-24s: %8lld\n", #X, (long long int)pcpu_stats_ai.X) + seq_printf(m, " %-20s: %12lld\n", #X, (long long int)pcpu_stats_ai.X) seq_printf(m, "Percpu Memory Statistics\n" @@ -151,7 +151,7 @@ alloc_buffer: #undef PL #define PU(X) \ - seq_printf(m, " %-18s: %14llu\n", #X, (unsigned long long)pcpu_stats.X) + seq_printf(m, " %-20s: %12llu\n", #X, (unsigned long long)pcpu_stats.X) seq_printf(m, "Global Stats:\n" From 6b9b6f39946c4a7a98fac31ee91bac43cd2b73a9 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Sat, 15 Jul 2017 22:23:08 -0400 Subject: [PATCH 03/27] percpu: expose pcpu_nr_empty_pop_pages in pcpu_stats Percpu memory holds a minimum threshold of pages that are populated in order to serve atomic percpu memory requests. This change makes it easier to verify that there are a minimum number of populated pages lying around. Signed-off-by: Dennis Zhou Signed-off-by: Tejun Heo --- mm/percpu-internal.h | 1 + mm/percpu-stats.c | 1 + mm/percpu.c | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index cd2442e13d8f..c9158a48ca05 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -36,6 +36,7 @@ extern spinlock_t pcpu_lock; extern struct list_head *pcpu_slot; extern int pcpu_nr_slots; +extern int pcpu_nr_empty_pop_pages; extern struct pcpu_chunk *pcpu_first_chunk; extern struct pcpu_chunk *pcpu_reserved_chunk; diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index fa0f5de26c68..44e561d3ab47 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -164,6 +164,7 @@ alloc_buffer: PU(nr_max_chunks); PU(min_alloc_size); PU(max_alloc_size); + P("empty_pop_pages", pcpu_nr_empty_pop_pages); seq_putc(m, '\n'); #undef PU diff --git a/mm/percpu.c b/mm/percpu.c index bd4130a69bbc..9ec5fd498f1d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -160,7 +160,7 @@ static LIST_HEAD(pcpu_map_extend_chunks); * The number of empty populated pages, protected by pcpu_lock. The * reserved chunk doesn't contribute to the count. */ -static int pcpu_nr_empty_pop_pages; +int pcpu_nr_empty_pop_pages; /* * Balance work is used to populate or destroy chunks asynchronously. We From 9c01516278ef872190fdda95aafaa8effdf6649a Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Sat, 15 Jul 2017 22:23:09 -0400 Subject: [PATCH 04/27] percpu: update the header comment and pcpu_build_alloc_info comments The header comment for percpu memory is a little hard to parse and is not super clear about how the first chunk is managed. This adds a little more clarity to the situation. There is also quite a bit of tricky logic in the pcpu_build_alloc_info. This adds a restructure of a comment to add a little more information. Unfortunately, you will still have to piece together a handful of other comments too, but should help direct you to the meaningful comments. Signed-off-by: Dennis Zhou Signed-off-by: Tejun Heo --- mm/percpu.c | 56 +++++++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 9ec5fd498f1d..29244fb076ba 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -4,36 +4,35 @@ * Copyright (C) 2009 SUSE Linux Products GmbH * Copyright (C) 2009 Tejun Heo * - * This file is released under the GPLv2. + * This file is released under the GPLv2 license. * - * This is percpu allocator which can handle both static and dynamic - * areas. Percpu areas are allocated in chunks. Each chunk is - * consisted of boot-time determined number of units and the first - * chunk is used for static percpu variables in the kernel image - * (special boot time alloc/init handling necessary as these areas - * need to be brought up before allocation services are running). - * Unit grows as necessary and all units grow or shrink in unison. - * When a chunk is filled up, another chunk is allocated. + * The percpu allocator handles both static and dynamic areas. Percpu + * areas are allocated in chunks which are divided into units. There is + * a 1-to-1 mapping for units to possible cpus. These units are grouped + * based on NUMA properties of the machine. * * c0 c1 c2 * ------------------- ------------------- ------------ * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u * ------------------- ...... ------------------- .... ------------ * - * Allocation is done in offset-size areas of single unit space. Ie, - * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, - * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to - * cpus. On NUMA, the mapping can be non-linear and even sparse. - * Percpu access can be done by configuring percpu base registers - * according to cpu to unit mapping and pcpu_unit_size. + * Allocation is done by offsets into a unit's address space. Ie., an + * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0, + * c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear + * and even sparse. Access is handled by configuring percpu base + * registers according to the cpu to unit mappings and offsetting the + * base address using pcpu_unit_size. * - * There are usually many small percpu allocations many of them being - * as small as 4 bytes. The allocator organizes chunks into lists - * according to free size and tries to allocate from the fullest one. - * Each chunk keeps the maximum contiguous area size hint which is - * guaranteed to be equal to or larger than the maximum contiguous - * area in the chunk. This helps the allocator not to iterate the - * chunk maps unnecessarily. + * There is special consideration for the first chunk which must handle + * the static percpu variables in the kernel image as allocation services + * are not online yet. In short, the first chunk is structure like so: + * + * + * + * The static data is copied from the original section managed by the + * linker. The reserved section, if non-zero, primarily manages static + * percpu variables from kernel modules. Finally, the dynamic section + * takes care of normal allocations. * * Allocation state in each chunk is kept using an array of integers * on chunk->map. A positive value in the map represents a free @@ -43,6 +42,12 @@ * Chunks can be determined from the address using the index field * in the page struct. The index field contains a pointer to the chunk. * + * These chunks are organized into lists according to free_size and + * tries to allocate from the fullest chunk first. Each chunk maintains + * a maximum contiguous area size hint which is guaranteed to be equal + * to or larger than the maximum contiguous area in the chunk. This + * helps prevent the allocator from iterating over chunks unnecessarily. + * * To use this allocator, arch code should do the following: * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate @@ -1842,6 +1847,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( */ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + /* determine the maximum # of units that can fit in an allocation */ alloc_size = roundup(min_unit_size, atom_size); upa = alloc_size / min_unit_size; while (alloc_size % upa || (offset_in_page(alloc_size / upa))) @@ -1868,9 +1874,9 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( } /* - * Expand unit size until address space usage goes over 75% - * and then as much as possible without using more address - * space. + * Wasted space is caused by a ratio imbalance of upa to group_cnt. + * Expand the unit_size until we use >= 75% of the units allocated. + * Related to atom_size, which could be much larger than the unit_size. */ last_allocs = INT_MAX; for (upa = max_upa; upa; upa--) { From fb29a2cc6b06e057ab7153a85190a96f66498ff4 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:01:58 -0400 Subject: [PATCH 05/27] percpu: setup_first_chunk enforce dynamic region must exist The first chunk is handled as a special case as it is composed of the static, reserved, and dynamic regions. The code handles each case individually. The next several patches will merge these code paths and lay the foundation for the bitmap allocator. This patch modifies logic to enforce that a dynamic region exists and changes the area map to account for that. This brings the logic closer to the dynamic chunk's init logic. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 29244fb076ba..3602d41a7dfe 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1598,6 +1598,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size)); PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); + PCPU_SETUP_BUG_ON(!ai->dyn_size); PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ @@ -1700,14 +1701,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, schunk->free_size = dyn_size; dyn_size = 0; /* dynamic area covered */ } - schunk->contig_hint = schunk->free_size; + schunk->contig_hint = schunk->free_size; schunk->map[0] = 1; schunk->map[1] = ai->static_size; - schunk->map_used = 1; - if (schunk->free_size) - schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size; - schunk->map[schunk->map_used] |= 1; + schunk->map[2] = (ai->static_size + schunk->free_size) | 1; + schunk->map_used = 2; schunk->has_reserved = true; /* init dynamic chunk if necessary */ From e22667056644086ca4a5b2986eb4fbf32e03ebab Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:01:59 -0400 Subject: [PATCH 06/27] percpu: introduce start_offset to pcpu_chunk The reserved chunk arithmetic uses a global variable pcpu_reserved_chunk_limit that is set in the first chunk init code to hide a portion of the area map. The bitmap allocator to come will eventually move the base_addr up and require both the reserved chunk and static chunk to maintain this offset. pcpu_reserved_chunk_limit is removed and start_offset is added. The first chunk that is circulated and is pcpu_first_chunk serves the dynamic region, the region following the reserved region. The reserved chunk address check will temporarily use the first chunk to identify its address range. A following patch will increase the base_addr and remove this. If there is no reserved chunk, this will check the static region and return false because those values should never be passed into the allocator. Lastly, when linking in the first chunk, make sure to count the right free region for the number of empty populated pages. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu-internal.h | 3 +++ mm/percpu.c | 21 ++++++++++----------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index c9158a48ca05..92fc0121dfff 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -28,6 +28,9 @@ struct pcpu_chunk { contain reservation for static chunk. Dynamic chunk will contain reservation for static and reserved chunks. */ + int start_offset; /* the overlap with the previous + region to have a page aligned + base_addr */ int nr_populated; /* # of populated pages */ unsigned long populated[]; /* populated bitmap */ }; diff --git a/mm/percpu.c b/mm/percpu.c index 3602d41a7dfe..e94f0d18c421 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -145,13 +145,10 @@ struct pcpu_chunk *pcpu_first_chunk __ro_after_init; /* * Optional reserved chunk. This chunk reserves part of the first - * chunk and serves it for reserved allocations. The amount of - * reserved offset is in pcpu_reserved_chunk_limit. When reserved - * area doesn't exist, the following variables contain NULL and 0 - * respectively. + * chunk and serves it for reserved allocations. When the reserved + * region doesn't exist, the following variable is NULL. */ struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init; -static int pcpu_reserved_chunk_limit __ro_after_init; DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ @@ -196,7 +193,7 @@ static bool pcpu_addr_in_reserved_chunk(void *addr) void *first_start = pcpu_first_chunk->base_addr; return addr >= first_start && - addr < first_start + pcpu_reserved_chunk_limit; + addr < first_start + pcpu_first_chunk->start_offset; } static int __pcpu_size_to_slot(int size) @@ -1687,6 +1684,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, INIT_LIST_HEAD(&schunk->list); INIT_LIST_HEAD(&schunk->map_extend_list); schunk->base_addr = base_addr; + schunk->start_offset = ai->static_size; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); schunk->immutable = true; @@ -1696,7 +1694,6 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (ai->reserved_size) { schunk->free_size = ai->reserved_size; pcpu_reserved_chunk = schunk; - pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size; } else { schunk->free_size = dyn_size; dyn_size = 0; /* dynamic area covered */ @@ -1704,7 +1701,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, schunk->contig_hint = schunk->free_size; schunk->map[0] = 1; - schunk->map[1] = ai->static_size; + schunk->map[1] = schunk->start_offset; schunk->map[2] = (ai->static_size + schunk->free_size) | 1; schunk->map_used = 2; schunk->has_reserved = true; @@ -1715,6 +1712,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, INIT_LIST_HEAD(&dchunk->list); INIT_LIST_HEAD(&dchunk->map_extend_list); dchunk->base_addr = base_addr; + dchunk->start_offset = ai->static_size + ai->reserved_size; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); dchunk->immutable = true; @@ -1723,16 +1721,17 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[0] = 1; - dchunk->map[1] = pcpu_reserved_chunk_limit; - dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1; + dchunk->map[1] = dchunk->start_offset; + dchunk->map[2] = (dchunk->start_offset + dchunk->free_size) | 1; dchunk->map_used = 2; dchunk->has_reserved = true; } /* link the first chunk in */ pcpu_first_chunk = dchunk ?: schunk; + i = (pcpu_first_chunk->start_offset) ? 1 : 0; pcpu_nr_empty_pop_pages += - pcpu_count_occupied_pages(pcpu_first_chunk, 1); + pcpu_count_occupied_pages(pcpu_first_chunk, i); pcpu_chunk_relocate(pcpu_first_chunk, -1); pcpu_stats_chunk_alloc(); From 4af1e6fbd8e46dc44a89edf215207fd7c8d5cdda Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:00 -0400 Subject: [PATCH 07/27] percpu: remove has_reserved from pcpu_chunk Prior this variable was used to manage statistics when the first chunk had a reserved region. The previous patch introduced start_offset to keep track of the offset by value rather than boolean. Therefore, has_reserved can be removed. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu-internal.h | 5 ----- mm/percpu-stats.c | 2 +- mm/percpu.c | 3 --- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 92fc0121dfff..c876b5b5bc18 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -23,11 +23,6 @@ struct pcpu_chunk { void *data; /* chunk data */ int first_free; /* no free below this */ bool immutable; /* no [de]population allowed */ - bool has_reserved; /* Indicates if chunk has reserved space - at the beginning. Reserved chunk will - contain reservation for static chunk. - Dynamic chunk will contain reservation - for static and reserved chunks. */ int start_offset; /* the overlap with the previous region to have a page aligned base_addr */ diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index 44e561d3ab47..32f3550ea099 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -58,7 +58,7 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0; alloc_sizes = buffer; - s_index = chunk->has_reserved ? 1 : 0; + s_index = (chunk->start_offset) ? 1 : 0; /* find last allocation */ last_alloc = -1; diff --git a/mm/percpu.c b/mm/percpu.c index e94f0d18c421..470e1a084a7c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -727,7 +727,6 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map[0] = 0; chunk->map[1] = pcpu_unit_size | 1; chunk->map_used = 1; - chunk->has_reserved = false; INIT_LIST_HEAD(&chunk->list); INIT_LIST_HEAD(&chunk->map_extend_list); @@ -1704,7 +1703,6 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, schunk->map[1] = schunk->start_offset; schunk->map[2] = (ai->static_size + schunk->free_size) | 1; schunk->map_used = 2; - schunk->has_reserved = true; /* init dynamic chunk if necessary */ if (dyn_size) { @@ -1724,7 +1722,6 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, dchunk->map[1] = dchunk->start_offset; dchunk->map[2] = (dchunk->start_offset + dchunk->free_size) | 1; dchunk->map_used = 2; - dchunk->has_reserved = true; } /* link the first chunk in */ From b9c39442ceffb202b98a88d492347eae125c5ba2 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:01 -0400 Subject: [PATCH 08/27] percpu: setup_first_chunk remove dyn_size and consolidate logic There is logic for setting variables in the static chunk init code that could be consolidated with the dynamic chunk init code. This combines this logic to setup for combining the allocation paths. reserved_size is used as the conditional as a dynamic region will always exist. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 470e1a084a7c..851aa8109788 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1562,8 +1562,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, { static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; - size_t dyn_size = ai->dyn_size; - size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; + size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; unsigned long *group_offsets; size_t *group_sizes; @@ -1690,14 +1689,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, bitmap_fill(schunk->populated, pcpu_unit_pages); schunk->nr_populated = pcpu_unit_pages; - if (ai->reserved_size) { - schunk->free_size = ai->reserved_size; - pcpu_reserved_chunk = schunk; - } else { - schunk->free_size = dyn_size; - dyn_size = 0; /* dynamic area covered */ - } - + schunk->free_size = ai->reserved_size ?: ai->dyn_size; schunk->contig_hint = schunk->free_size; schunk->map[0] = 1; schunk->map[1] = schunk->start_offset; @@ -1705,7 +1697,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, schunk->map_used = 2; /* init dynamic chunk if necessary */ - if (dyn_size) { + if (ai->reserved_size) { + pcpu_reserved_chunk = schunk; + dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&dchunk->list); INIT_LIST_HEAD(&dchunk->map_extend_list); @@ -1717,7 +1711,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, bitmap_fill(dchunk->populated, pcpu_unit_pages); dchunk->nr_populated = pcpu_unit_pages; - dchunk->contig_hint = dchunk->free_size = dyn_size; + dchunk->contig_hint = dchunk->free_size = ai->dyn_size; dchunk->map[0] = 1; dchunk->map[1] = dchunk->start_offset; dchunk->map[2] = (dchunk->start_offset + dchunk->free_size) | 1; From 10edf5b0b6e238f9102c88df8b92ba7ce8fdcc46 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:02 -0400 Subject: [PATCH 09/27] percpu: unify allocation of schunk and dchunk Create a common allocator for first chunk initialization, pcpu_alloc_first_chunk. Comments for this function will be added in a later patch once the bitmap allocator is added. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu.c | 73 +++++++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 851aa8109788..2e785a77ce14 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -708,6 +708,36 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, pcpu_chunk_relocate(chunk, oslot); } +static struct pcpu_chunk * __init pcpu_alloc_first_chunk(void *base_addr, + int start_offset, + int map_size, + int *map, + int init_map_size) +{ + struct pcpu_chunk *chunk; + + chunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); + INIT_LIST_HEAD(&chunk->list); + INIT_LIST_HEAD(&chunk->map_extend_list); + chunk->base_addr = base_addr; + chunk->start_offset = start_offset; + chunk->map = map; + chunk->map_alloc = init_map_size; + + /* manage populated page bitmap */ + chunk->immutable = true; + bitmap_fill(chunk->populated, pcpu_unit_pages); + chunk->nr_populated = pcpu_unit_pages; + + chunk->contig_hint = chunk->free_size = map_size; + chunk->map[0] = 1; + chunk->map[1] = chunk->start_offset; + chunk->map[2] = (chunk->start_offset + chunk->free_size) | 1; + chunk->map_used = 2; + + return chunk; +} + static struct pcpu_chunk *pcpu_alloc_chunk(void) { struct pcpu_chunk *chunk; @@ -1570,6 +1600,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, unsigned int cpu; int *unit_map; int group, unit, i; + int map_size, start_offset; #define PCPU_SETUP_BUG_ON(cond) do { \ if (unlikely(cond)) { \ @@ -1678,44 +1709,20 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, * covers static area + reserved area (mostly used for module * static percpu allocation). */ - schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); - INIT_LIST_HEAD(&schunk->list); - INIT_LIST_HEAD(&schunk->map_extend_list); - schunk->base_addr = base_addr; - schunk->start_offset = ai->static_size; - schunk->map = smap; - schunk->map_alloc = ARRAY_SIZE(smap); - schunk->immutable = true; - bitmap_fill(schunk->populated, pcpu_unit_pages); - schunk->nr_populated = pcpu_unit_pages; - - schunk->free_size = ai->reserved_size ?: ai->dyn_size; - schunk->contig_hint = schunk->free_size; - schunk->map[0] = 1; - schunk->map[1] = schunk->start_offset; - schunk->map[2] = (ai->static_size + schunk->free_size) | 1; - schunk->map_used = 2; + start_offset = ai->static_size; + map_size = ai->reserved_size ?: ai->dyn_size; + schunk = pcpu_alloc_first_chunk(base_addr, start_offset, map_size, + smap, ARRAY_SIZE(smap)); /* init dynamic chunk if necessary */ if (ai->reserved_size) { pcpu_reserved_chunk = schunk; - dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); - INIT_LIST_HEAD(&dchunk->list); - INIT_LIST_HEAD(&dchunk->map_extend_list); - dchunk->base_addr = base_addr; - dchunk->start_offset = ai->static_size + ai->reserved_size; - dchunk->map = dmap; - dchunk->map_alloc = ARRAY_SIZE(dmap); - dchunk->immutable = true; - bitmap_fill(dchunk->populated, pcpu_unit_pages); - dchunk->nr_populated = pcpu_unit_pages; - - dchunk->contig_hint = dchunk->free_size = ai->dyn_size; - dchunk->map[0] = 1; - dchunk->map[1] = dchunk->start_offset; - dchunk->map[2] = (dchunk->start_offset + dchunk->free_size) | 1; - dchunk->map_used = 2; + start_offset = ai->static_size + ai->reserved_size; + map_size = ai->dyn_size; + dchunk = pcpu_alloc_first_chunk(base_addr, start_offset, + map_size, dmap, + ARRAY_SIZE(dmap)); } /* link the first chunk in */ From 6b9d7c8e8ecf35dc9ba6763a45d81e54ee3ffcde Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:03 -0400 Subject: [PATCH 10/27] percpu: end chunk area maps page aligned for the populated bitmap The area map allocator manages the first chunk area by hiding all but the region it is responsible for serving in the area map. To align this with the populated page bitmap, end_offset is introduced to keep track of the delta to end page aligned. The area map is appended with the page aligned end when necessary to be in line with how the bitmap allocator requires the ending to be aligned with the LCM of PAGE_SIZE and the size of each bitmap block. percpu_stats is updated to ignore this region when present. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu-internal.h | 3 +++ mm/percpu-stats.c | 5 +++-- mm/percpu.c | 9 +++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index c876b5b5bc18..f02f31cea0e6 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -26,6 +26,9 @@ struct pcpu_chunk { int start_offset; /* the overlap with the previous region to have a page aligned base_addr */ + int end_offset; /* additional area required to + have the region end page + aligned */ int nr_populated; /* # of populated pages */ unsigned long populated[]; /* populated bitmap */ }; diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index 32f3550ea099..ffbdb96cdbeb 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -51,7 +51,7 @@ static int find_max_map_used(void) static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, int *buffer) { - int i, s_index, last_alloc, alloc_sign, as_len; + int i, s_index, e_index, last_alloc, alloc_sign, as_len; int *alloc_sizes, *p; /* statistics */ int sum_frag = 0, max_frag = 0; @@ -59,10 +59,11 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, alloc_sizes = buffer; s_index = (chunk->start_offset) ? 1 : 0; + e_index = chunk->map_used - ((chunk->end_offset) ? 1 : 0); /* find last allocation */ last_alloc = -1; - for (i = chunk->map_used - 1; i >= s_index; i--) { + for (i = e_index - 1; i >= s_index; i--) { if (chunk->map[i] & 1) { last_alloc = i; break; diff --git a/mm/percpu.c b/mm/percpu.c index 2e785a77ce14..1d2c980fde3f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -715,12 +715,16 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(void *base_addr, int init_map_size) { struct pcpu_chunk *chunk; + int region_size; + + region_size = PFN_ALIGN(start_offset + map_size); chunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&chunk->list); INIT_LIST_HEAD(&chunk->map_extend_list); chunk->base_addr = base_addr; chunk->start_offset = start_offset; + chunk->end_offset = region_size - chunk->start_offset - map_size; chunk->map = map; chunk->map_alloc = init_map_size; @@ -735,6 +739,11 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(void *base_addr, chunk->map[2] = (chunk->start_offset + chunk->free_size) | 1; chunk->map_used = 2; + if (chunk->end_offset) { + /* hide the end of the bitmap */ + chunk->map[++chunk->map_used] = region_size | 1; + } + return chunk; } From 0c4169c3d11722a26773bdc0144c97fadd47d905 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:04 -0400 Subject: [PATCH 11/27] percpu: setup_first_chunk rename schunk/dchunk to chunk There is no need to have the static chunk and dynamic chunk be named separately as the allocations are sequential. This preemptively solves the misnomer problem with the base_addrs being moved up in the following patch. It also removes a ternary operation deciding the first chunk. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 1d2c980fde3f..e08ed61ea70a 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1602,7 +1602,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; - struct pcpu_chunk *schunk, *dchunk = NULL; + struct pcpu_chunk *chunk; unsigned long *group_offsets; size_t *group_sizes; unsigned long *unit_off; @@ -1720,22 +1720,22 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, */ start_offset = ai->static_size; map_size = ai->reserved_size ?: ai->dyn_size; - schunk = pcpu_alloc_first_chunk(base_addr, start_offset, map_size, - smap, ARRAY_SIZE(smap)); + chunk = pcpu_alloc_first_chunk(base_addr, start_offset, map_size, smap, + ARRAY_SIZE(smap)); /* init dynamic chunk if necessary */ if (ai->reserved_size) { - pcpu_reserved_chunk = schunk; + pcpu_reserved_chunk = chunk; start_offset = ai->static_size + ai->reserved_size; map_size = ai->dyn_size; - dchunk = pcpu_alloc_first_chunk(base_addr, start_offset, - map_size, dmap, - ARRAY_SIZE(dmap)); + chunk = pcpu_alloc_first_chunk(base_addr, start_offset, + map_size, dmap, + ARRAY_SIZE(dmap)); } /* link the first chunk in */ - pcpu_first_chunk = dchunk ?: schunk; + pcpu_first_chunk = chunk; i = (pcpu_first_chunk->start_offset) ? 1 : 0; pcpu_nr_empty_pop_pages += pcpu_count_occupied_pages(pcpu_first_chunk, i); From c0ebfdc3fefdef73131c7cb431ad8079f65c714a Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:05 -0400 Subject: [PATCH 12/27] percpu: modify base_addr to be region specific Originally, the first chunk was served by one or two chunks, each given a region they are responsible for. Despite this, the arithmetic was based off of the true base_addr of the chunk making it be overly inclusive. This patch moves the base_addr of chunks that are responsible for the first chunk. The base_addr must remain page aligned to keep the address alignment correct, so it is the beginning of the region served page aligned down. start_offset holds where the region served begins from this new base_addr. The corresponding percpu address checks are modified to be more specific as a result. The first chunk considers only the dynamic region and both first chunk and reserved chunk checks ignore the static region. The static region addresses should never be passed into the allocator. There is no impact here besides distinguishing the first chunk and making the checks specific. The percpu pointer to physical address is left intact as addresses are not given out in the non-allocated portion of percpu memory. nr_pages is added to pcpu_chunk to keep track of the size of the entire region served containing both start_offset and end_offset. This variable will be used to manage the bitmap allocator. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu-internal.h | 2 + mm/percpu.c | 155 +++++++++++++++++++++++++++++++------------ 2 files changed, 116 insertions(+), 41 deletions(-) diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index f02f31cea0e6..34cb9799e324 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -29,6 +29,8 @@ struct pcpu_chunk { int end_offset; /* additional area required to have the region end page aligned */ + + int nr_pages; /* # of pages served by this chunk */ int nr_populated; /* # of populated pages */ unsigned long populated[]; /* populated bitmap */ }; diff --git a/mm/percpu.c b/mm/percpu.c index e08ed61ea70a..7c9f0d3ad1b5 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -181,19 +181,55 @@ static void pcpu_schedule_balance_work(void) schedule_work(&pcpu_balance_work); } +/** + * pcpu_addr_in_first_chunk - address check for first chunk's dynamic region + * @addr: percpu address of interest + * + * The first chunk is considered to be the dynamic region of the first chunk. + * While the true first chunk is composed of the static, dynamic, and + * reserved regions, it is the chunk that serves the dynamic region that is + * circulated in the chunk slots. + * + * The reserved chunk has a separate check and the static region addresses + * should never be passed into the percpu allocator. + * + * RETURNS: + * True if the address is in the dynamic region of the first chunk. + */ static bool pcpu_addr_in_first_chunk(void *addr) { - void *first_start = pcpu_first_chunk->base_addr; + void *start_addr = pcpu_first_chunk->base_addr + + pcpu_first_chunk->start_offset; + void *end_addr = pcpu_first_chunk->base_addr + + pcpu_first_chunk->nr_pages * PAGE_SIZE - + pcpu_first_chunk->end_offset; - return addr >= first_start && addr < first_start + pcpu_unit_size; + return addr >= start_addr && addr < end_addr; } +/** + * pcpu_addr_in_reserved_chunk - address check for reserved region + * + * The reserved region is a part of the first chunk and primarily serves + * static percpu variables from kernel modules. + * + * RETURNS: + * True if the address is in the reserved region. + */ static bool pcpu_addr_in_reserved_chunk(void *addr) { - void *first_start = pcpu_first_chunk->base_addr; + void *start_addr, *end_addr; - return addr >= first_start && - addr < first_start + pcpu_first_chunk->start_offset; + if (!pcpu_reserved_chunk) + return false; + + start_addr = pcpu_reserved_chunk->base_addr + + pcpu_reserved_chunk->start_offset; + end_addr = pcpu_reserved_chunk->base_addr + + pcpu_reserved_chunk->nr_pages * PAGE_SIZE - + pcpu_reserved_chunk->end_offset; + + return addr >= start_addr && addr < end_addr; } static int __pcpu_size_to_slot(int size) @@ -234,11 +270,16 @@ static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx) return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; } +static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx) +{ + return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT); +} + static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { - return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] + - (page_idx << PAGE_SHIFT); + return (unsigned long)chunk->base_addr + + pcpu_unit_page_offset(cpu, page_idx); } static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk, @@ -708,23 +749,34 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, pcpu_chunk_relocate(chunk, oslot); } -static struct pcpu_chunk * __init pcpu_alloc_first_chunk(void *base_addr, - int start_offset, +static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, int map_size, int *map, int init_map_size) { struct pcpu_chunk *chunk; - int region_size; + unsigned long aligned_addr; + int start_offset, region_size; + + /* region calculations */ + aligned_addr = tmp_addr & PAGE_MASK; + + start_offset = tmp_addr - aligned_addr; region_size = PFN_ALIGN(start_offset + map_size); + /* allocate chunk */ chunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); + INIT_LIST_HEAD(&chunk->list); INIT_LIST_HEAD(&chunk->map_extend_list); - chunk->base_addr = base_addr; + + chunk->base_addr = (void *)aligned_addr; chunk->start_offset = start_offset; chunk->end_offset = region_size - chunk->start_offset - map_size; + + chunk->nr_pages = pcpu_unit_pages; + chunk->map = map; chunk->map_alloc = init_map_size; @@ -734,10 +786,17 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(void *base_addr, chunk->nr_populated = pcpu_unit_pages; chunk->contig_hint = chunk->free_size = map_size; - chunk->map[0] = 1; - chunk->map[1] = chunk->start_offset; - chunk->map[2] = (chunk->start_offset + chunk->free_size) | 1; - chunk->map_used = 2; + + if (chunk->start_offset) { + /* hide the beginning of the bitmap */ + chunk->map[0] = 1; + chunk->map[1] = chunk->start_offset; + chunk->map_used = 1; + } + + /* set chunk's free region */ + chunk->map[++chunk->map_used] = + (chunk->start_offset + chunk->free_size) | 1; if (chunk->end_offset) { /* hide the end of the bitmap */ @@ -772,6 +831,8 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; + chunk->nr_pages = pcpu_unit_pages; + return chunk; } @@ -859,18 +920,21 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); * pcpu_chunk_addr_search - determine chunk containing specified address * @addr: address for which the chunk needs to be determined. * + * This is an internal function that handles all but static allocations. + * Static percpu address values should never be passed into the allocator. + * * RETURNS: * The address of the found chunk. */ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { - /* is it in the first chunk? */ - if (pcpu_addr_in_first_chunk(addr)) { - /* is it in the reserved area? */ - if (pcpu_addr_in_reserved_chunk(addr)) - return pcpu_reserved_chunk; + /* is it in the dynamic region (first chunk)? */ + if (pcpu_addr_in_first_chunk(addr)) return pcpu_first_chunk; - } + + /* is it in the reserved region? */ + if (pcpu_addr_in_reserved_chunk(addr)) + return pcpu_reserved_chunk; /* * The address is relative to unit0 which might be unused and @@ -1401,10 +1465,16 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) * The following test on unit_low/high isn't strictly * necessary but will speed up lookups of addresses which * aren't in the first chunk. + * + * The address check is against full chunk sizes. pcpu_base_addr + * points to the beginning of the first chunk including the + * static region. Assumes good intent as the first chunk may + * not be full (ie. < pcpu_unit_pages in size). */ - first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0); - first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu, - pcpu_unit_pages); + first_low = (unsigned long)pcpu_base_addr + + pcpu_unit_page_offset(pcpu_low_unit_cpu, 0); + first_high = (unsigned long)pcpu_base_addr + + pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages); if ((unsigned long)addr >= first_low && (unsigned long)addr < first_high) { for_each_possible_cpu(cpu) { @@ -1586,12 +1656,13 @@ static void pcpu_dump_alloc_info(const char *lvl, * The caller should have mapped the first chunk at @base_addr and * copied static data to each unit. * - * If the first chunk ends up with both reserved and dynamic areas, it - * is served by two chunks - one to serve the core static and reserved - * areas and the other for the dynamic area. They share the same vm - * and page map but uses different area allocation map to stay away - * from each other. The latter chunk is circulated in the chunk slots - * and available for dynamic allocation like any other chunks. + * The first chunk will always contain a static and a dynamic region. + * However, the static region is not managed by any chunk. If the first + * chunk also contains a reserved region, it is served by two chunks - + * one for the reserved region and one for the dynamic region. They + * share the same vm, but use offset regions in the area allocation map. + * The chunk serving the dynamic region is circulated in the chunk slots + * and available for dynamic allocation like any other chunk. * * RETURNS: * 0 on success, -errno on failure. @@ -1609,7 +1680,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, unsigned int cpu; int *unit_map; int group, unit, i; - int map_size, start_offset; + int map_size; + unsigned long tmp_addr; #define PCPU_SETUP_BUG_ON(cond) do { \ if (unlikely(cond)) { \ @@ -1712,25 +1784,26 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, INIT_LIST_HEAD(&pcpu_slot[i]); /* - * Initialize static chunk. If reserved_size is zero, the - * static chunk covers static area + dynamic allocation area - * in the first chunk. If reserved_size is not zero, it - * covers static area + reserved area (mostly used for module - * static percpu allocation). + * Initialize first chunk. + * If the reserved_size is non-zero, this initializes the reserved + * chunk. If the reserved_size is zero, the reserved chunk is NULL + * and the dynamic region is initialized here. The first chunk, + * pcpu_first_chunk, will always point to the chunk that serves + * the dynamic region. */ - start_offset = ai->static_size; + tmp_addr = (unsigned long)base_addr + ai->static_size; map_size = ai->reserved_size ?: ai->dyn_size; - chunk = pcpu_alloc_first_chunk(base_addr, start_offset, map_size, smap, + chunk = pcpu_alloc_first_chunk(tmp_addr, map_size, smap, ARRAY_SIZE(smap)); /* init dynamic chunk if necessary */ if (ai->reserved_size) { pcpu_reserved_chunk = chunk; - start_offset = ai->static_size + ai->reserved_size; + tmp_addr = (unsigned long)base_addr + ai->static_size + + ai->reserved_size; map_size = ai->dyn_size; - chunk = pcpu_alloc_first_chunk(base_addr, start_offset, - map_size, dmap, + chunk = pcpu_alloc_first_chunk(tmp_addr, map_size, dmap, ARRAY_SIZE(dmap)); } From 560f2c23666853b31acc32c892d44f5b14d258de Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:06 -0400 Subject: [PATCH 13/27] percpu: combine percpu address checks The percpu address checks for the reserved and dynamic region chunks are now specific to each region. The address checking logic can be combined taking advantage of the global references to the dynamic and static region chunks. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu.c | 51 +++++++++++---------------------------------------- 1 file changed, 11 insertions(+), 40 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 7c9f0d3ad1b5..5b1fcefdc386 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -182,52 +182,23 @@ static void pcpu_schedule_balance_work(void) } /** - * pcpu_addr_in_first_chunk - address check for first chunk's dynamic region - * @addr: percpu address of interest - * - * The first chunk is considered to be the dynamic region of the first chunk. - * While the true first chunk is composed of the static, dynamic, and - * reserved regions, it is the chunk that serves the dynamic region that is - * circulated in the chunk slots. - * - * The reserved chunk has a separate check and the static region addresses - * should never be passed into the percpu allocator. + * pcpu_addr_in_chunk - check if the address is served from this chunk + * @chunk: chunk of interest + * @addr: percpu address * * RETURNS: - * True if the address is in the dynamic region of the first chunk. + * True if the address is served from this chunk. */ -static bool pcpu_addr_in_first_chunk(void *addr) -{ - void *start_addr = pcpu_first_chunk->base_addr + - pcpu_first_chunk->start_offset; - void *end_addr = pcpu_first_chunk->base_addr + - pcpu_first_chunk->nr_pages * PAGE_SIZE - - pcpu_first_chunk->end_offset; - - return addr >= start_addr && addr < end_addr; -} - -/** - * pcpu_addr_in_reserved_chunk - address check for reserved region - * - * The reserved region is a part of the first chunk and primarily serves - * static percpu variables from kernel modules. - * - * RETURNS: - * True if the address is in the reserved region. - */ -static bool pcpu_addr_in_reserved_chunk(void *addr) +static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr) { void *start_addr, *end_addr; - if (!pcpu_reserved_chunk) + if (!chunk) return false; - start_addr = pcpu_reserved_chunk->base_addr + - pcpu_reserved_chunk->start_offset; - end_addr = pcpu_reserved_chunk->base_addr + - pcpu_reserved_chunk->nr_pages * PAGE_SIZE - - pcpu_reserved_chunk->end_offset; + start_addr = chunk->base_addr + chunk->start_offset; + end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE - + chunk->end_offset; return addr >= start_addr && addr < end_addr; } @@ -929,11 +900,11 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { /* is it in the dynamic region (first chunk)? */ - if (pcpu_addr_in_first_chunk(addr)) + if (pcpu_addr_in_chunk(pcpu_first_chunk, addr)) return pcpu_first_chunk; /* is it in the reserved region? */ - if (pcpu_addr_in_reserved_chunk(addr)) + if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr)) return pcpu_reserved_chunk; /* From 8ab16c43ea79098f4126432c6b199a5d6ba24b6d Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:07 -0400 Subject: [PATCH 14/27] percpu: change the number of pages marked in the first_chunk pop bitmap The populated bitmap represents the state of the pages the chunk serves. Prior, the bitmap was marked completely used as the first chunk was allocated and immutable. This is misleading because the first chunk may not be completely filled. Additionally, with moving the base_addr up in the previous patch, the population check no longer corresponds to what was being checked. This patch modifies the population map to be only the number of pages the region serves and to make what it was checking correspond correctly again. The change is to remove any misunderstanding between the size of the populated bitmap and the actual size of it. The work function page iterators now use nr_pages for the check rather than pcpu_unit_pages because nr_populated is now chunk specific. Without this, the work function would try to populate the remainder of these chunks despite it not serving any more than nr_pages when nr_pages is set less than pcpu_unit_pages. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 5b1fcefdc386..773dafea181e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -737,7 +737,9 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, region_size = PFN_ALIGN(start_offset + map_size); /* allocate chunk */ - chunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); + chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) + + BITS_TO_LONGS(region_size >> PAGE_SHIFT), + 0); INIT_LIST_HEAD(&chunk->list); INIT_LIST_HEAD(&chunk->map_extend_list); @@ -746,15 +748,15 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, chunk->start_offset = start_offset; chunk->end_offset = region_size - chunk->start_offset - map_size; - chunk->nr_pages = pcpu_unit_pages; + chunk->nr_pages = region_size >> PAGE_SHIFT; chunk->map = map; chunk->map_alloc = init_map_size; /* manage populated page bitmap */ chunk->immutable = true; - bitmap_fill(chunk->populated, pcpu_unit_pages); - chunk->nr_populated = pcpu_unit_pages; + bitmap_fill(chunk->populated, chunk->nr_pages); + chunk->nr_populated = chunk->nr_pages; chunk->contig_hint = chunk->free_size = map_size; @@ -1212,7 +1214,7 @@ static void pcpu_balance_workfn(struct work_struct *work) list_for_each_entry_safe(chunk, next, &to_free, list) { int rs, re; - pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { + pcpu_for_each_pop_region(chunk, rs, re, 0, chunk->nr_pages) { pcpu_depopulate_chunk(chunk, rs, re); spin_lock_irq(&pcpu_lock); pcpu_chunk_depopulated(chunk, rs, re); @@ -1269,7 +1271,7 @@ retry_pop: spin_lock_irq(&pcpu_lock); list_for_each_entry(chunk, &pcpu_slot[slot], list) { - nr_unpop = pcpu_unit_pages - chunk->nr_populated; + nr_unpop = chunk->nr_pages - chunk->nr_populated; if (nr_unpop) break; } @@ -1279,7 +1281,7 @@ retry_pop: continue; /* @chunk can't go away while pcpu_alloc_mutex is held */ - pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { + pcpu_for_each_unpop_region(chunk, rs, re, 0, chunk->nr_pages) { int nr = min(re - rs, nr_to_pop); ret = pcpu_populate_chunk(chunk, rs, rs + nr); From 0cecf50cf00fbe6858908098ae6c6a9fd1d60724 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:08 -0400 Subject: [PATCH 15/27] percpu: introduce nr_empty_pop_pages to help empty page accounting pcpu_nr_empty_pop_pages is used to ensure there are a handful of free pages around to serve atomic allocations. A new field, nr_empty_pop_pages, is added to the pcpu_chunk struct to keep track of the number of empty pages. This field is needed as the number of empty populated pages is globally tracked and deltas are used to update in the bitmap allocator. Pages that contain a hidden area are not considered to be empty. This new field is exposed in percpu_stats. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu-internal.h | 1 + mm/percpu-stats.c | 1 + mm/percpu.c | 11 ++++++++--- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 34cb9799e324..c4c8fc49780b 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -32,6 +32,7 @@ struct pcpu_chunk { int nr_pages; /* # of pages served by this chunk */ int nr_populated; /* # of populated pages */ + int nr_empty_pop_pages; /* # of empty populated pages */ unsigned long populated[]; /* populated bitmap */ }; diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index ffbdb96cdbeb..e146b585fd18 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -100,6 +100,7 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, P("nr_alloc", chunk->nr_alloc); P("max_alloc_size", chunk->max_alloc_size); + P("empty_pop_pages", chunk->nr_empty_pop_pages); P("free_size", chunk->free_size); P("contig_hint", chunk->contig_hint); P("sum_frag", sum_frag); diff --git a/mm/percpu.c b/mm/percpu.c index 773dafea181e..657ab0821cf0 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -757,11 +757,14 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, chunk->immutable = true; bitmap_fill(chunk->populated, chunk->nr_pages); chunk->nr_populated = chunk->nr_pages; + chunk->nr_empty_pop_pages = chunk->nr_pages; chunk->contig_hint = chunk->free_size = map_size; if (chunk->start_offset) { /* hide the beginning of the bitmap */ + chunk->nr_empty_pop_pages--; + chunk->map[0] = 1; chunk->map[1] = chunk->start_offset; chunk->map_used = 1; @@ -773,6 +776,8 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, if (chunk->end_offset) { /* hide the end of the bitmap */ + chunk->nr_empty_pop_pages--; + chunk->map[++chunk->map_used] = region_size | 1; } @@ -836,6 +841,7 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, bitmap_set(chunk->populated, page_start, nr); chunk->nr_populated += nr; + chunk->nr_empty_pop_pages += nr; pcpu_nr_empty_pop_pages += nr; } @@ -858,6 +864,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, bitmap_clear(chunk->populated, page_start, nr); chunk->nr_populated -= nr; + chunk->nr_empty_pop_pages -= nr; pcpu_nr_empty_pop_pages -= nr; } @@ -1782,9 +1789,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* link the first chunk in */ pcpu_first_chunk = chunk; - i = (pcpu_first_chunk->start_offset) ? 1 : 0; - pcpu_nr_empty_pop_pages += - pcpu_count_occupied_pages(pcpu_first_chunk, i); + pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); pcpu_stats_chunk_alloc(); From d2f3c3849461baefdbb39123abde1054d46bf22e Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:09 -0400 Subject: [PATCH 16/27] percpu: increase minimum percpu allocation size and align first regions This patch increases the minimum allocation size of percpu memory to 4-bytes. This change will help minimize the metadata overhead associated with the bitmap allocator. The assumption is that most allocations will be of objects or structs greater than 2 bytes with integers or longs being used rather than shorts. The first chunk regions are now aligned with the minimum allocation size. The reserved region is expected to be set as a multiple of the minimum allocation size. The static region is aligned up and the delta is removed from the dynamic size. This works because the dynamic size is increased to be page aligned. If the static size is not minimum allocation size aligned, then there must be a gap that is added to the dynamic size. The dynamic size will never be smaller than the set value. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- include/linux/percpu.h | 4 ++++ mm/percpu.c | 27 ++++++++++++++++++++------- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 491b3f5a5f8a..90e0cb0f7802 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -21,6 +21,10 @@ /* minimum unit size, also is the maximum supported allocation size */ #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(32 << 10) +/* minimum allocation size and shift in bytes */ +#define PCPU_MIN_ALLOC_SHIFT 2 +#define PCPU_MIN_ALLOC_SIZE (1 << PCPU_MIN_ALLOC_SHIFT) + /* * Percpu allocator can serve percpu allocations before slab is * initialized which allows slab to depend on the percpu allocator. diff --git a/mm/percpu.c b/mm/percpu.c index 657ab0821cf0..dc755721c333 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -956,10 +956,10 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, * We want the lowest bit of offset available for in-use/free * indicator, so force >= 16bit alignment and make size even. */ - if (unlikely(align < 2)) - align = 2; + if (unlikely(align < PCPU_MIN_ALLOC_SIZE)) + align = PCPU_MIN_ALLOC_SIZE; - size = ALIGN(size, 2); + size = ALIGN(size, PCPU_MIN_ALLOC_SIZE); if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE || !is_power_of_2(align))) { @@ -1653,6 +1653,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; + size_t static_size, dyn_size; struct pcpu_chunk *chunk; unsigned long *group_offsets; size_t *group_sizes; @@ -1686,6 +1687,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); PCPU_SETUP_BUG_ON(!ai->dyn_size); + PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE)); PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ @@ -1763,6 +1765,17 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); + /* + * The end of the static region needs to be aligned with the + * minimum allocation size as this offsets the reserved and + * dynamic region. The first chunk ends page aligned by + * expanding the dynamic region, therefore the dynamic region + * can be shrunk to compensate while still staying above the + * configured sizes. + */ + static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE); + dyn_size = ai->dyn_size - (static_size - ai->static_size); + /* * Initialize first chunk. * If the reserved_size is non-zero, this initializes the reserved @@ -1771,8 +1784,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, * pcpu_first_chunk, will always point to the chunk that serves * the dynamic region. */ - tmp_addr = (unsigned long)base_addr + ai->static_size; - map_size = ai->reserved_size ?: ai->dyn_size; + tmp_addr = (unsigned long)base_addr + static_size; + map_size = ai->reserved_size ?: dyn_size; chunk = pcpu_alloc_first_chunk(tmp_addr, map_size, smap, ARRAY_SIZE(smap)); @@ -1780,9 +1793,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (ai->reserved_size) { pcpu_reserved_chunk = chunk; - tmp_addr = (unsigned long)base_addr + ai->static_size + + tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size; - map_size = ai->dyn_size; + map_size = dyn_size; chunk = pcpu_alloc_first_chunk(tmp_addr, map_size, dmap, ARRAY_SIZE(dmap)); } From 91e914c5a4988d00a13c14297ab02b250611e00e Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:10 -0400 Subject: [PATCH 17/27] percpu: generalize bitmap (un)populated iterators The area map allocator only used a bitmap for the backing page state. The new bitmap allocator will use bitmaps to manage the allocation region in addition to this. This patch generalizes the bitmap iterators so they can be reused with the bitmap allocator. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu.c | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index dc755721c333..84cc2559d4aa 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -253,35 +253,32 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, pcpu_unit_page_offset(cpu, page_idx); } -static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk, - int *rs, int *re, int end) +static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end) { - *rs = find_next_zero_bit(chunk->populated, end, *rs); - *re = find_next_bit(chunk->populated, end, *rs + 1); + *rs = find_next_zero_bit(bitmap, end, *rs); + *re = find_next_bit(bitmap, end, *rs + 1); } -static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, - int *rs, int *re, int end) +static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end) { - *rs = find_next_bit(chunk->populated, end, *rs); - *re = find_next_zero_bit(chunk->populated, end, *rs + 1); + *rs = find_next_bit(bitmap, end, *rs); + *re = find_next_zero_bit(bitmap, end, *rs + 1); } /* - * (Un)populated page region iterators. Iterate over (un)populated - * page regions between @start and @end in @chunk. @rs and @re should - * be integer variables and will be set to start and end page index of - * the current region. + * Bitmap region iterators. Iterates over the bitmap between + * [@start, @end) in @chunk. @rs and @re should be integer variables + * and will be set to start and end index of the current free region. */ -#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \ - for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \ - (rs) < (re); \ - (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end))) +#define pcpu_for_each_unpop_region(bitmap, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end))) -#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \ - for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \ - (rs) < (re); \ - (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) +#define pcpu_for_each_pop_region(bitmap, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end))) /** * pcpu_mem_zalloc - allocate memory @@ -521,7 +518,8 @@ static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, page_end = PFN_UP(head + off + size); rs = page_start; - pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size)); + pcpu_next_unpop(chunk->populated, &rs, &re, + PFN_UP(off + this_size)); if (rs >= page_end) return head; cand_off = re * PAGE_SIZE; @@ -1071,7 +1069,8 @@ area_found: page_start = PFN_DOWN(off); page_end = PFN_UP(off + size); - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + pcpu_for_each_unpop_region(chunk->populated, rs, re, + page_start, page_end) { WARN_ON(chunk->immutable); ret = pcpu_populate_chunk(chunk, rs, re); @@ -1221,7 +1220,8 @@ static void pcpu_balance_workfn(struct work_struct *work) list_for_each_entry_safe(chunk, next, &to_free, list) { int rs, re; - pcpu_for_each_pop_region(chunk, rs, re, 0, chunk->nr_pages) { + pcpu_for_each_pop_region(chunk->populated, rs, re, 0, + chunk->nr_pages) { pcpu_depopulate_chunk(chunk, rs, re); spin_lock_irq(&pcpu_lock); pcpu_chunk_depopulated(chunk, rs, re); @@ -1288,7 +1288,8 @@ retry_pop: continue; /* @chunk can't go away while pcpu_alloc_mutex is held */ - pcpu_for_each_unpop_region(chunk, rs, re, 0, chunk->nr_pages) { + pcpu_for_each_unpop_region(chunk->populated, rs, re, 0, + chunk->nr_pages) { int nr = min(re - rs, nr_to_pop); ret = pcpu_populate_chunk(chunk, rs, rs + nr); From 40064aeca35c5c14789e2adcf3a1d7e5d4bd65f2 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Wed, 12 Jul 2017 11:27:32 -0700 Subject: [PATCH 18/27] percpu: replace area map allocator with bitmap The percpu memory allocator is experiencing scalability issues when allocating and freeing large numbers of counters as in BPF. Additionally, there is a corner case where iteration is triggered over all chunks if the contig_hint is the right size, but wrong alignment. This patch replaces the area map allocator with a basic bitmap allocator implementation. Each subsequent patch will introduce new features and replace full scanning functions with faster non-scanning options when possible. Implementation: This patchset removes the area map allocator in favor of a bitmap allocator backed by metadata blocks. The primary goal is to provide consistency in performance and memory footprint with a focus on small allocations (< 64 bytes). The bitmap removes the heavy memmove from the freeing critical path and provides a consistent memory footprint. The metadata blocks provide a bound on the amount of scanning required by maintaining a set of hints. In an effort to make freeing fast, the metadata is updated on the free path if the new free area makes a page free, a block free, or spans across blocks. This causes the chunk's contig hint to potentially be smaller than what it could allocate by up to the smaller of a page or a block. If the chunk's contig hint is contained within a block, a check occurs and the hint is kept accurate. Metadata is always kept accurate on allocation, so there will not be a situation where a chunk has a later contig hint than available. Evaluation: I have primarily done testing against a simple workload of allocation of 1 million objects (2^20) of varying size. Deallocation was done by in order, alternating, and in reverse. These numbers were collected after rebasing ontop of a80099a152. I present the worst-case numbers here: Area Map Allocator: Object Size | Alloc Time (ms) | Free Time (ms) ---------------------------------------------- 4B | 310 | 4770 16B | 557 | 1325 64B | 436 | 273 256B | 776 | 131 1024B | 3280 | 122 Bitmap Allocator: Object Size | Alloc Time (ms) | Free Time (ms) ---------------------------------------------- 4B | 490 | 70 16B | 515 | 75 64B | 610 | 80 256B | 950 | 100 1024B | 3520 | 200 This data demonstrates the inability for the area map allocator to handle less than ideal situations. In the best case of reverse deallocation, the area map allocator was able to perform within range of the bitmap allocator. In the worst case situation, freeing took nearly 5 seconds for 1 million 4-byte objects. The bitmap allocator dramatically improves the consistency of the free path. The small allocations performed nearly identical regardless of the freeing pattern. While it does add to the allocation latency, the allocation scenario here is optimal for the area map allocator. The area map allocator runs into trouble when it is allocating in chunks where the latter half is full. It is difficult to replicate this, so I present a variant where the pages are second half filled. Freeing was done sequentially. Below are the numbers for this scenario: Area Map Allocator: Object Size | Alloc Time (ms) | Free Time (ms) ---------------------------------------------- 4B | 4118 | 4892 16B | 1651 | 1163 64B | 598 | 285 256B | 771 | 158 1024B | 3034 | 160 Bitmap Allocator: Object Size | Alloc Time (ms) | Free Time (ms) ---------------------------------------------- 4B | 481 | 67 16B | 506 | 69 64B | 636 | 75 256B | 892 | 90 1024B | 3262 | 147 The data shows a parabolic curve of performance for the area map allocator. This is due to the memmove operation being the dominant cost with the lower object sizes as more objects are packed in a chunk and at higher object sizes, the traversal of the chunk slots is the dominating cost. The bitmap allocator suffers this problem as well. The above data shows the inability to scale for the allocation path with the area map allocator and that the bitmap allocator demonstrates consistent performance in general. The second problem of additional scanning can result in the area map allocator completing in 52 minutes when trying to allocate 1 million 4-byte objects with 8-byte alignment. The same workload takes approximately 16 seconds to complete for the bitmap allocator. V2: Fixed a bug in pcpu_alloc_first_chunk end_offset was setting the bitmap using bytes instead of bits. Added a comment to pcpu_cnt_pop_pages to explain bitmap_weight. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- include/linux/percpu.h | 1 - init/main.c | 1 - mm/percpu-internal.h | 34 +- mm/percpu-km.c | 2 +- mm/percpu-stats.c | 97 +++-- mm/percpu.c | 807 ++++++++++++++++------------------------- 6 files changed, 400 insertions(+), 542 deletions(-) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 90e0cb0f7802..b7e6c98722d1 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -120,7 +120,6 @@ extern bool is_kernel_percpu_address(unsigned long addr); #if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) extern void __init setup_per_cpu_areas(void); #endif -extern void __init percpu_init_late(void); extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); extern void __percpu *__alloc_percpu(size_t size, size_t align); diff --git a/init/main.c b/init/main.c index 052481fbe363..c9a9ffff6ec6 100644 --- a/init/main.c +++ b/init/main.c @@ -500,7 +500,6 @@ static void __init mm_init(void) page_ext_init_flatmem(); mem_init(); kmem_cache_init(); - percpu_init_late(); pgtable_init(); vmalloc_init(); ioremap_huge_init(); diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index c4c8fc49780b..2e9d9bcb6fa2 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -11,14 +11,12 @@ struct pcpu_chunk { #endif struct list_head list; /* linked to pcpu_slot lists */ - int free_size; /* free bytes in the chunk */ - int contig_hint; /* max contiguous size hint */ + int free_bytes; /* free bytes in the chunk */ + int contig_bits; /* max contiguous size hint */ void *base_addr; /* base address of this chunk */ - int map_used; /* # of map entries used before the sentry */ - int map_alloc; /* # of map entries allocated */ - int *map; /* allocation map */ - struct list_head map_extend_list;/* on pcpu_map_extend_chunks */ + unsigned long *alloc_map; /* allocation map */ + unsigned long *bound_map; /* boundary map */ void *data; /* chunk data */ int first_free; /* no free below this */ @@ -45,6 +43,30 @@ extern int pcpu_nr_empty_pop_pages; extern struct pcpu_chunk *pcpu_first_chunk; extern struct pcpu_chunk *pcpu_reserved_chunk; +/** + * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap + * @pages: number of physical pages + * + * This conversion is from physical pages to the number of bits + * required in the bitmap. + */ +static inline int pcpu_nr_pages_to_map_bits(int pages) +{ + return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; +} + +/** + * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap + * @chunk: chunk of interest + * + * This conversion is from the number of physical pages that the chunk + * serves to the number of bits in the bitmap. + */ +static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) +{ + return pcpu_nr_pages_to_map_bits(chunk->nr_pages); +} + #ifdef CONFIG_PERCPU_STATS #include diff --git a/mm/percpu-km.c b/mm/percpu-km.c index eb58aa4c0997..d2a76642c4ae 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -69,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void) chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; spin_lock_irq(&pcpu_lock); - pcpu_chunk_populated(chunk, 0, nr_pages); + pcpu_chunk_populated(chunk, 0, nr_pages, false); spin_unlock_irq(&pcpu_lock); pcpu_stats_chunk_alloc(); diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index e146b585fd18..ad03d73aa5fe 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -29,65 +29,85 @@ static int cmpint(const void *a, const void *b) } /* - * Iterates over all chunks to find the max # of map entries used. + * Iterates over all chunks to find the max nr_alloc entries. */ -static int find_max_map_used(void) +static int find_max_nr_alloc(void) { struct pcpu_chunk *chunk; - int slot, max_map_used; + int slot, max_nr_alloc; - max_map_used = 0; + max_nr_alloc = 0; for (slot = 0; slot < pcpu_nr_slots; slot++) list_for_each_entry(chunk, &pcpu_slot[slot], list) - max_map_used = max(max_map_used, chunk->map_used); + max_nr_alloc = max(max_nr_alloc, chunk->nr_alloc); - return max_map_used; + return max_nr_alloc; } /* * Prints out chunk state. Fragmentation is considered between * the beginning of the chunk to the last allocation. + * + * All statistics are in bytes unless stated otherwise. */ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, int *buffer) { - int i, s_index, e_index, last_alloc, alloc_sign, as_len; + int i, last_alloc, as_len, start, end; int *alloc_sizes, *p; /* statistics */ int sum_frag = 0, max_frag = 0; int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0; alloc_sizes = buffer; - s_index = (chunk->start_offset) ? 1 : 0; - e_index = chunk->map_used - ((chunk->end_offset) ? 1 : 0); - /* find last allocation */ - last_alloc = -1; - for (i = e_index - 1; i >= s_index; i--) { - if (chunk->map[i] & 1) { - last_alloc = i; - break; + /* + * find_last_bit returns the start value if nothing found. + * Therefore, we must determine if it is a failure of find_last_bit + * and set the appropriate value. + */ + last_alloc = find_last_bit(chunk->alloc_map, + pcpu_chunk_map_bits(chunk) - + chunk->end_offset / PCPU_MIN_ALLOC_SIZE - 1); + last_alloc = test_bit(last_alloc, chunk->alloc_map) ? + last_alloc + 1 : 0; + + as_len = 0; + start = chunk->start_offset; + + /* + * If a bit is set in the allocation map, the bound_map identifies + * where the allocation ends. If the allocation is not set, the + * bound_map does not identify free areas as it is only kept accurate + * on allocation, not free. + * + * Positive values are allocations and negative values are free + * fragments. + */ + while (start < last_alloc) { + if (test_bit(start, chunk->alloc_map)) { + end = find_next_bit(chunk->bound_map, last_alloc, + start + 1); + alloc_sizes[as_len] = 1; + } else { + end = find_next_bit(chunk->alloc_map, last_alloc, + start + 1); + alloc_sizes[as_len] = -1; } + + alloc_sizes[as_len++] *= (end - start) * PCPU_MIN_ALLOC_SIZE; + + start = end; } - /* if the chunk is not empty - ignoring reserve */ - if (last_alloc >= s_index) { - as_len = last_alloc + 1 - s_index; + /* + * The negative values are free fragments and thus sorting gives the + * free fragments at the beginning in largest first order. + */ + if (as_len > 0) { + sort(alloc_sizes, as_len, sizeof(int), cmpint, NULL); - /* - * Iterate through chunk map computing size info. - * The first bit is overloaded to be a used flag. - * negative = free space, positive = allocated - */ - for (i = 0, p = chunk->map + s_index; i < as_len; i++, p++) { - alloc_sign = (*p & 1) ? 1 : -1; - alloc_sizes[i] = alloc_sign * - ((p[1] & ~1) - (p[0] & ~1)); - } - - sort(alloc_sizes, as_len, sizeof(chunk->map[0]), cmpint, NULL); - - /* Iterate through the unallocated fragements. */ + /* iterate through the unallocated fragments */ for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) { sum_frag -= *p; max_frag = max(max_frag, -1 * (*p)); @@ -101,8 +121,8 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, P("nr_alloc", chunk->nr_alloc); P("max_alloc_size", chunk->max_alloc_size); P("empty_pop_pages", chunk->nr_empty_pop_pages); - P("free_size", chunk->free_size); - P("contig_hint", chunk->contig_hint); + P("free_bytes", chunk->free_bytes); + P("contig_bytes", chunk->contig_bits * PCPU_MIN_ALLOC_SIZE); P("sum_frag", sum_frag); P("max_frag", max_frag); P("cur_min_alloc", cur_min_alloc); @@ -114,22 +134,23 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, static int percpu_stats_show(struct seq_file *m, void *v) { struct pcpu_chunk *chunk; - int slot, max_map_used; + int slot, max_nr_alloc; int *buffer; alloc_buffer: spin_lock_irq(&pcpu_lock); - max_map_used = find_max_map_used(); + max_nr_alloc = find_max_nr_alloc(); spin_unlock_irq(&pcpu_lock); - buffer = vmalloc(max_map_used * sizeof(pcpu_first_chunk->map[0])); + /* there can be at most this many free and allocated fragments */ + buffer = vmalloc((2 * max_nr_alloc + 1) * sizeof(int)); if (!buffer) return -ENOMEM; spin_lock_irq(&pcpu_lock); /* if the buffer allocated earlier is too small */ - if (max_map_used < find_max_map_used()) { + if (max_nr_alloc < find_max_nr_alloc()) { spin_unlock_irq(&pcpu_lock); vfree(buffer); goto alloc_buffer; diff --git a/mm/percpu.c b/mm/percpu.c index 84cc2559d4aa..986d900e6680 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -86,10 +86,9 @@ #include "percpu-internal.h" -#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ -#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ -#define PCPU_ATOMIC_MAP_MARGIN_LOW 32 -#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 +/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */ +#define PCPU_SLOT_BASE_SHIFT 5 + #define PCPU_EMPTY_POP_PAGES_LOW 2 #define PCPU_EMPTY_POP_PAGES_HIGH 4 @@ -218,10 +217,10 @@ static int pcpu_size_to_slot(int size) static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) { - if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) + if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0) return 0; - return pcpu_size_to_slot(chunk->free_size); + return pcpu_size_to_slot(chunk->free_bytes); } /* set the pointer to a chunk in a page struct */ @@ -316,38 +315,6 @@ static void pcpu_mem_free(void *ptr) kvfree(ptr); } -/** - * pcpu_count_occupied_pages - count the number of pages an area occupies - * @chunk: chunk of interest - * @i: index of the area in question - * - * Count the number of pages chunk's @i'th area occupies. When the area's - * start and/or end address isn't aligned to page boundary, the straddled - * page is included in the count iff the rest of the page is free. - */ -static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i) -{ - int off = chunk->map[i] & ~1; - int end = chunk->map[i + 1] & ~1; - - if (!PAGE_ALIGNED(off) && i > 0) { - int prev = chunk->map[i - 1]; - - if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE)) - off = round_down(off, PAGE_SIZE); - } - - if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) { - int next = chunk->map[i + 1]; - int nend = chunk->map[i + 2] & ~1; - - if (!(next & 1) && nend >= round_up(end, PAGE_SIZE)) - end = round_up(end, PAGE_SIZE); - } - - return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0); -} - /** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest @@ -374,358 +341,270 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) } /** - * pcpu_need_to_extend - determine whether chunk area map needs to be extended + * pcpu_cnt_pop_pages- counts populated backing pages in range * @chunk: chunk of interest - * @is_atomic: the allocation context + * @bit_off: start offset + * @bits: size of area to check * - * Determine whether area map of @chunk needs to be extended. If - * @is_atomic, only the amount necessary for a new allocation is - * considered; however, async extension is scheduled if the left amount is - * low. If !@is_atomic, it aims for more empty space. Combined, this - * ensures that the map is likely to have enough available space to - * accomodate atomic allocations which can't extend maps directly. - * - * CONTEXT: - * pcpu_lock. + * Calculates the number of populated pages in the region + * [page_start, page_end). This keeps track of how many empty populated + * pages are available and decide if async work should be scheduled. * * RETURNS: - * New target map allocation length if extension is necessary, 0 - * otherwise. + * The nr of populated pages. */ -static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) +static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off, + int bits) { - int margin, new_alloc; + int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE); + int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); + + if (page_start >= page_end) + return 0; + + /* + * bitmap_weight counts the number of bits set in a bitmap up to + * the specified number of bits. This is counting the populated + * pages up to page_end and then subtracting the populated pages + * up to page_start to count the populated pages in + * [page_start, page_end). + */ + return bitmap_weight(chunk->populated, page_end) - + bitmap_weight(chunk->populated, page_start); +} + +/** + * pcpu_chunk_update - updates the chunk metadata given a free area + * @chunk: chunk of interest + * @bit_off: chunk offset + * @bits: size of free area + * + * This updates the chunk's contig hint given a free area. + */ +static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits) +{ + if (bits > chunk->contig_bits) + chunk->contig_bits = bits; +} + +/** + * pcpu_chunk_refresh_hint - updates metadata about a chunk + * @chunk: chunk of interest + * + * Iterates over the chunk to find the largest free area. + * + * Updates: + * chunk->contig_bits + * nr_empty_pop_pages + */ +static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk) +{ + int bits, nr_empty_pop_pages; + int rs, re; /* region start, region end */ + + /* clear metadata */ + chunk->contig_bits = 0; + + bits = nr_empty_pop_pages = 0; + pcpu_for_each_unpop_region(chunk->alloc_map, rs, re, 0, + pcpu_chunk_map_bits(chunk)) { + bits = re - rs; + + pcpu_chunk_update(chunk, rs, bits); + + nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, rs, bits); + } + + /* + * Keep track of nr_empty_pop_pages. + * + * The chunk maintains the previous number of free pages it held, + * so the delta is used to update the global counter. The reserved + * chunk is not part of the free page count as they are populated + * at init and are special to serving reserved allocations. + */ + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages += + (nr_empty_pop_pages - chunk->nr_empty_pop_pages); + + chunk->nr_empty_pop_pages = nr_empty_pop_pages; +} + +/** + * pcpu_is_populated - determines if the region is populated + * @chunk: chunk of interest + * @bit_off: chunk offset + * @bits: size of area + * @next_off: return value for the next offset to start searching + * + * For atomic allocations, check if the backing pages are populated. + * + * RETURNS: + * Bool if the backing pages are populated. + * next_index is to skip over unpopulated blocks in pcpu_find_block_fit. + */ +static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, + int *next_off) +{ + int page_start, page_end, rs, re; + + page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); + page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); + + rs = page_start; + pcpu_next_unpop(chunk->populated, &rs, &re, page_end); + if (rs >= page_end) + return true; + + *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; + return false; +} + +/** + * pcpu_find_block_fit - finds the block index to start searching + * @chunk: chunk of interest + * @alloc_bits: size of request in allocation units + * @align: alignment of area (max PAGE_SIZE bytes) + * @pop_only: use populated regions only + * + * RETURNS: + * The offset in the bitmap to begin searching. + * -1 if no offset is found. + */ +static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, + size_t align, bool pop_only) +{ + int bit_off, bits; + int re; /* region end */ + + pcpu_for_each_unpop_region(chunk->alloc_map, bit_off, re, 0, + pcpu_chunk_map_bits(chunk)) { + bits = re - bit_off; + + /* check alignment */ + bits -= ALIGN(bit_off, align) - bit_off; + bit_off = ALIGN(bit_off, align); + if (bits < alloc_bits) + continue; + + bits = alloc_bits; + if (!pop_only || pcpu_is_populated(chunk, bit_off, bits, + &bit_off)) + break; + + bits = 0; + } + + if (bit_off == pcpu_chunk_map_bits(chunk)) + return -1; + + return bit_off; +} + +/** + * pcpu_alloc_area - allocates an area from a pcpu_chunk + * @chunk: chunk of interest + * @alloc_bits: size of request in allocation units + * @align: alignment of area (max PAGE_SIZE) + * @start: bit_off to start searching + * + * This function takes in a @start offset to begin searching to fit an + * allocation of @alloc_bits with alignment @align. If it confirms a + * valid free area, it then updates the allocation and boundary maps + * accordingly. + * + * RETURNS: + * Allocated addr offset in @chunk on success. + * -1 if no matching area is found. + */ +static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, + size_t align, int start) +{ + size_t align_mask = (align) ? (align - 1) : 0; + int bit_off, end, oslot; lockdep_assert_held(&pcpu_lock); - if (is_atomic) { - margin = 3; - - if (chunk->map_alloc < - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) { - if (list_empty(&chunk->map_extend_list)) { - list_add_tail(&chunk->map_extend_list, - &pcpu_map_extend_chunks); - pcpu_schedule_balance_work(); - } - } - } else { - margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; - } - - if (chunk->map_alloc >= chunk->map_used + margin) - return 0; - - new_alloc = PCPU_DFL_MAP_ALLOC; - while (new_alloc < chunk->map_used + margin) - new_alloc *= 2; - - return new_alloc; -} - -/** - * pcpu_extend_area_map - extend area map of a chunk - * @chunk: chunk of interest - * @new_alloc: new target allocation length of the area map - * - * Extend area map of @chunk to have @new_alloc entries. - * - * CONTEXT: - * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) -{ - int *old = NULL, *new = NULL; - size_t old_size = 0, new_size = new_alloc * sizeof(new[0]); - unsigned long flags; - - lockdep_assert_held(&pcpu_alloc_mutex); - - new = pcpu_mem_zalloc(new_size); - if (!new) - return -ENOMEM; - - /* acquire pcpu_lock and switch to new area map */ - spin_lock_irqsave(&pcpu_lock, flags); - - if (new_alloc <= chunk->map_alloc) - goto out_unlock; - - old_size = chunk->map_alloc * sizeof(chunk->map[0]); - old = chunk->map; - - memcpy(new, old, old_size); - - chunk->map_alloc = new_alloc; - chunk->map = new; - new = NULL; - -out_unlock: - spin_unlock_irqrestore(&pcpu_lock, flags); + oslot = pcpu_chunk_slot(chunk); /* - * pcpu_mem_free() might end up calling vfree() which uses - * IRQ-unsafe lock and thus can't be called under pcpu_lock. + * Search to find a fit. */ - pcpu_mem_free(old); - pcpu_mem_free(new); + end = start + alloc_bits; + bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start, + alloc_bits, align_mask); + if (bit_off >= end) + return -1; - return 0; -} + /* update alloc map */ + bitmap_set(chunk->alloc_map, bit_off, alloc_bits); -/** - * pcpu_fit_in_area - try to fit the requested allocation in a candidate area - * @chunk: chunk the candidate area belongs to - * @off: the offset to the start of the candidate area - * @this_size: the size of the candidate area - * @size: the size of the target allocation - * @align: the alignment of the target allocation - * @pop_only: only allocate from already populated region - * - * We're trying to allocate @size bytes aligned at @align. @chunk's area - * at @off sized @this_size is a candidate. This function determines - * whether the target allocation fits in the candidate area and returns the - * number of bytes to pad after @off. If the target area doesn't fit, -1 - * is returned. - * - * If @pop_only is %true, this function only considers the already - * populated part of the candidate area. - */ -static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, - int size, int align, bool pop_only) -{ - int cand_off = off; + /* update boundary map */ + set_bit(bit_off, chunk->bound_map); + bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1); + set_bit(bit_off + alloc_bits, chunk->bound_map); - while (true) { - int head = ALIGN(cand_off, align) - off; - int page_start, page_end, rs, re; + chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE; - if (this_size < head + size) - return -1; + pcpu_chunk_refresh_hint(chunk); - if (!pop_only) - return head; - - /* - * If the first unpopulated page is beyond the end of the - * allocation, the whole allocation is populated; - * otherwise, retry from the end of the unpopulated area. - */ - page_start = PFN_DOWN(head + off); - page_end = PFN_UP(head + off + size); - - rs = page_start; - pcpu_next_unpop(chunk->populated, &rs, &re, - PFN_UP(off + this_size)); - if (rs >= page_end) - return head; - cand_off = re * PAGE_SIZE; - } -} - -/** - * pcpu_alloc_area - allocate area from a pcpu_chunk - * @chunk: chunk of interest - * @size: wanted size in bytes - * @align: wanted align - * @pop_only: allocate only from the populated area - * @occ_pages_p: out param for the number of pages the area occupies - * - * Try to allocate @size bytes area aligned at @align from @chunk. - * Note that this function only allocates the offset. It doesn't - * populate or map the area. - * - * @chunk->map must have at least two free slots. - * - * CONTEXT: - * pcpu_lock. - * - * RETURNS: - * Allocated offset in @chunk on success, -1 if no matching area is - * found. - */ -static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, - bool pop_only, int *occ_pages_p) -{ - int oslot = pcpu_chunk_slot(chunk); - int max_contig = 0; - int i, off; - bool seen_free = false; - int *p; - - for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) { - int head, tail; - int this_size; - - off = *p; - if (off & 1) - continue; - - this_size = (p[1] & ~1) - off; - - head = pcpu_fit_in_area(chunk, off, this_size, size, align, - pop_only); - if (head < 0) { - if (!seen_free) { - chunk->first_free = i; - seen_free = true; - } - max_contig = max(this_size, max_contig); - continue; - } - - /* - * If head is small or the previous block is free, - * merge'em. Note that 'small' is defined as smaller - * than sizeof(int), which is very small but isn't too - * uncommon for percpu allocations. - */ - if (head && (head < sizeof(int) || !(p[-1] & 1))) { - *p = off += head; - if (p[-1] & 1) - chunk->free_size -= head; - else - max_contig = max(*p - p[-1], max_contig); - this_size -= head; - head = 0; - } - - /* if tail is small, just keep it around */ - tail = this_size - head - size; - if (tail < sizeof(int)) { - tail = 0; - size = this_size - head; - } - - /* split if warranted */ - if (head || tail) { - int nr_extra = !!head + !!tail; - - /* insert new subblocks */ - memmove(p + nr_extra + 1, p + 1, - sizeof(chunk->map[0]) * (chunk->map_used - i)); - chunk->map_used += nr_extra; - - if (head) { - if (!seen_free) { - chunk->first_free = i; - seen_free = true; - } - *++p = off += head; - ++i; - max_contig = max(head, max_contig); - } - if (tail) { - p[1] = off + size; - max_contig = max(tail, max_contig); - } - } - - if (!seen_free) - chunk->first_free = i + 1; - - /* update hint and mark allocated */ - if (i + 1 == chunk->map_used) - chunk->contig_hint = max_contig; /* fully scanned */ - else - chunk->contig_hint = max(chunk->contig_hint, - max_contig); - - chunk->free_size -= size; - *p |= 1; - - *occ_pages_p = pcpu_count_occupied_pages(chunk, i); - pcpu_chunk_relocate(chunk, oslot); - return off; - } - - chunk->contig_hint = max_contig; /* fully scanned */ pcpu_chunk_relocate(chunk, oslot); - /* tell the upper layer that this chunk has no matching area */ - return -1; + return bit_off * PCPU_MIN_ALLOC_SIZE; } /** - * pcpu_free_area - free area to a pcpu_chunk + * pcpu_free_area - frees the corresponding offset * @chunk: chunk of interest - * @freeme: offset of area to free - * @occ_pages_p: out param for the number of pages the area occupies + * @off: addr offset into chunk * - * Free area starting from @freeme to @chunk. Note that this function - * only modifies the allocation map. It doesn't depopulate or unmap - * the area. - * - * CONTEXT: - * pcpu_lock. + * This function determines the size of an allocation to free using + * the boundary bitmap and clears the allocation map. */ -static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, - int *occ_pages_p) +static void pcpu_free_area(struct pcpu_chunk *chunk, int off) { - int oslot = pcpu_chunk_slot(chunk); - int off = 0; - unsigned i, j; - int to_free = 0; - int *p; + int bit_off, bits, end, oslot; lockdep_assert_held(&pcpu_lock); pcpu_stats_area_dealloc(chunk); - freeme |= 1; /* we are searching for pair */ + oslot = pcpu_chunk_slot(chunk); - i = 0; - j = chunk->map_used; - while (i != j) { - unsigned k = (i + j) / 2; - off = chunk->map[k]; - if (off < freeme) - i = k + 1; - else if (off > freeme) - j = k; - else - i = j = k; - } - BUG_ON(off != freeme); + bit_off = off / PCPU_MIN_ALLOC_SIZE; - if (i < chunk->first_free) - chunk->first_free = i; + /* find end index */ + end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), + bit_off + 1); + bits = end - bit_off; + bitmap_clear(chunk->alloc_map, bit_off, bits); - p = chunk->map + i; - *p = off &= ~1; - chunk->free_size += (p[1] & ~1) - off; + /* update metadata */ + chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE; - *occ_pages_p = pcpu_count_occupied_pages(chunk, i); + pcpu_chunk_refresh_hint(chunk); - /* merge with next? */ - if (!(p[1] & 1)) - to_free++; - /* merge with previous? */ - if (i > 0 && !(p[-1] & 1)) { - to_free++; - i--; - p--; - } - if (to_free) { - chunk->map_used -= to_free; - memmove(p + 1, p + 1 + to_free, - (chunk->map_used - i) * sizeof(chunk->map[0])); - } - - chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint); pcpu_chunk_relocate(chunk, oslot); } +/** + * pcpu_alloc_first_chunk - creates chunks that serve the first chunk + * @tmp_addr: the start of the region served + * @map_size: size of the region served + * + * This is responsible for creating the chunks that serve the first chunk. The + * base_addr is page aligned down of @tmp_addr while the region end is page + * aligned up. Offsets are kept track of to determine the region served. All + * this is done to appease the bitmap allocator in avoiding partial blocks. + * + * RETURNS: + * Chunk serving the region at @tmp_addr of @map_size. + */ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, - int map_size, - int *map, - int init_map_size) + int map_size) { struct pcpu_chunk *chunk; unsigned long aligned_addr; - int start_offset, region_size; + int start_offset, offset_bits, region_size, region_bits; /* region calculations */ aligned_addr = tmp_addr & PAGE_MASK; @@ -740,83 +619,99 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, 0); INIT_LIST_HEAD(&chunk->list); - INIT_LIST_HEAD(&chunk->map_extend_list); chunk->base_addr = (void *)aligned_addr; chunk->start_offset = start_offset; chunk->end_offset = region_size - chunk->start_offset - map_size; chunk->nr_pages = region_size >> PAGE_SHIFT; + region_bits = pcpu_chunk_map_bits(chunk); - chunk->map = map; - chunk->map_alloc = init_map_size; + chunk->alloc_map = memblock_virt_alloc( + BITS_TO_LONGS(region_bits) * + sizeof(chunk->alloc_map[0]), 0); + chunk->bound_map = memblock_virt_alloc( + BITS_TO_LONGS(region_bits + 1) * + sizeof(chunk->bound_map[0]), 0); /* manage populated page bitmap */ chunk->immutable = true; bitmap_fill(chunk->populated, chunk->nr_pages); chunk->nr_populated = chunk->nr_pages; - chunk->nr_empty_pop_pages = chunk->nr_pages; + chunk->nr_empty_pop_pages = + pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE, + map_size / PCPU_MIN_ALLOC_SIZE); - chunk->contig_hint = chunk->free_size = map_size; + chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE; + chunk->free_bytes = map_size; if (chunk->start_offset) { /* hide the beginning of the bitmap */ - chunk->nr_empty_pop_pages--; - - chunk->map[0] = 1; - chunk->map[1] = chunk->start_offset; - chunk->map_used = 1; + offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE; + bitmap_set(chunk->alloc_map, 0, offset_bits); + set_bit(0, chunk->bound_map); + set_bit(offset_bits, chunk->bound_map); } - /* set chunk's free region */ - chunk->map[++chunk->map_used] = - (chunk->start_offset + chunk->free_size) | 1; - if (chunk->end_offset) { /* hide the end of the bitmap */ - chunk->nr_empty_pop_pages--; - - chunk->map[++chunk->map_used] = region_size | 1; + offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE; + bitmap_set(chunk->alloc_map, + pcpu_chunk_map_bits(chunk) - offset_bits, + offset_bits); + set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE, + chunk->bound_map); + set_bit(region_bits, chunk->bound_map); } + pcpu_chunk_refresh_hint(chunk); + return chunk; } static struct pcpu_chunk *pcpu_alloc_chunk(void) { struct pcpu_chunk *chunk; + int region_bits; chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size); if (!chunk) return NULL; - chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * - sizeof(chunk->map[0])); - if (!chunk->map) { - pcpu_mem_free(chunk); - return NULL; - } - - chunk->map_alloc = PCPU_DFL_MAP_ALLOC; - chunk->map[0] = 0; - chunk->map[1] = pcpu_unit_size | 1; - chunk->map_used = 1; - INIT_LIST_HEAD(&chunk->list); - INIT_LIST_HEAD(&chunk->map_extend_list); - chunk->free_size = pcpu_unit_size; - chunk->contig_hint = pcpu_unit_size; - chunk->nr_pages = pcpu_unit_pages; + region_bits = pcpu_chunk_map_bits(chunk); + + chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) * + sizeof(chunk->alloc_map[0])); + if (!chunk->alloc_map) + goto alloc_map_fail; + + chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) * + sizeof(chunk->bound_map[0])); + if (!chunk->bound_map) + goto bound_map_fail; + + /* init metadata */ + chunk->contig_bits = region_bits; + chunk->free_bytes = chunk->nr_pages * PAGE_SIZE; return chunk; + +bound_map_fail: + pcpu_mem_free(chunk->alloc_map); +alloc_map_fail: + pcpu_mem_free(chunk); + + return NULL; } static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; - pcpu_mem_free(chunk->map); + pcpu_mem_free(chunk->bound_map); + pcpu_mem_free(chunk->alloc_map); pcpu_mem_free(chunk); } @@ -825,13 +720,17 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) * @chunk: pcpu_chunk which got populated * @page_start: the start page * @page_end: the end page + * @for_alloc: if this is to populate for allocation * * Pages in [@page_start,@page_end) have been populated to @chunk. Update * the bookkeeping information accordingly. Must be called after each * successful population. + * + * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it + * is to serve an allocation in that area. */ -static void pcpu_chunk_populated(struct pcpu_chunk *chunk, - int page_start, int page_end) +static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, + int page_end, bool for_alloc) { int nr = page_end - page_start; @@ -839,8 +738,11 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, bitmap_set(chunk->populated, page_start, nr); chunk->nr_populated += nr; - chunk->nr_empty_pop_pages += nr; - pcpu_nr_empty_pop_pages += nr; + + if (!for_alloc) { + chunk->nr_empty_pop_pages += nr; + pcpu_nr_empty_pop_pages += nr; + } } /** @@ -945,19 +847,23 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, struct pcpu_chunk *chunk; const char *err; bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; - int occ_pages = 0; - int slot, off, new_alloc, cpu, ret; + int slot, off, cpu, ret; unsigned long flags; void __percpu *ptr; + size_t bits, bit_align; /* - * We want the lowest bit of offset available for in-use/free - * indicator, so force >= 16bit alignment and make size even. + * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE, + * therefore alignment must be a minimum of that many bytes. + * An allocation may have internal fragmentation from rounding up + * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes. */ if (unlikely(align < PCPU_MIN_ALLOC_SIZE)) align = PCPU_MIN_ALLOC_SIZE; size = ALIGN(size, PCPU_MIN_ALLOC_SIZE); + bits = size >> PCPU_MIN_ALLOC_SHIFT; + bit_align = align >> PCPU_MIN_ALLOC_SHIFT; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE || !is_power_of_2(align))) { @@ -975,23 +881,13 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; - if (size > chunk->contig_hint) { + off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); + if (off < 0) { err = "alloc from reserved chunk failed"; goto fail_unlock; } - while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) { - spin_unlock_irqrestore(&pcpu_lock, flags); - if (is_atomic || - pcpu_extend_area_map(chunk, new_alloc) < 0) { - err = "failed to extend area map of reserved chunk"; - goto fail; - } - spin_lock_irqsave(&pcpu_lock, flags); - } - - off = pcpu_alloc_area(chunk, size, align, is_atomic, - &occ_pages); + off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) goto area_found; @@ -1003,31 +899,15 @@ restart: /* search through normal chunks */ for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { list_for_each_entry(chunk, &pcpu_slot[slot], list) { - if (size > chunk->contig_hint) + off = pcpu_find_block_fit(chunk, bits, bit_align, + is_atomic); + if (off < 0) continue; - new_alloc = pcpu_need_to_extend(chunk, is_atomic); - if (new_alloc) { - if (is_atomic) - continue; - spin_unlock_irqrestore(&pcpu_lock, flags); - if (pcpu_extend_area_map(chunk, - new_alloc) < 0) { - err = "failed to extend area map"; - goto fail; - } - spin_lock_irqsave(&pcpu_lock, flags); - /* - * pcpu_lock has been dropped, need to - * restart cpu_slot list walking. - */ - goto restart; - } - - off = pcpu_alloc_area(chunk, size, align, is_atomic, - &occ_pages); + off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) goto area_found; + } } @@ -1077,23 +957,17 @@ area_found: spin_lock_irqsave(&pcpu_lock, flags); if (ret) { - pcpu_free_area(chunk, off, &occ_pages); + pcpu_free_area(chunk, off); err = "failed to populate"; goto fail_unlock; } - pcpu_chunk_populated(chunk, rs, re); + pcpu_chunk_populated(chunk, rs, re, true); spin_unlock_irqrestore(&pcpu_lock, flags); } mutex_unlock(&pcpu_alloc_mutex); } - if (chunk != pcpu_reserved_chunk) { - spin_lock_irqsave(&pcpu_lock, flags); - pcpu_nr_empty_pop_pages -= occ_pages; - spin_unlock_irqrestore(&pcpu_lock, flags); - } - if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) pcpu_schedule_balance_work(); @@ -1211,7 +1085,6 @@ static void pcpu_balance_workfn(struct work_struct *work) if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; - list_del_init(&chunk->map_extend_list); list_move(&chunk->list, &to_free); } @@ -1230,25 +1103,6 @@ static void pcpu_balance_workfn(struct work_struct *work) pcpu_destroy_chunk(chunk); } - /* service chunks which requested async area map extension */ - do { - int new_alloc = 0; - - spin_lock_irq(&pcpu_lock); - - chunk = list_first_entry_or_null(&pcpu_map_extend_chunks, - struct pcpu_chunk, map_extend_list); - if (chunk) { - list_del_init(&chunk->map_extend_list); - new_alloc = pcpu_need_to_extend(chunk, false); - } - - spin_unlock_irq(&pcpu_lock); - - if (new_alloc) - pcpu_extend_area_map(chunk, new_alloc); - } while (chunk); - /* * Ensure there are certain number of free populated pages for * atomic allocs. Fill up from the most packed so that atomic @@ -1296,7 +1150,7 @@ retry_pop: if (!ret) { nr_to_pop -= nr; spin_lock_irq(&pcpu_lock); - pcpu_chunk_populated(chunk, rs, rs + nr); + pcpu_chunk_populated(chunk, rs, rs + nr, false); spin_unlock_irq(&pcpu_lock); } else { nr_to_pop = 0; @@ -1335,7 +1189,7 @@ void free_percpu(void __percpu *ptr) void *addr; struct pcpu_chunk *chunk; unsigned long flags; - int off, occ_pages; + int off; if (!ptr) return; @@ -1349,13 +1203,10 @@ void free_percpu(void __percpu *ptr) chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->base_addr; - pcpu_free_area(chunk, off, &occ_pages); - - if (chunk != pcpu_reserved_chunk) - pcpu_nr_empty_pop_pages += occ_pages; + pcpu_free_area(chunk, off); /* if there are more than one fully free chunks, wake up grim reaper */ - if (chunk->free_size == pcpu_unit_size) { + if (chunk->free_bytes == pcpu_unit_size) { struct pcpu_chunk *pos; list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) @@ -1651,8 +1502,6 @@ static void pcpu_dump_alloc_info(const char *lvl, int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr) { - static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; - static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; size_t static_size, dyn_size; struct pcpu_chunk *chunk; @@ -1787,8 +1636,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, */ tmp_addr = (unsigned long)base_addr + static_size; map_size = ai->reserved_size ?: dyn_size; - chunk = pcpu_alloc_first_chunk(tmp_addr, map_size, smap, - ARRAY_SIZE(smap)); + chunk = pcpu_alloc_first_chunk(tmp_addr, map_size); /* init dynamic chunk if necessary */ if (ai->reserved_size) { @@ -1797,8 +1645,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size; map_size = dyn_size; - chunk = pcpu_alloc_first_chunk(tmp_addr, map_size, dmap, - ARRAY_SIZE(dmap)); + chunk = pcpu_alloc_first_chunk(tmp_addr, map_size); } /* link the first chunk in */ @@ -2374,36 +2221,6 @@ void __init setup_per_cpu_areas(void) #endif /* CONFIG_SMP */ -/* - * First and reserved chunks are initialized with temporary allocation - * map in initdata so that they can be used before slab is online. - * This function is called after slab is brought up and replaces those - * with properly allocated maps. - */ -void __init percpu_init_late(void) -{ - struct pcpu_chunk *target_chunks[] = - { pcpu_first_chunk, pcpu_reserved_chunk, NULL }; - struct pcpu_chunk *chunk; - unsigned long flags; - int i; - - for (i = 0; (chunk = target_chunks[i]); i++) { - int *map; - const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]); - - BUILD_BUG_ON(size > PAGE_SIZE); - - map = pcpu_mem_zalloc(size); - BUG_ON(!map); - - spin_lock_irqsave(&pcpu_lock, flags); - memcpy(map, chunk->map, size); - chunk->map = map; - spin_unlock_irqrestore(&pcpu_lock, flags); - } -} - /* * Percpu allocator is initialized early during boot when neither slab or * workqueue is available. Plug async management until everything is up From ca460b3c96274d79f84b31a3fea23a6eed479917 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:12 -0400 Subject: [PATCH 19/27] percpu: introduce bitmap metadata blocks This patch introduces the bitmap metadata blocks and adds the skeleton of the code that will be used to maintain these blocks. Each chunk's bitmap is made up of full metadata blocks. These blocks maintain basic metadata to help prevent scanning unnecssarily to update hints. Full scanning methods are used for the skeleton and will be replaced in the coming patches. A number of helper functions are added as well to do conversion of pages to blocks and manage offsets. Comments will be updated as the final version of each function is added. There exists a relationship between PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE, the region size, and unit_size. Every chunk's region (including offsets) is page aligned at the beginning to preserve alignment. The end is aligned to LCM(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE) to ensure that the end can fit with the populated page map which is by page and every metadata block is fully accounted for. The unit_size is already page aligned, but must also be aligned with PCPU_BITMAP_BLOCK_SIZE to ensure full metadata blocks. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- include/linux/percpu.h | 12 +++ mm/percpu-internal.h | 29 ++++++ mm/percpu.c | 228 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 257 insertions(+), 12 deletions(-) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index b7e6c98722d1..31795e619273 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -25,6 +25,18 @@ #define PCPU_MIN_ALLOC_SHIFT 2 #define PCPU_MIN_ALLOC_SIZE (1 << PCPU_MIN_ALLOC_SHIFT) +/* + * This determines the size of each metadata block. There are several subtle + * constraints around this constant. The reserved region must be a multiple of + * PCPU_BITMAP_BLOCK_SIZE. Additionally, PCPU_BITMAP_BLOCK_SIZE must be a + * multiple of PAGE_SIZE or PAGE_SIZE must be a multiple of + * PCPU_BITMAP_BLOCK_SIZE to align with the populated page map. The unit_size + * also has to be a multiple of PCPU_BITMAP_BLOCK_SIZE to ensure full blocks. + */ +#define PCPU_BITMAP_BLOCK_SIZE PAGE_SIZE +#define PCPU_BITMAP_BLOCK_BITS (PCPU_BITMAP_BLOCK_SIZE >> \ + PCPU_MIN_ALLOC_SHIFT) + /* * Percpu allocator can serve percpu allocations before slab is * initialized which allows slab to depend on the percpu allocator. diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 2e9d9bcb6fa2..252ae9e960e0 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -4,6 +4,22 @@ #include #include +/* + * pcpu_block_md is the metadata block struct. + * Each chunk's bitmap is split into a number of full blocks. + * All units are in terms of bits. + */ +struct pcpu_block_md { + int contig_hint; /* contig hint for block */ + int contig_hint_start; /* block relative starting + position of the contig hint */ + int left_free; /* size of free space along + the left side of the block */ + int right_free; /* size of free space along + the right side of the block */ + int first_free; /* block position of first free */ +}; + struct pcpu_chunk { #ifdef CONFIG_PERCPU_STATS int nr_alloc; /* # of allocations */ @@ -17,6 +33,7 @@ struct pcpu_chunk { unsigned long *alloc_map; /* allocation map */ unsigned long *bound_map; /* boundary map */ + struct pcpu_block_md *md_blocks; /* metadata blocks */ void *data; /* chunk data */ int first_free; /* no free below this */ @@ -43,6 +60,18 @@ extern int pcpu_nr_empty_pop_pages; extern struct pcpu_chunk *pcpu_first_chunk; extern struct pcpu_chunk *pcpu_reserved_chunk; +/** + * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks + * @chunk: chunk of interest + * + * This conversion is from the number of physical pages that the chunk + * serves to the number of bitmap blocks used. + */ +static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk) +{ + return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE; +} + /** * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap * @pages: number of physical pages diff --git a/mm/percpu.c b/mm/percpu.c index 986d900e6680..708c6de237b9 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include #include @@ -279,6 +280,26 @@ static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end) (rs) < (re); \ (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end))) +/* + * The following are helper functions to help access bitmaps and convert + * between bitmap offsets to address offsets. + */ +static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index) +{ + return chunk->alloc_map + + (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG); +} + +static unsigned long pcpu_off_to_block_index(int off) +{ + return off / PCPU_BITMAP_BLOCK_BITS; +} + +static unsigned long pcpu_off_to_block_off(int off) +{ + return off & (PCPU_BITMAP_BLOCK_BITS - 1); +} + /** * pcpu_mem_zalloc - allocate memory * @size: bytes to allocate @@ -430,6 +451,154 @@ static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk) chunk->nr_empty_pop_pages = nr_empty_pop_pages; } +/** + * pcpu_block_update - updates a block given a free area + * @block: block of interest + * @start: start offset in block + * @end: end offset in block + * + * Updates a block given a known free area. The region [start, end) is + * expected to be the entirety of the free area within a block. + */ +static void pcpu_block_update(struct pcpu_block_md *block, int start, int end) +{ + int contig = end - start; + + block->first_free = min(block->first_free, start); + if (start == 0) + block->left_free = contig; + + if (end == PCPU_BITMAP_BLOCK_BITS) + block->right_free = contig; + + if (contig > block->contig_hint) { + block->contig_hint_start = start; + block->contig_hint = contig; + } +} + +/** + * pcpu_block_refresh_hint + * @chunk: chunk of interest + * @index: index of the metadata block + * + * Scans over the block beginning at first_free and updates the block + * metadata accordingly. + */ +static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) +{ + struct pcpu_block_md *block = chunk->md_blocks + index; + unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index); + int rs, re; /* region start, region end */ + + /* clear hints */ + block->contig_hint = 0; + block->left_free = block->right_free = 0; + + /* iterate over free areas and update the contig hints */ + pcpu_for_each_unpop_region(alloc_map, rs, re, block->first_free, + PCPU_BITMAP_BLOCK_BITS) { + pcpu_block_update(block, rs, re); + } +} + +/** + * pcpu_block_update_hint_alloc - update hint on allocation path + * @chunk: chunk of interest + * @bit_off: chunk offset + * @bits: size of request + */ +static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, + int bits) +{ + struct pcpu_block_md *s_block, *e_block, *block; + int s_index, e_index; /* block indexes of the freed allocation */ + int s_off, e_off; /* block offsets of the freed allocation */ + + /* + * Calculate per block offsets. + * The calculation uses an inclusive range, but the resulting offsets + * are [start, end). e_index always points to the last block in the + * range. + */ + s_index = pcpu_off_to_block_index(bit_off); + e_index = pcpu_off_to_block_index(bit_off + bits - 1); + s_off = pcpu_off_to_block_off(bit_off); + e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1; + + s_block = chunk->md_blocks + s_index; + e_block = chunk->md_blocks + e_index; + + /* + * Update s_block. + */ + pcpu_block_refresh_hint(chunk, s_index); + + /* + * Update e_block. + */ + if (s_index != e_index) { + pcpu_block_refresh_hint(chunk, e_index); + + /* update in-between md_blocks */ + for (block = s_block + 1; block < e_block; block++) { + block->contig_hint = 0; + block->left_free = 0; + block->right_free = 0; + } + } + + pcpu_chunk_refresh_hint(chunk); +} + +/** + * pcpu_block_update_hint_free - updates the block hints on the free path + * @chunk: chunk of interest + * @bit_off: chunk offset + * @bits: size of request + */ +static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, + int bits) +{ + struct pcpu_block_md *s_block, *e_block, *block; + int s_index, e_index; /* block indexes of the freed allocation */ + int s_off, e_off; /* block offsets of the freed allocation */ + + /* + * Calculate per block offsets. + * The calculation uses an inclusive range, but the resulting offsets + * are [start, end). e_index always points to the last block in the + * range. + */ + s_index = pcpu_off_to_block_index(bit_off); + e_index = pcpu_off_to_block_index(bit_off + bits - 1); + s_off = pcpu_off_to_block_off(bit_off); + e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1; + + s_block = chunk->md_blocks + s_index; + e_block = chunk->md_blocks + e_index; + + /* update s_block */ + pcpu_block_refresh_hint(chunk, s_index); + + /* freeing in the same block */ + if (s_index != e_index) { + /* update e_block */ + pcpu_block_refresh_hint(chunk, e_index); + + /* reset md_blocks in the middle */ + for (block = s_block + 1; block < e_block; block++) { + block->first_free = 0; + block->contig_hint_start = 0; + block->contig_hint = PCPU_BITMAP_BLOCK_BITS; + block->left_free = PCPU_BITMAP_BLOCK_BITS; + block->right_free = PCPU_BITMAP_BLOCK_BITS; + } + } + + pcpu_chunk_refresh_hint(chunk); +} + /** * pcpu_is_populated - determines if the region is populated * @chunk: chunk of interest @@ -546,7 +715,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE; - pcpu_chunk_refresh_hint(chunk); + pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits); pcpu_chunk_relocate(chunk, oslot); @@ -581,11 +750,24 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int off) /* update metadata */ chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE; - pcpu_chunk_refresh_hint(chunk); + pcpu_block_update_hint_free(chunk, bit_off, bits); pcpu_chunk_relocate(chunk, oslot); } +static void pcpu_init_md_blocks(struct pcpu_chunk *chunk) +{ + struct pcpu_block_md *md_block; + + for (md_block = chunk->md_blocks; + md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk); + md_block++) { + md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS; + md_block->left_free = PCPU_BITMAP_BLOCK_BITS; + md_block->right_free = PCPU_BITMAP_BLOCK_BITS; + } +} + /** * pcpu_alloc_first_chunk - creates chunks that serve the first chunk * @tmp_addr: the start of the region served @@ -603,7 +785,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, int map_size) { struct pcpu_chunk *chunk; - unsigned long aligned_addr; + unsigned long aligned_addr, lcm_align; int start_offset, offset_bits, region_size, region_bits; /* region calculations */ @@ -611,7 +793,13 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, start_offset = tmp_addr - aligned_addr; - region_size = PFN_ALIGN(start_offset + map_size); + /* + * Align the end of the region with the LCM of PAGE_SIZE and + * PCPU_BITMAP_BLOCK_SIZE. One of these constants is a multiple of + * the other. + */ + lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE); + region_size = ALIGN(start_offset + map_size, lcm_align); /* allocate chunk */ chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) + @@ -627,12 +815,13 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, chunk->nr_pages = region_size >> PAGE_SHIFT; region_bits = pcpu_chunk_map_bits(chunk); - chunk->alloc_map = memblock_virt_alloc( - BITS_TO_LONGS(region_bits) * - sizeof(chunk->alloc_map[0]), 0); - chunk->bound_map = memblock_virt_alloc( - BITS_TO_LONGS(region_bits + 1) * - sizeof(chunk->bound_map[0]), 0); + chunk->alloc_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits) * + sizeof(chunk->alloc_map[0]), 0); + chunk->bound_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits + 1) * + sizeof(chunk->bound_map[0]), 0); + chunk->md_blocks = memblock_virt_alloc(pcpu_chunk_nr_blocks(chunk) * + sizeof(chunk->md_blocks[0]), 0); + pcpu_init_md_blocks(chunk); /* manage populated page bitmap */ chunk->immutable = true; @@ -651,6 +840,8 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, bitmap_set(chunk->alloc_map, 0, offset_bits); set_bit(0, chunk->bound_map); set_bit(offset_bits, chunk->bound_map); + + pcpu_block_update_hint_alloc(chunk, 0, offset_bits); } if (chunk->end_offset) { @@ -662,9 +853,10 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE, chunk->bound_map); set_bit(region_bits, chunk->bound_map); - } - pcpu_chunk_refresh_hint(chunk); + pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk) + - offset_bits, offset_bits); + } return chunk; } @@ -692,12 +884,21 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) if (!chunk->bound_map) goto bound_map_fail; + chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) * + sizeof(chunk->md_blocks[0])); + if (!chunk->md_blocks) + goto md_blocks_fail; + + pcpu_init_md_blocks(chunk); + /* init metadata */ chunk->contig_bits = region_bits; chunk->free_bytes = chunk->nr_pages * PAGE_SIZE; return chunk; +md_blocks_fail: + pcpu_mem_free(chunk->bound_map); bound_map_fail: pcpu_mem_free(chunk->alloc_map); alloc_map_fail: @@ -1535,9 +1736,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size)); PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); + PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE)); PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); PCPU_SETUP_BUG_ON(!ai->dyn_size); PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE)); + PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) || + IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE))); PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ From 86b442fbce74d6cd0805410ef228776cbd0338d7 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:13 -0400 Subject: [PATCH 20/27] percpu: add first_bit to keep track of the first free in the bitmap This patch adds first_bit to keep track of the first free bit in the bitmap. This hint helps prevent scanning of fully allocated blocks. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu-internal.h | 2 +- mm/percpu-stats.c | 1 + mm/percpu.c | 17 +++++++++++++++-- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 252ae9e960e0..e60e04966d96 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -36,7 +36,7 @@ struct pcpu_chunk { struct pcpu_block_md *md_blocks; /* metadata blocks */ void *data; /* chunk data */ - int first_free; /* no free below this */ + int first_bit; /* no free below this */ bool immutable; /* no [de]population allowed */ int start_offset; /* the overlap with the previous region to have a page aligned diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index ad03d73aa5fe..6142484e88f7 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -121,6 +121,7 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, P("nr_alloc", chunk->nr_alloc); P("max_alloc_size", chunk->max_alloc_size); P("empty_pop_pages", chunk->nr_empty_pop_pages); + P("first_bit", chunk->first_bit); P("free_bytes", chunk->free_bytes); P("contig_bytes", chunk->contig_bits * PCPU_MIN_ALLOC_SIZE); P("sum_frag", sum_frag); diff --git a/mm/percpu.c b/mm/percpu.c index 708c6de237b9..83abb190ca5a 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -427,7 +427,7 @@ static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk) chunk->contig_bits = 0; bits = nr_empty_pop_pages = 0; - pcpu_for_each_unpop_region(chunk->alloc_map, rs, re, 0, + pcpu_for_each_unpop_region(chunk->alloc_map, rs, re, chunk->first_bit, pcpu_chunk_map_bits(chunk)) { bits = re - rs; @@ -646,7 +646,8 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, int bit_off, bits; int re; /* region end */ - pcpu_for_each_unpop_region(chunk->alloc_map, bit_off, re, 0, + pcpu_for_each_unpop_region(chunk->alloc_map, bit_off, re, + chunk->first_bit, pcpu_chunk_map_bits(chunk)) { bits = re - bit_off; @@ -715,6 +716,13 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE; + /* update first free bit */ + if (bit_off == chunk->first_bit) + chunk->first_bit = find_next_zero_bit( + chunk->alloc_map, + pcpu_chunk_map_bits(chunk), + bit_off + alloc_bits); + pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits); pcpu_chunk_relocate(chunk, oslot); @@ -750,6 +758,9 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int off) /* update metadata */ chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE; + /* update first free bit */ + chunk->first_bit = min(chunk->first_bit, bit_off); + pcpu_block_update_hint_free(chunk, bit_off, bits); pcpu_chunk_relocate(chunk, oslot); @@ -841,6 +852,8 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, set_bit(0, chunk->bound_map); set_bit(offset_bits, chunk->bound_map); + chunk->first_bit = offset_bits; + pcpu_block_update_hint_alloc(chunk, 0, offset_bits); } From 13f966373f9296c0da2fb2764654cce520b3a6b4 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:14 -0400 Subject: [PATCH 21/27] percpu: skip chunks if the alloc does not fit in the contig hint This patch adds chunk->contig_bits_start to keep track of the contig hint's offset and the check to skip the chunk if it does not fit. If the chunk's contig hint starting offset cannot satisfy an allocation, the allocator assumes there is enough memory pressure in this chunk to either use a different chunk or create a new one. This accepts a less tight packing for a smoother latency curve. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu-internal.h | 2 ++ mm/percpu.c | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index e60e04966d96..7065faf74b46 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -29,6 +29,8 @@ struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ int free_bytes; /* free bytes in the chunk */ int contig_bits; /* max contiguous size hint */ + int contig_bits_start; /* contig_bits starting + offset */ void *base_addr; /* base address of this chunk */ unsigned long *alloc_map; /* allocation map */ diff --git a/mm/percpu.c b/mm/percpu.c index 83abb190ca5a..734745a0c9b6 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -400,12 +400,14 @@ static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off, * @bit_off: chunk offset * @bits: size of free area * - * This updates the chunk's contig hint given a free area. + * This updates the chunk's contig hint and starting offset given a free area. */ static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits) { - if (bits > chunk->contig_bits) + if (bits > chunk->contig_bits) { + chunk->contig_bits_start = bit_off; chunk->contig_bits = bits; + } } /** @@ -416,6 +418,7 @@ static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits) * * Updates: * chunk->contig_bits + * chunk->contig_bits_start * nr_empty_pop_pages */ static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk) @@ -646,6 +649,17 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, int bit_off, bits; int re; /* region end */ + /* + * Check to see if the allocation can fit in the chunk's contig hint. + * This is an optimization to prevent scanning by assuming if it + * cannot fit in the global hint, there is memory pressure and creating + * a new chunk would happen soon. + */ + bit_off = ALIGN(chunk->contig_bits_start, align) - + chunk->contig_bits_start; + if (bit_off + alloc_bits > chunk->contig_bits) + return -1; + pcpu_for_each_unpop_region(chunk->alloc_map, bit_off, re, chunk->first_bit, pcpu_chunk_map_bits(chunk)) { From 268625a6f9df6a7c9b0ae7707a8a1cd5a9993bd2 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:15 -0400 Subject: [PATCH 22/27] percpu: keep track of the best offset for contig hints This patch makes the contig hint starting offset optimization from the previous patch as honest as it can be. For both chunk and block starting offsets, make sure it keeps the starting offset with the best alignment. The block skip optimization is added in a later patch when the pcpu_find_block_fit iterator is swapped in. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/mm/percpu.c b/mm/percpu.c index 734745a0c9b6..d0d3fa872a8c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -401,12 +401,18 @@ static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off, * @bits: size of free area * * This updates the chunk's contig hint and starting offset given a free area. + * Choose the best starting offset if the contig hint is equal. */ static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits) { if (bits > chunk->contig_bits) { chunk->contig_bits_start = bit_off; chunk->contig_bits = bits; + } else if (bits == chunk->contig_bits && chunk->contig_bits_start && + (!bit_off || + __ffs(bit_off) > __ffs(chunk->contig_bits_start))) { + /* use the start with the best alignment */ + chunk->contig_bits_start = bit_off; } } @@ -461,7 +467,8 @@ static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk) * @end: end offset in block * * Updates a block given a known free area. The region [start, end) is - * expected to be the entirety of the free area within a block. + * expected to be the entirety of the free area within a block. Chooses + * the best starting offset if the contig hints are equal. */ static void pcpu_block_update(struct pcpu_block_md *block, int start, int end) { @@ -477,6 +484,10 @@ static void pcpu_block_update(struct pcpu_block_md *block, int start, int end) if (contig > block->contig_hint) { block->contig_hint_start = start; block->contig_hint = contig; + } else if (block->contig_hint_start && contig == block->contig_hint && + (!start || __ffs(start) > __ffs(block->contig_hint_start))) { + /* use the start with the best alignment */ + block->contig_hint_start = start; } } From fc3043345a648a49978c6fb0bf8c188b7cfe0ab3 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:16 -0400 Subject: [PATCH 23/27] percpu: update alloc path to only scan if contig hints are broken Metadata is kept per block to keep track of where the contig hints are. Scanning can be avoided when the contig hints are not broken. In that case, left and right contigs have to be managed manually. This patch changes the allocation path hint updating to only scan when contig hints are broken. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index d0d3fa872a8c..f38f47a65642 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -521,6 +521,10 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of request + * + * Updates metadata for the allocation path. The metadata only has to be + * refreshed by a full scan iff the chunk's contig hint is broken. Block level + * scans are required if the block's contig hint is broken. */ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, int bits) @@ -545,14 +549,56 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, /* * Update s_block. + * block->first_free must be updated if the allocation takes its place. + * If the allocation breaks the contig_hint, a scan is required to + * restore this hint. */ - pcpu_block_refresh_hint(chunk, s_index); + if (s_off == s_block->first_free) + s_block->first_free = find_next_zero_bit( + pcpu_index_alloc_map(chunk, s_index), + PCPU_BITMAP_BLOCK_BITS, + s_off + bits); + + if (s_off >= s_block->contig_hint_start && + s_off < s_block->contig_hint_start + s_block->contig_hint) { + /* block contig hint is broken - scan to fix it */ + pcpu_block_refresh_hint(chunk, s_index); + } else { + /* update left and right contig manually */ + s_block->left_free = min(s_block->left_free, s_off); + if (s_index == e_index) + s_block->right_free = min_t(int, s_block->right_free, + PCPU_BITMAP_BLOCK_BITS - e_off); + else + s_block->right_free = 0; + } /* * Update e_block. */ if (s_index != e_index) { - pcpu_block_refresh_hint(chunk, e_index); + /* + * When the allocation is across blocks, the end is along + * the left part of the e_block. + */ + e_block->first_free = find_next_zero_bit( + pcpu_index_alloc_map(chunk, e_index), + PCPU_BITMAP_BLOCK_BITS, e_off); + + if (e_off == PCPU_BITMAP_BLOCK_BITS) { + /* reset the block */ + e_block++; + } else { + if (e_off > e_block->contig_hint_start) { + /* contig hint is broken - scan to fix it */ + pcpu_block_refresh_hint(chunk, e_index); + } else { + e_block->left_free = 0; + e_block->right_free = + min_t(int, e_block->right_free, + PCPU_BITMAP_BLOCK_BITS - e_off); + } + } /* update in-between md_blocks */ for (block = s_block + 1; block < e_block; block++) { @@ -562,7 +608,14 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, } } - pcpu_chunk_refresh_hint(chunk); + /* + * The only time a full chunk scan is required is if the chunk + * contig hint is broken. Otherwise, it means a smaller space + * was used and therefore the chunk contig hint is still correct. + */ + if (bit_off >= chunk->contig_bits_start && + bit_off < chunk->contig_bits_start + chunk->contig_bits) + pcpu_chunk_refresh_hint(chunk); } /** From b185cd0dc61c14875155e7bcc3f2c139b6feefd2 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:17 -0400 Subject: [PATCH 24/27] percpu: update free path to take advantage of contig hints The bitmap allocator must keep metadata consistent. The easiest way is to scan after every allocation for each affected block and the entire chunk. This is rather expensive. The free path can take advantage of current contig hints to prevent scanning within the start and end block. If a scan is needed, it can be done by scanning backwards from the start and forwards from the end to identify the entire free area this can be combined with. The blocks can then be updated by some basic checks rather than complete block scans. A chunk scan happens when the freed area makes a page free, a block free, or spans across blocks. This is necessary as the contig hint at this point could span across blocks. The check uses the minimum of page size and the block size to allow for variable sized blocks. There is a tradeoff here with not updating after every free. It is possible a contig hint in one block can be merged with the contig hint in the next block. This means the contig hint can be off by up to a page. However, if the chunk's contig hint is contained in one block, the contig hint will be accurate. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- include/linux/percpu.h | 3 ++ mm/percpu.c | 68 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 68 insertions(+), 3 deletions(-) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 31795e619273..6a5fb939d3e5 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -25,6 +25,9 @@ #define PCPU_MIN_ALLOC_SHIFT 2 #define PCPU_MIN_ALLOC_SIZE (1 << PCPU_MIN_ALLOC_SHIFT) +/* number of bits per page, used to trigger a scan if blocks are > PAGE_SIZE */ +#define PCPU_BITS_PER_PAGE (PAGE_SIZE >> PCPU_MIN_ALLOC_SHIFT) + /* * This determines the size of each metadata block. There are several subtle * constraints around this constant. The reserved region must be a multiple of diff --git a/mm/percpu.c b/mm/percpu.c index f38f47a65642..57b3168eae08 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -300,6 +300,11 @@ static unsigned long pcpu_off_to_block_off(int off) return off & (PCPU_BITMAP_BLOCK_BITS - 1); } +static unsigned long pcpu_block_off_to_off(int index, int off) +{ + return index * PCPU_BITMAP_BLOCK_BITS + off; +} + /** * pcpu_mem_zalloc - allocate memory * @size: bytes to allocate @@ -623,6 +628,17 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of request + * + * Updates metadata for the allocation path. This avoids a blind block + * refresh by making use of the block contig hints. If this fails, it scans + * forward and backward to determine the extent of the free area. This is + * capped at the boundary of blocks. + * + * A chunk update is triggered if a page becomes free, a block becomes free, + * or the free spans across blocks. This tradeoff is to minimize iterating + * over the block metadata to update chunk->contig_bits. chunk->contig_bits + * may be off by up to a page, but it will never be more than the available + * space. If the contig hint is contained in one block, it will be accurate. */ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, int bits) @@ -630,6 +646,7 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */ + int start, end; /* start and end of the whole free area */ /* * Calculate per block offsets. @@ -645,13 +662,46 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, s_block = chunk->md_blocks + s_index; e_block = chunk->md_blocks + e_index; + /* + * Check if the freed area aligns with the block->contig_hint. + * If it does, then the scan to find the beginning/end of the + * larger free area can be avoided. + * + * start and end refer to beginning and end of the free area + * within each their respective blocks. This is not necessarily + * the entire free area as it may span blocks past the beginning + * or end of the block. + */ + start = s_off; + if (s_off == s_block->contig_hint + s_block->contig_hint_start) { + start = s_block->contig_hint_start; + } else { + /* + * Scan backwards to find the extent of the free area. + * find_last_bit returns the starting bit, so if the start bit + * is returned, that means there was no last bit and the + * remainder of the chunk is free. + */ + int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), + start); + start = (start == l_bit) ? 0 : l_bit + 1; + } + + end = e_off; + if (e_off == e_block->contig_hint_start) + end = e_block->contig_hint_start + e_block->contig_hint; + else + end = find_next_bit(pcpu_index_alloc_map(chunk, e_index), + PCPU_BITMAP_BLOCK_BITS, end); + /* update s_block */ - pcpu_block_refresh_hint(chunk, s_index); + e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS; + pcpu_block_update(s_block, start, e_off); /* freeing in the same block */ if (s_index != e_index) { /* update e_block */ - pcpu_block_refresh_hint(chunk, e_index); + pcpu_block_update(e_block, 0, end); /* reset md_blocks in the middle */ for (block = s_block + 1; block < e_block; block++) { @@ -663,7 +713,19 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, } } - pcpu_chunk_refresh_hint(chunk); + /* + * Refresh chunk metadata when the free makes a page free, a block + * free, or spans across blocks. The contig hint may be off by up to + * a page, but if the hint is contained in a block, it will be accurate + * with the else condition below. + */ + if ((ALIGN_DOWN(end, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS)) > + ALIGN(start, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS))) || + s_index != e_index) + pcpu_chunk_refresh_hint(chunk); + else + pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start), + s_block->contig_hint); } /** From 525ca84daec01825b0d037f5fcf60adb7f510118 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:18 -0400 Subject: [PATCH 25/27] percpu: use metadata blocks to update the chunk contig hint The largest free region will either be a block level contig hint or an aggregate over the left_free and right_free areas of blocks. This is a much smaller set of free areas that need to be checked than a full traverse. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 70 insertions(+), 10 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 57b3168eae08..0f05647d9547 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -305,6 +305,67 @@ static unsigned long pcpu_block_off_to_off(int index, int off) return index * PCPU_BITMAP_BLOCK_BITS + off; } +/** + * pcpu_next_md_free_region - finds the next hint free area + * @chunk: chunk of interest + * @bit_off: chunk offset + * @bits: size of free area + * + * Helper function for pcpu_for_each_md_free_region. It checks + * block->contig_hint and performs aggregation across blocks to find the + * next hint. It modifies bit_off and bits in-place to be consumed in the + * loop. + */ +static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off, + int *bits) +{ + int i = pcpu_off_to_block_index(*bit_off); + int block_off = pcpu_off_to_block_off(*bit_off); + struct pcpu_block_md *block; + + *bits = 0; + for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); + block++, i++) { + /* handles contig area across blocks */ + if (*bits) { + *bits += block->left_free; + if (block->left_free == PCPU_BITMAP_BLOCK_BITS) + continue; + return; + } + + /* + * This checks three things. First is there a contig_hint to + * check. Second, have we checked this hint before by + * comparing the block_off. Third, is this the same as the + * right contig hint. In the last case, it spills over into + * the next block and should be handled by the contig area + * across blocks code. + */ + *bits = block->contig_hint; + if (*bits && block->contig_hint_start >= block_off && + *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) { + *bit_off = pcpu_block_off_to_off(i, + block->contig_hint_start); + return; + } + + *bits = block->right_free; + *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free; + } +} + +/* + * Metadata free area iterators. These perform aggregation of free areas + * based on the metadata blocks and return the offset @bit_off and size in + * bits of the free area @bits. + */ +#define pcpu_for_each_md_free_region(chunk, bit_off, bits) \ + for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \ + (bit_off) < pcpu_chunk_map_bits((chunk)); \ + (bit_off) += (bits) + 1, \ + pcpu_next_md_free_region((chunk), &(bit_off), &(bits))) + /** * pcpu_mem_zalloc - allocate memory * @size: bytes to allocate @@ -425,29 +486,28 @@ static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits) * pcpu_chunk_refresh_hint - updates metadata about a chunk * @chunk: chunk of interest * - * Iterates over the chunk to find the largest free area. + * Iterates over the metadata blocks to find the largest contig area. + * It also counts the populated pages and uses the delta to update the + * global count. * * Updates: * chunk->contig_bits * chunk->contig_bits_start - * nr_empty_pop_pages + * nr_empty_pop_pages (chunk and global) */ static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk) { - int bits, nr_empty_pop_pages; - int rs, re; /* region start, region end */ + int bit_off, bits, nr_empty_pop_pages; /* clear metadata */ chunk->contig_bits = 0; + bit_off = chunk->first_bit; bits = nr_empty_pop_pages = 0; - pcpu_for_each_unpop_region(chunk->alloc_map, rs, re, chunk->first_bit, - pcpu_chunk_map_bits(chunk)) { - bits = re - rs; + pcpu_for_each_md_free_region(chunk, bit_off, bits) { + pcpu_chunk_update(chunk, bit_off, bits); - pcpu_chunk_update(chunk, rs, bits); - - nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, rs, bits); + nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, bit_off, bits); } /* From b4c2116cfae65b09761b7ba34453733e745a6f77 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:19 -0400 Subject: [PATCH 26/27] percpu: update pcpu_find_block_fit to use an iterator The simple, and expensive, way to find a free area is to iterate over the entire bitmap until an area is found that fits the allocation size and alignment. This patch makes use of an iterate that find an area to check by using the block level contig hints. It will only return an area that can fit the size and alignment request. If the request can fit inside a block, it returns the first_free bit to start checking from to see if it can be fulfilled prior to the contig hint. The pcpu_alloc_area check has a bound of a block size added in case it is wrong. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu.c | 112 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 92 insertions(+), 20 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 0f05647d9547..b4754f3bf38f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -355,10 +355,72 @@ static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off, } } +/** + * pcpu_next_fit_region - finds fit areas for a given allocation request + * @chunk: chunk of interest + * @alloc_bits: size of allocation + * @align: alignment of area (max PAGE_SIZE) + * @bit_off: chunk offset + * @bits: size of free area + * + * Finds the next free region that is viable for use with a given size and + * alignment. This only returns if there is a valid area to be used for this + * allocation. block->first_free is returned if the allocation request fits + * within the block to see if the request can be fulfilled prior to the contig + * hint. + */ +static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, + int align, int *bit_off, int *bits) +{ + int i = pcpu_off_to_block_index(*bit_off); + int block_off = pcpu_off_to_block_off(*bit_off); + struct pcpu_block_md *block; + + *bits = 0; + for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); + block++, i++) { + /* handles contig area across blocks */ + if (*bits) { + *bits += block->left_free; + if (*bits >= alloc_bits) + return; + if (block->left_free == PCPU_BITMAP_BLOCK_BITS) + continue; + } + + /* check block->contig_hint */ + *bits = ALIGN(block->contig_hint_start, align) - + block->contig_hint_start; + /* + * This uses the block offset to determine if this has been + * checked in the prior iteration. + */ + if (block->contig_hint && + block->contig_hint_start >= block_off && + block->contig_hint >= *bits + alloc_bits) { + *bits += alloc_bits + block->contig_hint_start - + block->first_free; + *bit_off = pcpu_block_off_to_off(i, block->first_free); + return; + } + + *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free, + align); + *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off; + *bit_off = pcpu_block_off_to_off(i, *bit_off); + if (*bits >= alloc_bits) + return; + } + + /* no valid offsets were found - fail condition */ + *bit_off = pcpu_chunk_map_bits(chunk); +} + /* * Metadata free area iterators. These perform aggregation of free areas * based on the metadata blocks and return the offset @bit_off and size in - * bits of the free area @bits. + * bits of the free area @bits. pcpu_for_each_fit_region only returns when + * a fit is found for the allocation request. */ #define pcpu_for_each_md_free_region(chunk, bit_off, bits) \ for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \ @@ -366,6 +428,14 @@ static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off, (bit_off) += (bits) + 1, \ pcpu_next_md_free_region((chunk), &(bit_off), &(bits))) +#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \ + for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \ + &(bits)); \ + (bit_off) < pcpu_chunk_map_bits((chunk)); \ + (bit_off) += (bits), \ + pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \ + &(bits))) + /** * pcpu_mem_zalloc - allocate memory * @size: bytes to allocate @@ -825,6 +895,14 @@ static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, * @align: alignment of area (max PAGE_SIZE bytes) * @pop_only: use populated regions only * + * Given a chunk and an allocation spec, find the offset to begin searching + * for a free region. This iterates over the bitmap metadata blocks to + * find an offset that will be guaranteed to fit the requirements. It is + * not quite first fit as if the allocation does not fit in the contig hint + * of a block or chunk, it is skipped. This errs on the side of caution + * to prevent excess iteration. Poor alignment can cause the allocator to + * skip over blocks and chunks that have valid free areas. + * * RETURNS: * The offset in the bitmap to begin searching. * -1 if no offset is found. @@ -832,8 +910,7 @@ static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, size_t align, bool pop_only) { - int bit_off, bits; - int re; /* region end */ + int bit_off, bits, next_off; /* * Check to see if the allocation can fit in the chunk's contig hint. @@ -846,22 +923,14 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, if (bit_off + alloc_bits > chunk->contig_bits) return -1; - pcpu_for_each_unpop_region(chunk->alloc_map, bit_off, re, - chunk->first_bit, - pcpu_chunk_map_bits(chunk)) { - bits = re - bit_off; - - /* check alignment */ - bits -= ALIGN(bit_off, align) - bit_off; - bit_off = ALIGN(bit_off, align); - if (bits < alloc_bits) - continue; - - bits = alloc_bits; + bit_off = chunk->first_bit; + bits = 0; + pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) { if (!pop_only || pcpu_is_populated(chunk, bit_off, bits, - &bit_off)) + &next_off)) break; + bit_off = next_off; bits = 0; } @@ -879,9 +948,12 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, * @start: bit_off to start searching * * This function takes in a @start offset to begin searching to fit an - * allocation of @alloc_bits with alignment @align. If it confirms a - * valid free area, it then updates the allocation and boundary maps - * accordingly. + * allocation of @alloc_bits with alignment @align. It needs to scan + * the allocation map because if it fits within the block's contig hint, + * @start will be block->first_free. This is an attempt to fill the + * allocation prior to breaking the contig hint. The allocation and + * boundary maps are updated accordingly if it confirms a valid + * free area. * * RETURNS: * Allocated addr offset in @chunk on success. @@ -900,7 +972,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, /* * Search to find a fit. */ - end = start + alloc_bits; + end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS; bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start, alloc_bits, align_mask); if (bit_off >= end) From 5e81ee3e6a79cc9fa85af5c3db0f1f269709bbf1 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Mon, 24 Jul 2017 19:02:20 -0400 Subject: [PATCH 27/27] percpu: update header to contain bitmap allocator explanation. The other patches contain a lot of information, so adding this information in a separate patch. It adds my copyright and a brief explanation of how the bitmap allocator works. There is a minor typo as well in the prior explanation so that is fixed. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Signed-off-by: Tejun Heo --- mm/percpu.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index b4754f3bf38f..59d44d61f5f1 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -4,6 +4,9 @@ * Copyright (C) 2009 SUSE Linux Products GmbH * Copyright (C) 2009 Tejun Heo * + * Copyright (C) 2017 Facebook Inc. + * Copyright (C) 2017 Dennis Zhou + * * This file is released under the GPLv2 license. * * The percpu allocator handles both static and dynamic areas. Percpu @@ -25,7 +28,7 @@ * * There is special consideration for the first chunk which must handle * the static percpu variables in the kernel image as allocation services - * are not online yet. In short, the first chunk is structure like so: + * are not online yet. In short, the first chunk is structured like so: * * * @@ -34,19 +37,20 @@ * percpu variables from kernel modules. Finally, the dynamic section * takes care of normal allocations. * - * Allocation state in each chunk is kept using an array of integers - * on chunk->map. A positive value in the map represents a free - * region and negative allocated. Allocation inside a chunk is done - * by scanning this map sequentially and serving the first matching - * entry. This is mostly copied from the percpu_modalloc() allocator. - * Chunks can be determined from the address using the index field - * in the page struct. The index field contains a pointer to the chunk. + * The allocator organizes chunks into lists according to free size and + * tries to allocate from the fullest chunk first. Each chunk is managed + * by a bitmap with metadata blocks. The allocation map is updated on + * every allocation and free to reflect the current state while the boundary + * map is only updated on allocation. Each metadata block contains + * information to help mitigate the need to iterate over large portions + * of the bitmap. The reverse mapping from page to chunk is stored in + * the page's index. Lastly, units are lazily backed and grow in unison. * - * These chunks are organized into lists according to free_size and - * tries to allocate from the fullest chunk first. Each chunk maintains - * a maximum contiguous area size hint which is guaranteed to be equal - * to or larger than the maximum contiguous area in the chunk. This - * helps prevent the allocator from iterating over chunks unnecessarily. + * There is a unique conversion that goes on here between bytes and bits. + * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk + * tracks the number of pages it is responsible for in nr_pages. Helper + * functions are used to convert from between the bytes, bits, and blocks. + * All hints are managed in bits unless explicitly stated. * * To use this allocator, arch code should do the following: *