2009-02-20 08:29:08 +01:00
|
|
|
/*
|
2010-04-09 11:57:01 +02:00
|
|
|
* mm/percpu.c - percpu memory allocator
|
2009-02-20 08:29:08 +01:00
|
|
|
*
|
|
|
|
* Copyright (C) 2009 SUSE Linux Products GmbH
|
|
|
|
* Copyright (C) 2009 Tejun Heo <tj@kernel.org>
|
|
|
|
*
|
|
|
|
* This file is released under the GPLv2.
|
|
|
|
*
|
|
|
|
* This is percpu allocator which can handle both static and dynamic
|
2010-04-09 11:57:01 +02:00
|
|
|
* areas. Percpu areas are allocated in chunks. Each chunk is
|
|
|
|
* consisted of boot-time determined number of units and the first
|
|
|
|
* chunk is used for static percpu variables in the kernel image
|
2009-07-04 01:11:00 +02:00
|
|
|
* (special boot time alloc/init handling necessary as these areas
|
|
|
|
* need to be brought up before allocation services are running).
|
|
|
|
* Unit grows as necessary and all units grow or shrink in unison.
|
2010-04-09 11:57:01 +02:00
|
|
|
* When a chunk is filled up, another chunk is allocated.
|
2009-02-20 08:29:08 +01:00
|
|
|
*
|
|
|
|
* c0 c1 c2
|
|
|
|
* ------------------- ------------------- ------------
|
|
|
|
* | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
|
|
|
|
* ------------------- ...... ------------------- .... ------------
|
|
|
|
*
|
|
|
|
* Allocation is done in offset-size areas of single unit space. Ie,
|
|
|
|
* an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
|
2009-07-04 01:11:00 +02:00
|
|
|
* c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
|
|
|
|
* cpus. On NUMA, the mapping can be non-linear and even sparse.
|
|
|
|
* Percpu access can be done by configuring percpu base registers
|
|
|
|
* according to cpu to unit mapping and pcpu_unit_size.
|
2009-02-20 08:29:08 +01:00
|
|
|
*
|
2009-07-04 01:11:00 +02:00
|
|
|
* There are usually many small percpu allocations many of them being
|
|
|
|
* as small as 4 bytes. The allocator organizes chunks into lists
|
2009-02-20 08:29:08 +01:00
|
|
|
* according to free size and tries to allocate from the fullest one.
|
|
|
|
* Each chunk keeps the maximum contiguous area size hint which is
|
2010-08-11 04:24:10 +02:00
|
|
|
* guaranteed to be equal to or larger than the maximum contiguous
|
2009-02-20 08:29:08 +01:00
|
|
|
* area in the chunk. This helps the allocator not to iterate the
|
|
|
|
* chunk maps unnecessarily.
|
|
|
|
*
|
|
|
|
* Allocation state in each chunk is kept using an array of integers
|
|
|
|
* on chunk->map. A positive value in the map represents a free
|
|
|
|
* region and negative allocated. Allocation inside a chunk is done
|
|
|
|
* by scanning this map sequentially and serving the first matching
|
|
|
|
* entry. This is mostly copied from the percpu_modalloc() allocator.
|
2009-04-02 06:21:44 +02:00
|
|
|
* Chunks can be determined from the address using the index field
|
|
|
|
* in the page struct. The index field contains a pointer to the chunk.
|
2009-02-20 08:29:08 +01:00
|
|
|
*
|
|
|
|
* To use this allocator, arch code should do the followings.
|
|
|
|
*
|
|
|
|
* - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
|
2009-03-10 08:27:48 +01:00
|
|
|
* regular address to percpu pointer and back if they need to be
|
|
|
|
* different from the default
|
2009-02-20 08:29:08 +01:00
|
|
|
*
|
2009-02-24 03:57:21 +01:00
|
|
|
* - use pcpu_setup_first_chunk() during percpu area initialization to
|
|
|
|
* setup the first chunk containing the kernel static percpu area
|
2009-02-20 08:29:08 +01:00
|
|
|
*/
|
|
|
|
|
2016-03-17 22:19:53 +01:00
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
2009-02-20 08:29:08 +01:00
|
|
|
#include <linux/bitmap.h>
|
|
|
|
#include <linux/bootmem.h>
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
#include <linux/err.h>
|
2009-02-20 08:29:08 +01:00
|
|
|
#include <linux/list.h>
|
2009-07-04 01:11:00 +02:00
|
|
|
#include <linux/log2.h>
|
2009-02-20 08:29:08 +01:00
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/mutex.h>
|
|
|
|
#include <linux/percpu.h>
|
|
|
|
#include <linux/pfn.h>
|
|
|
|
#include <linux/slab.h>
|
2009-03-06 16:44:13 +01:00
|
|
|
#include <linux/spinlock.h>
|
2009-02-20 08:29:08 +01:00
|
|
|
#include <linux/vmalloc.h>
|
2009-03-06 16:44:11 +01:00
|
|
|
#include <linux/workqueue.h>
|
2011-09-26 18:12:53 +02:00
|
|
|
#include <linux/kmemleak.h>
|
2009-02-20 08:29:08 +01:00
|
|
|
|
|
|
|
#include <asm/cacheflush.h>
|
2009-03-10 08:27:48 +01:00
|
|
|
#include <asm/sections.h>
|
2009-02-20 08:29:08 +01:00
|
|
|
#include <asm/tlbflush.h>
|
2009-11-24 07:50:03 +01:00
|
|
|
#include <asm/io.h>
|
2009-02-20 08:29:08 +01:00
|
|
|
|
|
|
|
#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
|
|
|
|
#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
|
2014-09-02 20:46:05 +02:00
|
|
|
#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
|
|
|
|
#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
|
2014-09-02 20:46:05 +02:00
|
|
|
#define PCPU_EMPTY_POP_PAGES_LOW 2
|
|
|
|
#define PCPU_EMPTY_POP_PAGES_HIGH 4
|
2009-02-20 08:29:08 +01:00
|
|
|
|
2010-09-03 18:22:48 +02:00
|
|
|
#ifdef CONFIG_SMP
|
2009-03-10 08:27:48 +01:00
|
|
|
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
|
|
|
|
#ifndef __addr_to_pcpu_ptr
|
|
|
|
#define __addr_to_pcpu_ptr(addr) \
|
2010-02-02 06:38:57 +01:00
|
|
|
(void __percpu *)((unsigned long)(addr) - \
|
|
|
|
(unsigned long)pcpu_base_addr + \
|
|
|
|
(unsigned long)__per_cpu_start)
|
2009-03-10 08:27:48 +01:00
|
|
|
#endif
|
|
|
|
#ifndef __pcpu_ptr_to_addr
|
|
|
|
#define __pcpu_ptr_to_addr(ptr) \
|
2010-02-02 06:38:57 +01:00
|
|
|
(void __force *)((unsigned long)(ptr) + \
|
|
|
|
(unsigned long)pcpu_base_addr - \
|
|
|
|
(unsigned long)__per_cpu_start)
|
2009-03-10 08:27:48 +01:00
|
|
|
#endif
|
2010-09-03 18:22:48 +02:00
|
|
|
#else /* CONFIG_SMP */
|
|
|
|
/* on UP, it's always identity mapped */
|
|
|
|
#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
|
|
|
|
#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
|
|
|
|
#endif /* CONFIG_SMP */
|
2009-03-10 08:27:48 +01:00
|
|
|
|
2009-02-20 08:29:08 +01:00
|
|
|
struct pcpu_chunk {
|
|
|
|
struct list_head list; /* linked to pcpu_slot lists */
|
|
|
|
int free_size; /* free bytes in the chunk */
|
|
|
|
int contig_hint; /* max contiguous size hint */
|
2009-08-14 08:00:51 +02:00
|
|
|
void *base_addr; /* base address of this chunk */
|
2014-09-02 20:46:05 +02:00
|
|
|
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
int map_used; /* # of map entries used before the sentry */
|
2009-02-20 08:29:08 +01:00
|
|
|
int map_alloc; /* # of map entries allocated */
|
|
|
|
int *map; /* allocation map */
|
2016-05-25 17:48:25 +02:00
|
|
|
struct list_head map_extend_list;/* on pcpu_map_extend_chunks */
|
2014-09-02 20:46:05 +02:00
|
|
|
|
2010-04-09 11:57:01 +02:00
|
|
|
void *data; /* chunk data */
|
2014-03-07 02:52:32 +01:00
|
|
|
int first_free; /* no free below this */
|
2009-02-24 03:57:21 +01:00
|
|
|
bool immutable; /* no [de]population allowed */
|
2014-09-02 20:46:05 +02:00
|
|
|
int nr_populated; /* # of populated pages */
|
2009-07-04 01:11:00 +02:00
|
|
|
unsigned long populated[]; /* populated bitmap */
|
2009-02-20 08:29:08 +01:00
|
|
|
};
|
|
|
|
|
2009-02-24 04:32:28 +01:00
|
|
|
static int pcpu_unit_pages __read_mostly;
|
|
|
|
static int pcpu_unit_size __read_mostly;
|
2009-07-04 01:11:00 +02:00
|
|
|
static int pcpu_nr_units __read_mostly;
|
2009-08-14 08:00:52 +02:00
|
|
|
static int pcpu_atom_size __read_mostly;
|
2009-02-24 04:32:28 +01:00
|
|
|
static int pcpu_nr_slots __read_mostly;
|
|
|
|
static size_t pcpu_chunk_struct_size __read_mostly;
|
2009-02-20 08:29:08 +01:00
|
|
|
|
2011-11-18 19:55:35 +01:00
|
|
|
/* cpus with the lowest and highest unit addresses */
|
|
|
|
static unsigned int pcpu_low_unit_cpu __read_mostly;
|
|
|
|
static unsigned int pcpu_high_unit_cpu __read_mostly;
|
2009-07-04 01:11:00 +02:00
|
|
|
|
2009-02-20 08:29:08 +01:00
|
|
|
/* the address of the first chunk which starts with the kernel static area */
|
2009-02-24 04:32:28 +01:00
|
|
|
void *pcpu_base_addr __read_mostly;
|
2009-02-20 08:29:08 +01:00
|
|
|
EXPORT_SYMBOL_GPL(pcpu_base_addr);
|
|
|
|
|
2009-08-14 08:00:51 +02:00
|
|
|
static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */
|
|
|
|
const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */
|
2009-07-04 01:11:00 +02:00
|
|
|
|
2009-08-14 08:00:52 +02:00
|
|
|
/* group information, used for vm allocation */
|
|
|
|
static int pcpu_nr_groups __read_mostly;
|
|
|
|
static const unsigned long *pcpu_group_offsets __read_mostly;
|
|
|
|
static const size_t *pcpu_group_sizes __read_mostly;
|
|
|
|
|
2009-04-02 06:19:54 +02:00
|
|
|
/*
|
|
|
|
* The first chunk which always exists. Note that unlike other
|
|
|
|
* chunks, this one can be allocated and mapped in several different
|
|
|
|
* ways and thus often doesn't live in the vmalloc area.
|
|
|
|
*/
|
|
|
|
static struct pcpu_chunk *pcpu_first_chunk;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Optional reserved chunk. This chunk reserves part of the first
|
|
|
|
* chunk and serves it for reserved allocations. The amount of
|
|
|
|
* reserved offset is in pcpu_reserved_chunk_limit. When reserved
|
|
|
|
* area doesn't exist, the following variables contain NULL and 0
|
|
|
|
* respectively.
|
|
|
|
*/
|
2009-03-06 06:33:59 +01:00
|
|
|
static struct pcpu_chunk *pcpu_reserved_chunk;
|
|
|
|
static int pcpu_reserved_chunk_limit;
|
|
|
|
|
2014-09-02 20:46:02 +02:00
|
|
|
static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
|
2016-05-25 17:48:25 +02:00
|
|
|
static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
|
2009-02-20 08:29:08 +01:00
|
|
|
|
2009-02-24 04:32:28 +01:00
|
|
|
static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
|
2009-02-20 08:29:08 +01:00
|
|
|
|
2016-05-25 17:48:25 +02:00
|
|
|
/* chunks which need their map areas extended, protected by pcpu_lock */
|
|
|
|
static LIST_HEAD(pcpu_map_extend_chunks);
|
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
/*
|
|
|
|
* The number of empty populated pages, protected by pcpu_lock. The
|
|
|
|
* reserved chunk doesn't contribute to the count.
|
|
|
|
*/
|
|
|
|
static int pcpu_nr_empty_pop_pages;
|
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
/*
|
|
|
|
* Balance work is used to populate or destroy chunks asynchronously. We
|
|
|
|
* try to keep the number of populated free pages between
|
|
|
|
* PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
|
|
|
|
* empty chunk.
|
|
|
|
*/
|
2014-09-02 20:46:05 +02:00
|
|
|
static void pcpu_balance_workfn(struct work_struct *work);
|
|
|
|
static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
|
2014-09-02 20:46:05 +02:00
|
|
|
static bool pcpu_async_enabled __read_mostly;
|
|
|
|
static bool pcpu_atomic_alloc_failed;
|
|
|
|
|
|
|
|
static void pcpu_schedule_balance_work(void)
|
|
|
|
{
|
|
|
|
if (pcpu_async_enabled)
|
|
|
|
schedule_work(&pcpu_balance_work);
|
|
|
|
}
|
2009-03-06 16:44:11 +01:00
|
|
|
|
2010-04-09 11:57:00 +02:00
|
|
|
static bool pcpu_addr_in_first_chunk(void *addr)
|
|
|
|
{
|
|
|
|
void *first_start = pcpu_first_chunk->base_addr;
|
|
|
|
|
|
|
|
return addr >= first_start && addr < first_start + pcpu_unit_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool pcpu_addr_in_reserved_chunk(void *addr)
|
|
|
|
{
|
|
|
|
void *first_start = pcpu_first_chunk->base_addr;
|
|
|
|
|
|
|
|
return addr >= first_start &&
|
|
|
|
addr < first_start + pcpu_reserved_chunk_limit;
|
|
|
|
}
|
|
|
|
|
2009-02-24 03:57:21 +01:00
|
|
|
static int __pcpu_size_to_slot(int size)
|
2009-02-20 08:29:08 +01:00
|
|
|
{
|
2009-02-21 08:56:23 +01:00
|
|
|
int highbit = fls(size); /* size is in bytes */
|
2009-02-20 08:29:08 +01:00
|
|
|
return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
|
|
|
|
}
|
|
|
|
|
2009-02-24 03:57:21 +01:00
|
|
|
static int pcpu_size_to_slot(int size)
|
|
|
|
{
|
|
|
|
if (size == pcpu_unit_size)
|
|
|
|
return pcpu_nr_slots - 1;
|
|
|
|
return __pcpu_size_to_slot(size);
|
|
|
|
}
|
|
|
|
|
2009-02-20 08:29:08 +01:00
|
|
|
static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
|
|
|
|
{
|
|
|
|
if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return pcpu_size_to_slot(chunk->free_size);
|
|
|
|
}
|
|
|
|
|
2010-04-09 11:57:01 +02:00
|
|
|
/* set the pointer to a chunk in a page struct */
|
|
|
|
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
|
|
|
|
{
|
|
|
|
page->index = (unsigned long)pcpu;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* obtain pointer to a chunk from a page struct */
|
|
|
|
static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
|
|
|
|
{
|
|
|
|
return (struct pcpu_chunk *)page->index;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
|
2009-02-20 08:29:08 +01:00
|
|
|
{
|
2009-07-04 01:11:00 +02:00
|
|
|
return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
|
2009-02-20 08:29:08 +01:00
|
|
|
}
|
|
|
|
|
2010-06-18 11:44:31 +02:00
|
|
|
static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
|
|
|
|
unsigned int cpu, int page_idx)
|
2009-02-20 08:29:08 +01:00
|
|
|
{
|
2009-08-14 08:00:51 +02:00
|
|
|
return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
|
2009-08-14 08:00:51 +02:00
|
|
|
(page_idx << PAGE_SHIFT);
|
2009-02-20 08:29:08 +01:00
|
|
|
}
|
|
|
|
|
2010-04-09 11:57:01 +02:00
|
|
|
static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
|
|
|
|
int *rs, int *re, int end)
|
2009-07-04 01:11:00 +02:00
|
|
|
{
|
|
|
|
*rs = find_next_zero_bit(chunk->populated, end, *rs);
|
|
|
|
*re = find_next_bit(chunk->populated, end, *rs + 1);
|
|
|
|
}
|
|
|
|
|
2010-04-09 11:57:01 +02:00
|
|
|
static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
|
|
|
|
int *rs, int *re, int end)
|
2009-07-04 01:11:00 +02:00
|
|
|
{
|
|
|
|
*rs = find_next_bit(chunk->populated, end, *rs);
|
|
|
|
*re = find_next_zero_bit(chunk->populated, end, *rs + 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* (Un)populated page region iterators. Iterate over (un)populated
|
tree-wide: fix comment/printk typos
"gadget", "through", "command", "maintain", "maintain", "controller", "address",
"between", "initiali[zs]e", "instead", "function", "select", "already",
"equal", "access", "management", "hierarchy", "registration", "interest",
"relative", "memory", "offset", "already",
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
2010-11-01 20:38:34 +01:00
|
|
|
* page regions between @start and @end in @chunk. @rs and @re should
|
2009-07-04 01:11:00 +02:00
|
|
|
* be integer variables and will be set to start and end page index of
|
|
|
|
* the current region.
|
|
|
|
*/
|
|
|
|
#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
|
|
|
|
for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
|
|
|
|
(rs) < (re); \
|
|
|
|
(rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
|
|
|
|
|
|
|
|
#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
|
|
|
|
for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
|
|
|
|
(rs) < (re); \
|
|
|
|
(rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
|
|
|
|
|
2009-02-20 08:29:08 +01:00
|
|
|
/**
|
2011-08-04 11:02:33 +02:00
|
|
|
* pcpu_mem_zalloc - allocate memory
|
2009-03-06 16:44:09 +01:00
|
|
|
* @size: bytes to allocate
|
2009-02-20 08:29:08 +01:00
|
|
|
*
|
2009-03-06 16:44:09 +01:00
|
|
|
* Allocate @size bytes. If @size is smaller than PAGE_SIZE,
|
2011-08-04 11:02:33 +02:00
|
|
|
* kzalloc() is used; otherwise, vzalloc() is used. The returned
|
2009-03-06 16:44:09 +01:00
|
|
|
* memory is always zeroed.
|
2009-02-20 08:29:08 +01:00
|
|
|
*
|
2009-03-06 16:44:13 +01:00
|
|
|
* CONTEXT:
|
|
|
|
* Does GFP_KERNEL allocation.
|
|
|
|
*
|
2009-02-20 08:29:08 +01:00
|
|
|
* RETURNS:
|
2009-03-06 16:44:09 +01:00
|
|
|
* Pointer to the allocated area on success, NULL on failure.
|
2009-02-20 08:29:08 +01:00
|
|
|
*/
|
2011-08-04 11:02:33 +02:00
|
|
|
static void *pcpu_mem_zalloc(size_t size)
|
2009-02-20 08:29:08 +01:00
|
|
|
{
|
2010-06-27 18:50:00 +02:00
|
|
|
if (WARN_ON_ONCE(!slab_is_available()))
|
|
|
|
return NULL;
|
|
|
|
|
2009-03-06 16:44:09 +01:00
|
|
|
if (size <= PAGE_SIZE)
|
|
|
|
return kzalloc(size, GFP_KERNEL);
|
2010-10-30 15:56:54 +02:00
|
|
|
else
|
|
|
|
return vzalloc(size);
|
2009-03-06 16:44:09 +01:00
|
|
|
}
|
2009-02-20 08:29:08 +01:00
|
|
|
|
2009-03-06 16:44:09 +01:00
|
|
|
/**
|
|
|
|
* pcpu_mem_free - free memory
|
|
|
|
* @ptr: memory to free
|
|
|
|
*
|
2011-08-04 11:02:33 +02:00
|
|
|
* Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc().
|
2009-03-06 16:44:09 +01:00
|
|
|
*/
|
2016-01-23 00:11:02 +01:00
|
|
|
static void pcpu_mem_free(void *ptr)
|
2009-03-06 16:44:09 +01:00
|
|
|
{
|
2016-01-23 00:11:02 +01:00
|
|
|
kvfree(ptr);
|
2009-02-20 08:29:08 +01:00
|
|
|
}
|
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
/**
|
|
|
|
* pcpu_count_occupied_pages - count the number of pages an area occupies
|
|
|
|
* @chunk: chunk of interest
|
|
|
|
* @i: index of the area in question
|
|
|
|
*
|
|
|
|
* Count the number of pages chunk's @i'th area occupies. When the area's
|
|
|
|
* start and/or end address isn't aligned to page boundary, the straddled
|
|
|
|
* page is included in the count iff the rest of the page is free.
|
|
|
|
*/
|
|
|
|
static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i)
|
|
|
|
{
|
|
|
|
int off = chunk->map[i] & ~1;
|
|
|
|
int end = chunk->map[i + 1] & ~1;
|
|
|
|
|
|
|
|
if (!PAGE_ALIGNED(off) && i > 0) {
|
|
|
|
int prev = chunk->map[i - 1];
|
|
|
|
|
|
|
|
if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE))
|
|
|
|
off = round_down(off, PAGE_SIZE);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) {
|
|
|
|
int next = chunk->map[i + 1];
|
|
|
|
int nend = chunk->map[i + 2] & ~1;
|
|
|
|
|
|
|
|
if (!(next & 1) && nend >= round_up(end, PAGE_SIZE))
|
|
|
|
end = round_up(end, PAGE_SIZE);
|
|
|
|
}
|
|
|
|
|
|
|
|
return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0);
|
|
|
|
}
|
|
|
|
|
2009-02-20 08:29:08 +01:00
|
|
|
/**
|
|
|
|
* pcpu_chunk_relocate - put chunk in the appropriate chunk slot
|
|
|
|
* @chunk: chunk of interest
|
|
|
|
* @oslot: the previous slot it was on
|
|
|
|
*
|
|
|
|
* This function is called after an allocation or free changed @chunk.
|
|
|
|
* New slot according to the changed state is determined and @chunk is
|
2009-03-06 06:33:59 +01:00
|
|
|
* moved to the slot. Note that the reserved chunk is never put on
|
|
|
|
* chunk slots.
|
2009-03-06 16:44:13 +01:00
|
|
|
*
|
|
|
|
* CONTEXT:
|
|
|
|
* pcpu_lock.
|
2009-02-20 08:29:08 +01:00
|
|
|
*/
|
|
|
|
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
|
|
|
|
{
|
|
|
|
int nslot = pcpu_chunk_slot(chunk);
|
|
|
|
|
2009-03-06 06:33:59 +01:00
|
|
|
if (chunk != pcpu_reserved_chunk && oslot != nslot) {
|
2009-02-20 08:29:08 +01:00
|
|
|
if (oslot < nslot)
|
|
|
|
list_move(&chunk->list, &pcpu_slot[nslot]);
|
|
|
|
else
|
|
|
|
list_move_tail(&chunk->list, &pcpu_slot[nslot]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-03-06 16:44:09 +01:00
|
|
|
/**
|
2009-11-11 07:35:18 +01:00
|
|
|
* pcpu_need_to_extend - determine whether chunk area map needs to be extended
|
|
|
|
* @chunk: chunk of interest
|
2014-09-02 20:46:05 +02:00
|
|
|
* @is_atomic: the allocation context
|
2009-03-06 16:44:09 +01:00
|
|
|
*
|
2014-09-02 20:46:05 +02:00
|
|
|
* Determine whether area map of @chunk needs to be extended. If
|
|
|
|
* @is_atomic, only the amount necessary for a new allocation is
|
|
|
|
* considered; however, async extension is scheduled if the left amount is
|
|
|
|
* low. If !@is_atomic, it aims for more empty space. Combined, this
|
|
|
|
* ensures that the map is likely to have enough available space to
|
|
|
|
* accomodate atomic allocations which can't extend maps directly.
|
2009-03-06 16:44:09 +01:00
|
|
|
*
|
2009-03-06 16:44:13 +01:00
|
|
|
* CONTEXT:
|
2009-11-11 07:35:18 +01:00
|
|
|
* pcpu_lock.
|
2009-03-06 16:44:13 +01:00
|
|
|
*
|
2009-03-06 16:44:09 +01:00
|
|
|
* RETURNS:
|
2009-11-11 07:35:18 +01:00
|
|
|
* New target map allocation length if extension is necessary, 0
|
|
|
|
* otherwise.
|
2009-03-06 16:44:09 +01:00
|
|
|
*/
|
2014-09-02 20:46:05 +02:00
|
|
|
static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
|
2009-03-06 16:44:09 +01:00
|
|
|
{
|
2014-09-02 20:46:05 +02:00
|
|
|
int margin, new_alloc;
|
|
|
|
|
2016-05-25 17:48:25 +02:00
|
|
|
lockdep_assert_held(&pcpu_lock);
|
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
if (is_atomic) {
|
|
|
|
margin = 3;
|
2009-03-06 16:44:09 +01:00
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
if (chunk->map_alloc <
|
2016-05-25 17:48:25 +02:00
|
|
|
chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
|
|
|
|
if (list_empty(&chunk->map_extend_list)) {
|
|
|
|
list_add_tail(&chunk->map_extend_list,
|
|
|
|
&pcpu_map_extend_chunks);
|
|
|
|
pcpu_schedule_balance_work();
|
|
|
|
}
|
|
|
|
}
|
2014-09-02 20:46:05 +02:00
|
|
|
} else {
|
|
|
|
margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (chunk->map_alloc >= chunk->map_used + margin)
|
2009-03-06 16:44:09 +01:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
new_alloc = PCPU_DFL_MAP_ALLOC;
|
2014-09-02 20:46:05 +02:00
|
|
|
while (new_alloc < chunk->map_used + margin)
|
2009-03-06 16:44:09 +01:00
|
|
|
new_alloc *= 2;
|
|
|
|
|
2009-11-11 07:35:18 +01:00
|
|
|
return new_alloc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* pcpu_extend_area_map - extend area map of a chunk
|
|
|
|
* @chunk: chunk of interest
|
|
|
|
* @new_alloc: new target allocation length of the area map
|
|
|
|
*
|
|
|
|
* Extend area map of @chunk to have @new_alloc entries.
|
|
|
|
*
|
|
|
|
* CONTEXT:
|
|
|
|
* Does GFP_KERNEL allocation. Grabs and releases pcpu_lock.
|
|
|
|
*
|
|
|
|
* RETURNS:
|
|
|
|
* 0 on success, -errno on failure.
|
|
|
|
*/
|
|
|
|
static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
|
|
|
|
{
|
|
|
|
int *old = NULL, *new = NULL;
|
|
|
|
size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
|
|
|
|
unsigned long flags;
|
|
|
|
|
2016-05-25 17:48:25 +02:00
|
|
|
lockdep_assert_held(&pcpu_alloc_mutex);
|
|
|
|
|
2011-08-04 11:02:33 +02:00
|
|
|
new = pcpu_mem_zalloc(new_size);
|
2009-11-11 07:35:18 +01:00
|
|
|
if (!new)
|
2009-03-06 16:44:09 +01:00
|
|
|
return -ENOMEM;
|
2009-03-06 16:44:13 +01:00
|
|
|
|
2009-11-11 07:35:18 +01:00
|
|
|
/* acquire pcpu_lock and switch to new area map */
|
|
|
|
spin_lock_irqsave(&pcpu_lock, flags);
|
|
|
|
|
|
|
|
if (new_alloc <= chunk->map_alloc)
|
|
|
|
goto out_unlock;
|
2009-03-06 16:44:09 +01:00
|
|
|
|
2009-11-11 07:35:18 +01:00
|
|
|
old_size = chunk->map_alloc * sizeof(chunk->map[0]);
|
2010-08-08 14:39:07 +02:00
|
|
|
old = chunk->map;
|
|
|
|
|
|
|
|
memcpy(new, old, old_size);
|
2009-03-06 16:44:09 +01:00
|
|
|
|
|
|
|
chunk->map_alloc = new_alloc;
|
|
|
|
chunk->map = new;
|
2009-11-11 07:35:18 +01:00
|
|
|
new = NULL;
|
|
|
|
|
|
|
|
out_unlock:
|
|
|
|
spin_unlock_irqrestore(&pcpu_lock, flags);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pcpu_mem_free() might end up calling vfree() which uses
|
|
|
|
* IRQ-unsafe lock and thus can't be called under pcpu_lock.
|
|
|
|
*/
|
2016-01-23 00:11:02 +01:00
|
|
|
pcpu_mem_free(old);
|
|
|
|
pcpu_mem_free(new);
|
2009-11-11 07:35:18 +01:00
|
|
|
|
2009-03-06 16:44:09 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-09-02 20:46:02 +02:00
|
|
|
/**
|
|
|
|
* pcpu_fit_in_area - try to fit the requested allocation in a candidate area
|
|
|
|
* @chunk: chunk the candidate area belongs to
|
|
|
|
* @off: the offset to the start of the candidate area
|
|
|
|
* @this_size: the size of the candidate area
|
|
|
|
* @size: the size of the target allocation
|
|
|
|
* @align: the alignment of the target allocation
|
|
|
|
* @pop_only: only allocate from already populated region
|
|
|
|
*
|
|
|
|
* We're trying to allocate @size bytes aligned at @align. @chunk's area
|
|
|
|
* at @off sized @this_size is a candidate. This function determines
|
|
|
|
* whether the target allocation fits in the candidate area and returns the
|
|
|
|
* number of bytes to pad after @off. If the target area doesn't fit, -1
|
|
|
|
* is returned.
|
|
|
|
*
|
|
|
|
* If @pop_only is %true, this function only considers the already
|
|
|
|
* populated part of the candidate area.
|
|
|
|
*/
|
|
|
|
static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size,
|
|
|
|
int size, int align, bool pop_only)
|
|
|
|
{
|
|
|
|
int cand_off = off;
|
|
|
|
|
|
|
|
while (true) {
|
|
|
|
int head = ALIGN(cand_off, align) - off;
|
|
|
|
int page_start, page_end, rs, re;
|
|
|
|
|
|
|
|
if (this_size < head + size)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
if (!pop_only)
|
|
|
|
return head;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the first unpopulated page is beyond the end of the
|
|
|
|
* allocation, the whole allocation is populated;
|
|
|
|
* otherwise, retry from the end of the unpopulated area.
|
|
|
|
*/
|
|
|
|
page_start = PFN_DOWN(head + off);
|
|
|
|
page_end = PFN_UP(head + off + size);
|
|
|
|
|
|
|
|
rs = page_start;
|
|
|
|
pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size));
|
|
|
|
if (rs >= page_end)
|
|
|
|
return head;
|
|
|
|
cand_off = re * PAGE_SIZE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-02-20 08:29:08 +01:00
|
|
|
/**
|
|
|
|
* pcpu_alloc_area - allocate area from a pcpu_chunk
|
|
|
|
* @chunk: chunk of interest
|
2009-02-21 08:56:23 +01:00
|
|
|
* @size: wanted size in bytes
|
2009-02-20 08:29:08 +01:00
|
|
|
* @align: wanted align
|
2014-09-02 20:46:02 +02:00
|
|
|
* @pop_only: allocate only from the populated area
|
2014-09-02 20:46:05 +02:00
|
|
|
* @occ_pages_p: out param for the number of pages the area occupies
|
2009-02-20 08:29:08 +01:00
|
|
|
*
|
|
|
|
* Try to allocate @size bytes area aligned at @align from @chunk.
|
|
|
|
* Note that this function only allocates the offset. It doesn't
|
|
|
|
* populate or map the area.
|
|
|
|
*
|
2009-03-06 16:44:09 +01:00
|
|
|
* @chunk->map must have at least two free slots.
|
|
|
|
*
|
2009-03-06 16:44:13 +01:00
|
|
|
* CONTEXT:
|
|
|
|
* pcpu_lock.
|
|
|
|
*
|
2009-02-20 08:29:08 +01:00
|
|
|
* RETURNS:
|
2009-03-06 16:44:09 +01:00
|
|
|
* Allocated offset in @chunk on success, -1 if no matching area is
|
|
|
|
* found.
|
2009-02-20 08:29:08 +01:00
|
|
|
*/
|
2014-09-02 20:46:02 +02:00
|
|
|
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
|
2014-09-02 20:46:05 +02:00
|
|
|
bool pop_only, int *occ_pages_p)
|
2009-02-20 08:29:08 +01:00
|
|
|
{
|
|
|
|
int oslot = pcpu_chunk_slot(chunk);
|
|
|
|
int max_contig = 0;
|
|
|
|
int i, off;
|
2014-03-07 02:52:32 +01:00
|
|
|
bool seen_free = false;
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
int *p;
|
2009-02-20 08:29:08 +01:00
|
|
|
|
2014-03-07 02:52:32 +01:00
|
|
|
for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) {
|
2009-02-20 08:29:08 +01:00
|
|
|
int head, tail;
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
int this_size;
|
|
|
|
|
|
|
|
off = *p;
|
|
|
|
if (off & 1)
|
|
|
|
continue;
|
2009-02-20 08:29:08 +01:00
|
|
|
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
this_size = (p[1] & ~1) - off;
|
2014-09-02 20:46:02 +02:00
|
|
|
|
|
|
|
head = pcpu_fit_in_area(chunk, off, this_size, size, align,
|
|
|
|
pop_only);
|
|
|
|
if (head < 0) {
|
2014-03-07 02:52:32 +01:00
|
|
|
if (!seen_free) {
|
|
|
|
chunk->first_free = i;
|
|
|
|
seen_free = true;
|
|
|
|
}
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
max_contig = max(this_size, max_contig);
|
2009-02-20 08:29:08 +01:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If head is small or the previous block is free,
|
|
|
|
* merge'em. Note that 'small' is defined as smaller
|
|
|
|
* than sizeof(int), which is very small but isn't too
|
|
|
|
* uncommon for percpu allocations.
|
|
|
|
*/
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
if (head && (head < sizeof(int) || !(p[-1] & 1))) {
|
2014-03-28 13:55:21 +01:00
|
|
|
*p = off += head;
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
if (p[-1] & 1)
|
2009-02-20 08:29:08 +01:00
|
|
|
chunk->free_size -= head;
|
2014-03-28 13:55:21 +01:00
|
|
|
else
|
|
|
|
max_contig = max(*p - p[-1], max_contig);
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
this_size -= head;
|
2009-02-20 08:29:08 +01:00
|
|
|
head = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* if tail is small, just keep it around */
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
tail = this_size - head - size;
|
|
|
|
if (tail < sizeof(int)) {
|
2009-02-20 08:29:08 +01:00
|
|
|
tail = 0;
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
size = this_size - head;
|
|
|
|
}
|
2009-02-20 08:29:08 +01:00
|
|
|
|
|
|
|
/* split if warranted */
|
|
|
|
if (head || tail) {
|
2014-03-07 03:08:24 +01:00
|
|
|
int nr_extra = !!head + !!tail;
|
|
|
|
|
|
|
|
/* insert new subblocks */
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
memmove(p + nr_extra + 1, p + 1,
|
2014-03-07 03:08:24 +01:00
|
|
|
sizeof(chunk->map[0]) * (chunk->map_used - i));
|
|
|
|
chunk->map_used += nr_extra;
|
|
|
|
|
2009-02-20 08:29:08 +01:00
|
|
|
if (head) {
|
2014-03-07 02:52:32 +01:00
|
|
|
if (!seen_free) {
|
|
|
|
chunk->first_free = i;
|
|
|
|
seen_free = true;
|
|
|
|
}
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
*++p = off += head;
|
|
|
|
++i;
|
2014-03-07 03:08:24 +01:00
|
|
|
max_contig = max(head, max_contig);
|
|
|
|
}
|
|
|
|
if (tail) {
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
p[1] = off + size;
|
2014-03-07 03:08:24 +01:00
|
|
|
max_contig = max(tail, max_contig);
|
2009-02-20 08:29:08 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-03-07 02:52:32 +01:00
|
|
|
if (!seen_free)
|
|
|
|
chunk->first_free = i + 1;
|
|
|
|
|
2009-02-20 08:29:08 +01:00
|
|
|
/* update hint and mark allocated */
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
if (i + 1 == chunk->map_used)
|
2009-02-20 08:29:08 +01:00
|
|
|
chunk->contig_hint = max_contig; /* fully scanned */
|
|
|
|
else
|
|
|
|
chunk->contig_hint = max(chunk->contig_hint,
|
|
|
|
max_contig);
|
|
|
|
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
chunk->free_size -= size;
|
|
|
|
*p |= 1;
|
2009-02-20 08:29:08 +01:00
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
*occ_pages_p = pcpu_count_occupied_pages(chunk, i);
|
2009-02-20 08:29:08 +01:00
|
|
|
pcpu_chunk_relocate(chunk, oslot);
|
|
|
|
return off;
|
|
|
|
}
|
|
|
|
|
|
|
|
chunk->contig_hint = max_contig; /* fully scanned */
|
|
|
|
pcpu_chunk_relocate(chunk, oslot);
|
|
|
|
|
2009-03-06 16:44:09 +01:00
|
|
|
/* tell the upper layer that this chunk has no matching area */
|
|
|
|
return -1;
|
2009-02-20 08:29:08 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* pcpu_free_area - free area to a pcpu_chunk
|
|
|
|
* @chunk: chunk of interest
|
|
|
|
* @freeme: offset of area to free
|
2014-09-02 20:46:05 +02:00
|
|
|
* @occ_pages_p: out param for the number of pages the area occupies
|
2009-02-20 08:29:08 +01:00
|
|
|
*
|
|
|
|
* Free area starting from @freeme to @chunk. Note that this function
|
|
|
|
* only modifies the allocation map. It doesn't depopulate or unmap
|
|
|
|
* the area.
|
2009-03-06 16:44:13 +01:00
|
|
|
*
|
|
|
|
* CONTEXT:
|
|
|
|
* pcpu_lock.
|
2009-02-20 08:29:08 +01:00
|
|
|
*/
|
2014-09-02 20:46:05 +02:00
|
|
|
static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
|
|
|
|
int *occ_pages_p)
|
2009-02-20 08:29:08 +01:00
|
|
|
{
|
|
|
|
int oslot = pcpu_chunk_slot(chunk);
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
int off = 0;
|
|
|
|
unsigned i, j;
|
|
|
|
int to_free = 0;
|
|
|
|
int *p;
|
|
|
|
|
|
|
|
freeme |= 1; /* we are searching for <given offset, in use> pair */
|
|
|
|
|
|
|
|
i = 0;
|
|
|
|
j = chunk->map_used;
|
|
|
|
while (i != j) {
|
|
|
|
unsigned k = (i + j) / 2;
|
|
|
|
off = chunk->map[k];
|
|
|
|
if (off < freeme)
|
|
|
|
i = k + 1;
|
|
|
|
else if (off > freeme)
|
|
|
|
j = k;
|
|
|
|
else
|
|
|
|
i = j = k;
|
|
|
|
}
|
2009-02-20 08:29:08 +01:00
|
|
|
BUG_ON(off != freeme);
|
|
|
|
|
2014-03-07 02:52:32 +01:00
|
|
|
if (i < chunk->first_free)
|
|
|
|
chunk->first_free = i;
|
|
|
|
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
p = chunk->map + i;
|
|
|
|
*p = off &= ~1;
|
|
|
|
chunk->free_size += (p[1] & ~1) - off;
|
2009-02-20 08:29:08 +01:00
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
*occ_pages_p = pcpu_count_occupied_pages(chunk, i);
|
|
|
|
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
/* merge with next? */
|
|
|
|
if (!(p[1] & 1))
|
|
|
|
to_free++;
|
2009-02-20 08:29:08 +01:00
|
|
|
/* merge with previous? */
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
if (i > 0 && !(p[-1] & 1)) {
|
|
|
|
to_free++;
|
2009-02-20 08:29:08 +01:00
|
|
|
i--;
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
p--;
|
2009-02-20 08:29:08 +01:00
|
|
|
}
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
if (to_free) {
|
|
|
|
chunk->map_used -= to_free;
|
|
|
|
memmove(p + 1, p + 1 + to_free,
|
|
|
|
(chunk->map_used - i) * sizeof(chunk->map[0]));
|
2009-02-20 08:29:08 +01:00
|
|
|
}
|
|
|
|
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint);
|
2009-02-20 08:29:08 +01:00
|
|
|
pcpu_chunk_relocate(chunk, oslot);
|
|
|
|
}
|
|
|
|
|
2010-04-09 11:57:01 +02:00
|
|
|
static struct pcpu_chunk *pcpu_alloc_chunk(void)
|
|
|
|
{
|
|
|
|
struct pcpu_chunk *chunk;
|
|
|
|
|
2011-08-04 11:02:33 +02:00
|
|
|
chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
|
2010-04-09 11:57:01 +02:00
|
|
|
if (!chunk)
|
|
|
|
return NULL;
|
|
|
|
|
2011-08-04 11:02:33 +02:00
|
|
|
chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
|
|
|
|
sizeof(chunk->map[0]));
|
2010-04-09 11:57:01 +02:00
|
|
|
if (!chunk->map) {
|
2016-01-23 00:11:02 +01:00
|
|
|
pcpu_mem_free(chunk);
|
2010-04-09 11:57:01 +02:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
chunk->map[0] = 0;
|
|
|
|
chunk->map[1] = pcpu_unit_size | 1;
|
|
|
|
chunk->map_used = 1;
|
2010-04-09 11:57:01 +02:00
|
|
|
|
|
|
|
INIT_LIST_HEAD(&chunk->list);
|
2016-05-25 17:48:25 +02:00
|
|
|
INIT_LIST_HEAD(&chunk->map_extend_list);
|
2010-04-09 11:57:01 +02:00
|
|
|
chunk->free_size = pcpu_unit_size;
|
|
|
|
chunk->contig_hint = pcpu_unit_size;
|
|
|
|
|
|
|
|
return chunk;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void pcpu_free_chunk(struct pcpu_chunk *chunk)
|
|
|
|
{
|
|
|
|
if (!chunk)
|
|
|
|
return;
|
2016-01-23 00:11:02 +01:00
|
|
|
pcpu_mem_free(chunk->map);
|
|
|
|
pcpu_mem_free(chunk);
|
2010-04-09 11:57:01 +02:00
|
|
|
}
|
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
/**
|
|
|
|
* pcpu_chunk_populated - post-population bookkeeping
|
|
|
|
* @chunk: pcpu_chunk which got populated
|
|
|
|
* @page_start: the start page
|
|
|
|
* @page_end: the end page
|
|
|
|
*
|
|
|
|
* Pages in [@page_start,@page_end) have been populated to @chunk. Update
|
|
|
|
* the bookkeeping information accordingly. Must be called after each
|
|
|
|
* successful population.
|
|
|
|
*/
|
|
|
|
static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
|
|
|
|
int page_start, int page_end)
|
|
|
|
{
|
|
|
|
int nr = page_end - page_start;
|
|
|
|
|
|
|
|
lockdep_assert_held(&pcpu_lock);
|
|
|
|
|
|
|
|
bitmap_set(chunk->populated, page_start, nr);
|
|
|
|
chunk->nr_populated += nr;
|
|
|
|
pcpu_nr_empty_pop_pages += nr;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* pcpu_chunk_depopulated - post-depopulation bookkeeping
|
|
|
|
* @chunk: pcpu_chunk which got depopulated
|
|
|
|
* @page_start: the start page
|
|
|
|
* @page_end: the end page
|
|
|
|
*
|
|
|
|
* Pages in [@page_start,@page_end) have been depopulated from @chunk.
|
|
|
|
* Update the bookkeeping information accordingly. Must be called after
|
|
|
|
* each successful depopulation.
|
|
|
|
*/
|
|
|
|
static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
|
|
|
|
int page_start, int page_end)
|
|
|
|
{
|
|
|
|
int nr = page_end - page_start;
|
|
|
|
|
|
|
|
lockdep_assert_held(&pcpu_lock);
|
|
|
|
|
|
|
|
bitmap_clear(chunk->populated, page_start, nr);
|
|
|
|
chunk->nr_populated -= nr;
|
|
|
|
pcpu_nr_empty_pop_pages -= nr;
|
|
|
|
}
|
|
|
|
|
2010-04-09 11:57:01 +02:00
|
|
|
/*
|
|
|
|
* Chunk management implementation.
|
|
|
|
*
|
|
|
|
* To allow different implementations, chunk alloc/free and
|
|
|
|
* [de]population are implemented in a separate file which is pulled
|
|
|
|
* into this file and compiled together. The following functions
|
|
|
|
* should be implemented.
|
|
|
|
*
|
|
|
|
* pcpu_populate_chunk - populate the specified range of a chunk
|
|
|
|
* pcpu_depopulate_chunk - depopulate the specified range of a chunk
|
|
|
|
* pcpu_create_chunk - create a new chunk
|
|
|
|
* pcpu_destroy_chunk - destroy a chunk, always preceded by full depop
|
|
|
|
* pcpu_addr_to_page - translate address to physical address
|
|
|
|
* pcpu_verify_alloc_info - check alloc_info is acceptable during init
|
2009-02-20 08:29:08 +01:00
|
|
|
*/
|
2010-04-09 11:57:01 +02:00
|
|
|
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
|
|
|
|
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
|
|
|
|
static struct pcpu_chunk *pcpu_create_chunk(void);
|
|
|
|
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
|
|
|
|
static struct page *pcpu_addr_to_page(void *addr);
|
|
|
|
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
|
2009-02-20 08:29:08 +01:00
|
|
|
|
2010-04-09 11:57:01 +02:00
|
|
|
#ifdef CONFIG_NEED_PER_CPU_KM
|
|
|
|
#include "percpu-km.c"
|
|
|
|
#else
|
2010-04-09 11:57:01 +02:00
|
|
|
#include "percpu-vm.c"
|
2010-04-09 11:57:01 +02:00
|
|
|
#endif
|
2009-02-20 08:29:08 +01:00
|
|
|
|
2010-04-09 11:57:01 +02:00
|
|
|
/**
|
|
|
|
* pcpu_chunk_addr_search - determine chunk containing specified address
|
|
|
|
* @addr: address for which the chunk needs to be determined.
|
|
|
|
*
|
|
|
|
* RETURNS:
|
|
|
|
* The address of the found chunk.
|
|
|
|
*/
|
|
|
|
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
|
|
|
|
{
|
|
|
|
/* is it in the first chunk? */
|
|
|
|
if (pcpu_addr_in_first_chunk(addr)) {
|
|
|
|
/* is it in the reserved area? */
|
|
|
|
if (pcpu_addr_in_reserved_chunk(addr))
|
|
|
|
return pcpu_reserved_chunk;
|
|
|
|
return pcpu_first_chunk;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The address is relative to unit0 which might be unused and
|
|
|
|
* thus unmapped. Offset the address to the unit space of the
|
|
|
|
* current processor before looking it up in the vmalloc
|
|
|
|
* space. Note that any possible cpu id can be used here, so
|
|
|
|
* there's no need to worry about preemption or cpu hotplug.
|
|
|
|
*/
|
|
|
|
addr += pcpu_unit_offsets[raw_smp_processor_id()];
|
2010-04-09 11:57:01 +02:00
|
|
|
return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
|
2010-04-09 11:57:01 +02:00
|
|
|
}
|
|
|
|
|
2009-02-20 08:29:08 +01:00
|
|
|
/**
|
2009-03-06 06:33:59 +01:00
|
|
|
* pcpu_alloc - the percpu allocator
|
2009-02-21 08:56:23 +01:00
|
|
|
* @size: size of area to allocate in bytes
|
2009-02-20 08:29:08 +01:00
|
|
|
* @align: alignment of area (max PAGE_SIZE)
|
2009-03-06 06:33:59 +01:00
|
|
|
* @reserved: allocate from the reserved chunk if available
|
2014-09-02 20:46:04 +02:00
|
|
|
* @gfp: allocation flags
|
2009-02-20 08:29:08 +01:00
|
|
|
*
|
2014-09-02 20:46:04 +02:00
|
|
|
* Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't
|
|
|
|
* contain %GFP_KERNEL, the allocation is atomic.
|
2009-02-20 08:29:08 +01:00
|
|
|
*
|
|
|
|
* RETURNS:
|
|
|
|
* Percpu pointer to the allocated area on success, NULL on failure.
|
|
|
|
*/
|
2014-09-02 20:46:04 +02:00
|
|
|
static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
|
|
|
|
gfp_t gfp)
|
2009-02-20 08:29:08 +01:00
|
|
|
{
|
2009-09-29 02:17:58 +02:00
|
|
|
static int warn_limit = 10;
|
2009-02-20 08:29:08 +01:00
|
|
|
struct pcpu_chunk *chunk;
|
2009-09-29 02:17:58 +02:00
|
|
|
const char *err;
|
2014-10-08 18:01:52 +02:00
|
|
|
bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
|
2014-09-02 20:46:05 +02:00
|
|
|
int occ_pages = 0;
|
2014-09-02 20:46:02 +02:00
|
|
|
int slot, off, new_alloc, cpu, ret;
|
2009-10-28 16:25:59 +01:00
|
|
|
unsigned long flags;
|
2011-09-26 18:12:53 +02:00
|
|
|
void __percpu *ptr;
|
2009-02-20 08:29:08 +01:00
|
|
|
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
/*
|
|
|
|
* We want the lowest bit of offset available for in-use/free
|
2014-03-17 21:01:27 +01:00
|
|
|
* indicator, so force >= 16bit alignment and make size even.
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
*/
|
|
|
|
if (unlikely(align < 2))
|
|
|
|
align = 2;
|
|
|
|
|
2014-06-19 16:59:18 +02:00
|
|
|
size = ALIGN(size, 2);
|
2014-03-17 21:01:27 +01:00
|
|
|
|
percpu: ensure the requested alignment is power of two
The percpu allocator expectedly assumes that the requested alignment
is power of two but hasn't been veryfing the input. If the specified
alignment isn't power of two, the allocator can malfunction. Add the
sanity check.
The following is detailed analysis of the effects of alignments which
aren't power of two.
The alignment must be a even at least since the LSB of a chunk->map
element is used as free/in-use flag of a area; besides, the alignment
must be a power of 2 too since ALIGN() doesn't work well for other
alignment always but is adopted by pcpu_fit_in_area(). IOW, the
current allocator only works well for a power of 2 aligned area
allocation.
See below opposite example for why an odd alignment doesn't work.
Let's assume area [16, 36) is free but its previous one is in-use, we
want to allocate a @size == 8 and @align == 7 area. The larger area
[16, 36) is split to three areas [16, 21), [21, 29), [29, 36)
eventually. However, due to the usage for a chunk->map element, the
actual offset of the aim area [21, 29) is 21 but is recorded in
relevant element as 20; moreover, the residual tail free area [29,
36) is mistook as in-use and is lost silently
Unlike macro roundup(), ALIGN(x, a) doesn't work if @a isn't a power
of 2 for example, roundup(10, 6) == 12 but ALIGN(10, 6) == 10, and
the latter result isn't desired obviously.
tj: Code style and patch description updates.
Signed-off-by: zijun_hu <zijun_hu@htc.com>
Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
2016-10-14 09:12:54 +02:00
|
|
|
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
|
|
|
|
!is_power_of_2(align))) {
|
2016-03-17 22:19:47 +01:00
|
|
|
WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n",
|
|
|
|
size, align);
|
2009-02-20 08:29:08 +01:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2016-05-25 17:48:25 +02:00
|
|
|
if (!is_atomic)
|
|
|
|
mutex_lock(&pcpu_alloc_mutex);
|
|
|
|
|
2009-10-28 16:25:59 +01:00
|
|
|
spin_lock_irqsave(&pcpu_lock, flags);
|
2009-02-20 08:29:08 +01:00
|
|
|
|
2009-03-06 06:33:59 +01:00
|
|
|
/* serve reserved allocations from the reserved chunk if available */
|
|
|
|
if (reserved && pcpu_reserved_chunk) {
|
|
|
|
chunk = pcpu_reserved_chunk;
|
2009-11-11 07:35:18 +01:00
|
|
|
|
|
|
|
if (size > chunk->contig_hint) {
|
|
|
|
err = "alloc from reserved chunk failed";
|
2009-03-06 16:44:13 +01:00
|
|
|
goto fail_unlock;
|
2009-09-29 02:17:58 +02:00
|
|
|
}
|
2009-11-11 07:35:18 +01:00
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
|
2009-11-11 07:35:18 +01:00
|
|
|
spin_unlock_irqrestore(&pcpu_lock, flags);
|
2014-09-02 20:46:04 +02:00
|
|
|
if (is_atomic ||
|
|
|
|
pcpu_extend_area_map(chunk, new_alloc) < 0) {
|
2009-11-11 07:35:18 +01:00
|
|
|
err = "failed to extend area map of reserved chunk";
|
2014-09-02 20:46:02 +02:00
|
|
|
goto fail;
|
2009-11-11 07:35:18 +01:00
|
|
|
}
|
|
|
|
spin_lock_irqsave(&pcpu_lock, flags);
|
|
|
|
}
|
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
off = pcpu_alloc_area(chunk, size, align, is_atomic,
|
|
|
|
&occ_pages);
|
2009-03-06 06:33:59 +01:00
|
|
|
if (off >= 0)
|
|
|
|
goto area_found;
|
2009-11-11 07:35:18 +01:00
|
|
|
|
2009-09-29 02:17:58 +02:00
|
|
|
err = "alloc from reserved chunk failed";
|
2009-03-06 16:44:13 +01:00
|
|
|
goto fail_unlock;
|
2009-03-06 06:33:59 +01:00
|
|
|
}
|
|
|
|
|
2009-03-06 16:44:13 +01:00
|
|
|
restart:
|
2009-03-06 06:33:59 +01:00
|
|
|
/* search through normal chunks */
|
2009-02-20 08:29:08 +01:00
|
|
|
for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
|
|
|
|
list_for_each_entry(chunk, &pcpu_slot[slot], list) {
|
|
|
|
if (size > chunk->contig_hint)
|
|
|
|
continue;
|
2009-03-06 16:44:13 +01:00
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
new_alloc = pcpu_need_to_extend(chunk, is_atomic);
|
2009-11-11 07:35:18 +01:00
|
|
|
if (new_alloc) {
|
2014-09-02 20:46:04 +02:00
|
|
|
if (is_atomic)
|
|
|
|
continue;
|
2009-11-11 07:35:18 +01:00
|
|
|
spin_unlock_irqrestore(&pcpu_lock, flags);
|
|
|
|
if (pcpu_extend_area_map(chunk,
|
|
|
|
new_alloc) < 0) {
|
|
|
|
err = "failed to extend area map";
|
2014-09-02 20:46:02 +02:00
|
|
|
goto fail;
|
2009-11-11 07:35:18 +01:00
|
|
|
}
|
|
|
|
spin_lock_irqsave(&pcpu_lock, flags);
|
|
|
|
/*
|
|
|
|
* pcpu_lock has been dropped, need to
|
|
|
|
* restart cpu_slot list walking.
|
|
|
|
*/
|
|
|
|
goto restart;
|
2009-03-06 16:44:13 +01:00
|
|
|
}
|
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
off = pcpu_alloc_area(chunk, size, align, is_atomic,
|
|
|
|
&occ_pages);
|
2009-02-20 08:29:08 +01:00
|
|
|
if (off >= 0)
|
|
|
|
goto area_found;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-10-28 16:25:59 +01:00
|
|
|
spin_unlock_irqrestore(&pcpu_lock, flags);
|
2009-03-06 16:44:13 +01:00
|
|
|
|
2014-09-02 20:46:02 +02:00
|
|
|
/*
|
|
|
|
* No space left. Create a new chunk. We don't want multiple
|
|
|
|
* tasks to create chunks simultaneously. Serialize and create iff
|
|
|
|
* there's still no empty chunk after grabbing the mutex.
|
|
|
|
*/
|
2014-09-02 20:46:04 +02:00
|
|
|
if (is_atomic)
|
|
|
|
goto fail;
|
|
|
|
|
2014-09-02 20:46:02 +02:00
|
|
|
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
|
|
|
|
chunk = pcpu_create_chunk();
|
|
|
|
if (!chunk) {
|
|
|
|
err = "failed to allocate new chunk";
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
|
|
|
spin_lock_irqsave(&pcpu_lock, flags);
|
|
|
|
pcpu_chunk_relocate(chunk, -1);
|
|
|
|
} else {
|
|
|
|
spin_lock_irqsave(&pcpu_lock, flags);
|
2009-09-29 02:17:58 +02:00
|
|
|
}
|
2009-03-06 16:44:13 +01:00
|
|
|
|
|
|
|
goto restart;
|
2009-02-20 08:29:08 +01:00
|
|
|
|
|
|
|
area_found:
|
2009-10-28 16:25:59 +01:00
|
|
|
spin_unlock_irqrestore(&pcpu_lock, flags);
|
2009-03-06 16:44:13 +01:00
|
|
|
|
2014-09-02 20:46:01 +02:00
|
|
|
/* populate if not all pages are already there */
|
2014-09-02 20:46:04 +02:00
|
|
|
if (!is_atomic) {
|
2014-09-02 20:46:04 +02:00
|
|
|
int page_start, page_end, rs, re;
|
2014-09-02 20:46:01 +02:00
|
|
|
|
2014-09-02 20:46:04 +02:00
|
|
|
page_start = PFN_DOWN(off);
|
|
|
|
page_end = PFN_UP(off + size);
|
2014-09-02 20:46:02 +02:00
|
|
|
|
2014-09-02 20:46:04 +02:00
|
|
|
pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
|
|
|
|
WARN_ON(chunk->immutable);
|
|
|
|
|
|
|
|
ret = pcpu_populate_chunk(chunk, rs, re);
|
|
|
|
|
|
|
|
spin_lock_irqsave(&pcpu_lock, flags);
|
|
|
|
if (ret) {
|
2014-09-02 20:46:05 +02:00
|
|
|
pcpu_free_area(chunk, off, &occ_pages);
|
2014-09-02 20:46:04 +02:00
|
|
|
err = "failed to populate";
|
|
|
|
goto fail_unlock;
|
|
|
|
}
|
2014-09-02 20:46:05 +02:00
|
|
|
pcpu_chunk_populated(chunk, rs, re);
|
2014-09-02 20:46:04 +02:00
|
|
|
spin_unlock_irqrestore(&pcpu_lock, flags);
|
2014-09-02 20:46:01 +02:00
|
|
|
}
|
2009-02-20 08:29:08 +01:00
|
|
|
|
2014-09-02 20:46:04 +02:00
|
|
|
mutex_unlock(&pcpu_alloc_mutex);
|
|
|
|
}
|
2009-03-06 16:44:13 +01:00
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
if (chunk != pcpu_reserved_chunk)
|
|
|
|
pcpu_nr_empty_pop_pages -= occ_pages;
|
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
|
|
|
|
pcpu_schedule_balance_work();
|
|
|
|
|
2014-09-02 20:46:01 +02:00
|
|
|
/* clear the areas and return address relative to base address */
|
|
|
|
for_each_possible_cpu(cpu)
|
|
|
|
memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
|
|
|
|
|
2011-09-26 18:12:53 +02:00
|
|
|
ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
|
2015-06-25 01:58:51 +02:00
|
|
|
kmemleak_alloc_percpu(ptr, size, gfp);
|
2011-09-26 18:12:53 +02:00
|
|
|
return ptr;
|
2009-03-06 16:44:13 +01:00
|
|
|
|
|
|
|
fail_unlock:
|
2009-10-28 16:25:59 +01:00
|
|
|
spin_unlock_irqrestore(&pcpu_lock, flags);
|
2014-09-02 20:46:02 +02:00
|
|
|
fail:
|
2014-09-02 20:46:04 +02:00
|
|
|
if (!is_atomic && warn_limit) {
|
2016-03-17 22:19:53 +01:00
|
|
|
pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
|
2016-03-17 22:19:44 +01:00
|
|
|
size, align, is_atomic, err);
|
2009-09-29 02:17:58 +02:00
|
|
|
dump_stack();
|
|
|
|
if (!--warn_limit)
|
2016-03-17 22:19:53 +01:00
|
|
|
pr_info("limit reached, disable warning\n");
|
2009-09-29 02:17:58 +02:00
|
|
|
}
|
2014-09-02 20:46:05 +02:00
|
|
|
if (is_atomic) {
|
|
|
|
/* see the flag handling in pcpu_blance_workfn() */
|
|
|
|
pcpu_atomic_alloc_failed = true;
|
|
|
|
pcpu_schedule_balance_work();
|
2016-05-25 17:48:25 +02:00
|
|
|
} else {
|
|
|
|
mutex_unlock(&pcpu_alloc_mutex);
|
2014-09-02 20:46:05 +02:00
|
|
|
}
|
2009-03-06 16:44:13 +01:00
|
|
|
return NULL;
|
2009-02-20 08:29:08 +01:00
|
|
|
}
|
2009-03-06 06:33:59 +01:00
|
|
|
|
|
|
|
/**
|
2014-09-02 20:46:04 +02:00
|
|
|
* __alloc_percpu_gfp - allocate dynamic percpu area
|
2009-03-06 06:33:59 +01:00
|
|
|
* @size: size of area to allocate in bytes
|
|
|
|
* @align: alignment of area (max PAGE_SIZE)
|
2014-09-02 20:46:04 +02:00
|
|
|
* @gfp: allocation flags
|
2009-03-06 06:33:59 +01:00
|
|
|
*
|
2014-09-02 20:46:04 +02:00
|
|
|
* Allocate zero-filled percpu area of @size bytes aligned at @align. If
|
|
|
|
* @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
|
|
|
|
* be called from any context but is a lot more likely to fail.
|
2009-03-06 16:44:13 +01:00
|
|
|
*
|
2009-03-06 06:33:59 +01:00
|
|
|
* RETURNS:
|
|
|
|
* Percpu pointer to the allocated area on success, NULL on failure.
|
|
|
|
*/
|
2014-09-02 20:46:04 +02:00
|
|
|
void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
|
|
|
|
{
|
|
|
|
return pcpu_alloc(size, align, false, gfp);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* __alloc_percpu - allocate dynamic percpu area
|
|
|
|
* @size: size of area to allocate in bytes
|
|
|
|
* @align: alignment of area (max PAGE_SIZE)
|
|
|
|
*
|
|
|
|
* Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
|
|
|
|
*/
|
2010-02-02 06:38:57 +01:00
|
|
|
void __percpu *__alloc_percpu(size_t size, size_t align)
|
2009-03-06 06:33:59 +01:00
|
|
|
{
|
2014-09-02 20:46:04 +02:00
|
|
|
return pcpu_alloc(size, align, false, GFP_KERNEL);
|
2009-03-06 06:33:59 +01:00
|
|
|
}
|
2009-02-20 08:29:08 +01:00
|
|
|
EXPORT_SYMBOL_GPL(__alloc_percpu);
|
|
|
|
|
2009-03-06 06:33:59 +01:00
|
|
|
/**
|
|
|
|
* __alloc_reserved_percpu - allocate reserved percpu area
|
|
|
|
* @size: size of area to allocate in bytes
|
|
|
|
* @align: alignment of area (max PAGE_SIZE)
|
|
|
|
*
|
2010-09-10 11:01:56 +02:00
|
|
|
* Allocate zero-filled percpu area of @size bytes aligned at @align
|
|
|
|
* from reserved percpu area if arch has set it up; otherwise,
|
|
|
|
* allocation is served from the same dynamic area. Might sleep.
|
|
|
|
* Might trigger writeouts.
|
2009-03-06 06:33:59 +01:00
|
|
|
*
|
2009-03-06 16:44:13 +01:00
|
|
|
* CONTEXT:
|
|
|
|
* Does GFP_KERNEL allocation.
|
|
|
|
*
|
2009-03-06 06:33:59 +01:00
|
|
|
* RETURNS:
|
|
|
|
* Percpu pointer to the allocated area on success, NULL on failure.
|
|
|
|
*/
|
2010-02-02 06:38:57 +01:00
|
|
|
void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
|
2009-03-06 06:33:59 +01:00
|
|
|
{
|
2014-09-02 20:46:04 +02:00
|
|
|
return pcpu_alloc(size, align, true, GFP_KERNEL);
|
2009-03-06 06:33:59 +01:00
|
|
|
}
|
|
|
|
|
2009-03-06 16:44:11 +01:00
|
|
|
/**
|
2014-09-02 20:46:05 +02:00
|
|
|
* pcpu_balance_workfn - manage the amount of free chunks and populated pages
|
2009-03-06 16:44:11 +01:00
|
|
|
* @work: unused
|
|
|
|
*
|
|
|
|
* Reclaim all fully free chunks except for the first one.
|
|
|
|
*/
|
2014-09-02 20:46:05 +02:00
|
|
|
static void pcpu_balance_workfn(struct work_struct *work)
|
2009-02-20 08:29:08 +01:00
|
|
|
{
|
2014-09-02 20:46:05 +02:00
|
|
|
LIST_HEAD(to_free);
|
|
|
|
struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
|
2009-03-06 16:44:11 +01:00
|
|
|
struct pcpu_chunk *chunk, *next;
|
2014-09-02 20:46:05 +02:00
|
|
|
int slot, nr_to_pop, ret;
|
2009-03-06 16:44:11 +01:00
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
/*
|
|
|
|
* There's no reason to keep around multiple unused chunks and VM
|
|
|
|
* areas can be scarce. Destroy all free chunks except for one.
|
|
|
|
*/
|
2009-03-06 16:44:13 +01:00
|
|
|
mutex_lock(&pcpu_alloc_mutex);
|
|
|
|
spin_lock_irq(&pcpu_lock);
|
2009-03-06 16:44:11 +01:00
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
list_for_each_entry_safe(chunk, next, free_head, list) {
|
2009-03-06 16:44:11 +01:00
|
|
|
WARN_ON(chunk->immutable);
|
|
|
|
|
|
|
|
/* spare the first one */
|
2014-09-02 20:46:05 +02:00
|
|
|
if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
|
2009-03-06 16:44:11 +01:00
|
|
|
continue;
|
|
|
|
|
2016-05-25 17:48:25 +02:00
|
|
|
list_del_init(&chunk->map_extend_list);
|
2014-09-02 20:46:05 +02:00
|
|
|
list_move(&chunk->list, &to_free);
|
2009-03-06 16:44:11 +01:00
|
|
|
}
|
|
|
|
|
2009-03-06 16:44:13 +01:00
|
|
|
spin_unlock_irq(&pcpu_lock);
|
2009-03-06 16:44:11 +01:00
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
list_for_each_entry_safe(chunk, next, &to_free, list) {
|
2014-09-02 20:46:02 +02:00
|
|
|
int rs, re;
|
2014-09-02 20:46:01 +02:00
|
|
|
|
2014-09-02 20:46:02 +02:00
|
|
|
pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
|
|
|
|
pcpu_depopulate_chunk(chunk, rs, re);
|
2014-09-02 20:46:05 +02:00
|
|
|
spin_lock_irq(&pcpu_lock);
|
|
|
|
pcpu_chunk_depopulated(chunk, rs, re);
|
|
|
|
spin_unlock_irq(&pcpu_lock);
|
2014-09-02 20:46:02 +02:00
|
|
|
}
|
2010-04-09 11:57:01 +02:00
|
|
|
pcpu_destroy_chunk(chunk);
|
2009-03-06 16:44:11 +01:00
|
|
|
}
|
2009-08-14 08:00:49 +02:00
|
|
|
|
2016-05-25 17:48:25 +02:00
|
|
|
/* service chunks which requested async area map extension */
|
|
|
|
do {
|
|
|
|
int new_alloc = 0;
|
|
|
|
|
|
|
|
spin_lock_irq(&pcpu_lock);
|
|
|
|
|
|
|
|
chunk = list_first_entry_or_null(&pcpu_map_extend_chunks,
|
|
|
|
struct pcpu_chunk, map_extend_list);
|
|
|
|
if (chunk) {
|
|
|
|
list_del_init(&chunk->map_extend_list);
|
|
|
|
new_alloc = pcpu_need_to_extend(chunk, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
spin_unlock_irq(&pcpu_lock);
|
|
|
|
|
|
|
|
if (new_alloc)
|
|
|
|
pcpu_extend_area_map(chunk, new_alloc);
|
|
|
|
} while (chunk);
|
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
/*
|
|
|
|
* Ensure there are certain number of free populated pages for
|
|
|
|
* atomic allocs. Fill up from the most packed so that atomic
|
|
|
|
* allocs don't increase fragmentation. If atomic allocation
|
|
|
|
* failed previously, always populate the maximum amount. This
|
|
|
|
* should prevent atomic allocs larger than PAGE_SIZE from keeping
|
|
|
|
* failing indefinitely; however, large atomic allocs are not
|
|
|
|
* something we support properly and can be highly unreliable and
|
|
|
|
* inefficient.
|
|
|
|
*/
|
|
|
|
retry_pop:
|
|
|
|
if (pcpu_atomic_alloc_failed) {
|
|
|
|
nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
|
|
|
|
/* best effort anyway, don't worry about synchronization */
|
|
|
|
pcpu_atomic_alloc_failed = false;
|
|
|
|
} else {
|
|
|
|
nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
|
|
|
|
pcpu_nr_empty_pop_pages,
|
|
|
|
0, PCPU_EMPTY_POP_PAGES_HIGH);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
|
|
|
|
int nr_unpop = 0, rs, re;
|
|
|
|
|
|
|
|
if (!nr_to_pop)
|
|
|
|
break;
|
|
|
|
|
|
|
|
spin_lock_irq(&pcpu_lock);
|
|
|
|
list_for_each_entry(chunk, &pcpu_slot[slot], list) {
|
|
|
|
nr_unpop = pcpu_unit_pages - chunk->nr_populated;
|
|
|
|
if (nr_unpop)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
spin_unlock_irq(&pcpu_lock);
|
|
|
|
|
|
|
|
if (!nr_unpop)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* @chunk can't go away while pcpu_alloc_mutex is held */
|
|
|
|
pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
|
|
|
|
int nr = min(re - rs, nr_to_pop);
|
|
|
|
|
|
|
|
ret = pcpu_populate_chunk(chunk, rs, rs + nr);
|
|
|
|
if (!ret) {
|
|
|
|
nr_to_pop -= nr;
|
|
|
|
spin_lock_irq(&pcpu_lock);
|
|
|
|
pcpu_chunk_populated(chunk, rs, rs + nr);
|
|
|
|
spin_unlock_irq(&pcpu_lock);
|
|
|
|
} else {
|
|
|
|
nr_to_pop = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!nr_to_pop)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nr_to_pop) {
|
|
|
|
/* ran out of chunks to populate, create a new one and retry */
|
|
|
|
chunk = pcpu_create_chunk();
|
|
|
|
if (chunk) {
|
|
|
|
spin_lock_irq(&pcpu_lock);
|
|
|
|
pcpu_chunk_relocate(chunk, -1);
|
|
|
|
spin_unlock_irq(&pcpu_lock);
|
|
|
|
goto retry_pop;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-08-14 08:00:49 +02:00
|
|
|
mutex_unlock(&pcpu_alloc_mutex);
|
2009-02-20 08:29:08 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* free_percpu - free percpu area
|
|
|
|
* @ptr: pointer to area to free
|
|
|
|
*
|
2009-03-06 16:44:13 +01:00
|
|
|
* Free percpu area @ptr.
|
|
|
|
*
|
|
|
|
* CONTEXT:
|
|
|
|
* Can be called from atomic context.
|
2009-02-20 08:29:08 +01:00
|
|
|
*/
|
2010-02-02 06:38:57 +01:00
|
|
|
void free_percpu(void __percpu *ptr)
|
2009-02-20 08:29:08 +01:00
|
|
|
{
|
2010-01-08 23:42:39 +01:00
|
|
|
void *addr;
|
2009-02-20 08:29:08 +01:00
|
|
|
struct pcpu_chunk *chunk;
|
2009-03-06 16:44:13 +01:00
|
|
|
unsigned long flags;
|
2014-09-02 20:46:05 +02:00
|
|
|
int off, occ_pages;
|
2009-02-20 08:29:08 +01:00
|
|
|
|
|
|
|
if (!ptr)
|
|
|
|
return;
|
|
|
|
|
2011-09-26 18:12:53 +02:00
|
|
|
kmemleak_free_percpu(ptr);
|
|
|
|
|
2010-01-08 23:42:39 +01:00
|
|
|
addr = __pcpu_ptr_to_addr(ptr);
|
|
|
|
|
2009-03-06 16:44:13 +01:00
|
|
|
spin_lock_irqsave(&pcpu_lock, flags);
|
2009-02-20 08:29:08 +01:00
|
|
|
|
|
|
|
chunk = pcpu_chunk_addr_search(addr);
|
2009-08-14 08:00:51 +02:00
|
|
|
off = addr - chunk->base_addr;
|
2009-02-20 08:29:08 +01:00
|
|
|
|
2014-09-02 20:46:05 +02:00
|
|
|
pcpu_free_area(chunk, off, &occ_pages);
|
|
|
|
|
|
|
|
if (chunk != pcpu_reserved_chunk)
|
|
|
|
pcpu_nr_empty_pop_pages += occ_pages;
|
2009-02-20 08:29:08 +01:00
|
|
|
|
2009-03-06 16:44:11 +01:00
|
|
|
/* if there are more than one fully free chunks, wake up grim reaper */
|
2009-02-20 08:29:08 +01:00
|
|
|
if (chunk->free_size == pcpu_unit_size) {
|
|
|
|
struct pcpu_chunk *pos;
|
|
|
|
|
2009-03-06 16:44:11 +01:00
|
|
|
list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
|
2009-02-20 08:29:08 +01:00
|
|
|
if (pos != chunk) {
|
2014-09-02 20:46:05 +02:00
|
|
|
pcpu_schedule_balance_work();
|
2009-02-20 08:29:08 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-03-06 16:44:13 +01:00
|
|
|
spin_unlock_irqrestore(&pcpu_lock, flags);
|
2009-02-20 08:29:08 +01:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(free_percpu);
|
|
|
|
|
2010-03-10 10:57:54 +01:00
|
|
|
/**
|
|
|
|
* is_kernel_percpu_address - test whether address is from static percpu area
|
|
|
|
* @addr: address to test
|
|
|
|
*
|
|
|
|
* Test whether @addr belongs to in-kernel static percpu area. Module
|
|
|
|
* static percpu areas are not considered. For those, use
|
|
|
|
* is_module_percpu_address().
|
|
|
|
*
|
|
|
|
* RETURNS:
|
|
|
|
* %true if @addr is from in-kernel static percpu area, %false otherwise.
|
|
|
|
*/
|
|
|
|
bool is_kernel_percpu_address(unsigned long addr)
|
|
|
|
{
|
2010-09-03 18:22:48 +02:00
|
|
|
#ifdef CONFIG_SMP
|
2010-03-10 10:57:54 +01:00
|
|
|
const size_t static_size = __per_cpu_end - __per_cpu_start;
|
|
|
|
void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
|
|
|
|
unsigned int cpu;
|
|
|
|
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
void *start = per_cpu_ptr(base, cpu);
|
|
|
|
|
|
|
|
if ((void *)addr >= start && (void *)addr < start + static_size)
|
|
|
|
return true;
|
|
|
|
}
|
2010-09-03 18:22:48 +02:00
|
|
|
#endif
|
|
|
|
/* on UP, can't distinguish from other static vars, always false */
|
2010-03-10 10:57:54 +01:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2009-11-24 07:50:03 +01:00
|
|
|
/**
|
|
|
|
* per_cpu_ptr_to_phys - convert translated percpu address to physical address
|
|
|
|
* @addr: the address to be converted to physical address
|
|
|
|
*
|
|
|
|
* Given @addr which is dereferenceable address obtained via one of
|
|
|
|
* percpu access macros, this function translates it into its physical
|
|
|
|
* address. The caller is responsible for ensuring @addr stays valid
|
|
|
|
* until this function finishes.
|
|
|
|
*
|
2011-11-23 17:20:53 +01:00
|
|
|
* percpu allocator has special setup for the first chunk, which currently
|
|
|
|
* supports either embedding in linear address space or vmalloc mapping,
|
|
|
|
* and, from the second one, the backing allocator (currently either vm or
|
|
|
|
* km) provides translation.
|
|
|
|
*
|
2015-03-06 23:30:42 +01:00
|
|
|
* The addr can be translated simply without checking if it falls into the
|
2011-11-23 17:20:53 +01:00
|
|
|
* first chunk. But the current code reflects better how percpu allocator
|
|
|
|
* actually works, and the verification can discover both bugs in percpu
|
|
|
|
* allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
|
|
|
|
* code.
|
|
|
|
*
|
2009-11-24 07:50:03 +01:00
|
|
|
* RETURNS:
|
|
|
|
* The physical address for @addr.
|
|
|
|
*/
|
|
|
|
phys_addr_t per_cpu_ptr_to_phys(void *addr)
|
|
|
|
{
|
2010-06-18 11:44:31 +02:00
|
|
|
void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
|
|
|
|
bool in_first_chunk = false;
|
2011-11-18 19:55:35 +01:00
|
|
|
unsigned long first_low, first_high;
|
2010-06-18 11:44:31 +02:00
|
|
|
unsigned int cpu;
|
|
|
|
|
|
|
|
/*
|
2011-11-18 19:55:35 +01:00
|
|
|
* The following test on unit_low/high isn't strictly
|
2010-06-18 11:44:31 +02:00
|
|
|
* necessary but will speed up lookups of addresses which
|
|
|
|
* aren't in the first chunk.
|
|
|
|
*/
|
2011-11-18 19:55:35 +01:00
|
|
|
first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
|
|
|
|
first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
|
|
|
|
pcpu_unit_pages);
|
|
|
|
if ((unsigned long)addr >= first_low &&
|
|
|
|
(unsigned long)addr < first_high) {
|
2010-06-18 11:44:31 +02:00
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
void *start = per_cpu_ptr(base, cpu);
|
|
|
|
|
|
|
|
if (addr >= start && addr < start + pcpu_unit_size) {
|
|
|
|
in_first_chunk = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (in_first_chunk) {
|
2011-03-28 13:53:29 +02:00
|
|
|
if (!is_vmalloc_addr(addr))
|
2010-04-09 11:57:00 +02:00
|
|
|
return __pa(addr);
|
|
|
|
else
|
2011-12-15 20:25:59 +01:00
|
|
|
return page_to_phys(vmalloc_to_page(addr)) +
|
|
|
|
offset_in_page(addr);
|
2010-04-09 11:57:00 +02:00
|
|
|
} else
|
2011-12-15 20:25:59 +01:00
|
|
|
return page_to_phys(pcpu_addr_to_page(addr)) +
|
|
|
|
offset_in_page(addr);
|
2009-11-24 07:50:03 +01:00
|
|
|
}
|
|
|
|
|
2009-02-20 08:29:08 +01:00
|
|
|
/**
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
* pcpu_alloc_alloc_info - allocate percpu allocation info
|
|
|
|
* @nr_groups: the number of groups
|
|
|
|
* @nr_units: the number of units
|
|
|
|
*
|
|
|
|
* Allocate ai which is large enough for @nr_groups groups containing
|
|
|
|
* @nr_units units. The returned ai's groups[0].cpu_map points to the
|
|
|
|
* cpu_map array which is long enough for @nr_units and filled with
|
|
|
|
* NR_CPUS. It's the caller's responsibility to initialize cpu_map
|
|
|
|
* pointer of other groups.
|
|
|
|
*
|
|
|
|
* RETURNS:
|
|
|
|
* Pointer to the allocated pcpu_alloc_info on success, NULL on
|
|
|
|
* failure.
|
|
|
|
*/
|
|
|
|
struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
|
|
|
|
int nr_units)
|
|
|
|
{
|
|
|
|
struct pcpu_alloc_info *ai;
|
|
|
|
size_t base_size, ai_size;
|
|
|
|
void *ptr;
|
|
|
|
int unit;
|
|
|
|
|
|
|
|
base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
|
|
|
|
__alignof__(ai->groups[0].cpu_map[0]));
|
|
|
|
ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
|
|
|
|
|
2014-01-22 00:50:40 +01:00
|
|
|
ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0);
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
if (!ptr)
|
|
|
|
return NULL;
|
|
|
|
ai = ptr;
|
|
|
|
ptr += base_size;
|
|
|
|
|
|
|
|
ai->groups[0].cpu_map = ptr;
|
|
|
|
|
|
|
|
for (unit = 0; unit < nr_units; unit++)
|
|
|
|
ai->groups[0].cpu_map[unit] = NR_CPUS;
|
|
|
|
|
|
|
|
ai->nr_groups = nr_groups;
|
|
|
|
ai->__ai_size = PFN_ALIGN(ai_size);
|
|
|
|
|
|
|
|
return ai;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* pcpu_free_alloc_info - free percpu allocation info
|
|
|
|
* @ai: pcpu_alloc_info to free
|
|
|
|
*
|
|
|
|
* Free @ai which was allocated by pcpu_alloc_alloc_info().
|
|
|
|
*/
|
|
|
|
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
|
|
|
|
{
|
2014-01-22 00:50:40 +01:00
|
|
|
memblock_free_early(__pa(ai), ai->__ai_size);
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* pcpu_dump_alloc_info - print out information about pcpu_alloc_info
|
|
|
|
* @lvl: loglevel
|
|
|
|
* @ai: allocation info to dump
|
|
|
|
*
|
|
|
|
* Print out information about @ai using loglevel @lvl.
|
|
|
|
*/
|
|
|
|
static void pcpu_dump_alloc_info(const char *lvl,
|
|
|
|
const struct pcpu_alloc_info *ai)
|
2009-08-14 08:00:51 +02:00
|
|
|
{
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
int group_width = 1, cpu_width = 1, width;
|
2009-08-14 08:00:51 +02:00
|
|
|
char empty_str[] = "--------";
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
int alloc = 0, alloc_end = 0;
|
|
|
|
int group, v;
|
|
|
|
int upa, apl; /* units per alloc, allocs per line */
|
|
|
|
|
|
|
|
v = ai->nr_groups;
|
|
|
|
while (v /= 10)
|
|
|
|
group_width++;
|
2009-08-14 08:00:51 +02:00
|
|
|
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
v = num_possible_cpus();
|
2009-08-14 08:00:51 +02:00
|
|
|
while (v /= 10)
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
cpu_width++;
|
|
|
|
empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
|
2009-08-14 08:00:51 +02:00
|
|
|
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
upa = ai->alloc_size / ai->unit_size;
|
|
|
|
width = upa * (cpu_width + 1) + group_width + 3;
|
|
|
|
apl = rounddown_pow_of_two(max(60 / width, 1));
|
2009-08-14 08:00:51 +02:00
|
|
|
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
|
|
|
|
lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
|
|
|
|
ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
|
2009-08-14 08:00:51 +02:00
|
|
|
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
for (group = 0; group < ai->nr_groups; group++) {
|
|
|
|
const struct pcpu_group_info *gi = &ai->groups[group];
|
|
|
|
int unit = 0, unit_end = 0;
|
|
|
|
|
|
|
|
BUG_ON(gi->nr_units % upa);
|
|
|
|
for (alloc_end += gi->nr_units / upa;
|
|
|
|
alloc < alloc_end; alloc++) {
|
|
|
|
if (!(alloc % apl)) {
|
2016-03-17 22:19:50 +01:00
|
|
|
pr_cont("\n");
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
printk("%spcpu-alloc: ", lvl);
|
|
|
|
}
|
2016-03-17 22:19:50 +01:00
|
|
|
pr_cont("[%0*d] ", group_width, group);
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
|
|
|
|
for (unit_end += upa; unit < unit_end; unit++)
|
|
|
|
if (gi->cpu_map[unit] != NR_CPUS)
|
2016-03-17 22:19:50 +01:00
|
|
|
pr_cont("%0*d ",
|
|
|
|
cpu_width, gi->cpu_map[unit]);
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
else
|
2016-03-17 22:19:50 +01:00
|
|
|
pr_cont("%s ", empty_str);
|
2009-08-14 08:00:51 +02:00
|
|
|
}
|
|
|
|
}
|
2016-03-17 22:19:50 +01:00
|
|
|
pr_cont("\n");
|
2009-08-14 08:00:51 +02:00
|
|
|
}
|
|
|
|
|
2009-02-20 08:29:08 +01:00
|
|
|
/**
|
2009-02-24 03:57:21 +01:00
|
|
|
* pcpu_setup_first_chunk - initialize the first percpu chunk
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
* @ai: pcpu_alloc_info describing how to percpu area is shaped
|
2009-07-04 01:10:59 +02:00
|
|
|
* @base_addr: mapped address
|
2009-02-24 03:57:21 +01:00
|
|
|
*
|
|
|
|
* Initialize the first percpu chunk which contains the kernel static
|
|
|
|
* perpcu area. This function is to be called from arch percpu area
|
2009-07-04 01:10:59 +02:00
|
|
|
* setup path.
|
2009-02-24 03:57:21 +01:00
|
|
|
*
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
* @ai contains all information necessary to initialize the first
|
|
|
|
* chunk and prime the dynamic percpu allocator.
|
|
|
|
*
|
|
|
|
* @ai->static_size is the size of static percpu area.
|
|
|
|
*
|
|
|
|
* @ai->reserved_size, if non-zero, specifies the amount of bytes to
|
2009-03-06 06:33:59 +01:00
|
|
|
* reserve after the static area in the first chunk. This reserves
|
|
|
|
* the first chunk such that it's available only through reserved
|
|
|
|
* percpu allocation. This is primarily used to serve module percpu
|
|
|
|
* static areas on architectures where the addressing model has
|
|
|
|
* limited offset range for symbol relocations to guarantee module
|
|
|
|
* percpu symbols fall inside the relocatable range.
|
|
|
|
*
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
* @ai->dyn_size determines the number of bytes available for dynamic
|
|
|
|
* allocation in the first chunk. The area between @ai->static_size +
|
|
|
|
* @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
|
2009-03-10 08:27:48 +01:00
|
|
|
*
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
* @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
|
|
|
|
* and equal to or larger than @ai->static_size + @ai->reserved_size +
|
|
|
|
* @ai->dyn_size.
|
2009-02-24 03:57:21 +01:00
|
|
|
*
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
* @ai->atom_size is the allocation atom size and used as alignment
|
|
|
|
* for vm areas.
|
2009-02-24 03:57:21 +01:00
|
|
|
*
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
* @ai->alloc_size is the allocation size and always multiple of
|
|
|
|
* @ai->atom_size. This is larger than @ai->atom_size if
|
|
|
|
* @ai->unit_size is larger than @ai->atom_size.
|
|
|
|
*
|
|
|
|
* @ai->nr_groups and @ai->groups describe virtual memory layout of
|
|
|
|
* percpu areas. Units which should be colocated are put into the
|
|
|
|
* same group. Dynamic VM areas will be allocated according to these
|
|
|
|
* groupings. If @ai->nr_groups is zero, a single group containing
|
|
|
|
* all units is assumed.
|
2009-02-24 03:57:21 +01:00
|
|
|
*
|
2009-07-04 01:10:59 +02:00
|
|
|
* The caller should have mapped the first chunk at @base_addr and
|
|
|
|
* copied static data to each unit.
|
2009-02-20 08:29:08 +01:00
|
|
|
*
|
2009-03-06 06:33:59 +01:00
|
|
|
* If the first chunk ends up with both reserved and dynamic areas, it
|
|
|
|
* is served by two chunks - one to serve the core static and reserved
|
|
|
|
* areas and the other for the dynamic area. They share the same vm
|
|
|
|
* and page map but uses different area allocation map to stay away
|
|
|
|
* from each other. The latter chunk is circulated in the chunk slots
|
|
|
|
* and available for dynamic allocation like any other chunks.
|
|
|
|
*
|
2009-02-20 08:29:08 +01:00
|
|
|
* RETURNS:
|
2009-08-14 08:00:51 +02:00
|
|
|
* 0 on success, -errno on failure.
|
2009-02-20 08:29:08 +01:00
|
|
|
*/
|
2009-08-14 08:00:51 +02:00
|
|
|
int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
|
|
|
|
void *base_addr)
|
2009-02-20 08:29:08 +01:00
|
|
|
{
|
2010-06-27 18:50:00 +02:00
|
|
|
static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
|
|
|
|
static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
size_t dyn_size = ai->dyn_size;
|
|
|
|
size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
|
2009-03-06 06:33:59 +01:00
|
|
|
struct pcpu_chunk *schunk, *dchunk = NULL;
|
2009-08-14 08:00:52 +02:00
|
|
|
unsigned long *group_offsets;
|
|
|
|
size_t *group_sizes;
|
2009-08-14 08:00:51 +02:00
|
|
|
unsigned long *unit_off;
|
2009-02-20 08:29:08 +01:00
|
|
|
unsigned int cpu;
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
int *unit_map;
|
|
|
|
int group, unit, i;
|
2009-02-20 08:29:08 +01:00
|
|
|
|
2009-09-24 02:43:11 +02:00
|
|
|
#define PCPU_SETUP_BUG_ON(cond) do { \
|
|
|
|
if (unlikely(cond)) { \
|
2016-03-17 22:19:53 +01:00
|
|
|
pr_emerg("failed to initialize, %s\n", #cond); \
|
|
|
|
pr_emerg("cpu_possible_mask=%*pb\n", \
|
2015-02-13 23:37:34 +01:00
|
|
|
cpumask_pr_args(cpu_possible_mask)); \
|
2009-09-24 02:43:11 +02:00
|
|
|
pcpu_dump_alloc_info(KERN_EMERG, ai); \
|
|
|
|
BUG(); \
|
|
|
|
} \
|
|
|
|
} while (0)
|
|
|
|
|
2009-07-04 01:11:00 +02:00
|
|
|
/* sanity checks */
|
2009-09-24 02:43:11 +02:00
|
|
|
PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
|
2010-09-03 18:22:48 +02:00
|
|
|
#ifdef CONFIG_SMP
|
2009-09-24 02:43:11 +02:00
|
|
|
PCPU_SETUP_BUG_ON(!ai->static_size);
|
2015-11-06 03:46:43 +01:00
|
|
|
PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
|
2010-09-03 18:22:48 +02:00
|
|
|
#endif
|
2009-09-24 02:43:11 +02:00
|
|
|
PCPU_SETUP_BUG_ON(!base_addr);
|
2015-11-06 03:46:43 +01:00
|
|
|
PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
|
2009-09-24 02:43:11 +02:00
|
|
|
PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
|
2015-11-06 03:46:43 +01:00
|
|
|
PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
|
2009-09-24 02:43:11 +02:00
|
|
|
PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
|
2010-06-27 18:50:00 +02:00
|
|
|
PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
|
2010-04-09 11:57:01 +02:00
|
|
|
PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
|
2009-02-24 03:57:21 +01:00
|
|
|
|
2009-08-14 08:00:52 +02:00
|
|
|
/* process group information and build config tables accordingly */
|
2014-01-22 00:50:40 +01:00
|
|
|
group_offsets = memblock_virt_alloc(ai->nr_groups *
|
|
|
|
sizeof(group_offsets[0]), 0);
|
|
|
|
group_sizes = memblock_virt_alloc(ai->nr_groups *
|
|
|
|
sizeof(group_sizes[0]), 0);
|
|
|
|
unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
|
|
|
|
unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
|
2009-07-04 01:11:00 +02:00
|
|
|
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
|
2009-09-29 02:17:56 +02:00
|
|
|
unit_map[cpu] = UINT_MAX;
|
2011-11-18 19:55:35 +01:00
|
|
|
|
|
|
|
pcpu_low_unit_cpu = NR_CPUS;
|
|
|
|
pcpu_high_unit_cpu = NR_CPUS;
|
2009-07-04 01:11:00 +02:00
|
|
|
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
|
|
|
|
const struct pcpu_group_info *gi = &ai->groups[group];
|
2009-07-04 01:11:00 +02:00
|
|
|
|
2009-08-14 08:00:52 +02:00
|
|
|
group_offsets[group] = gi->base_offset;
|
|
|
|
group_sizes[group] = gi->nr_units * ai->unit_size;
|
|
|
|
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
for (i = 0; i < gi->nr_units; i++) {
|
|
|
|
cpu = gi->cpu_map[i];
|
|
|
|
if (cpu == NR_CPUS)
|
|
|
|
continue;
|
2009-02-24 03:57:21 +01:00
|
|
|
|
2014-10-29 09:45:04 +01:00
|
|
|
PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
|
2009-09-24 02:43:11 +02:00
|
|
|
PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
|
|
|
|
PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
|
2009-02-20 08:29:08 +01:00
|
|
|
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
unit_map[cpu] = unit + i;
|
2009-08-14 08:00:51 +02:00
|
|
|
unit_off[cpu] = gi->base_offset + i * ai->unit_size;
|
|
|
|
|
2011-11-18 19:55:35 +01:00
|
|
|
/* determine low/high unit_cpu */
|
|
|
|
if (pcpu_low_unit_cpu == NR_CPUS ||
|
|
|
|
unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
|
|
|
|
pcpu_low_unit_cpu = cpu;
|
|
|
|
if (pcpu_high_unit_cpu == NR_CPUS ||
|
|
|
|
unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
|
|
|
|
pcpu_high_unit_cpu = cpu;
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
}
|
2009-07-04 01:11:00 +02:00
|
|
|
}
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
pcpu_nr_units = unit;
|
|
|
|
|
|
|
|
for_each_possible_cpu(cpu)
|
2009-09-24 02:43:11 +02:00
|
|
|
PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
|
|
|
|
|
|
|
|
/* we're done parsing the input, undefine BUG macro and dump config */
|
|
|
|
#undef PCPU_SETUP_BUG_ON
|
2010-12-22 14:19:14 +01:00
|
|
|
pcpu_dump_alloc_info(KERN_DEBUG, ai);
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
|
2009-08-14 08:00:52 +02:00
|
|
|
pcpu_nr_groups = ai->nr_groups;
|
|
|
|
pcpu_group_offsets = group_offsets;
|
|
|
|
pcpu_group_sizes = group_sizes;
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
pcpu_unit_map = unit_map;
|
2009-08-14 08:00:51 +02:00
|
|
|
pcpu_unit_offsets = unit_off;
|
2009-07-04 01:11:00 +02:00
|
|
|
|
|
|
|
/* determine basic parameters */
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
|
2009-02-24 03:57:21 +01:00
|
|
|
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
|
2009-08-14 08:00:52 +02:00
|
|
|
pcpu_atom_size = ai->atom_size;
|
2009-07-04 01:11:00 +02:00
|
|
|
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
|
|
|
|
BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
|
2009-03-06 06:33:59 +01:00
|
|
|
|
2009-02-24 03:57:21 +01:00
|
|
|
/*
|
|
|
|
* Allocate chunk slots. The additional last slot is for
|
|
|
|
* empty chunks.
|
|
|
|
*/
|
|
|
|
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
|
2014-01-22 00:50:40 +01:00
|
|
|
pcpu_slot = memblock_virt_alloc(
|
|
|
|
pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
|
2009-02-20 08:29:08 +01:00
|
|
|
for (i = 0; i < pcpu_nr_slots; i++)
|
|
|
|
INIT_LIST_HEAD(&pcpu_slot[i]);
|
|
|
|
|
2009-03-06 06:33:59 +01:00
|
|
|
/*
|
|
|
|
* Initialize static chunk. If reserved_size is zero, the
|
|
|
|
* static chunk covers static area + dynamic allocation area
|
|
|
|
* in the first chunk. If reserved_size is not zero, it
|
|
|
|
* covers static area + reserved area (mostly used for module
|
|
|
|
* static percpu allocation).
|
|
|
|
*/
|
2014-01-22 00:50:40 +01:00
|
|
|
schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
|
2009-03-06 06:33:59 +01:00
|
|
|
INIT_LIST_HEAD(&schunk->list);
|
2016-05-25 17:48:25 +02:00
|
|
|
INIT_LIST_HEAD(&schunk->map_extend_list);
|
2009-08-14 08:00:51 +02:00
|
|
|
schunk->base_addr = base_addr;
|
2009-03-06 06:33:59 +01:00
|
|
|
schunk->map = smap;
|
|
|
|
schunk->map_alloc = ARRAY_SIZE(smap);
|
2009-07-04 01:10:59 +02:00
|
|
|
schunk->immutable = true;
|
2009-07-04 01:11:00 +02:00
|
|
|
bitmap_fill(schunk->populated, pcpu_unit_pages);
|
2014-09-02 20:46:05 +02:00
|
|
|
schunk->nr_populated = pcpu_unit_pages;
|
2009-03-06 06:33:59 +01:00
|
|
|
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
if (ai->reserved_size) {
|
|
|
|
schunk->free_size = ai->reserved_size;
|
2009-04-02 06:19:54 +02:00
|
|
|
pcpu_reserved_chunk = schunk;
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
|
2009-03-06 06:33:59 +01:00
|
|
|
} else {
|
|
|
|
schunk->free_size = dyn_size;
|
|
|
|
dyn_size = 0; /* dynamic area covered */
|
|
|
|
}
|
2009-03-06 06:33:59 +01:00
|
|
|
schunk->contig_hint = schunk->free_size;
|
2009-02-20 08:29:08 +01:00
|
|
|
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
schunk->map[0] = 1;
|
|
|
|
schunk->map[1] = ai->static_size;
|
|
|
|
schunk->map_used = 1;
|
2009-03-06 06:33:59 +01:00
|
|
|
if (schunk->free_size)
|
2015-07-20 16:55:28 +02:00
|
|
|
schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size;
|
|
|
|
schunk->map[schunk->map_used] |= 1;
|
2009-03-06 06:33:59 +01:00
|
|
|
|
2009-03-06 06:33:59 +01:00
|
|
|
/* init dynamic chunk if necessary */
|
|
|
|
if (dyn_size) {
|
2014-01-22 00:50:40 +01:00
|
|
|
dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
|
2009-03-06 06:33:59 +01:00
|
|
|
INIT_LIST_HEAD(&dchunk->list);
|
2016-05-25 17:48:25 +02:00
|
|
|
INIT_LIST_HEAD(&dchunk->map_extend_list);
|
2009-08-14 08:00:51 +02:00
|
|
|
dchunk->base_addr = base_addr;
|
2009-03-06 06:33:59 +01:00
|
|
|
dchunk->map = dmap;
|
|
|
|
dchunk->map_alloc = ARRAY_SIZE(dmap);
|
2009-07-04 01:10:59 +02:00
|
|
|
dchunk->immutable = true;
|
2009-07-04 01:11:00 +02:00
|
|
|
bitmap_fill(dchunk->populated, pcpu_unit_pages);
|
2014-09-02 20:46:05 +02:00
|
|
|
dchunk->nr_populated = pcpu_unit_pages;
|
2009-03-06 06:33:59 +01:00
|
|
|
|
|
|
|
dchunk->contig_hint = dchunk->free_size = dyn_size;
|
percpu: store offsets instead of lengths in ->map[]
Current code keeps +-length for each area in chunk->map[]. It has
several unpleasant consequences:
* even if we know that first 50 areas are all in use, allocation
still needs to go through all those areas just to sum their sizes, just
to get the offset of free one.
* freeing needs to find the array entry refering to the area
in question; again, the need to sum the sizes until we reach the offset
we are interested in. Note that offsets are monotonous, so simple
binary search would do here.
New data representation: array of <offset,in-use flag> pairs.
Each pair is represented by one int - we use offset|1 for <offset, in use>
and offset for <offset, free> (we make sure that all offsets are even).
In the end we put a sentry entry - <total size, in use>. The first
entry is <0, flag>; it would be possible to store together the flag
for Nth area and offset for N+1st, but that leads to much hairier code.
In other words, where the old variant would have
4, -8, -4, 4, -12, 100
(4 bytes free, 8 in use, 4 in use, 4 free, 12 in use, 100 free) we store
<0,0>, <4,1>, <12,1>, <16,0>, <20,1>, <32,0>, <132,1>
i.e.
0, 5, 13, 16, 21, 32, 133
This commit switches to new data representation and takes care of a couple
of low-hanging fruits in free_pcpu_area() - one is the switch to binary
search, another is not doing two memmove() when one would do. Speeding
the alloc side up (by keeping track of how many areas in the beginning are
known to be all in use) also becomes possible - that'll be done in the next
commit.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-03-07 03:13:18 +01:00
|
|
|
dchunk->map[0] = 1;
|
|
|
|
dchunk->map[1] = pcpu_reserved_chunk_limit;
|
|
|
|
dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1;
|
|
|
|
dchunk->map_used = 2;
|
2009-03-06 06:33:59 +01:00
|
|
|
}
|
|
|
|
|
2009-03-06 06:33:59 +01:00
|
|
|
/* link the first chunk in */
|
2009-04-02 06:19:54 +02:00
|
|
|
pcpu_first_chunk = dchunk ?: schunk;
|
2014-09-02 20:46:05 +02:00
|
|
|
pcpu_nr_empty_pop_pages +=
|
|
|
|
pcpu_count_occupied_pages(pcpu_first_chunk, 1);
|
2009-04-02 06:19:54 +02:00
|
|
|
pcpu_chunk_relocate(pcpu_first_chunk, -1);
|
2009-02-20 08:29:08 +01:00
|
|
|
|
|
|
|
/* we're done */
|
2009-08-14 08:00:51 +02:00
|
|
|
pcpu_base_addr = base_addr;
|
2009-08-14 08:00:51 +02:00
|
|
|
return 0;
|
2009-02-20 08:29:08 +01:00
|
|
|
}
|
2009-03-10 08:27:48 +01:00
|
|
|
|
2010-09-03 18:22:48 +02:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
|
2012-10-05 02:12:07 +02:00
|
|
|
const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
|
2009-08-14 08:00:50 +02:00
|
|
|
[PCPU_FC_AUTO] = "auto",
|
|
|
|
[PCPU_FC_EMBED] = "embed",
|
|
|
|
[PCPU_FC_PAGE] = "page",
|
|
|
|
};
|
2009-03-10 08:27:48 +01:00
|
|
|
|
2009-08-14 08:00:50 +02:00
|
|
|
enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
|
2009-03-10 08:27:48 +01:00
|
|
|
|
2009-08-14 08:00:50 +02:00
|
|
|
static int __init percpu_alloc_setup(char *str)
|
|
|
|
{
|
2012-11-24 22:17:13 +01:00
|
|
|
if (!str)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2009-08-14 08:00:50 +02:00
|
|
|
if (0)
|
|
|
|
/* nada */;
|
|
|
|
#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
|
|
|
|
else if (!strcmp(str, "embed"))
|
|
|
|
pcpu_chosen_fc = PCPU_FC_EMBED;
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
|
|
|
|
else if (!strcmp(str, "page"))
|
|
|
|
pcpu_chosen_fc = PCPU_FC_PAGE;
|
|
|
|
#endif
|
|
|
|
else
|
2016-03-17 22:19:53 +01:00
|
|
|
pr_warn("unknown allocator %s specified\n", str);
|
2009-03-10 08:27:48 +01:00
|
|
|
|
2009-08-14 08:00:50 +02:00
|
|
|
return 0;
|
2009-03-10 08:27:48 +01:00
|
|
|
}
|
2009-08-14 08:00:50 +02:00
|
|
|
early_param("percpu_alloc", percpu_alloc_setup);
|
2009-03-10 08:27:48 +01:00
|
|
|
|
2010-09-09 18:00:15 +02:00
|
|
|
/*
|
|
|
|
* pcpu_embed_first_chunk() is used by the generic percpu setup.
|
|
|
|
* Build it if needed by the arch config or the generic setup is going
|
|
|
|
* to be used.
|
|
|
|
*/
|
2009-08-14 08:00:49 +02:00
|
|
|
#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
|
|
|
|
!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
|
2010-09-09 18:00:15 +02:00
|
|
|
#define BUILD_EMBED_FIRST_CHUNK
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* build pcpu_page_first_chunk() iff needed by the arch config */
|
|
|
|
#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
|
|
|
|
#define BUILD_PAGE_FIRST_CHUNK
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* pcpu_build_alloc_info() is used by both embed and page first chunk */
|
|
|
|
#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
|
|
|
|
/**
|
|
|
|
* pcpu_build_alloc_info - build alloc_info considering distances between CPUs
|
|
|
|
* @reserved_size: the size of reserved percpu area in bytes
|
|
|
|
* @dyn_size: minimum free size for dynamic allocation in bytes
|
|
|
|
* @atom_size: allocation atom size
|
|
|
|
* @cpu_distance_fn: callback to determine distance between cpus, optional
|
|
|
|
*
|
|
|
|
* This function determines grouping of units, their mappings to cpus
|
|
|
|
* and other parameters considering needed percpu size, allocation
|
|
|
|
* atom size and distances between CPUs.
|
|
|
|
*
|
2015-03-06 23:30:42 +01:00
|
|
|
* Groups are always multiples of atom size and CPUs which are of
|
2010-09-09 18:00:15 +02:00
|
|
|
* LOCAL_DISTANCE both ways are grouped together and share space for
|
|
|
|
* units in the same group. The returned configuration is guaranteed
|
|
|
|
* to have CPUs on different nodes on different groups and >=75% usage
|
|
|
|
* of allocated virtual address space.
|
|
|
|
*
|
|
|
|
* RETURNS:
|
|
|
|
* On success, pointer to the new allocation_info is returned. On
|
|
|
|
* failure, ERR_PTR value is returned.
|
|
|
|
*/
|
|
|
|
static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
|
|
|
|
size_t reserved_size, size_t dyn_size,
|
|
|
|
size_t atom_size,
|
|
|
|
pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
|
|
|
|
{
|
|
|
|
static int group_map[NR_CPUS] __initdata;
|
|
|
|
static int group_cnt[NR_CPUS] __initdata;
|
|
|
|
const size_t static_size = __per_cpu_end - __per_cpu_start;
|
|
|
|
int nr_groups = 1, nr_units = 0;
|
|
|
|
size_t size_sum, min_unit_size, alloc_size;
|
|
|
|
int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
|
|
|
|
int last_allocs, group, unit;
|
|
|
|
unsigned int cpu, tcpu;
|
|
|
|
struct pcpu_alloc_info *ai;
|
|
|
|
unsigned int *cpu_map;
|
|
|
|
|
|
|
|
/* this function may be called multiple times */
|
|
|
|
memset(group_map, 0, sizeof(group_map));
|
|
|
|
memset(group_cnt, 0, sizeof(group_cnt));
|
|
|
|
|
|
|
|
/* calculate size_sum and ensure dyn_size is enough for early alloc */
|
|
|
|
size_sum = PFN_ALIGN(static_size + reserved_size +
|
|
|
|
max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
|
|
|
|
dyn_size = size_sum - static_size - reserved_size;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine min_unit_size, alloc_size and max_upa such that
|
|
|
|
* alloc_size is multiple of atom_size and is the smallest
|
2011-03-31 03:57:33 +02:00
|
|
|
* which can accommodate 4k aligned segments which are equal to
|
2010-09-09 18:00:15 +02:00
|
|
|
* or larger than min_unit_size.
|
|
|
|
*/
|
|
|
|
min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
|
|
|
|
|
|
|
|
alloc_size = roundup(min_unit_size, atom_size);
|
|
|
|
upa = alloc_size / min_unit_size;
|
2015-11-06 03:46:43 +01:00
|
|
|
while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
|
2010-09-09 18:00:15 +02:00
|
|
|
upa--;
|
|
|
|
max_upa = upa;
|
|
|
|
|
|
|
|
/* group cpus according to their proximity */
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
group = 0;
|
|
|
|
next_group:
|
|
|
|
for_each_possible_cpu(tcpu) {
|
|
|
|
if (cpu == tcpu)
|
|
|
|
break;
|
|
|
|
if (group_map[tcpu] == group && cpu_distance_fn &&
|
|
|
|
(cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
|
|
|
|
cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
|
|
|
|
group++;
|
|
|
|
nr_groups = max(nr_groups, group + 1);
|
|
|
|
goto next_group;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
group_map[cpu] = group;
|
|
|
|
group_cnt[group]++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Expand unit size until address space usage goes over 75%
|
|
|
|
* and then as much as possible without using more address
|
|
|
|
* space.
|
|
|
|
*/
|
|
|
|
last_allocs = INT_MAX;
|
|
|
|
for (upa = max_upa; upa; upa--) {
|
|
|
|
int allocs = 0, wasted = 0;
|
|
|
|
|
2015-11-06 03:46:43 +01:00
|
|
|
if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
|
2010-09-09 18:00:15 +02:00
|
|
|
continue;
|
|
|
|
|
|
|
|
for (group = 0; group < nr_groups; group++) {
|
|
|
|
int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
|
|
|
|
allocs += this_allocs;
|
|
|
|
wasted += this_allocs * upa - group_cnt[group];
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't accept if wastage is over 1/3. The
|
|
|
|
* greater-than comparison ensures upa==1 always
|
|
|
|
* passes the following check.
|
|
|
|
*/
|
|
|
|
if (wasted > num_possible_cpus() / 3)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* and then don't consume more memory */
|
|
|
|
if (allocs > last_allocs)
|
|
|
|
break;
|
|
|
|
last_allocs = allocs;
|
|
|
|
best_upa = upa;
|
|
|
|
}
|
|
|
|
upa = best_upa;
|
|
|
|
|
|
|
|
/* allocate and fill alloc_info */
|
|
|
|
for (group = 0; group < nr_groups; group++)
|
|
|
|
nr_units += roundup(group_cnt[group], upa);
|
|
|
|
|
|
|
|
ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
|
|
|
|
if (!ai)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
cpu_map = ai->groups[0].cpu_map;
|
|
|
|
|
|
|
|
for (group = 0; group < nr_groups; group++) {
|
|
|
|
ai->groups[group].cpu_map = cpu_map;
|
|
|
|
cpu_map += roundup(group_cnt[group], upa);
|
|
|
|
}
|
|
|
|
|
|
|
|
ai->static_size = static_size;
|
|
|
|
ai->reserved_size = reserved_size;
|
|
|
|
ai->dyn_size = dyn_size;
|
|
|
|
ai->unit_size = alloc_size / upa;
|
|
|
|
ai->atom_size = atom_size;
|
|
|
|
ai->alloc_size = alloc_size;
|
|
|
|
|
|
|
|
for (group = 0, unit = 0; group_cnt[group]; group++) {
|
|
|
|
struct pcpu_group_info *gi = &ai->groups[group];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize base_offset as if all groups are located
|
|
|
|
* back-to-back. The caller should update this to
|
|
|
|
* reflect actual allocation.
|
|
|
|
*/
|
|
|
|
gi->base_offset = unit * ai->unit_size;
|
|
|
|
|
|
|
|
for_each_possible_cpu(cpu)
|
|
|
|
if (group_map[cpu] == group)
|
|
|
|
gi->cpu_map[gi->nr_units++] = cpu;
|
|
|
|
gi->nr_units = roundup(gi->nr_units, upa);
|
|
|
|
unit += gi->nr_units;
|
|
|
|
}
|
|
|
|
BUG_ON(unit != nr_units);
|
|
|
|
|
|
|
|
return ai;
|
|
|
|
}
|
|
|
|
#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
|
|
|
|
|
|
|
|
#if defined(BUILD_EMBED_FIRST_CHUNK)
|
2009-03-10 08:27:48 +01:00
|
|
|
/**
|
|
|
|
* pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
|
|
|
|
* @reserved_size: the size of reserved percpu area in bytes
|
2010-06-27 18:49:59 +02:00
|
|
|
* @dyn_size: minimum free size for dynamic allocation in bytes
|
2009-08-14 08:00:52 +02:00
|
|
|
* @atom_size: allocation atom size
|
|
|
|
* @cpu_distance_fn: callback to determine distance between cpus, optional
|
|
|
|
* @alloc_fn: function to allocate percpu page
|
2011-03-31 03:57:33 +02:00
|
|
|
* @free_fn: function to free percpu page
|
2009-03-10 08:27:48 +01:00
|
|
|
*
|
|
|
|
* This is a helper to ease setting up embedded first percpu chunk and
|
|
|
|
* can be called where pcpu_setup_first_chunk() is expected.
|
|
|
|
*
|
|
|
|
* If this function is used to setup the first chunk, it is allocated
|
2009-08-14 08:00:52 +02:00
|
|
|
* by calling @alloc_fn and used as-is without being mapped into
|
|
|
|
* vmalloc area. Allocations are always whole multiples of @atom_size
|
|
|
|
* aligned to @atom_size.
|
|
|
|
*
|
|
|
|
* This enables the first chunk to piggy back on the linear physical
|
|
|
|
* mapping which often uses larger page size. Please note that this
|
|
|
|
* can result in very sparse cpu->unit mapping on NUMA machines thus
|
|
|
|
* requiring large vmalloc address space. Don't use this allocator if
|
|
|
|
* vmalloc space is not orders of magnitude larger than distances
|
|
|
|
* between node memory addresses (ie. 32bit NUMA machines).
|
2009-03-10 08:27:48 +01:00
|
|
|
*
|
2010-06-27 18:49:59 +02:00
|
|
|
* @dyn_size specifies the minimum dynamic area size.
|
2009-03-10 08:27:48 +01:00
|
|
|
*
|
|
|
|
* If the needed size is smaller than the minimum or specified unit
|
2009-08-14 08:00:52 +02:00
|
|
|
* size, the leftover is returned using @free_fn.
|
2009-03-10 08:27:48 +01:00
|
|
|
*
|
|
|
|
* RETURNS:
|
2009-08-14 08:00:51 +02:00
|
|
|
* 0 on success, -errno on failure.
|
2009-03-10 08:27:48 +01:00
|
|
|
*/
|
2010-06-27 18:49:59 +02:00
|
|
|
int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
|
2009-08-14 08:00:52 +02:00
|
|
|
size_t atom_size,
|
|
|
|
pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
|
|
|
|
pcpu_fc_alloc_fn_t alloc_fn,
|
|
|
|
pcpu_fc_free_fn_t free_fn)
|
2009-03-10 08:27:48 +01:00
|
|
|
{
|
2009-08-14 08:00:52 +02:00
|
|
|
void *base = (void *)ULONG_MAX;
|
|
|
|
void **areas = NULL;
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
struct pcpu_alloc_info *ai;
|
2016-10-05 15:19:11 +02:00
|
|
|
size_t size_sum, areas_size;
|
|
|
|
unsigned long max_distance;
|
mm/percpu.c: fix potential memory leakage for pcpu_embed_first_chunk()
in order to ensure the percpu group areas within a chunk aren't
distributed too sparsely, pcpu_embed_first_chunk() goes to error handling
path when a chunk spans over 3/4 VMALLOC area, however, during the error
handling, it forget to free the memory allocated for all percpu groups by
going to label @out_free other than @out_free_areas.
it will cause memory leakage issue if the rare scene really happens, in
order to fix the issue, we check chunk spanned area immediately after
completing memory allocation for all percpu groups, we go to label
@out_free_areas to free the memory then return if the checking is failed.
in order to verify the approach, we dump all memory allocated then
enforce the jump then dump all memory freed, the result is okay after
checking whether we free all memory we allocate in this function.
BTW, The approach is chosen after thinking over the below scenes
- we don't go to label @out_free directly to fix this issue since we
maybe free several allocated memory blocks twice
- the aim of jumping after pcpu_setup_first_chunk() is bypassing free
usable memory other than handling error, moreover, the function does
not return error code in any case, it either panics due to BUG_ON()
or return 0.
Signed-off-by: zijun_hu <zijun_hu@htc.com>
Tested-by: zijun_hu <zijun_hu@htc.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2016-10-05 15:30:24 +02:00
|
|
|
int group, i, highest_group, rc;
|
2009-03-10 08:27:48 +01:00
|
|
|
|
2009-08-14 08:00:52 +02:00
|
|
|
ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
|
|
|
|
cpu_distance_fn);
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
if (IS_ERR(ai))
|
|
|
|
return PTR_ERR(ai);
|
2009-03-10 08:27:48 +01:00
|
|
|
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
|
2009-08-14 08:00:52 +02:00
|
|
|
areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
|
2009-06-22 04:56:24 +02:00
|
|
|
|
2014-01-22 00:50:40 +01:00
|
|
|
areas = memblock_virt_alloc_nopanic(areas_size, 0);
|
2009-08-14 08:00:52 +02:00
|
|
|
if (!areas) {
|
2009-08-14 08:00:51 +02:00
|
|
|
rc = -ENOMEM;
|
2009-08-14 08:00:52 +02:00
|
|
|
goto out_free;
|
2009-06-22 04:56:24 +02:00
|
|
|
}
|
2009-03-10 08:27:48 +01:00
|
|
|
|
mm/percpu.c: fix potential memory leakage for pcpu_embed_first_chunk()
in order to ensure the percpu group areas within a chunk aren't
distributed too sparsely, pcpu_embed_first_chunk() goes to error handling
path when a chunk spans over 3/4 VMALLOC area, however, during the error
handling, it forget to free the memory allocated for all percpu groups by
going to label @out_free other than @out_free_areas.
it will cause memory leakage issue if the rare scene really happens, in
order to fix the issue, we check chunk spanned area immediately after
completing memory allocation for all percpu groups, we go to label
@out_free_areas to free the memory then return if the checking is failed.
in order to verify the approach, we dump all memory allocated then
enforce the jump then dump all memory freed, the result is okay after
checking whether we free all memory we allocate in this function.
BTW, The approach is chosen after thinking over the below scenes
- we don't go to label @out_free directly to fix this issue since we
maybe free several allocated memory blocks twice
- the aim of jumping after pcpu_setup_first_chunk() is bypassing free
usable memory other than handling error, moreover, the function does
not return error code in any case, it either panics due to BUG_ON()
or return 0.
Signed-off-by: zijun_hu <zijun_hu@htc.com>
Tested-by: zijun_hu <zijun_hu@htc.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2016-10-05 15:30:24 +02:00
|
|
|
/* allocate, copy and determine base address & max_distance */
|
|
|
|
highest_group = 0;
|
2009-08-14 08:00:52 +02:00
|
|
|
for (group = 0; group < ai->nr_groups; group++) {
|
|
|
|
struct pcpu_group_info *gi = &ai->groups[group];
|
|
|
|
unsigned int cpu = NR_CPUS;
|
|
|
|
void *ptr;
|
|
|
|
|
|
|
|
for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
|
|
|
|
cpu = gi->cpu_map[i];
|
|
|
|
BUG_ON(cpu == NR_CPUS);
|
|
|
|
|
|
|
|
/* allocate space for the whole group */
|
|
|
|
ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
|
|
|
|
if (!ptr) {
|
|
|
|
rc = -ENOMEM;
|
|
|
|
goto out_free_areas;
|
|
|
|
}
|
2011-09-26 18:12:53 +02:00
|
|
|
/* kmemleak tracks the percpu allocations separately */
|
|
|
|
kmemleak_free(ptr);
|
2009-08-14 08:00:52 +02:00
|
|
|
areas[group] = ptr;
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
|
2009-08-14 08:00:52 +02:00
|
|
|
base = min(ptr, base);
|
mm/percpu.c: fix potential memory leakage for pcpu_embed_first_chunk()
in order to ensure the percpu group areas within a chunk aren't
distributed too sparsely, pcpu_embed_first_chunk() goes to error handling
path when a chunk spans over 3/4 VMALLOC area, however, during the error
handling, it forget to free the memory allocated for all percpu groups by
going to label @out_free other than @out_free_areas.
it will cause memory leakage issue if the rare scene really happens, in
order to fix the issue, we check chunk spanned area immediately after
completing memory allocation for all percpu groups, we go to label
@out_free_areas to free the memory then return if the checking is failed.
in order to verify the approach, we dump all memory allocated then
enforce the jump then dump all memory freed, the result is okay after
checking whether we free all memory we allocate in this function.
BTW, The approach is chosen after thinking over the below scenes
- we don't go to label @out_free directly to fix this issue since we
maybe free several allocated memory blocks twice
- the aim of jumping after pcpu_setup_first_chunk() is bypassing free
usable memory other than handling error, moreover, the function does
not return error code in any case, it either panics due to BUG_ON()
or return 0.
Signed-off-by: zijun_hu <zijun_hu@htc.com>
Tested-by: zijun_hu <zijun_hu@htc.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2016-10-05 15:30:24 +02:00
|
|
|
if (ptr > areas[highest_group])
|
|
|
|
highest_group = group;
|
|
|
|
}
|
|
|
|
max_distance = areas[highest_group] - base;
|
|
|
|
max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
|
|
|
|
|
|
|
|
/* warn if maximum distance is further than 75% of vmalloc space */
|
|
|
|
if (max_distance > VMALLOC_TOTAL * 3 / 4) {
|
|
|
|
pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
|
|
|
|
max_distance, VMALLOC_TOTAL);
|
|
|
|
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
|
|
|
|
/* and fail if we have fallback */
|
|
|
|
rc = -EINVAL;
|
|
|
|
goto out_free_areas;
|
|
|
|
#endif
|
2012-04-27 17:42:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Copy data and free unused parts. This should happen after all
|
|
|
|
* allocations are complete; otherwise, we may end up with
|
|
|
|
* overlapping groups.
|
|
|
|
*/
|
|
|
|
for (group = 0; group < ai->nr_groups; group++) {
|
|
|
|
struct pcpu_group_info *gi = &ai->groups[group];
|
|
|
|
void *ptr = areas[group];
|
2009-08-14 08:00:52 +02:00
|
|
|
|
|
|
|
for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
|
|
|
|
if (gi->cpu_map[i] == NR_CPUS) {
|
|
|
|
/* unused unit, free whole */
|
|
|
|
free_fn(ptr, ai->unit_size);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/* copy and return the unused part */
|
|
|
|
memcpy(ptr, __per_cpu_load, ai->static_size);
|
|
|
|
free_fn(ptr + size_sum, ai->unit_size - size_sum);
|
|
|
|
}
|
2009-06-22 04:56:24 +02:00
|
|
|
}
|
2009-03-10 08:27:48 +01:00
|
|
|
|
2009-08-14 08:00:52 +02:00
|
|
|
/* base address is now known, determine group base offsets */
|
2009-09-24 11:46:01 +02:00
|
|
|
for (group = 0; group < ai->nr_groups; group++) {
|
2009-08-14 08:00:52 +02:00
|
|
|
ai->groups[group].base_offset = areas[group] - base;
|
2009-09-24 11:46:01 +02:00
|
|
|
}
|
2009-08-14 08:00:52 +02:00
|
|
|
|
2016-03-17 22:19:53 +01:00
|
|
|
pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
|
|
|
|
ai->dyn_size, ai->unit_size);
|
2009-07-04 01:10:59 +02:00
|
|
|
|
2009-08-14 08:00:51 +02:00
|
|
|
rc = pcpu_setup_first_chunk(ai, base);
|
2009-08-14 08:00:52 +02:00
|
|
|
goto out_free;
|
|
|
|
|
|
|
|
out_free_areas:
|
|
|
|
for (group = 0; group < ai->nr_groups; group++)
|
2013-09-17 16:57:34 +02:00
|
|
|
if (areas[group])
|
|
|
|
free_fn(areas[group],
|
|
|
|
ai->groups[group].nr_units * ai->unit_size);
|
2009-08-14 08:00:52 +02:00
|
|
|
out_free:
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
pcpu_free_alloc_info(ai);
|
2009-08-14 08:00:52 +02:00
|
|
|
if (areas)
|
2014-01-22 00:50:40 +01:00
|
|
|
memblock_free_early(__pa(areas), areas_size);
|
2009-08-14 08:00:51 +02:00
|
|
|
return rc;
|
2009-07-04 01:10:59 +02:00
|
|
|
}
|
2010-09-09 18:00:15 +02:00
|
|
|
#endif /* BUILD_EMBED_FIRST_CHUNK */
|
2009-07-04 01:10:59 +02:00
|
|
|
|
2010-09-09 18:00:15 +02:00
|
|
|
#ifdef BUILD_PAGE_FIRST_CHUNK
|
2009-07-04 01:10:59 +02:00
|
|
|
/**
|
2009-08-14 08:00:49 +02:00
|
|
|
* pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
|
2009-07-04 01:10:59 +02:00
|
|
|
* @reserved_size: the size of reserved percpu area in bytes
|
|
|
|
* @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
|
2011-03-31 03:57:33 +02:00
|
|
|
* @free_fn: function to free percpu page, always called with PAGE_SIZE
|
2009-07-04 01:10:59 +02:00
|
|
|
* @populate_pte_fn: function to populate pte
|
|
|
|
*
|
2009-08-14 08:00:49 +02:00
|
|
|
* This is a helper to ease setting up page-remapped first percpu
|
|
|
|
* chunk and can be called where pcpu_setup_first_chunk() is expected.
|
2009-07-04 01:10:59 +02:00
|
|
|
*
|
|
|
|
* This is the basic allocator. Static percpu area is allocated
|
|
|
|
* page-by-page into vmalloc area.
|
|
|
|
*
|
|
|
|
* RETURNS:
|
2009-08-14 08:00:51 +02:00
|
|
|
* 0 on success, -errno on failure.
|
2009-07-04 01:10:59 +02:00
|
|
|
*/
|
2009-08-14 08:00:51 +02:00
|
|
|
int __init pcpu_page_first_chunk(size_t reserved_size,
|
|
|
|
pcpu_fc_alloc_fn_t alloc_fn,
|
|
|
|
pcpu_fc_free_fn_t free_fn,
|
|
|
|
pcpu_fc_populate_pte_fn_t populate_pte_fn)
|
2009-07-04 01:10:59 +02:00
|
|
|
{
|
2009-07-04 01:10:59 +02:00
|
|
|
static struct vm_struct vm;
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
struct pcpu_alloc_info *ai;
|
2009-08-14 08:00:49 +02:00
|
|
|
char psize_str[16];
|
2009-07-04 01:11:00 +02:00
|
|
|
int unit_pages;
|
2009-07-04 01:10:59 +02:00
|
|
|
size_t pages_size;
|
2009-07-04 01:11:00 +02:00
|
|
|
struct page **pages;
|
2009-08-14 08:00:51 +02:00
|
|
|
int unit, i, j, rc;
|
2016-12-13 01:45:02 +01:00
|
|
|
int upa;
|
|
|
|
int nr_g0_units;
|
2009-07-04 01:10:59 +02:00
|
|
|
|
2009-08-14 08:00:49 +02:00
|
|
|
snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
|
|
|
|
|
2010-06-27 18:49:59 +02:00
|
|
|
ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
if (IS_ERR(ai))
|
|
|
|
return PTR_ERR(ai);
|
|
|
|
BUG_ON(ai->nr_groups != 1);
|
2016-12-13 01:45:02 +01:00
|
|
|
upa = ai->alloc_size/ai->unit_size;
|
|
|
|
nr_g0_units = roundup(num_possible_cpus(), upa);
|
|
|
|
if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) {
|
|
|
|
pcpu_free_alloc_info(ai);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
|
|
|
|
unit_pages = ai->unit_size >> PAGE_SHIFT;
|
2009-07-04 01:10:59 +02:00
|
|
|
|
|
|
|
/* unaligned allocations can't be freed, round up to page size */
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
|
|
|
|
sizeof(pages[0]));
|
2014-01-22 00:50:40 +01:00
|
|
|
pages = memblock_virt_alloc(pages_size, 0);
|
2009-07-04 01:10:59 +02:00
|
|
|
|
2009-07-04 01:10:59 +02:00
|
|
|
/* allocate pages */
|
2009-07-04 01:10:59 +02:00
|
|
|
j = 0;
|
2016-12-13 01:45:02 +01:00
|
|
|
for (unit = 0; unit < num_possible_cpus(); unit++) {
|
|
|
|
unsigned int cpu = ai->groups[0].cpu_map[unit];
|
2009-07-04 01:11:00 +02:00
|
|
|
for (i = 0; i < unit_pages; i++) {
|
2009-07-04 01:10:59 +02:00
|
|
|
void *ptr;
|
|
|
|
|
2009-08-14 08:00:50 +02:00
|
|
|
ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
|
2009-07-04 01:10:59 +02:00
|
|
|
if (!ptr) {
|
2016-03-17 22:19:53 +01:00
|
|
|
pr_warn("failed to allocate %s page for cpu%u\n",
|
2016-12-13 01:45:02 +01:00
|
|
|
psize_str, cpu);
|
2009-07-04 01:10:59 +02:00
|
|
|
goto enomem;
|
|
|
|
}
|
2011-09-26 18:12:53 +02:00
|
|
|
/* kmemleak tracks the percpu allocations separately */
|
|
|
|
kmemleak_free(ptr);
|
2009-07-04 01:11:00 +02:00
|
|
|
pages[j++] = virt_to_page(ptr);
|
2009-07-04 01:10:59 +02:00
|
|
|
}
|
2016-12-13 01:45:02 +01:00
|
|
|
}
|
2009-07-04 01:10:59 +02:00
|
|
|
|
2009-07-04 01:10:59 +02:00
|
|
|
/* allocate vm area, map the pages and copy static data */
|
|
|
|
vm.flags = VM_ALLOC;
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
vm.size = num_possible_cpus() * ai->unit_size;
|
2009-07-04 01:10:59 +02:00
|
|
|
vm_area_register_early(&vm, PAGE_SIZE);
|
|
|
|
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
for (unit = 0; unit < num_possible_cpus(); unit++) {
|
2009-08-14 08:00:50 +02:00
|
|
|
unsigned long unit_addr =
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
(unsigned long)vm.addr + unit * ai->unit_size;
|
2009-07-04 01:10:59 +02:00
|
|
|
|
2009-07-04 01:11:00 +02:00
|
|
|
for (i = 0; i < unit_pages; i++)
|
2009-07-04 01:10:59 +02:00
|
|
|
populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
|
|
|
|
|
|
|
|
/* pte already populated, the following shouldn't fail */
|
2009-08-14 08:00:51 +02:00
|
|
|
rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
|
|
|
|
unit_pages);
|
|
|
|
if (rc < 0)
|
|
|
|
panic("failed to map percpu area, err=%d\n", rc);
|
2009-03-10 08:27:48 +01:00
|
|
|
|
2009-07-04 01:10:59 +02:00
|
|
|
/*
|
|
|
|
* FIXME: Archs with virtual cache should flush local
|
|
|
|
* cache for the linear mapping here - something
|
|
|
|
* equivalent to flush_cache_vmap() on the local cpu.
|
|
|
|
* flush_cache_vmap() can't be used as most supporting
|
|
|
|
* data structures are not set up yet.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* copy static data */
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
|
2009-03-10 08:27:48 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/* we're ready, commit */
|
2016-03-17 22:19:53 +01:00
|
|
|
pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n",
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
unit_pages, psize_str, vm.addr, ai->static_size,
|
|
|
|
ai->reserved_size, ai->dyn_size);
|
2009-07-04 01:10:59 +02:00
|
|
|
|
2009-08-14 08:00:51 +02:00
|
|
|
rc = pcpu_setup_first_chunk(ai, vm.addr);
|
2009-07-04 01:10:59 +02:00
|
|
|
goto out_free_ar;
|
|
|
|
|
|
|
|
enomem:
|
|
|
|
while (--j >= 0)
|
2009-07-04 01:11:00 +02:00
|
|
|
free_fn(page_address(pages[j]), PAGE_SIZE);
|
2009-08-14 08:00:51 +02:00
|
|
|
rc = -ENOMEM;
|
2009-07-04 01:10:59 +02:00
|
|
|
out_free_ar:
|
2014-01-22 00:50:40 +01:00
|
|
|
memblock_free_early(__pa(pages), pages_size);
|
percpu: introduce pcpu_alloc_info and pcpu_group_info
Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array. For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem. Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.
This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus. pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.
pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.
* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
help dealing with pcpu_alloc_info.
* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
generalized and renamed to pcpu_build_alloc_info().
@cpu_distance_fn may be NULL indicating that all cpus are of
LOCAL_DISTANCE.
* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
generalized and renamed to pcpu_dump_alloc_info(). It now also
prints which group each alloc unit belongs to.
* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
separate parameters. All first chunk allocators are updated to use
pcpu_build_alloc_info() to build alloc_info and call
pcpu_setup_first_chunk() with it. This has the side effect of
packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are
possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.
* x86 setup_pcpu_lpage() is updated to deal with alloc_info.
* sparc64 setup_per_cpu_areas() is updated to build alloc_info.
Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus. It mostly changes how information is passed among
initialization functions and makes room for more flexibility.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
2009-08-14 08:00:51 +02:00
|
|
|
pcpu_free_alloc_info(ai);
|
2009-08-14 08:00:51 +02:00
|
|
|
return rc;
|
2009-07-04 01:10:59 +02:00
|
|
|
}
|
2010-09-09 18:00:15 +02:00
|
|
|
#endif /* BUILD_PAGE_FIRST_CHUNK */
|
2009-07-04 01:10:59 +02:00
|
|
|
|
2010-09-03 18:22:48 +02:00
|
|
|
#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
|
2009-03-30 12:07:44 +02:00
|
|
|
/*
|
2010-09-03 18:22:48 +02:00
|
|
|
* Generic SMP percpu area setup.
|
2009-03-30 12:07:44 +02:00
|
|
|
*
|
|
|
|
* The embedding helper is used because its behavior closely resembles
|
|
|
|
* the original non-dynamic generic percpu area setup. This is
|
|
|
|
* important because many archs have addressing restrictions and might
|
|
|
|
* fail if the percpu area is located far away from the previous
|
|
|
|
* location. As an added bonus, in non-NUMA cases, embedding is
|
|
|
|
* generally a good idea TLB-wise because percpu area can piggy back
|
|
|
|
* on the physical linear memory mapping which uses large page
|
|
|
|
* mappings on applicable archs.
|
|
|
|
*/
|
|
|
|
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
|
|
|
|
EXPORT_SYMBOL(__per_cpu_offset);
|
|
|
|
|
2009-08-14 08:00:52 +02:00
|
|
|
static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
|
|
|
|
size_t align)
|
|
|
|
{
|
2014-01-22 00:50:40 +01:00
|
|
|
return memblock_virt_alloc_from_nopanic(
|
|
|
|
size, align, __pa(MAX_DMA_ADDRESS));
|
2009-08-14 08:00:52 +02:00
|
|
|
}
|
2009-03-10 08:27:48 +01:00
|
|
|
|
2009-08-14 08:00:52 +02:00
|
|
|
static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
|
|
|
|
{
|
2014-01-22 00:50:40 +01:00
|
|
|
memblock_free_early(__pa(ptr), size);
|
2009-08-14 08:00:52 +02:00
|
|
|
}
|
|
|
|
|
2009-03-30 12:07:44 +02:00
|
|
|
void __init setup_per_cpu_areas(void)
|
|
|
|
{
|
|
|
|
unsigned long delta;
|
|
|
|
unsigned int cpu;
|
2009-08-14 08:00:51 +02:00
|
|
|
int rc;
|
2009-03-30 12:07:44 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Always reserve area for module percpu variables. That's
|
|
|
|
* what the legacy allocator did.
|
|
|
|
*/
|
2009-08-14 08:00:51 +02:00
|
|
|
rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
|
2009-08-14 08:00:52 +02:00
|
|
|
PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
|
|
|
|
pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
|
2009-08-14 08:00:51 +02:00
|
|
|
if (rc < 0)
|
2010-09-03 18:22:48 +02:00
|
|
|
panic("Failed to initialize percpu areas.");
|
2009-03-30 12:07:44 +02:00
|
|
|
|
|
|
|
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
|
|
|
|
for_each_possible_cpu(cpu)
|
2009-08-14 08:00:51 +02:00
|
|
|
__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
|
2009-03-10 08:27:48 +01:00
|
|
|
}
|
2010-09-03 18:22:48 +02:00
|
|
|
#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
|
|
|
|
|
|
|
|
#else /* CONFIG_SMP */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* UP percpu area setup.
|
|
|
|
*
|
|
|
|
* UP always uses km-based percpu allocator with identity mapping.
|
|
|
|
* Static percpu variables are indistinguishable from the usual static
|
|
|
|
* variables and don't require any special preparation.
|
|
|
|
*/
|
|
|
|
void __init setup_per_cpu_areas(void)
|
|
|
|
{
|
|
|
|
const size_t unit_size =
|
|
|
|
roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
|
|
|
|
PERCPU_DYNAMIC_RESERVE));
|
|
|
|
struct pcpu_alloc_info *ai;
|
|
|
|
void *fc;
|
|
|
|
|
|
|
|
ai = pcpu_alloc_alloc_info(1, 1);
|
2014-01-22 00:50:40 +01:00
|
|
|
fc = memblock_virt_alloc_from_nopanic(unit_size,
|
|
|
|
PAGE_SIZE,
|
|
|
|
__pa(MAX_DMA_ADDRESS));
|
2010-09-03 18:22:48 +02:00
|
|
|
if (!ai || !fc)
|
|
|
|
panic("Failed to allocate memory for percpu areas.");
|
2012-05-09 17:55:19 +02:00
|
|
|
/* kmemleak tracks the percpu allocations separately */
|
|
|
|
kmemleak_free(fc);
|
2010-09-03 18:22:48 +02:00
|
|
|
|
|
|
|
ai->dyn_size = unit_size;
|
|
|
|
ai->unit_size = unit_size;
|
|
|
|
ai->atom_size = unit_size;
|
|
|
|
ai->alloc_size = unit_size;
|
|
|
|
ai->groups[0].nr_units = 1;
|
|
|
|
ai->groups[0].cpu_map[0] = 0;
|
|
|
|
|
|
|
|
if (pcpu_setup_first_chunk(ai, fc) < 0)
|
|
|
|
panic("Failed to initialize percpu areas.");
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_SMP */
|
2010-06-27 18:50:00 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* First and reserved chunks are initialized with temporary allocation
|
|
|
|
* map in initdata so that they can be used before slab is online.
|
|
|
|
* This function is called after slab is brought up and replaces those
|
|
|
|
* with properly allocated maps.
|
|
|
|
*/
|
|
|
|
void __init percpu_init_late(void)
|
|
|
|
{
|
|
|
|
struct pcpu_chunk *target_chunks[] =
|
|
|
|
{ pcpu_first_chunk, pcpu_reserved_chunk, NULL };
|
|
|
|
struct pcpu_chunk *chunk;
|
|
|
|
unsigned long flags;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; (chunk = target_chunks[i]); i++) {
|
|
|
|
int *map;
|
|
|
|
const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
|
|
|
|
|
|
|
|
BUILD_BUG_ON(size > PAGE_SIZE);
|
|
|
|
|
2011-08-04 11:02:33 +02:00
|
|
|
map = pcpu_mem_zalloc(size);
|
2010-06-27 18:50:00 +02:00
|
|
|
BUG_ON(!map);
|
|
|
|
|
|
|
|
spin_lock_irqsave(&pcpu_lock, flags);
|
|
|
|
memcpy(map, chunk->map, size);
|
|
|
|
chunk->map = map;
|
|
|
|
spin_unlock_irqrestore(&pcpu_lock, flags);
|
|
|
|
}
|
|
|
|
}
|
2014-09-02 20:46:05 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Percpu allocator is initialized early during boot when neither slab or
|
|
|
|
* workqueue is available. Plug async management until everything is up
|
|
|
|
* and running.
|
|
|
|
*/
|
|
|
|
static int __init percpu_enable_async(void)
|
|
|
|
{
|
|
|
|
pcpu_async_enabled = true;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
subsys_initcall(percpu_enable_async);
|