mm/memblock: add extra "flags" to memblock to allow selection of memory based on attribute

Some high end Intel Xeon systems report uncorrectable memory errors as a
recoverable machine check.  Linux has included code for some time to
process these and just signal the affected processes (or even recover
completely if the error was in a read only page that can be replaced by
reading from disk).

But we have no recovery path for errors encountered during kernel code
execution.  Except for some very specific cases were are unlikely to ever
be able to recover.

Enter memory mirroring. Actually 3rd generation of memory mirroing.

Gen1: All memory is mirrored
	Pro: No s/w enabling - h/w just gets good data from other side of the
	     mirror
	Con: Halves effective memory capacity available to OS/applications

Gen2: Partial memory mirror - just mirror memory begind some memory controllers
	Pro: Keep more of the capacity
	Con: Nightmare to enable. Have to choose between allocating from
	     mirrored memory for safety vs. NUMA local memory for performance

Gen3: Address range partial memory mirror - some mirror on each memory
      controller
	Pro: Can tune the amount of mirror and keep NUMA performance
	Con: I have to write memory management code to implement

The current plan is just to use mirrored memory for kernel allocations.
This has been broken into two phases:

1) This patch series - find the mirrored memory, use it for boot time
   allocations

2) Wade into mm/page_alloc.c and define a ZONE_MIRROR to pick up the
   unused mirrored memory from mm/memblock.c and only give it out to
   select kernel allocations (this is still being scoped because
   page_alloc.c is scary).

This patch (of 3):

Add extra "flags" to memblock to allow selection of memory based on
attribute.  No functional changes

Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Xiexiuqi <xiexiuqi@huawei.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Tony Luck 2015-06-24 16:58:09 -07:00 committed by Linus Torvalds
parent 6afdb859b7
commit fc6daaf931
10 changed files with 83 additions and 47 deletions

View File

@ -33,11 +33,12 @@ static struct memblock_type oldmem_type = {
};
#define for_each_dump_mem_range(i, nid, p_start, p_end, p_nid) \
for (i = 0, __next_mem_range(&i, nid, &memblock.physmem, \
for (i = 0, __next_mem_range(&i, nid, MEMBLOCK_NONE, \
&memblock.physmem, \
&oldmem_type, p_start, \
p_end, p_nid); \
i != (u64)ULLONG_MAX; \
__next_mem_range(&i, nid, &memblock.physmem, \
__next_mem_range(&i, nid, MEMBLOCK_NONE, &memblock.physmem,\
&oldmem_type, \
p_start, p_end, p_nid))

View File

@ -1966,7 +1966,8 @@ static phys_addr_t __init available_memory(void)
phys_addr_t pa_start, pa_end;
u64 i;
for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL)
for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start,
&pa_end, NULL)
available = available + (pa_end - pa_start);
return available;
@ -1992,7 +1993,8 @@ static void __init reduce_memory(phys_addr_t limit_ram)
if (limit_ram >= avail_ram)
return;
for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL) {
for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start,
&pa_end, NULL) {
phys_addr_t region_size = pa_end - pa_start;
phys_addr_t clip_start = pa_start;

View File

@ -91,7 +91,8 @@ void __init setup_bios_corruption_check(void)
corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
NULL) {
start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
PAGE_SIZE, corruption_check_size);
end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),

View File

@ -1123,7 +1123,8 @@ void __init memblock_find_dma_reserve(void)
nr_pages += end_pfn - start_pfn;
}
for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
NULL) {
start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
if (start_pfn < end_pfn)

View File

@ -433,7 +433,7 @@ void __init add_highpages_with_active_regions(int nid,
phys_addr_t start, end;
u64 i;
for_each_free_mem_range(i, nid, &start, &end, NULL) {
for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) {
unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
start_pfn, end_pfn);
unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),

View File

@ -21,7 +21,10 @@
#define INIT_PHYSMEM_REGIONS 4
/* Definition of memblock flags. */
#define MEMBLOCK_HOTPLUG 0x1 /* hotpluggable region */
enum {
MEMBLOCK_NONE = 0x0, /* No special request */
MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */
};
struct memblock_region {
phys_addr_t base;
@ -61,7 +64,7 @@ extern bool movable_node_enabled;
phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
phys_addr_t start, phys_addr_t end,
int nid);
int nid, ulong flags);
phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align);
phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
@ -85,11 +88,13 @@ int memblock_remove_range(struct memblock_type *type,
phys_addr_t base,
phys_addr_t size);
void __next_mem_range(u64 *idx, int nid, struct memblock_type *type_a,
void __next_mem_range(u64 *idx, int nid, ulong flags,
struct memblock_type *type_a,
struct memblock_type *type_b, phys_addr_t *out_start,
phys_addr_t *out_end, int *out_nid);
void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
struct memblock_type *type_a,
struct memblock_type *type_b, phys_addr_t *out_start,
phys_addr_t *out_end, int *out_nid);
@ -100,16 +105,17 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
* @type_a: ptr to memblock_type to iterate
* @type_b: ptr to memblock_type which excludes from the iteration
* @nid: node selector, %NUMA_NO_NODE for all nodes
* @flags: pick from blocks based on memory attributes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*/
#define for_each_mem_range(i, type_a, type_b, nid, \
#define for_each_mem_range(i, type_a, type_b, nid, flags, \
p_start, p_end, p_nid) \
for (i = 0, __next_mem_range(&i, nid, type_a, type_b, \
for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b, \
p_start, p_end, p_nid); \
i != (u64)ULLONG_MAX; \
__next_mem_range(&i, nid, type_a, type_b, \
__next_mem_range(&i, nid, flags, type_a, type_b, \
p_start, p_end, p_nid))
/**
@ -119,17 +125,18 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
* @type_a: ptr to memblock_type to iterate
* @type_b: ptr to memblock_type which excludes from the iteration
* @nid: node selector, %NUMA_NO_NODE for all nodes
* @flags: pick from blocks based on memory attributes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*/
#define for_each_mem_range_rev(i, type_a, type_b, nid, \
#define for_each_mem_range_rev(i, type_a, type_b, nid, flags, \
p_start, p_end, p_nid) \
for (i = (u64)ULLONG_MAX, \
__next_mem_range_rev(&i, nid, type_a, type_b, \
__next_mem_range_rev(&i, nid, flags, type_a, type_b,\
p_start, p_end, p_nid); \
i != (u64)ULLONG_MAX; \
__next_mem_range_rev(&i, nid, type_a, type_b, \
__next_mem_range_rev(&i, nid, flags, type_a, type_b, \
p_start, p_end, p_nid))
#ifdef CONFIG_MOVABLE_NODE
@ -181,13 +188,14 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
* @flags: pick from blocks based on memory attributes
*
* Walks over free (memory && !reserved) areas of memblock. Available as
* soon as memblock is initialized.
*/
#define for_each_free_mem_range(i, nid, p_start, p_end, p_nid) \
#define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid) \
for_each_mem_range(i, &memblock.memory, &memblock.reserved, \
nid, p_start, p_end, p_nid)
nid, flags, p_start, p_end, p_nid)
/**
* for_each_free_mem_range_reverse - rev-iterate through free memblock areas
@ -196,13 +204,15 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
* @flags: pick from blocks based on memory attributes
*
* Walks over free (memory && !reserved) areas of memblock in reverse
* order. Available as soon as memblock is initialized.
*/
#define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \
#define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end, \
p_nid) \
for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
nid, p_start, p_end, p_nid)
nid, flags, p_start, p_end, p_nid)
static inline void memblock_set_region_flags(struct memblock_region *r,
unsigned long flags)
@ -273,7 +283,8 @@ static inline bool memblock_bottom_up(void) { return false; }
#define MEMBLOCK_ALLOC_ACCESSIBLE 0
phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
phys_addr_t start, phys_addr_t end);
phys_addr_t start, phys_addr_t end,
ulong flags);
phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
phys_addr_t max_addr);
phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,

View File

@ -316,13 +316,15 @@ int __init cma_declare_contiguous(phys_addr_t base,
*/
if (base < highmem_start && limit > highmem_start) {
addr = memblock_alloc_range(size, alignment,
highmem_start, limit);
highmem_start, limit,
MEMBLOCK_NONE);
limit = highmem_start;
}
if (!addr) {
addr = memblock_alloc_range(size, alignment, base,
limit);
limit,
MEMBLOCK_NONE);
if (!addr) {
ret = -ENOMEM;
goto err;

View File

@ -107,6 +107,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
* @flags: pick from blocks based on memory attributes
*
* Utility called from memblock_find_in_range_node(), find free area bottom-up.
*
@ -115,12 +116,13 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
*/
static phys_addr_t __init_memblock
__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align, int nid)
phys_addr_t size, phys_addr_t align, int nid,
ulong flags)
{
phys_addr_t this_start, this_end, cand;
u64 i;
for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
@ -139,6 +141,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
* @flags: pick from blocks based on memory attributes
*
* Utility called from memblock_find_in_range_node(), find free area top-down.
*
@ -147,12 +150,14 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
*/
static phys_addr_t __init_memblock
__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align, int nid)
phys_addr_t size, phys_addr_t align, int nid,
ulong flags)
{
phys_addr_t this_start, this_end, cand;
u64 i;
for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end,
NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
@ -174,6 +179,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
* @start: start of candidate range
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
* @flags: pick from blocks based on memory attributes
*
* Find @size free area aligned to @align in the specified range and node.
*
@ -190,7 +196,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
*/
phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
phys_addr_t end, int nid)
phys_addr_t end, int nid, ulong flags)
{
phys_addr_t kernel_end, ret;
@ -215,7 +221,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
/* ok, try bottom-up allocation first */
ret = __memblock_find_range_bottom_up(bottom_up_start, end,
size, align, nid);
size, align, nid, flags);
if (ret)
return ret;
@ -233,7 +239,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
"memory hotunplug may be affected\n");
}
return __memblock_find_range_top_down(start, end, size, align, nid);
return __memblock_find_range_top_down(start, end, size, align, nid,
flags);
}
/**
@ -253,7 +260,7 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t align)
{
return memblock_find_in_range_node(size, align, start, end,
NUMA_NO_NODE);
NUMA_NO_NODE, MEMBLOCK_NONE);
}
static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@ -782,6 +789,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
* __next__mem_range - next function for for_each_free_mem_range() etc.
* @idx: pointer to u64 loop variable
* @nid: node selector, %NUMA_NO_NODE for all nodes
* @flags: pick from blocks based on memory attributes
* @type_a: pointer to memblock_type from where the range is taken
* @type_b: pointer to memblock_type which excludes memory from being taken
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
@ -803,7 +811,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
* As both region arrays are sorted, the function advances the two indices
* in lockstep and returns each intersection.
*/
void __init_memblock __next_mem_range(u64 *idx, int nid,
void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
struct memblock_type *type_a,
struct memblock_type *type_b,
phys_addr_t *out_start,
@ -895,6 +903,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
*
* @idx: pointer to u64 loop variable
* @nid: nid: node selector, %NUMA_NO_NODE for all nodes
* @flags: pick from blocks based on memory attributes
* @type_a: pointer to memblock_type from where the range is taken
* @type_b: pointer to memblock_type which excludes memory from being taken
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
@ -903,7 +912,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
*
* Reverse of __next_mem_range().
*/
void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
struct memblock_type *type_a,
struct memblock_type *type_b,
phys_addr_t *out_start,
@ -1050,14 +1059,15 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
phys_addr_t end, int nid)
phys_addr_t end, int nid, ulong flags)
{
phys_addr_t found;
if (!align)
align = SMP_CACHE_BYTES;
found = memblock_find_in_range_node(size, align, start, end, nid);
found = memblock_find_in_range_node(size, align, start, end, nid,
flags);
if (found && !memblock_reserve(found, size)) {
/*
* The min_count is set to 0 so that memblock allocations are
@ -1070,26 +1080,30 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
}
phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
phys_addr_t start, phys_addr_t end)
phys_addr_t start, phys_addr_t end,
ulong flags)
{
return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
flags);
}
static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t max_addr,
int nid)
int nid, ulong flags)
{
return memblock_alloc_range_nid(size, align, 0, max_addr, nid);
return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags);
}
phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
{
return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE,
nid, MEMBLOCK_NONE);
}
phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE,
MEMBLOCK_NONE);
}
phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@ -1173,13 +1187,14 @@ static void * __init memblock_virt_alloc_internal(
again:
alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
nid);
nid, MEMBLOCK_NONE);
if (alloc)
goto done;
if (nid != NUMA_NO_NODE) {
alloc = memblock_find_in_range_node(size, align, min_addr,
max_addr, NUMA_NO_NODE);
max_addr, NUMA_NO_NODE,
MEMBLOCK_NONE);
if (alloc)
goto done;
}

View File

@ -74,7 +74,8 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
u64 i;
phys_addr_t this_start, this_end;
for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &this_start,
&this_end, NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
if (this_start < this_end) {

View File

@ -41,7 +41,8 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
if (limit > memblock.current_limit)
limit = memblock.current_limit;
addr = memblock_find_in_range_node(size, align, goal, limit, nid);
addr = memblock_find_in_range_node(size, align, goal, limit, nid,
MEMBLOCK_NONE);
if (!addr)
return NULL;
@ -121,7 +122,8 @@ static unsigned long __init free_low_memory_core_early(void)
memblock_clear_hotplug(0, -1);
for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
NULL)
count += __free_memory_core(start, end);
#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK