diff --git a/arch/s390/Kbuild b/arch/s390/Kbuild index 2938934c6518..e256592eb66e 100644 --- a/arch/s390/Kbuild +++ b/arch/s390/Kbuild @@ -6,3 +6,4 @@ obj-$(CONFIG_S390_HYPFS_FS) += hypfs/ obj-$(CONFIG_APPLDATA_BASE) += appldata/ obj-y += net/ obj-$(CONFIG_PCI) += pci/ +obj-$(CONFIG_NUMA) += numa/ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 91e8954f1237..25510adb07d3 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -153,6 +153,10 @@ config S390 select TTY select VIRT_CPU_ACCOUNTING select VIRT_TO_BUS + select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_WANTS_PROT_NUMA_PROT_NONE + select HAVE_ARCH_EARLY_PFN_TO_NID + config SCHED_OMIT_FRAME_POINTER def_bool y @@ -386,6 +390,39 @@ config HOTPLUG_CPU config SCHED_SMT def_bool n +# Some NUMA nodes have memory ranges that span +# other nodes. Even though a pfn is valid and +# between a node's start and end pfns, it may not +# reside on that node. See memmap_init_zone() +# for details. <- They meant memory holes! +config NODES_SPAN_OTHER_NODES + def_bool NUMA + +config NUMA + bool "NUMA support" + depends on SMP && 64BIT && SCHED_TOPOLOGY + default n + help + Enable NUMA support + + This option adds NUMA support to the kernel. + + An operation mode can be selected by appending + numa= to the kernel command line. + + The default behaviour is identical to appending numa=plain to + the command line. This will create just one node with all + available memory and all CPUs in it. + +config NODES_SHIFT + int "Maximum NUMA nodes (as a power of 2)" + range 1 10 + depends on NUMA + default "4" + help + Specify the maximum number of NUMA nodes available on the target + system. Increases memory reserved to accommodate various tables. + config SCHED_MC def_bool n diff --git a/arch/s390/include/asm/mmzone.h b/arch/s390/include/asm/mmzone.h new file mode 100644 index 000000000000..a9e834e60b84 --- /dev/null +++ b/arch/s390/include/asm/mmzone.h @@ -0,0 +1,16 @@ +/* + * NUMA support for s390 + * + * Copyright IBM Corp. 2015 + */ + +#ifndef _ASM_S390_MMZONE_H +#define _ASM_S390_MMZONE_H + +#ifdef CONFIG_NUMA + +extern struct pglist_data *node_data[]; +#define NODE_DATA(nid) (node_data[nid]) + +#endif /* CONFIG_NUMA */ +#endif /* _ASM_S390_MMZONE_H */ diff --git a/arch/s390/include/asm/numa.h b/arch/s390/include/asm/numa.h new file mode 100644 index 000000000000..ea4edbfba9f6 --- /dev/null +++ b/arch/s390/include/asm/numa.h @@ -0,0 +1,31 @@ +/* + * NUMA support for s390 + * + * Declare the NUMA core code structures and functions. + * + * Copyright IBM Corp. 2015 + */ + +#ifndef _ASM_S390_NUMA_H +#define _ASM_S390_NUMA_H + +#ifdef CONFIG_NUMA + +#include +#include + +void numa_setup(void); +int numa_pfn_to_nid(unsigned long pfn); +int __node_distance(int a, int b); +void numa_update_cpu_topology(void); + +extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; +extern int numa_debug_enabled; + +#else + +static inline void numa_setup(void) { } +static inline void numa_update_cpu_topology(void) { } + +#endif /* CONFIG_NUMA */ +#endif /* _ASM_S390_NUMA_H */ diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 9b6545e8f685..34d960353a08 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -192,4 +192,20 @@ void zpci_debug_init_device(struct zpci_dev *); void zpci_debug_exit_device(struct zpci_dev *); void zpci_debug_info(struct zpci_dev *, struct seq_file *); +#ifdef CONFIG_NUMA + +/* Returns the node based on PCI bus */ +static inline int __pcibus_to_node(const struct pci_bus *bus) +{ + return NUMA_NO_NODE; +} + +static inline const struct cpumask * +cpumask_of_pcibus(const struct pci_bus *bus) +{ + return cpu_online_mask; +} + +#endif /* CONFIG_NUMA */ + #endif diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index 4990f6c66288..27ebde643933 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -2,6 +2,7 @@ #define _ASM_S390_TOPOLOGY_H #include +#include struct sysinfo_15_1_x; struct cpu; @@ -13,6 +14,7 @@ struct cpu_topology_s390 { unsigned short core_id; unsigned short socket_id; unsigned short book_id; + unsigned short node_id; cpumask_t thread_mask; cpumask_t core_mask; cpumask_t book_mask; @@ -52,6 +54,43 @@ static inline void topology_expect_change(void) { } #define POLARIZATION_VM (2) #define POLARIZATION_VH (3) +#define SD_BOOK_INIT SD_CPU_INIT + +#ifdef CONFIG_NUMA + +#define cpu_to_node cpu_to_node +static inline int cpu_to_node(int cpu) +{ + return per_cpu(cpu_topology, cpu).node_id; +} + +/* Returns a pointer to the cpumask of CPUs on node 'node'. */ +#define cpumask_of_node cpumask_of_node +static inline const struct cpumask *cpumask_of_node(int node) +{ + return node_to_cpumask_map[node]; +} + +/* + * Returns the number of the node containing node 'node'. This + * architecture is flat, so it is a pretty simple function! + */ +#define parent_node(node) (node) + +#define pcibus_to_node(bus) __pcibus_to_node(bus) + +#define node_distance(a, b) __node_distance(a, b) + +#else /* !CONFIG_NUMA */ + +#define numa_node_id numa_node_id +static inline int numa_node_id(void) +{ + return 0; +} + +#endif /* CONFIG_NUMA */ + #include #endif /* _ASM_S390_TOPOLOGY_H */ diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h index 91f56b1d8156..ec2bfc83a1e9 100644 --- a/arch/s390/include/asm/unistd.h +++ b/arch/s390/include/asm/unistd.h @@ -11,12 +11,12 @@ #define __IGNORE_time -/* Ignore NUMA system calls. Not wired up on s390. */ -#define __IGNORE_mbind -#define __IGNORE_get_mempolicy -#define __IGNORE_set_mempolicy -#define __IGNORE_migrate_pages -#define __IGNORE_move_pages +/* NUMA system calls */ +#define _ARCH_WANT_mbind +#define __ARCH_WANT_get_mempolicy +#define __ARCH_WANT_set_mempolicy +#define __ARCH_WANT_migrate_pages +#define __ARCH_WANT_move_pages /* Ignore system calls that are also reachable via sys_socket */ #define __IGNORE_recvmmsg diff --git a/arch/s390/include/uapi/asm/unistd.h b/arch/s390/include/uapi/asm/unistd.h index 67878af257a0..59d2bb4e2d0c 100644 --- a/arch/s390/include/uapi/asm/unistd.h +++ b/arch/s390/include/uapi/asm/unistd.h @@ -204,9 +204,9 @@ #define __NR_statfs64 265 #define __NR_fstatfs64 266 #define __NR_remap_file_pages 267 -/* Number 268 is reserved for new sys_mbind */ -/* Number 269 is reserved for new sys_get_mempolicy */ -/* Number 270 is reserved for new sys_set_mempolicy */ +#define __NR_mbind 268 +#define __NR_get_mempolicy 269 +#define __NR_set_mempolicy 270 #define __NR_mq_open 271 #define __NR_mq_unlink 272 #define __NR_mq_timedsend 273 @@ -223,7 +223,7 @@ #define __NR_inotify_init 284 #define __NR_inotify_add_watch 285 #define __NR_inotify_rm_watch 286 -/* Number 287 is reserved for new sys_migrate_pages */ +#define __NR_migrate_pages 287 #define __NR_openat 288 #define __NR_mkdirat 289 #define __NR_mknodat 290 @@ -245,7 +245,7 @@ #define __NR_sync_file_range 307 #define __NR_tee 308 #define __NR_vmsplice 309 -/* Number 310 is reserved for new sys_move_pages */ +#define __NR_move_pages 310 #define __NR_getcpu 311 #define __NR_epoll_pwait 312 #define __NR_utimes 313 diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 85a1d4770c9c..af6b0236ccf3 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -62,6 +62,7 @@ #include #include #include +#include #include "entry.h" /* @@ -879,6 +880,7 @@ void __init setup_arch(char **cmdline_p) setup_lowcore(); smp_fill_possible_mask(); cpu_init(); + numa_setup(); /* * Setup capabilities (ELF_HWCAP & ELF_PLATFORM). diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index 1acad02681c4..f3f4a137aef6 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -276,9 +276,9 @@ SYSCALL(sys_ni_syscall,compat_sys_s390_fadvise64_64) SYSCALL(sys_statfs64,compat_sys_statfs64) SYSCALL(sys_fstatfs64,compat_sys_fstatfs64) SYSCALL(sys_remap_file_pages,compat_sys_remap_file_pages) -NI_SYSCALL /* 268 sys_mbind */ -NI_SYSCALL /* 269 sys_get_mempolicy */ -NI_SYSCALL /* 270 sys_set_mempolicy */ +SYSCALL(sys_mbind,compat_sys_mbind) +SYSCALL(sys_get_mempolicy,compat_sys_get_mempolicy) +SYSCALL(sys_set_mempolicy,compat_sys_set_mempolicy) SYSCALL(sys_mq_open,compat_sys_mq_open) SYSCALL(sys_mq_unlink,compat_sys_mq_unlink) SYSCALL(sys_mq_timedsend,compat_sys_mq_timedsend) @@ -295,7 +295,7 @@ SYSCALL(sys_ioprio_get,compat_sys_ioprio_get) SYSCALL(sys_inotify_init,sys_inotify_init) SYSCALL(sys_inotify_add_watch,compat_sys_inotify_add_watch) /* 285 */ SYSCALL(sys_inotify_rm_watch,compat_sys_inotify_rm_watch) -NI_SYSCALL /* 287 sys_migrate_pages */ +SYSCALL(sys_migrate_pages,compat_sys_migrate_pages) SYSCALL(sys_openat,compat_sys_openat) SYSCALL(sys_mkdirat,compat_sys_mkdirat) SYSCALL(sys_mknodat,compat_sys_mknodat) /* 290 */ @@ -318,7 +318,7 @@ SYSCALL(sys_splice,compat_sys_splice) SYSCALL(sys_sync_file_range,compat_sys_s390_sync_file_range) SYSCALL(sys_tee,compat_sys_tee) SYSCALL(sys_vmsplice,compat_sys_vmsplice) -NI_SYSCALL /* 310 sys_move_pages */ +SYSCALL(sys_move_pages,compat_sys_move_pages) SYSCALL(sys_getcpu,compat_sys_getcpu) SYSCALL(sys_epoll_pwait,compat_sys_epoll_pwait) SYSCALL(sys_utimes,compat_sys_utimes) diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 5728c5bd44a8..0f5f8b09c903 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -18,7 +18,10 @@ #include #include #include +#include +#include #include +#include #define PTF_HORIZONTAL (0UL) #define PTF_VERTICAL (1UL) @@ -260,6 +263,7 @@ static void update_cpu_masks(void) } } spin_unlock_irqrestore(&topology_lock, flags); + numa_update_cpu_topology(); } void store_topology(struct sysinfo_15_1_x *info) @@ -274,21 +278,21 @@ int arch_update_cpu_topology(void) { struct sysinfo_15_1_x *info = tl_info; struct device *dev; - int cpu; + int cpu, rc = 0; - if (!MACHINE_HAS_TOPOLOGY) { - update_cpu_masks(); - topology_update_polarization_simple(); - return 0; + if (MACHINE_HAS_TOPOLOGY) { + rc = 1; + store_topology(info); + tl_to_masks(info); } - store_topology(info); - tl_to_masks(info); update_cpu_masks(); + if (!MACHINE_HAS_TOPOLOGY) + topology_update_polarization_simple(); for_each_online_cpu(cpu) { dev = get_cpu_device(cpu); kobject_uevent(&dev->kobj, KOBJ_CHANGE); } - return 1; + return rc; } static void topology_work_fn(struct work_struct *work) @@ -450,7 +454,6 @@ static struct sched_domain_topology_level s390_topology[] = { { cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, { cpu_book_mask, SD_INIT_NAME(BOOK) }, - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, { NULL, }, }; diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index dc4db08286e9..2963b563621c 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -139,7 +139,7 @@ void __init mem_init(void) cpumask_set_cpu(0, mm_cpumask(&init_mm)); atomic_set(&init_mm.context.attach_count, 1); - max_mapnr = max_low_pfn; + set_max_mapnr(max_low_pfn); high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); /* Setup guest page hinting */ diff --git a/arch/s390/numa/Makefile b/arch/s390/numa/Makefile new file mode 100644 index 000000000000..7e94c8f491f7 --- /dev/null +++ b/arch/s390/numa/Makefile @@ -0,0 +1 @@ +obj-y += numa.o diff --git a/arch/s390/numa/numa.c b/arch/s390/numa/numa.c new file mode 100644 index 000000000000..0416a3671e33 --- /dev/null +++ b/arch/s390/numa/numa.c @@ -0,0 +1,180 @@ +/* + * NUMA support for s390 + * + * Implement NUMA core code. + * + * Copyright IBM Corp. 2015 + */ + +#define KMSG_COMPONENT "numa" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "numa_mode.h" + +pg_data_t *node_data[MAX_NUMNODES]; +EXPORT_SYMBOL(node_data); + +cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; +EXPORT_SYMBOL(node_to_cpumask_map); + +const struct numa_mode numa_mode_plain = { + .name = "plain", +}; + +static const struct numa_mode *mode = &numa_mode_plain; + +int numa_pfn_to_nid(unsigned long pfn) +{ + return mode->__pfn_to_nid ? mode->__pfn_to_nid(pfn) : 0; +} + +void numa_update_cpu_topology(void) +{ + if (mode->update_cpu_topology) + mode->update_cpu_topology(); +} + +int __node_distance(int a, int b) +{ + return mode->distance ? mode->distance(a, b) : 0; +} + +int numa_debug_enabled; + +/* + * alloc_node_data() - Allocate node data + */ +static __init pg_data_t *alloc_node_data(void) +{ + pg_data_t *res; + + res = (pg_data_t *) memblock_alloc(sizeof(pg_data_t), 1); + if (!res) + panic("Could not allocate memory for node data!\n"); + memset(res, 0, sizeof(pg_data_t)); + return res; +} + +/* + * numa_setup_memory() - Assign bootmem to nodes + * + * The memory is first added to memblock without any respect to nodes. + * This is fixed before remaining memblock memory is handed over to the + * buddy allocator. + * An important side effect is that large bootmem allocations might easily + * cross node boundaries, which can be needed for large allocations with + * smaller memory stripes in each node (i.e. when using NUMA emulation). + * + * Memory defines nodes: + * Therefore this routine also sets the nodes online with memory. + */ +static void __init numa_setup_memory(void) +{ + unsigned long cur_base, align, end_of_dram; + int nid = 0; + + end_of_dram = memblock_end_of_DRAM(); + align = mode->align ? mode->align() : ULONG_MAX; + + /* + * Step through all available memory and assign it to the nodes + * indicated by the mode implementation. + * All nodes which are seen here will be set online. + */ + cur_base = 0; + do { + nid = numa_pfn_to_nid(PFN_DOWN(cur_base)); + node_set_online(nid); + memblock_set_node(cur_base, align, &memblock.memory, nid); + cur_base += align; + } while (cur_base < end_of_dram); + + /* Allocate and fill out node_data */ + for (nid = 0; nid < MAX_NUMNODES; nid++) + NODE_DATA(nid) = alloc_node_data(); + + for_each_online_node(nid) { + unsigned long start_pfn, end_pfn; + unsigned long t_start, t_end; + int i; + + start_pfn = ULONG_MAX; + end_pfn = 0; + for_each_mem_pfn_range(i, nid, &t_start, &t_end, NULL) { + if (t_start < start_pfn) + start_pfn = t_start; + if (t_end > end_pfn) + end_pfn = t_end; + } + NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; + NODE_DATA(nid)->node_id = nid; + } +} + +/* + * numa_setup() - Earliest initialization + * + * Assign the mode and call the mode's setup routine. + */ +void __init numa_setup(void) +{ + pr_info("NUMA mode: %s\n", mode->name); + if (mode->setup) + mode->setup(); + numa_setup_memory(); + memblock_dump_all(); +} + + +/* + * numa_init_early() - Initialization initcall + * + * This runs when only one CPU is online and before the first + * topology update is called for by the scheduler. + */ +static int __init numa_init_early(void) +{ + /* Attach all possible CPUs to node 0 for now. */ + cpumask_copy(node_to_cpumask_map[0], cpu_possible_mask); + return 0; +} +early_initcall(numa_init_early); + +/* + * numa_init_late() - Initialization initcall + * + * Register NUMA nodes. + */ +static int __init numa_init_late(void) +{ + int nid; + + for_each_online_node(nid) + register_one_node(nid); + return 0; +} +device_initcall(numa_init_late); + +static int __init parse_debug(char *parm) +{ + numa_debug_enabled = 1; + return 0; +} +early_param("numa_debug", parse_debug); + +static int __init parse_numa(char *parm) +{ + if (strcmp(parm, numa_mode_plain.name) == 0) + mode = &numa_mode_plain; + return 0; +} +early_param("numa", parse_numa); diff --git a/arch/s390/numa/numa_mode.h b/arch/s390/numa/numa_mode.h new file mode 100644 index 000000000000..775659848011 --- /dev/null +++ b/arch/s390/numa/numa_mode.h @@ -0,0 +1,23 @@ +/* + * NUMA support for s390 + * + * Define declarations used for communication between NUMA mode + * implementations and NUMA core functionality. + * + * Copyright IBM Corp. 2015 + */ +#ifndef __S390_NUMA_MODE_H +#define __S390_NUMA_MODE_H + +struct numa_mode { + char *name; /* Name of mode */ + void (*setup)(void); /* Initizalize mode */ + void (*update_cpu_topology)(void); /* Called by topology code */ + int (*__pfn_to_nid)(unsigned long pfn); /* PFN to node ID */ + unsigned long (*align)(void); /* Minimum node alignment */ + int (*distance)(int a, int b); /* Distance between two nodes */ +}; + +extern const struct numa_mode numa_mode_plain; + +#endif /* __S390_NUMA_MODE_H */