diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0629121f2c0b..16052117131e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -106,6 +106,7 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem, */ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg); int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg); +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone, enum lru_list lru); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fc62c714f3b6..1520efd1c7c4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -231,6 +231,11 @@ struct mem_cgroup { * reclaimed from. */ int last_scanned_child; + int last_scanned_node; +#if MAX_NUMNODES > 1 + nodemask_t scan_nodes; + unsigned long next_scan_node_update; +#endif /* * Should the accounting and control be hierarchical, per subtree? */ @@ -624,18 +629,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, preempt_enable(); } +static unsigned long +mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx) +{ + struct mem_cgroup_per_zone *mz; + u64 total = 0; + int zid; + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz = mem_cgroup_zoneinfo(mem, nid, zid); + total += MEM_CGROUP_ZSTAT(mz, idx); + } + return total; +} static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, enum lru_list idx) { - int nid, zid; - struct mem_cgroup_per_zone *mz; + int nid; u64 total = 0; for_each_online_node(nid) - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = mem_cgroup_zoneinfo(mem, nid, zid); - total += MEM_CGROUP_ZSTAT(mz, idx); - } + total += mem_cgroup_get_zonestat_node(mem, nid, idx); return total; } @@ -1418,6 +1432,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) return ret; } +#if MAX_NUMNODES > 1 + +/* + * Always updating the nodemask is not very good - even if we have an empty + * list or the wrong list here, we can start from some node and traverse all + * nodes based on the zonelist. So update the list loosely once per 10 secs. + * + */ +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) +{ + int nid; + + if (time_after(mem->next_scan_node_update, jiffies)) + return; + + mem->next_scan_node_update = jiffies + 10*HZ; + /* make a nodemask where this memcg uses memory from */ + mem->scan_nodes = node_states[N_HIGH_MEMORY]; + + for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { + + if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) || + mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE)) + continue; + + if (total_swap_pages && + (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) || + mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON))) + continue; + node_clear(nid, mem->scan_nodes); + } +} + +/* + * Selecting a node where we start reclaim from. Because what we need is just + * reducing usage counter, start from anywhere is O,K. Considering + * memory reclaim from current node, there are pros. and cons. + * + * Freeing memory from current node means freeing memory from a node which + * we'll use or we've used. So, it may make LRU bad. And if several threads + * hit limits, it will see a contention on a node. But freeing from remote + * node means more costs for memory reclaim because of memory latency. + * + * Now, we use round-robin. Better algorithm is welcomed. + */ +int mem_cgroup_select_victim_node(struct mem_cgroup *mem) +{ + int node; + + mem_cgroup_may_update_nodemask(mem); + node = mem->last_scanned_node; + + node = next_node(node, mem->scan_nodes); + if (node == MAX_NUMNODES) + node = first_node(mem->scan_nodes); + /* + * We call this when we hit limit, not when pages are added to LRU. + * No LRU may hold pages because all pages are UNEVICTABLE or + * memcg is too small and all pages are not on LRU. In that case, + * we use curret node. + */ + if (unlikely(node == MAX_NUMNODES)) + node = numa_node_id(); + + mem->last_scanned_node = node; + return node; +} + +#else +int mem_cgroup_select_victim_node(struct mem_cgroup *mem) +{ + return 0; +} +#endif + /* * Scan the hierarchy if needed to reclaim memory. We remember the last child * we reclaimed from, so that we don't end up penalizing one child extensively @@ -4606,6 +4695,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) res_counter_init(&mem->memsw, NULL); } mem->last_scanned_child = 0; + mem->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&mem->oom_notify); if (parent) diff --git a/mm/vmscan.c b/mm/vmscan.c index 884ae08c16cc..b0875871820d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2226,6 +2226,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, { struct zonelist *zonelist; unsigned long nr_reclaimed; + int nid; struct scan_control sc = { .may_writepage = !laptop_mode, .may_unmap = 1, @@ -2242,7 +2243,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .gfp_mask = sc.gfp_mask, }; - zonelist = NODE_DATA(numa_node_id())->node_zonelists; + /* + * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't + * take care of from where we get pages. So the node where we start the + * scan does not need to be the current node. + */ + nid = mem_cgroup_select_victim_node(mem_cont); + + zonelist = NODE_DATA(nid)->node_zonelists; trace_mm_vmscan_memcg_reclaim_begin(0, sc.may_writepage,