diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 661a2c679f64..26a38b7c7739 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -152,10 +152,12 @@ struct mem_cgroup_stat_cpu { }; struct mem_cgroup_reclaim_iter { - /* css_id of the last scanned hierarchy member */ - int position; + /* last scanned hierarchy member with elevated css ref count */ + struct mem_cgroup *last_visited; /* scan generation, increased every round-trip */ unsigned int generation; + /* lock to protect the position and generation */ + spinlock_t iter_lock; }; /* @@ -1089,7 +1091,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup_reclaim_cookie *reclaim) { struct mem_cgroup *memcg = NULL; - int id = 0; + struct mem_cgroup *last_visited = NULL; if (mem_cgroup_disabled()) return NULL; @@ -1098,7 +1100,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, root = root_mem_cgroup; if (prev && !reclaim) - id = css_id(&prev->css); + last_visited = prev; if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) @@ -1106,9 +1108,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, return root; } + rcu_read_lock(); while (!memcg) { struct mem_cgroup_reclaim_iter *uninitialized_var(iter); - struct cgroup_subsys_state *css; + struct cgroup_subsys_state *css = NULL; if (reclaim) { int nid = zone_to_nid(reclaim->zone); @@ -1117,31 +1120,74 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, mz = mem_cgroup_zoneinfo(root, nid, zid); iter = &mz->reclaim_iter[reclaim->priority]; - if (prev && reclaim->generation != iter->generation) - goto out_css_put; - id = iter->position; + spin_lock(&iter->iter_lock); + last_visited = iter->last_visited; + if (prev && reclaim->generation != iter->generation) { + if (last_visited) { + css_put(&last_visited->css); + iter->last_visited = NULL; + } + spin_unlock(&iter->iter_lock); + goto out_unlock; + } } - rcu_read_lock(); - css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); - if (css) { - if (css == &root->css || css_tryget(css)) - memcg = mem_cgroup_from_css(css); - } else - id = 0; - rcu_read_unlock(); + /* + * Root is not visited by cgroup iterators so it needs an + * explicit visit. + */ + if (!last_visited) { + css = &root->css; + } else { + struct cgroup *prev_cgroup, *next_cgroup; + + prev_cgroup = (last_visited == root) ? NULL + : last_visited->css.cgroup; + next_cgroup = cgroup_next_descendant_pre(prev_cgroup, + root->css.cgroup); + if (next_cgroup) + css = cgroup_subsys_state(next_cgroup, + mem_cgroup_subsys_id); + } + + /* + * Even if we found a group we have to make sure it is alive. + * css && !memcg means that the groups should be skipped and + * we should continue the tree walk. + * last_visited css is safe to use because it is protected by + * css_get and the tree walk is rcu safe. + */ + if (css == &root->css || (css && css_tryget(css))) + memcg = mem_cgroup_from_css(css); if (reclaim) { - iter->position = id; + struct mem_cgroup *curr = memcg; + + if (last_visited) + css_put(&last_visited->css); + + if (css && !memcg) + curr = mem_cgroup_from_css(css); + + /* make sure that the cached memcg is not removed */ + if (curr) + css_get(&curr->css); + iter->last_visited = curr; + if (!css) iter->generation++; else if (!prev && memcg) reclaim->generation = iter->generation; + spin_unlock(&iter->iter_lock); + } else if (css && !memcg) { + last_visited = mem_cgroup_from_css(css); } if (prev && !css) - goto out_css_put; + goto out_unlock; } +out_unlock: + rcu_read_unlock(); out_css_put: if (prev && prev != root) css_put(&prev->css); @@ -5929,8 +5975,12 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) return 1; for (zone = 0; zone < MAX_NR_ZONES; zone++) { + int prio; + mz = &pn->zoneinfo[zone]; lruvec_init(&mz->lruvec); + for (prio = 0; prio < DEF_PRIORITY + 1; prio++) + spin_lock_init(&mz->reclaim_iter[prio].iter_lock); mz->usage_in_excess = 0; mz->on_tree = false; mz->memcg = memcg;