From 79bd9814e5ec9a288d6599f53aeac0b548fdfe52 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 Nov 2013 18:20:42 -0500 Subject: [PATCH 1/7] cgroup, memcg: move cgroup_event implementation to memcg cgroup_event is way over-designed and tries to build a generic flexible event mechanism into cgroup - fully customizable event specification for each user of the interface. This is utterly unnecessary and overboard especially in the light of the planned unified hierarchy as there's gonna be single agent. Simply generating events at fixed points, or if that's too restrictive, configureable cadence or single set of configureable points should be enough. Thankfully, memcg is the only user and gets to keep it. Replacing it with something simpler on sane_behavior is strongly recommended. This patch moves cgroup_event and "cgroup.event_control" implementation to mm/memcontrol.c. Clearing of events on cgroup destruction is moved from cgroup_destroy_locked() to mem_cgroup_css_offline(), which shouldn't make any noticeable difference. cgroup_css() and __file_cft() are exported to enable the move; however, this will soon be reverted once the event code is updated to be memcg specific. Note that "cgroup.event_control" will now exist only on the hierarchy with memcg attached to it. While this change is visible to userland, it is unlikely to be noticeable as the file has never been meaningful outside memcg. Aside from the above change, this is pure code relocation. v2: Per Li Zefan's comments, init/Kconfig updated accordingly and poll.h inclusion moved from cgroup.c to memcontrol.c. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Balbir Singh --- include/linux/cgroup.h | 5 + init/Kconfig | 3 +- kernel/cgroup.c | 253 +---------------------------------------- mm/memcontrol.c | 248 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 257 insertions(+), 252 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 3561d305b1e0..40c2427806c9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -907,6 +907,11 @@ unsigned short css_id(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); +/* XXX: temporary */ +struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, + struct cgroup_subsys *ss); +struct cftype *__file_cft(struct file *file); + #else /* !CONFIG_CGROUPS */ static inline int cgroup_init_early(void) { return 0; } diff --git a/init/Kconfig b/init/Kconfig index 3ecd8a1178f1..3ca5b8110b0c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -861,7 +861,6 @@ config NUMA_BALANCING menuconfig CGROUPS boolean "Control Group support" - depends on EVENTFD help This option adds support for grouping sets of processes together, for use with process control subsystems such as Cpusets, CFS, memory @@ -928,6 +927,7 @@ config MEMCG bool "Memory Resource Controller for Control Groups" depends on RESOURCE_COUNTERS select MM_OWNER + select EVENTFD help Provides a memory resource controller that manages both anonymous memory and page cache. (See Documentation/cgroups/memory.txt) @@ -1167,7 +1167,6 @@ config UIDGID_STRICT_TYPE_CHECKS config SCHED_AUTOGROUP bool "Automatic process group scheduling" - select EVENTFD select CGROUPS select CGROUP_SCHED select FAIR_GROUP_SCHED diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8bd9cfdc70d7..4bccaa7dda35 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -56,11 +56,8 @@ #include #include #include /* TODO: replace with more sophisticated array */ -#include -#include #include /* used in cgroup_attach_task */ #include -#include #include @@ -156,36 +153,6 @@ struct css_id { unsigned short stack[0]; /* Array of Length (depth+1) */ }; -/* - * cgroup_event represents events which userspace want to receive. - */ -struct cgroup_event { - /* - * css which the event belongs to. - */ - struct cgroup_subsys_state *css; - /* - * Control file which the event associated. - */ - struct cftype *cft; - /* - * eventfd to signal userspace about the event. - */ - struct eventfd_ctx *eventfd; - /* - * Each of these stored in a list by the cgroup. - */ - struct list_head list; - /* - * All fields below needed to unregister event when - * userspace closes eventfd. - */ - poll_table pt; - wait_queue_head_t *wqh; - wait_queue_t wait; - struct work_struct remove; -}; - /* The list of hierarchy roots */ static LIST_HEAD(cgroup_roots); @@ -235,8 +202,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], * keep accessing it outside the said locks. This function may return * %NULL if @cgrp doesn't have @subsys_id enabled. */ -static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) +struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) { if (ss) return rcu_dereference_check(cgrp->subsys[ss->subsys_id], @@ -2663,7 +2630,7 @@ static const struct inode_operations cgroup_dir_inode_operations = { /* * Check if a file is a control file */ -static inline struct cftype *__file_cft(struct file *file) +struct cftype *__file_cft(struct file *file) { if (file_inode(file)->i_fop != &cgroup_file_operations) return ERR_PTR(-EINVAL); @@ -3949,202 +3916,6 @@ static void cgroup_dput(struct cgroup *cgrp) deactivate_super(sb); } -/* - * Unregister event and free resources. - * - * Gets called from workqueue. - */ -static void cgroup_event_remove(struct work_struct *work) -{ - struct cgroup_event *event = container_of(work, struct cgroup_event, - remove); - struct cgroup_subsys_state *css = event->css; - - remove_wait_queue(event->wqh, &event->wait); - - event->cft->unregister_event(css, event->cft, event->eventfd); - - /* Notify userspace the event is going away. */ - eventfd_signal(event->eventfd, 1); - - eventfd_ctx_put(event->eventfd); - kfree(event); - css_put(css); -} - -/* - * Gets called on POLLHUP on eventfd when user closes it. - * - * Called with wqh->lock held and interrupts disabled. - */ -static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, - int sync, void *key) -{ - struct cgroup_event *event = container_of(wait, - struct cgroup_event, wait); - struct cgroup *cgrp = event->css->cgroup; - unsigned long flags = (unsigned long)key; - - if (flags & POLLHUP) { - /* - * If the event has been detached at cgroup removal, we - * can simply return knowing the other side will cleanup - * for us. - * - * We can't race against event freeing since the other - * side will require wqh->lock via remove_wait_queue(), - * which we hold. - */ - spin_lock(&cgrp->event_list_lock); - if (!list_empty(&event->list)) { - list_del_init(&event->list); - /* - * We are in atomic context, but cgroup_event_remove() - * may sleep, so we have to call it in workqueue. - */ - schedule_work(&event->remove); - } - spin_unlock(&cgrp->event_list_lock); - } - - return 0; -} - -static void cgroup_event_ptable_queue_proc(struct file *file, - wait_queue_head_t *wqh, poll_table *pt) -{ - struct cgroup_event *event = container_of(pt, - struct cgroup_event, pt); - - event->wqh = wqh; - add_wait_queue(wqh, &event->wait); -} - -/* - * Parse input and register new cgroup event handler. - * - * Input must be in format ' '. - * Interpretation of args is defined by control file implementation. - */ -static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, - struct cftype *cft, const char *buffer) -{ - struct cgroup *cgrp = dummy_css->cgroup; - struct cgroup_event *event; - struct cgroup_subsys_state *cfile_css; - unsigned int efd, cfd; - struct fd efile; - struct fd cfile; - char *endp; - int ret; - - efd = simple_strtoul(buffer, &endp, 10); - if (*endp != ' ') - return -EINVAL; - buffer = endp + 1; - - cfd = simple_strtoul(buffer, &endp, 10); - if ((*endp != ' ') && (*endp != '\0')) - return -EINVAL; - buffer = endp + 1; - - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return -ENOMEM; - - INIT_LIST_HEAD(&event->list); - init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); - init_waitqueue_func_entry(&event->wait, cgroup_event_wake); - INIT_WORK(&event->remove, cgroup_event_remove); - - efile = fdget(efd); - if (!efile.file) { - ret = -EBADF; - goto out_kfree; - } - - event->eventfd = eventfd_ctx_fileget(efile.file); - if (IS_ERR(event->eventfd)) { - ret = PTR_ERR(event->eventfd); - goto out_put_efile; - } - - cfile = fdget(cfd); - if (!cfile.file) { - ret = -EBADF; - goto out_put_eventfd; - } - - /* the process need read permission on control file */ - /* AV: shouldn't we check that it's been opened for read instead? */ - ret = inode_permission(file_inode(cfile.file), MAY_READ); - if (ret < 0) - goto out_put_cfile; - - event->cft = __file_cft(cfile.file); - if (IS_ERR(event->cft)) { - ret = PTR_ERR(event->cft); - goto out_put_cfile; - } - - if (!event->cft->ss) { - ret = -EBADF; - goto out_put_cfile; - } - - /* - * Determine the css of @cfile, verify it belongs to the same - * cgroup as cgroup.event_control, and associate @event with it. - * Remaining events are automatically removed on cgroup destruction - * but the removal is asynchronous, so take an extra ref. - */ - rcu_read_lock(); - - ret = -EINVAL; - event->css = cgroup_css(cgrp, event->cft->ss); - cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); - if (event->css && event->css == cfile_css && css_tryget(event->css)) - ret = 0; - - rcu_read_unlock(); - if (ret) - goto out_put_cfile; - - if (!event->cft->register_event || !event->cft->unregister_event) { - ret = -EINVAL; - goto out_put_css; - } - - ret = event->cft->register_event(event->css, event->cft, - event->eventfd, buffer); - if (ret) - goto out_put_css; - - efile.file->f_op->poll(efile.file, &event->pt); - - spin_lock(&cgrp->event_list_lock); - list_add(&event->list, &cgrp->event_list); - spin_unlock(&cgrp->event_list_lock); - - fdput(cfile); - fdput(efile); - - return 0; - -out_put_css: - css_put(event->css); -out_put_cfile: - fdput(cfile); -out_put_eventfd: - eventfd_ctx_put(event->eventfd); -out_put_efile: - fdput(efile); -out_kfree: - kfree(event); - - return ret; -} - static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -4169,11 +3940,6 @@ static struct cftype cgroup_base_files[] = { .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, - { - .name = "cgroup.event_control", - .write_string = cgroup_write_event_control, - .mode = S_IWUGO, - }, { .name = "cgroup.clone_children", .flags = CFTYPE_INSANE, @@ -4666,7 +4432,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct dentry *d = cgrp->dentry; - struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; struct cgroup *child; bool empty; @@ -4741,18 +4506,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) dget(d); cgroup_d_remove_dir(d); - /* - * Unregister events and notify userspace. - * Notify userspace about cgroup removing only after rmdir of cgroup - * directory to avoid race between userspace and kernelspace. - */ - spin_lock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { - list_del_init(&event->list); - schedule_work(&event->remove); - } - spin_unlock(&cgrp->event_list_lock); - return 0; }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 13b9d0f221b8..02dae3292668 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -55,6 +56,7 @@ #include #include #include +#include #include "internal.h" #include #include @@ -226,6 +228,36 @@ struct mem_cgroup_eventfd_list { struct eventfd_ctx *eventfd; }; +/* + * cgroup_event represents events which userspace want to receive. + */ +struct cgroup_event { + /* + * css which the event belongs to. + */ + struct cgroup_subsys_state *css; + /* + * Control file which the event associated. + */ + struct cftype *cft; + /* + * eventfd to signal userspace about the event. + */ + struct eventfd_ctx *eventfd; + /* + * Each of these stored in a list by the cgroup. + */ + struct list_head list; + /* + * All fields below needed to unregister event when + * userspace closes eventfd. + */ + poll_table pt; + wait_queue_head_t *wqh; + wait_queue_t wait; + struct work_struct remove; +}; + static void mem_cgroup_threshold(struct mem_cgroup *memcg); static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); @@ -5947,6 +5979,202 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) } #endif +/* + * Unregister event and free resources. + * + * Gets called from workqueue. + */ +static void cgroup_event_remove(struct work_struct *work) +{ + struct cgroup_event *event = container_of(work, struct cgroup_event, + remove); + struct cgroup_subsys_state *css = event->css; + + remove_wait_queue(event->wqh, &event->wait); + + event->cft->unregister_event(css, event->cft, event->eventfd); + + /* Notify userspace the event is going away. */ + eventfd_signal(event->eventfd, 1); + + eventfd_ctx_put(event->eventfd); + kfree(event); + css_put(css); +} + +/* + * Gets called on POLLHUP on eventfd when user closes it. + * + * Called with wqh->lock held and interrupts disabled. + */ +static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + struct cgroup_event *event = container_of(wait, + struct cgroup_event, wait); + struct cgroup *cgrp = event->css->cgroup; + unsigned long flags = (unsigned long)key; + + if (flags & POLLHUP) { + /* + * If the event has been detached at cgroup removal, we + * can simply return knowing the other side will cleanup + * for us. + * + * We can't race against event freeing since the other + * side will require wqh->lock via remove_wait_queue(), + * which we hold. + */ + spin_lock(&cgrp->event_list_lock); + if (!list_empty(&event->list)) { + list_del_init(&event->list); + /* + * We are in atomic context, but cgroup_event_remove() + * may sleep, so we have to call it in workqueue. + */ + schedule_work(&event->remove); + } + spin_unlock(&cgrp->event_list_lock); + } + + return 0; +} + +static void cgroup_event_ptable_queue_proc(struct file *file, + wait_queue_head_t *wqh, poll_table *pt) +{ + struct cgroup_event *event = container_of(pt, + struct cgroup_event, pt); + + event->wqh = wqh; + add_wait_queue(wqh, &event->wait); +} + +/* + * Parse input and register new cgroup event handler. + * + * Input must be in format ' '. + * Interpretation of args is defined by control file implementation. + */ +static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, + struct cftype *cft, const char *buffer) +{ + struct cgroup *cgrp = dummy_css->cgroup; + struct cgroup_event *event; + struct cgroup_subsys_state *cfile_css; + unsigned int efd, cfd; + struct fd efile; + struct fd cfile; + char *endp; + int ret; + + efd = simple_strtoul(buffer, &endp, 10); + if (*endp != ' ') + return -EINVAL; + buffer = endp + 1; + + cfd = simple_strtoul(buffer, &endp, 10); + if ((*endp != ' ') && (*endp != '\0')) + return -EINVAL; + buffer = endp + 1; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + INIT_LIST_HEAD(&event->list); + init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); + init_waitqueue_func_entry(&event->wait, cgroup_event_wake); + INIT_WORK(&event->remove, cgroup_event_remove); + + efile = fdget(efd); + if (!efile.file) { + ret = -EBADF; + goto out_kfree; + } + + event->eventfd = eventfd_ctx_fileget(efile.file); + if (IS_ERR(event->eventfd)) { + ret = PTR_ERR(event->eventfd); + goto out_put_efile; + } + + cfile = fdget(cfd); + if (!cfile.file) { + ret = -EBADF; + goto out_put_eventfd; + } + + /* the process need read permission on control file */ + /* AV: shouldn't we check that it's been opened for read instead? */ + ret = inode_permission(file_inode(cfile.file), MAY_READ); + if (ret < 0) + goto out_put_cfile; + + event->cft = __file_cft(cfile.file); + if (IS_ERR(event->cft)) { + ret = PTR_ERR(event->cft); + goto out_put_cfile; + } + + if (!event->cft->ss) { + ret = -EBADF; + goto out_put_cfile; + } + + /* + * Determine the css of @cfile, verify it belongs to the same + * cgroup as cgroup.event_control, and associate @event with it. + * Remaining events are automatically removed on cgroup destruction + * but the removal is asynchronous, so take an extra ref. + */ + rcu_read_lock(); + + ret = -EINVAL; + event->css = cgroup_css(cgrp, event->cft->ss); + cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); + if (event->css && event->css == cfile_css && css_tryget(event->css)) + ret = 0; + + rcu_read_unlock(); + if (ret) + goto out_put_cfile; + + if (!event->cft->register_event || !event->cft->unregister_event) { + ret = -EINVAL; + goto out_put_css; + } + + ret = event->cft->register_event(event->css, event->cft, + event->eventfd, buffer); + if (ret) + goto out_put_css; + + efile.file->f_op->poll(efile.file, &event->pt); + + spin_lock(&cgrp->event_list_lock); + list_add(&event->list, &cgrp->event_list); + spin_unlock(&cgrp->event_list_lock); + + fdput(cfile); + fdput(efile); + + return 0; + +out_put_css: + css_put(event->css); +out_put_cfile: + fdput(cfile); +out_put_eventfd: + eventfd_ctx_put(event->eventfd); +out_put_efile: + fdput(efile); +out_kfree: + kfree(event); + + return ret; +} + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -5993,6 +6221,12 @@ static struct cftype mem_cgroup_files[] = { .write_u64 = mem_cgroup_hierarchy_write, .read_u64 = mem_cgroup_hierarchy_read, }, + { + .name = "cgroup.event_control", + .write_string = cgroup_write_event_control, + .flags = CFTYPE_NO_PREFIX, + .mode = S_IWUGO, + }, { .name = "swappiness", .read_u64 = mem_cgroup_swappiness_read, @@ -6326,6 +6560,20 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct cgroup *cgrp = css->cgroup; + struct cgroup_event *event, *tmp; + + /* + * Unregister events and notify userspace. + * Notify userspace about cgroup removing only after rmdir of cgroup + * directory to avoid race between userspace and kernelspace. + */ + spin_lock(&cgrp->event_list_lock); + list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { + list_del_init(&event->list); + schedule_work(&event->remove); + } + spin_unlock(&cgrp->event_list_lock); kmem_cgroup_css_offline(memcg); From b5557c4c3b1a38074d7001b87c2482eda3a0834a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 Nov 2013 18:20:42 -0500 Subject: [PATCH 2/7] memcg: cgroup_write_event_control() now knows @css is for memcg @css for cgroup_write_event_control() is now always for memcg and the target file should be a memcg file too. Drop code which assumes @css is dummy_css and the target file may belong to different subsystems. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov --- mm/memcontrol.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 02dae3292668..d00368110b08 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6056,10 +6056,10 @@ static void cgroup_event_ptable_queue_proc(struct file *file, * Input must be in format ' '. * Interpretation of args is defined by control file implementation. */ -static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, +static int cgroup_write_event_control(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - struct cgroup *cgrp = dummy_css->cgroup; + struct cgroup *cgrp = css->cgroup; struct cgroup_event *event; struct cgroup_subsys_state *cfile_css; unsigned int efd, cfd; @@ -6082,6 +6082,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, if (!event) return -ENOMEM; + event->css = css; INIT_LIST_HEAD(&event->list); init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); init_waitqueue_func_entry(&event->wait, cgroup_event_wake); @@ -6117,23 +6118,17 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, goto out_put_cfile; } - if (!event->cft->ss) { - ret = -EBADF; - goto out_put_cfile; - } - /* - * Determine the css of @cfile, verify it belongs to the same - * cgroup as cgroup.event_control, and associate @event with it. - * Remaining events are automatically removed on cgroup destruction - * but the removal is asynchronous, so take an extra ref. + * Verify @cfile should belong to @css. Also, remaining events are + * automatically removed on cgroup destruction but the removal is + * asynchronous, so take an extra ref on @css. */ rcu_read_lock(); ret = -EINVAL; - event->css = cgroup_css(cgrp, event->cft->ss); - cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); - if (event->css && event->css == cfile_css && css_tryget(event->css)) + cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, + &mem_cgroup_subsys); + if (cfile_css == css && css_tryget(css)) ret = 0; rcu_read_unlock(); @@ -6145,7 +6140,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, goto out_put_css; } - ret = event->cft->register_event(event->css, event->cft, + ret = event->cft->register_event(css, event->cft, event->eventfd, buffer); if (ret) goto out_put_css; @@ -6162,7 +6157,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, return 0; out_put_css: - css_put(event->css); + css_put(css); out_put_cfile: fdput(cfile); out_put_eventfd: From fba94807837850e211f8975e1970e23e7804ff4d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 Nov 2013 18:20:43 -0500 Subject: [PATCH 3/7] cgroup, memcg: move cgroup->event_list[_lock] and event callbacks into memcg cgroup_event is being moved from cgroup core to memcg and the implementation is already moved by the previous patch. This patch moves the data fields and callbacks. * cgroup->event_list[_lock] are moved to mem_cgroup. * cftype->[un]register_event() are moved to cgroup_event. This makes it impossible for individual cftype definitions to specify their event callbacks. This is worked around by simply hard-coding filename to event callback mapping in cgroup_write_event_control(). This is awkward and inflexible, which is actually desirable given that we don't want to grow more usages of this feature. * eventfd_ctx declaration is removed from cgroup.h, which makes vmpressure.h miss eventfd_ctx declaration. Include eventfd.h from vmpressure.h. v2: Use file name from dentry instead of cftype. This will allow removing all cftype handling in the function. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Balbir Singh --- include/linux/cgroup.h | 24 ----------- include/linux/vmpressure.h | 1 + kernel/cgroup.c | 2 - mm/memcontrol.c | 87 ++++++++++++++++++++++++++------------ 4 files changed, 61 insertions(+), 53 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 40c2427806c9..612adc5b87c5 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -29,7 +29,6 @@ struct cgroup_subsys; struct inode; struct cgroup; struct css_id; -struct eventfd_ctx; extern int cgroup_init_early(void); extern int cgroup_init(void); @@ -239,10 +238,6 @@ struct cgroup { struct rcu_head rcu_head; struct work_struct destroy_work; - /* List of events which userspace want to receive */ - struct list_head event_list; - spinlock_t event_list_lock; - /* directory xattrs */ struct simple_xattrs xattrs; }; @@ -506,25 +501,6 @@ struct cftype { int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); int (*release)(struct inode *inode, struct file *file); - - /* - * register_event() callback will be used to add new userspace - * waiter for changes related to the cftype. Implement it if - * you want to provide this functionality. Use eventfd_signal() - * on eventfd to send notification to userspace. - */ - int (*register_event)(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd, - const char *args); - /* - * unregister_event() callback will be called when userspace - * closes the eventfd or on cgroup removing. - * This callback must be implemented, if you want provide - * notification functionality. - */ - void (*unregister_event)(struct cgroup_subsys_state *css, - struct cftype *cft, - struct eventfd_ctx *eventfd); }; /* diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 3f3788d49362..9dd1914f1a6c 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -7,6 +7,7 @@ #include #include #include +#include struct vmpressure { unsigned long scanned; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4bccaa7dda35..feda7c54fa6b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1352,8 +1352,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->dummy_css.cgroup = cgrp; - INIT_LIST_HEAD(&cgrp->event_list); - spin_lock_init(&cgrp->event_list_lock); simple_xattrs_init(&cgrp->xattrs); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d00368110b08..2fcacb18404b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -248,6 +248,22 @@ struct cgroup_event { * Each of these stored in a list by the cgroup. */ struct list_head list; + /* + * register_event() callback will be used to add new userspace + * waiter for changes related to this event. Use eventfd_signal() + * on eventfd to send notification to userspace. + */ + int (*register_event)(struct cgroup_subsys_state *css, + struct cftype *cft, struct eventfd_ctx *eventfd, + const char *args); + /* + * unregister_event() callback will be called when userspace closes + * the eventfd or on cgroup removing. This callback must be set, + * if you want provide notification functionality. + */ + void (*unregister_event)(struct cgroup_subsys_state *css, + struct cftype *cft, + struct eventfd_ctx *eventfd); /* * All fields below needed to unregister event when * userspace closes eventfd. @@ -362,6 +378,10 @@ struct mem_cgroup { atomic_t numainfo_updating; #endif + /* List of events which userspace want to receive */ + struct list_head event_list; + spinlock_t event_list_lock; + struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ }; @@ -5992,7 +6012,7 @@ static void cgroup_event_remove(struct work_struct *work) remove_wait_queue(event->wqh, &event->wait); - event->cft->unregister_event(css, event->cft, event->eventfd); + event->unregister_event(css, event->cft, event->eventfd); /* Notify userspace the event is going away. */ eventfd_signal(event->eventfd, 1); @@ -6012,7 +6032,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, { struct cgroup_event *event = container_of(wait, struct cgroup_event, wait); - struct cgroup *cgrp = event->css->cgroup; + struct mem_cgroup *memcg = mem_cgroup_from_css(event->css); unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { @@ -6025,7 +6045,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, * side will require wqh->lock via remove_wait_queue(), * which we hold. */ - spin_lock(&cgrp->event_list_lock); + spin_lock(&memcg->event_list_lock); if (!list_empty(&event->list)) { list_del_init(&event->list); /* @@ -6034,7 +6054,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, */ schedule_work(&event->remove); } - spin_unlock(&cgrp->event_list_lock); + spin_unlock(&memcg->event_list_lock); } return 0; @@ -6059,12 +6079,13 @@ static void cgroup_event_ptable_queue_proc(struct file *file, static int cgroup_write_event_control(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - struct cgroup *cgrp = css->cgroup; + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct cgroup_event *event; struct cgroup_subsys_state *cfile_css; unsigned int efd, cfd; struct fd efile; struct fd cfile; + const char *name; char *endp; int ret; @@ -6118,6 +6139,31 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, goto out_put_cfile; } + /* + * Determine the event callbacks and set them in @event. This used + * to be done via struct cftype but cgroup core no longer knows + * about these events. The following is crude but the whole thing + * is for compatibility anyway. + */ + name = cfile.file->f_dentry->d_name.name; + + if (!strcmp(name, "memory.usage_in_bytes")) { + event->register_event = mem_cgroup_usage_register_event; + event->unregister_event = mem_cgroup_usage_unregister_event; + } else if (!strcmp(name, "memory.oom_control")) { + event->register_event = mem_cgroup_oom_register_event; + event->unregister_event = mem_cgroup_oom_unregister_event; + } else if (!strcmp(name, "memory.pressure_level")) { + event->register_event = vmpressure_register_event; + event->unregister_event = vmpressure_unregister_event; + } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { + event->register_event = mem_cgroup_usage_register_event; + event->unregister_event = mem_cgroup_usage_unregister_event; + } else { + ret = -EINVAL; + goto out_put_cfile; + } + /* * Verify @cfile should belong to @css. Also, remaining events are * automatically removed on cgroup destruction but the removal is @@ -6135,21 +6181,15 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, if (ret) goto out_put_cfile; - if (!event->cft->register_event || !event->cft->unregister_event) { - ret = -EINVAL; - goto out_put_css; - } - - ret = event->cft->register_event(css, event->cft, - event->eventfd, buffer); + ret = event->register_event(css, event->cft, event->eventfd, buffer); if (ret) goto out_put_css; efile.file->f_op->poll(efile.file, &event->pt); - spin_lock(&cgrp->event_list_lock); - list_add(&event->list, &cgrp->event_list); - spin_unlock(&cgrp->event_list_lock); + spin_lock(&memcg->event_list_lock); + list_add(&event->list, &memcg->event_list); + spin_unlock(&memcg->event_list_lock); fdput(cfile); fdput(efile); @@ -6175,8 +6215,6 @@ static struct cftype mem_cgroup_files[] = { .name = "usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), .read = mem_cgroup_read, - .register_event = mem_cgroup_usage_register_event, - .unregister_event = mem_cgroup_usage_unregister_event, }, { .name = "max_usage_in_bytes", @@ -6236,14 +6274,10 @@ static struct cftype mem_cgroup_files[] = { .name = "oom_control", .read_map = mem_cgroup_oom_control_read, .write_u64 = mem_cgroup_oom_control_write, - .register_event = mem_cgroup_oom_register_event, - .unregister_event = mem_cgroup_oom_unregister_event, .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), }, { .name = "pressure_level", - .register_event = vmpressure_register_event, - .unregister_event = vmpressure_unregister_event, }, #ifdef CONFIG_NUMA { @@ -6291,8 +6325,6 @@ static struct cftype memsw_cgroup_files[] = { .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), .read = mem_cgroup_read, - .register_event = mem_cgroup_usage_register_event, - .unregister_event = mem_cgroup_usage_unregister_event, }, { .name = "memsw.max_usage_in_bytes", @@ -6483,6 +6515,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); vmpressure_init(&memcg->vmpressure); + INIT_LIST_HEAD(&memcg->event_list); + spin_lock_init(&memcg->event_list_lock); return &memcg->css; @@ -6555,7 +6589,6 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct cgroup *cgrp = css->cgroup; struct cgroup_event *event, *tmp; /* @@ -6563,12 +6596,12 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) * Notify userspace about cgroup removing only after rmdir of cgroup * directory to avoid race between userspace and kernelspace. */ - spin_lock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { + spin_lock(&memcg->event_list_lock); + list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { list_del_init(&event->list); schedule_work(&event->remove); } - spin_unlock(&cgrp->event_list_lock); + spin_unlock(&memcg->event_list_lock); kmem_cgroup_css_offline(memcg); From 347c4a8747104a945ecced358944e42879176ca5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 Nov 2013 18:20:43 -0500 Subject: [PATCH 4/7] memcg: remove cgroup_event->cft The only use of cgroup_event->cft is distinguishing "usage_in_bytes" and "memsw.usgae_in_bytes" for mem_cgroup_usage_[un]register_event(), which can be done by adding an explicit argument to the function and implementing two wrappers so that the two cases can be distinguished from the function alone. Remove cgroup_event->cft and the related code including [un]register_events() methods. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko --- include/linux/vmpressure.h | 2 -- mm/memcontrol.c | 65 ++++++++++++++++++++------------------ mm/vmpressure.c | 14 ++------ 3 files changed, 38 insertions(+), 43 deletions(-) diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 9dd1914f1a6c..b048365a7ed9 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -36,11 +36,9 @@ extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); extern int vmpressure_register_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd, const char *args); extern void vmpressure_unregister_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd); #else static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2fcacb18404b..3c93dcfd26da 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -236,10 +236,6 @@ struct cgroup_event { * css which the event belongs to. */ struct cgroup_subsys_state *css; - /* - * Control file which the event associated. - */ - struct cftype *cft; /* * eventfd to signal userspace about the event. */ @@ -254,15 +250,13 @@ struct cgroup_event { * on eventfd to send notification to userspace. */ int (*register_event)(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd, - const char *args); + struct eventfd_ctx *eventfd, const char *args); /* * unregister_event() callback will be called when userspace closes * the eventfd or on cgroup removing. This callback must be set, * if you want provide notification functionality. */ void (*unregister_event)(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd); /* * All fields below needed to unregister event when @@ -5688,13 +5682,12 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) mem_cgroup_oom_notify_cb(iter); } -static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) +static int __mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, + struct eventfd_ctx *eventfd, const char *args, enum res_type type) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - enum res_type type = MEMFILE_TYPE(cft->private); u64 threshold, usage; int i, size, ret; @@ -5771,13 +5764,24 @@ unlock: return ret; } -static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd) +static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, + struct eventfd_ctx *eventfd, const char *args) +{ + return __mem_cgroup_usage_register_event(css, eventfd, args, _MEM); +} + +static int memsw_cgroup_usage_register_event(struct cgroup_subsys_state *css, + struct eventfd_ctx *eventfd, const char *args) +{ + return __mem_cgroup_usage_register_event(css, eventfd, args, _MEMSWAP); +} + +static void __mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, + struct eventfd_ctx *eventfd, enum res_type type) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - enum res_type type = MEMFILE_TYPE(cft->private); u64 usage; int i, j, size; @@ -5850,14 +5854,24 @@ unlock: mutex_unlock(&memcg->thresholds_lock); } +static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, + struct eventfd_ctx *eventfd) +{ + return __mem_cgroup_usage_unregister_event(css, eventfd, _MEM); +} + +static void memsw_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, + struct eventfd_ctx *eventfd) +{ + return __mem_cgroup_usage_unregister_event(css, eventfd, _MEMSWAP); +} + static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) + struct eventfd_ctx *eventfd, const char *args) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *event; - enum res_type type = MEMFILE_TYPE(cft->private); - BUG_ON(type != _OOM_TYPE); event = kmalloc(sizeof(*event), GFP_KERNEL); if (!event) return -ENOMEM; @@ -5876,13 +5890,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, } static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd) + struct eventfd_ctx *eventfd) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *ev, *tmp; - enum res_type type = MEMFILE_TYPE(cft->private); - - BUG_ON(type != _OOM_TYPE); spin_lock(&memcg_oom_lock); @@ -6012,7 +6023,7 @@ static void cgroup_event_remove(struct work_struct *work) remove_wait_queue(event->wqh, &event->wait); - event->unregister_event(css, event->cft, event->eventfd); + event->unregister_event(css, event->eventfd); /* Notify userspace the event is going away. */ eventfd_signal(event->eventfd, 1); @@ -6133,12 +6144,6 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, if (ret < 0) goto out_put_cfile; - event->cft = __file_cft(cfile.file); - if (IS_ERR(event->cft)) { - ret = PTR_ERR(event->cft); - goto out_put_cfile; - } - /* * Determine the event callbacks and set them in @event. This used * to be done via struct cftype but cgroup core no longer knows @@ -6157,8 +6162,8 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, event->register_event = vmpressure_register_event; event->unregister_event = vmpressure_unregister_event; } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { - event->register_event = mem_cgroup_usage_register_event; - event->unregister_event = mem_cgroup_usage_unregister_event; + event->register_event = memsw_cgroup_usage_register_event; + event->unregister_event = memsw_cgroup_usage_unregister_event; } else { ret = -EINVAL; goto out_put_cfile; @@ -6181,7 +6186,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, if (ret) goto out_put_cfile; - ret = event->register_event(css, event->cft, event->eventfd, buffer); + ret = event->register_event(css, event->eventfd, buffer); if (ret) goto out_put_css; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index e0f62837c3f4..0f25a996d150 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -279,7 +279,6 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) /** * vmpressure_register_event() - Bind vmpressure notifications to an eventfd * @css: css that is interested in vmpressure notifications - * @cft: cgroup control files handle * @eventfd: eventfd context to link notifications with * @args: event arguments (used to set up a pressure level threshold) * @@ -289,13 +288,10 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or * "critical"). * - * This function should not be used directly, just pass it to (struct - * cftype).register_event, and then cgroup core will handle everything by - * itself. + * To be used as memcg event method. */ int vmpressure_register_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd, - const char *args) + struct eventfd_ctx *eventfd, const char *args) { struct vmpressure *vmpr = css_to_vmpressure(css); struct vmpressure_event *ev; @@ -326,19 +322,15 @@ int vmpressure_register_event(struct cgroup_subsys_state *css, /** * vmpressure_unregister_event() - Unbind eventfd from vmpressure * @css: css handle - * @cft: cgroup control files handle * @eventfd: eventfd context that was used to link vmpressure with the @cg * * This function does internal manipulations to detach the @eventfd from * the vmpressure notifications, and then frees internal resources * associated with the @eventfd (but the @eventfd itself is not freed). * - * This function should not be used directly, just pass it to (struct - * cftype).unregister_event, and then cgroup core will handle everything - * by itself. + * To be used as memcg event method. */ void vmpressure_unregister_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd) { struct vmpressure *vmpr = css_to_vmpressure(css); From 59b6f87344ab5eb3057e5844b8cd8a39e668f477 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 Nov 2013 18:20:43 -0500 Subject: [PATCH 5/7] memcg: make cgroup_event deal with mem_cgroup instead of cgroup_subsys_state cgroup_event is now memcg specific. Replace cgroup_event->css with ->memcg and convert [un]register_event() callbacks to take mem_cgroup pointer instead of cgroup_subsys_state one. This simplifies the code slightly and makes css_to_vmpressure() unnecessary which is removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko --- include/linux/vmpressure.h | 5 ++-- mm/memcontrol.c | 53 ++++++++++++++++---------------------- mm/vmpressure.c | 12 ++++----- 3 files changed, 30 insertions(+), 40 deletions(-) diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index b048365a7ed9..3e4535876d37 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -34,11 +34,10 @@ extern void vmpressure_init(struct vmpressure *vmpr); extern void vmpressure_cleanup(struct vmpressure *vmpr); extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); -extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); -extern int vmpressure_register_event(struct cgroup_subsys_state *css, +extern int vmpressure_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args); -extern void vmpressure_unregister_event(struct cgroup_subsys_state *css, +extern void vmpressure_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd); #else static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3c93dcfd26da..42f2843af1a7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -233,9 +233,9 @@ struct mem_cgroup_eventfd_list { */ struct cgroup_event { /* - * css which the event belongs to. + * memcg which the event belongs to. */ - struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; /* * eventfd to signal userspace about the event. */ @@ -249,14 +249,14 @@ struct cgroup_event { * waiter for changes related to this event. Use eventfd_signal() * on eventfd to send notification to userspace. */ - int (*register_event)(struct cgroup_subsys_state *css, + int (*register_event)(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args); /* * unregister_event() callback will be called when userspace closes * the eventfd or on cgroup removing. This callback must be set, * if you want provide notification functionality. */ - void (*unregister_event)(struct cgroup_subsys_state *css, + void (*unregister_event)(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd); /* * All fields below needed to unregister event when @@ -535,11 +535,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; } -struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) -{ - return &mem_cgroup_from_css(css)->vmpressure; -} - static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); @@ -5682,10 +5677,9 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) mem_cgroup_oom_notify_cb(iter); } -static int __mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, +static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args, enum res_type type) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; u64 threshold, usage; @@ -5764,22 +5758,21 @@ unlock: return ret; } -static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, +static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args) { - return __mem_cgroup_usage_register_event(css, eventfd, args, _MEM); + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); } -static int memsw_cgroup_usage_register_event(struct cgroup_subsys_state *css, +static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args) { - return __mem_cgroup_usage_register_event(css, eventfd, args, _MEMSWAP); + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); } -static void __mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, +static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, enum res_type type) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; u64 usage; @@ -5854,22 +5847,21 @@ unlock: mutex_unlock(&memcg->thresholds_lock); } -static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, +static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd) { - return __mem_cgroup_usage_unregister_event(css, eventfd, _MEM); + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); } -static void memsw_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, +static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd) { - return __mem_cgroup_usage_unregister_event(css, eventfd, _MEMSWAP); + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); } -static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, +static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *event; event = kmalloc(sizeof(*event), GFP_KERNEL); @@ -5889,10 +5881,9 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, return 0; } -static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, +static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *ev, *tmp; spin_lock(&memcg_oom_lock); @@ -6019,18 +6010,18 @@ static void cgroup_event_remove(struct work_struct *work) { struct cgroup_event *event = container_of(work, struct cgroup_event, remove); - struct cgroup_subsys_state *css = event->css; + struct mem_cgroup *memcg = event->memcg; remove_wait_queue(event->wqh, &event->wait); - event->unregister_event(css, event->eventfd); + event->unregister_event(memcg, event->eventfd); /* Notify userspace the event is going away. */ eventfd_signal(event->eventfd, 1); eventfd_ctx_put(event->eventfd); kfree(event); - css_put(css); + css_put(&memcg->css); } /* @@ -6043,7 +6034,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, { struct cgroup_event *event = container_of(wait, struct cgroup_event, wait); - struct mem_cgroup *memcg = mem_cgroup_from_css(event->css); + struct mem_cgroup *memcg = event->memcg; unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { @@ -6114,7 +6105,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, if (!event) return -ENOMEM; - event->css = css; + event->memcg = memcg; INIT_LIST_HEAD(&event->list); init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); init_waitqueue_func_entry(&event->wait, cgroup_event_wake); @@ -6186,7 +6177,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, if (ret) goto out_put_cfile; - ret = event->register_event(css, event->eventfd, buffer); + ret = event->register_event(memcg, event->eventfd, buffer); if (ret) goto out_put_css; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 0f25a996d150..196970a4541f 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -278,7 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) /** * vmpressure_register_event() - Bind vmpressure notifications to an eventfd - * @css: css that is interested in vmpressure notifications + * @memcg: memcg that is interested in vmpressure notifications * @eventfd: eventfd context to link notifications with * @args: event arguments (used to set up a pressure level threshold) * @@ -290,10 +290,10 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * * To be used as memcg event method. */ -int vmpressure_register_event(struct cgroup_subsys_state *css, +int vmpressure_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args) { - struct vmpressure *vmpr = css_to_vmpressure(css); + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure_event *ev; int level; @@ -321,7 +321,7 @@ int vmpressure_register_event(struct cgroup_subsys_state *css, /** * vmpressure_unregister_event() - Unbind eventfd from vmpressure - * @css: css handle + * @memcg: memcg handle * @eventfd: eventfd context that was used to link vmpressure with the @cg * * This function does internal manipulations to detach the @eventfd from @@ -330,10 +330,10 @@ int vmpressure_register_event(struct cgroup_subsys_state *css, * * To be used as memcg event method. */ -void vmpressure_unregister_event(struct cgroup_subsys_state *css, +void vmpressure_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd) { - struct vmpressure *vmpr = css_to_vmpressure(css); + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure_event *ev; mutex_lock(&vmpr->events_lock); From 3bc942f372af383f49d56aab599469561a5e39ec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 Nov 2013 18:20:44 -0500 Subject: [PATCH 6/7] memcg: rename cgroup_event to mem_cgroup_event cgroup_event is only available in memcg now. Let's brand it that way. While at it, add a comment encouraging deprecation of the feature and remove the respective section from cgroup documentation. This patch is cosmetic. v3: Typo update as per Li Zefan. v2: Index in cgroups.txt updated accordingly as suggested by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko --- Documentation/cgroups/cgroups.txt | 20 ----------- mm/memcontrol.c | 57 ++++++++++++++++++++----------- 2 files changed, 37 insertions(+), 40 deletions(-) diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 638bf17ff869..821de56d1580 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -24,7 +24,6 @@ CONTENTS: 2.1 Basic Usage 2.2 Attaching processes 2.3 Mounting hierarchies by name - 2.4 Notification API 3. Kernel API 3.1 Overview 3.2 Synchronization @@ -472,25 +471,6 @@ you give a subsystem a name. The name of the subsystem appears as part of the hierarchy description in /proc/mounts and /proc//cgroups. -2.4 Notification API --------------------- - -There is mechanism which allows to get notifications about changing -status of a cgroup. - -To register a new notification handler you need to: - - create a file descriptor for event notification using eventfd(2); - - open a control file to be monitored (e.g. memory.usage_in_bytes); - - write " " to cgroup.event_control. - Interpretation of args is defined by control file implementation; - -eventfd will be woken up by control file implementation or when the -cgroup is removed. - -To unregister a notification handler just close eventfd. - -NOTE: Support of notifications should be implemented for the control -file. See documentation for the subsystem. 3. Kernel API ============= diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 42f2843af1a7..ec8582b3a232 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -231,7 +231,7 @@ struct mem_cgroup_eventfd_list { /* * cgroup_event represents events which userspace want to receive. */ -struct cgroup_event { +struct mem_cgroup_event { /* * memcg which the event belongs to. */ @@ -6001,15 +6001,28 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) } #endif +/* + * DO NOT USE IN NEW FILES. + * + * "cgroup.event_control" implementation. + * + * This is way over-engineered. It tries to support fully configurable + * events for each user. Such level of flexibility is completely + * unnecessary especially in the light of the planned unified hierarchy. + * + * Please deprecate this and replace with something simpler if at all + * possible. + */ + /* * Unregister event and free resources. * * Gets called from workqueue. */ -static void cgroup_event_remove(struct work_struct *work) +static void memcg_event_remove(struct work_struct *work) { - struct cgroup_event *event = container_of(work, struct cgroup_event, - remove); + struct mem_cgroup_event *event = + container_of(work, struct mem_cgroup_event, remove); struct mem_cgroup *memcg = event->memcg; remove_wait_queue(event->wqh, &event->wait); @@ -6029,11 +6042,11 @@ static void cgroup_event_remove(struct work_struct *work) * * Called with wqh->lock held and interrupts disabled. */ -static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, - int sync, void *key) +static int memcg_event_wake(wait_queue_t *wait, unsigned mode, + int sync, void *key) { - struct cgroup_event *event = container_of(wait, - struct cgroup_event, wait); + struct mem_cgroup_event *event = + container_of(wait, struct mem_cgroup_event, wait); struct mem_cgroup *memcg = event->memcg; unsigned long flags = (unsigned long)key; @@ -6062,27 +6075,29 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, return 0; } -static void cgroup_event_ptable_queue_proc(struct file *file, +static void memcg_event_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { - struct cgroup_event *event = container_of(pt, - struct cgroup_event, pt); + struct mem_cgroup_event *event = + container_of(pt, struct mem_cgroup_event, pt); event->wqh = wqh; add_wait_queue(wqh, &event->wait); } /* + * DO NOT USE IN NEW FILES. + * * Parse input and register new cgroup event handler. * * Input must be in format ' '. * Interpretation of args is defined by control file implementation. */ -static int cgroup_write_event_control(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buffer) +static int memcg_write_event_control(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct cgroup_event *event; + struct mem_cgroup_event *event; struct cgroup_subsys_state *cfile_css; unsigned int efd, cfd; struct fd efile; @@ -6107,9 +6122,9 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, event->memcg = memcg; INIT_LIST_HEAD(&event->list); - init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); - init_waitqueue_func_entry(&event->wait, cgroup_event_wake); - INIT_WORK(&event->remove, cgroup_event_remove); + init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); + init_waitqueue_func_entry(&event->wait, memcg_event_wake); + INIT_WORK(&event->remove, memcg_event_remove); efile = fdget(efd); if (!efile.file) { @@ -6140,6 +6155,8 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, * to be done via struct cftype but cgroup core no longer knows * about these events. The following is crude but the whole thing * is for compatibility anyway. + * + * DO NOT ADD NEW FILES. */ name = cfile.file->f_dentry->d_name.name; @@ -6251,8 +6268,8 @@ static struct cftype mem_cgroup_files[] = { .read_u64 = mem_cgroup_hierarchy_read, }, { - .name = "cgroup.event_control", - .write_string = cgroup_write_event_control, + .name = "cgroup.event_control", /* XXX: for compat */ + .write_string = memcg_write_event_control, .flags = CFTYPE_NO_PREFIX, .mode = S_IWUGO, }, @@ -6585,7 +6602,7 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct cgroup_event *event, *tmp; + struct mem_cgroup_event *event, *tmp; /* * Unregister events and notify userspace. From b36824c75c7855585d6476eef2b234f6e0e68872 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 Nov 2013 18:20:44 -0500 Subject: [PATCH 7/7] cgroup: unexport cgroup_css() and remove __file_cft() Now that cgroup_event is made memcg specific, the temporarily exported functions are no longer necessary. Unexport cgroup_css() and remove __file_cft() which doesn't have any user left. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov --- include/linux/cgroup.h | 5 ----- kernel/cgroup.c | 14 ++------------ 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 612adc5b87c5..8d9fa8967c9e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -883,11 +883,6 @@ unsigned short css_id(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); -/* XXX: temporary */ -struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, - struct cgroup_subsys *ss); -struct cftype *__file_cft(struct file *file); - #else /* !CONFIG_CGROUPS */ static inline int cgroup_init_early(void) { return 0; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index feda7c54fa6b..c0248e16461d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -202,8 +202,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], * keep accessing it outside the said locks. This function may return * %NULL if @cgrp doesn't have @subsys_id enabled. */ -struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) +static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) { if (ss) return rcu_dereference_check(cgrp->subsys[ss->subsys_id], @@ -2625,16 +2625,6 @@ static const struct inode_operations cgroup_dir_inode_operations = { .removexattr = cgroup_removexattr, }; -/* - * Check if a file is a control file - */ -struct cftype *__file_cft(struct file *file) -{ - if (file_inode(file)->i_fop != &cgroup_file_operations) - return ERR_PTR(-EINVAL); - return __d_cft(file->f_dentry); -} - static int cgroup_create_file(struct dentry *dentry, umode_t mode, struct super_block *sb) {