From a1d82aff5df760d933b6ea3a03805dbc2bd73eb8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:02 -0500 Subject: [PATCH 01/27] kernfs: make kernfs_open_file->mmapped a bitfield More kernfs_open_file->mutex synchronized flags are planned to be added. Convert ->mmapped to a bitfield in preparation. While at it, make kernfs_fop_mmap() use "true" instead of "1" on ->mmapped. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Acked-by: Acked-by: Zefan Li --- fs/kernfs/file.c | 2 +- include/linux/kernfs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 78219d5644e9..d0b6fb13ead0 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -516,7 +516,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) goto out_put; rc = 0; - of->mmapped = 1; + of->mmapped = true; of->vm_ops = vma->vm_ops; vma->vm_ops = &kernfs_vm_ops; out_put: diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 7056238fd9f5..afd4e5abc4fb 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -185,7 +185,7 @@ struct kernfs_open_file { char *prealloc_buf; size_t atomic_write_len; - bool mmapped; + bool mmapped:1; const struct vm_operations_struct *vm_ops; }; From 0e67db2f9fe91937e798e3d7d22c50a8438187e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:03 -0500 Subject: [PATCH 02/27] kernfs: add kernfs_ops->open/release() callbacks Add ->open/release() methods to kernfs_ops. ->open() is called when the file is opened and ->release() when the file is either released or severed. These callbacks can be used, for example, to manage persistent caching objects over multiple seq_file iterations. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Acked-by: Acked-by: Zefan Li --- fs/kernfs/dir.c | 2 +- fs/kernfs/file.c | 51 ++++++++++++++++++++++++++++++++----- fs/kernfs/kernfs-internal.h | 2 +- include/linux/kernfs.h | 10 ++++++++ 4 files changed, 57 insertions(+), 8 deletions(-) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index cf4c636ff4da..0b9d23bcceb5 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -468,7 +468,7 @@ static void kernfs_drain(struct kernfs_node *kn) rwsem_release(&kn->dep_map, 1, _RET_IP_); } - kernfs_unmap_bin_file(kn); + kernfs_drain_open_files(kn); mutex_lock(&kernfs_mutex); } diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index d0b6fb13ead0..20c396291ac1 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -708,7 +708,8 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) if (error) goto err_free; - ((struct seq_file *)file->private_data)->private = of; + of->seq_file = file->private_data; + of->seq_file->private = of; /* seq_file clears PWRITE unconditionally, restore it if WRITE */ if (file->f_mode & FMODE_WRITE) @@ -717,13 +718,22 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) /* make sure we have open node struct */ error = kernfs_get_open_node(kn, of); if (error) - goto err_close; + goto err_seq_release; + + if (ops->open) { + /* nobody has access to @of yet, skip @of->mutex */ + error = ops->open(of); + if (error) + goto err_put_node; + } /* open succeeded, put active references */ kernfs_put_active(kn); return 0; -err_close: +err_put_node: + kernfs_put_open_node(kn, of); +err_seq_release: seq_release(inode, file); err_free: kfree(of->prealloc_buf); @@ -733,11 +743,32 @@ err_out: return error; } +/* used from release/drain to ensure that ->release() is called exactly once */ +static void kernfs_release_file(struct kernfs_node *kn, + struct kernfs_open_file *of) +{ + if (!(kn->flags & KERNFS_HAS_RELEASE)) + return; + + mutex_lock(&of->mutex); + if (!of->released) { + /* + * A file is never detached without being released and we + * need to be able to release files which are deactivated + * and being drained. Don't use kernfs_ops(). + */ + kn->attr.ops->release(of); + of->released = true; + } + mutex_unlock(&of->mutex); +} + static int kernfs_fop_release(struct inode *inode, struct file *filp) { struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; struct kernfs_open_file *of = kernfs_of(filp); + kernfs_release_file(kn, of); kernfs_put_open_node(kn, of); seq_release(inode, filp); kfree(of->prealloc_buf); @@ -746,12 +777,12 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp) return 0; } -void kernfs_unmap_bin_file(struct kernfs_node *kn) +void kernfs_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; struct kernfs_open_file *of; - if (!(kn->flags & KERNFS_HAS_MMAP)) + if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE))) return; spin_lock_irq(&kernfs_open_node_lock); @@ -763,10 +794,16 @@ void kernfs_unmap_bin_file(struct kernfs_node *kn) return; mutex_lock(&kernfs_open_file_mutex); + list_for_each_entry(of, &on->files, list) { struct inode *inode = file_inode(of->file); - unmap_mapping_range(inode->i_mapping, 0, 0, 1); + + if (kn->flags & KERNFS_HAS_MMAP) + unmap_mapping_range(inode->i_mapping, 0, 0, 1); + + kernfs_release_file(kn, of); } + mutex_unlock(&kernfs_open_file_mutex); kernfs_put_open_node(kn, NULL); @@ -965,6 +1002,8 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, kn->flags |= KERNFS_HAS_SEQ_SHOW; if (ops->mmap) kn->flags |= KERNFS_HAS_MMAP; + if (ops->release) + kn->flags |= KERNFS_HAS_RELEASE; rc = kernfs_add_one(kn); if (rc) { diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index bfd551bbf231..3100987cf8ba 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -104,7 +104,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, */ extern const struct file_operations kernfs_file_fops; -void kernfs_unmap_bin_file(struct kernfs_node *kn); +void kernfs_drain_open_files(struct kernfs_node *kn); /* * symlink.c diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index afd4e5abc4fb..a9b11b8d06f2 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -46,6 +46,7 @@ enum kernfs_node_flag { KERNFS_SUICIDAL = 0x0400, KERNFS_SUICIDED = 0x0800, KERNFS_EMPTY_DIR = 0x1000, + KERNFS_HAS_RELEASE = 0x2000, }; /* @flags for kernfs_create_root() */ @@ -175,6 +176,7 @@ struct kernfs_open_file { /* published fields */ struct kernfs_node *kn; struct file *file; + struct seq_file *seq_file; void *priv; /* private fields, do not use outside kernfs proper */ @@ -186,10 +188,18 @@ struct kernfs_open_file { size_t atomic_write_len; bool mmapped:1; + bool released:1; const struct vm_operations_struct *vm_ops; }; struct kernfs_ops { + /* + * Optional open/release methods. Both are called with + * @of->seq_file populated. + */ + int (*open)(struct kernfs_open_file *of); + void (*release)(struct kernfs_open_file *of); + /* * Read is handled by either seq_file or raw_read(). * From e90cbebc3fa5caea4c8bfeb0d0157a0cee53efc7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:03 -0500 Subject: [PATCH 03/27] cgroup add cftype->open/release() callbacks Pipe the newly added kernfs->open/release() callbacks through cftype. While at it, as cleanup operations now can be performed from ->release() instead of ->seq_stop(), make the latter optional. Signed-off-by: Tejun Heo Acked-by: Acked-by: Zefan Li --- include/linux/cgroup-defs.h | 3 +++ kernel/cgroup.c | 24 +++++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 861b4677fc5b..8a916dc96e84 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -388,6 +388,9 @@ struct cftype { struct list_head node; /* anchored at ss->cfts */ struct kernfs_ops *kf_ops; + int (*open)(struct kernfs_open_file *of); + void (*release)(struct kernfs_open_file *of); + /* * read_u64() is a shortcut for the common case of returning a * single integer. Use it in place of read() diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2ee9ec3051b2..87167e40c4fa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3503,6 +3503,23 @@ static int cgroup_events_show(struct seq_file *seq, void *v) return 0; } +static int cgroup_file_open(struct kernfs_open_file *of) +{ + struct cftype *cft = of->kn->priv; + + if (cft->open) + return cft->open(of); + return 0; +} + +static void cgroup_file_release(struct kernfs_open_file *of) +{ + struct cftype *cft = of->kn->priv; + + if (cft->release) + cft->release(of); +} + static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -3553,7 +3570,8 @@ static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { - seq_cft(seq)->seq_stop(seq, v); + if (seq_cft(seq)->seq_stop) + seq_cft(seq)->seq_stop(seq, v); } static int cgroup_seqfile_show(struct seq_file *m, void *arg) @@ -3575,12 +3593,16 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) static struct kernfs_ops cgroup_kf_single_ops = { .atomic_write_len = PAGE_SIZE, + .open = cgroup_file_open, + .release = cgroup_file_release, .write = cgroup_file_write, .seq_show = cgroup_seqfile_show, }; static struct kernfs_ops cgroup_kf_ops = { .atomic_write_len = PAGE_SIZE, + .open = cgroup_file_open, + .release = cgroup_file_release, .write = cgroup_file_write, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, From b4b90a8e86f2539a028d68077b45e8511dd2adb0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:04 -0500 Subject: [PATCH 04/27] cgroup: reimplement reading "cgroup.procs" on cgroup v2 On v1, "tasks" and "cgroup.procs" are expected to be sorted which makes the implementation expensive and unnecessarily complicated involving result cache management. v2 doesn't have the sorting requirement, so it can just iterate and print processes one by one. seq_files are either read sequentially or reset to position zero, so the implementation doesn't even need to worry about seeking. This keeps the css_task_iter across multiple read(2) calls and migrations of new processes always append won't miss processes which are newly migrated in before each read(2). Signed-off-by: Tejun Heo Acked-by: Acked-by: Zefan Li --- kernel/cgroup.c | 63 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 5 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 87167e40c4fa..fd684bfd154d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4426,6 +4426,60 @@ out_err: return ret; } +static void cgroup_procs_release(struct kernfs_open_file *of) +{ + if (of->priv) { + css_task_iter_end(of->priv); + kfree(of->priv); + } +} + +static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct kernfs_open_file *of = s->private; + struct css_task_iter *it = of->priv; + struct task_struct *task; + + do { + task = css_task_iter_next(it); + } while (task && !thread_group_leader(task)); + + return task; +} + +static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) +{ + struct kernfs_open_file *of = s->private; + struct cgroup *cgrp = seq_css(s)->cgroup; + struct css_task_iter *it = of->priv; + + /* + * When a seq_file is seeked, it's always traversed sequentially + * from position 0, so we can simply keep iterating on !0 *pos. + */ + if (!it) { + if (WARN_ON_ONCE((*pos)++)) + return ERR_PTR(-EINVAL); + + it = kzalloc(sizeof(*it), GFP_KERNEL); + if (!it) + return ERR_PTR(-ENOMEM); + of->priv = it; + css_task_iter_start(&cgrp->self, it); + } else if (!(*pos)++) { + css_task_iter_end(it); + css_task_iter_start(&cgrp->self, it); + } + + return cgroup_procs_next(s, NULL, NULL); +} + +static int cgroup_procs_show(struct seq_file *s, void *v) +{ + seq_printf(s, "%d\n", task_tgid_vnr(v)); + return 0; +} + /* * Stuff for reading the 'tasks'/'procs' files. * @@ -4914,11 +4968,10 @@ static struct cftype cgroup_dfl_base_files[] = { { .name = "cgroup.procs", .file_offset = offsetof(struct cgroup, procs_file), - .seq_start = cgroup_pidlist_start, - .seq_next = cgroup_pidlist_next, - .seq_stop = cgroup_pidlist_stop, - .seq_show = cgroup_pidlist_show, - .private = CGROUP_FILE_PROCS, + .release = cgroup_procs_release, + .seq_start = cgroup_procs_start, + .seq_next = cgroup_procs_next, + .seq_show = cgroup_procs_show, .write = cgroup_procs_write, }, { From 2fae98634334e256b67241970526ddfadc77db2b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:05 -0500 Subject: [PATCH 05/27] cgroup: remove cgroup_pid_fry() and friends cgroup_pid_fry() was added to mangle cgroup.procs pid listing order on v2 to make it clear that the output is not sorted. Now that v2 now uses a separate "cgroup.procs" read method, this is no longer used. Remove it. Signed-off-by: Tejun Heo Acked-by: Acked-by: Zefan Li --- kernel/cgroup.c | 43 +++++-------------------------------------- 1 file changed, 5 insertions(+), 38 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fd684bfd154d..b9e2d85d59b1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4618,42 +4618,12 @@ after: * sorted by task pointer. As pidlists can be fairly large, allocating one * per open file is dangerous, so cgroup had to implement shared pool of * pidlists keyed by cgroup and namespace. - * - * All this extra complexity was caused by the original implementation - * committing to an entirely unnecessary property. In the long term, we - * want to do away with it. Explicitly scramble sort order if on the - * default hierarchy so that no such expectation exists in the new - * interface. - * - * Scrambling is done by swapping every two consecutive bits, which is - * non-identity one-to-one mapping which disturbs sort order sufficiently. */ -static pid_t pid_fry(pid_t pid) -{ - unsigned a = pid & 0x55555555; - unsigned b = pid & 0xAAAAAAAA; - - return (a << 1) | (b >> 1); -} - -static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) -{ - if (cgroup_on_dfl(cgrp)) - return pid_fry(pid); - else - return pid; -} - static int cmppid(const void *a, const void *b) { return *(pid_t *)a - *(pid_t *)b; } -static int fried_cmppid(const void *a, const void *b) -{ - return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b); -} - static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, enum cgroup_filetype type) { @@ -4741,10 +4711,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, css_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ - if (cgroup_on_dfl(cgrp)) - sort(array, length, sizeof(pid_t), fried_cmppid, NULL); - else - sort(array, length, sizeof(pid_t), cmppid, NULL); + sort(array, length, sizeof(pid_t), cmppid, NULL); if (type == CGROUP_FILE_PROCS) length = pidlist_uniq(array, length); @@ -4876,10 +4843,10 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) while (index < end) { int mid = (index + end) / 2; - if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) { + if (l->list[mid] == pid) { index = mid; break; - } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid) + } else if (l->list[mid] <= pid) index = mid + 1; else end = mid; @@ -4890,7 +4857,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) return NULL; /* Update the abstract position to be the actual pid that we found */ iter = l->list + index; - *pos = cgroup_pid_fry(cgrp, *iter); + *pos = *iter; return iter; } @@ -4919,7 +4886,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) if (p >= end) { return NULL; } else { - *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p); + *pos = *p; return p; } } From 5f617ebbdf10abd49312a89e3b894b927c7367f5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:05 -0500 Subject: [PATCH 06/27] cgroup: reorder css_set fields Reorder css_set fields so that they're roughly in the order of how hot they are. The rough order is 1. the actual csses 2. reference counter and the default cgroup pointer. 3. task lists and iterations 4. fields used during merge including css_set lookup 5. the rest Signed-off-by: Tejun Heo Acked-by: Acked-by: Zefan Li --- include/linux/cgroup-defs.h | 56 ++++++++++++++++++------------------- kernel/cgroup.c | 13 +++++---- 2 files changed, 36 insertions(+), 33 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 8a916dc96e84..3c02404cfce9 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -148,14 +148,18 @@ struct cgroup_subsys_state { * set for a task. */ struct css_set { - /* Reference count */ + /* + * Set of subsystem states, one for each subsystem. This array is + * immutable after creation apart from the init_css_set during + * subsystem registration (at boot time). + */ + struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; + + /* reference count */ atomic_t refcount; - /* - * List running through all cgroup groups in the same hash - * slot. Protected by css_set_lock - */ - struct hlist_node hlist; + /* the default cgroup associated with this css_set */ + struct cgroup *dfl_cgrp; /* * Lists running through all tasks using this cgroup group. @@ -167,22 +171,30 @@ struct css_set { struct list_head tasks; struct list_head mg_tasks; + /* all css_task_iters currently walking this cset */ + struct list_head task_iters; + + /* + * On the default hierarhcy, ->subsys[ssid] may point to a css + * attached to an ancestor instead of the cgroup this css_set is + * associated with. The following node is anchored at + * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to + * iterate through all css's attached to a given cgroup. + */ + struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; + + /* + * List running through all cgroup groups in the same hash + * slot. Protected by css_set_lock + */ + struct hlist_node hlist; + /* * List of cgrp_cset_links pointing at cgroups referenced from this * css_set. Protected by css_set_lock. */ struct list_head cgrp_links; - /* the default cgroup associated with this css_set */ - struct cgroup *dfl_cgrp; - - /* - * Set of subsystem states, one for each subsystem. This array is - * immutable after creation apart from the init_css_set during - * subsystem registration (at boot time). - */ - struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; - /* * List of csets participating in the on-going migration either as * source or destination. Protected by cgroup_mutex. @@ -201,18 +213,6 @@ struct css_set { struct cgroup *mg_dst_cgrp; struct css_set *mg_dst_cset; - /* - * On the default hierarhcy, ->subsys[ssid] may point to a css - * attached to an ancestor instead of the cgroup this css_set is - * associated with. The following node is anchored at - * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to - * iterate through all css's attached to a given cgroup. - */ - struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; - - /* all css_task_iters currently walking this cset */ - struct list_head task_iters; - /* dead and being drained, ignore for migration */ bool dead; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b9e2d85d59b1..1a815f275849 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -647,12 +647,12 @@ struct cgrp_cset_link { */ struct css_set init_css_set = { .refcount = ATOMIC_INIT(1), - .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), + .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), + .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), - .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), }; static int css_set_count = 1; /* 1 for init_css_set */ @@ -1095,13 +1095,13 @@ static struct css_set *find_css_set(struct css_set *old_cset, } atomic_set(&cset->refcount, 1); - INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); - INIT_LIST_HEAD(&cset->mg_preload_node); - INIT_LIST_HEAD(&cset->mg_node); INIT_LIST_HEAD(&cset->task_iters); INIT_HLIST_NODE(&cset->hlist); + INIT_LIST_HEAD(&cset->cgrp_links); + INIT_LIST_HEAD(&cset->mg_preload_node); + INIT_LIST_HEAD(&cset->mg_node); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ @@ -4384,6 +4384,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) struct task_struct *task; int ret; + if (cgroup_on_dfl(to)) + return -EINVAL; + if (!cgroup_may_migrate_to(to)) return -EBUSY; From 201af4c0fab02876ef0311e7f7b4083aa138930c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:05 -0500 Subject: [PATCH 07/27] cgroup: move cgroup files under kernel/cgroup/ They're growing to be too many and planned to get split further. Move them under their own directory. kernel/cgroup.c -> kernel/cgroup/cgroup.c kernel/cgroup_freezer.c -> kernel/cgroup/freezer.c kernel/cgroup_pids.c -> kernel/cgroup/pids.c kernel/cpuset.c -> kernel/cgroup/cpuset.c Signed-off-by: Tejun Heo Acked-by: Acked-by: Zefan Li --- kernel/Makefile | 5 +---- kernel/cgroup/Makefile | 5 +++++ kernel/{ => cgroup}/cgroup.c | 0 kernel/{ => cgroup}/cpuset.c | 0 kernel/{cgroup_freezer.c => cgroup/freezer.c} | 0 kernel/{cgroup_pids.c => cgroup/pids.c} | 0 6 files changed, 6 insertions(+), 4 deletions(-) create mode 100644 kernel/cgroup/Makefile rename kernel/{ => cgroup}/cgroup.c (100%) rename kernel/{ => cgroup}/cpuset.c (100%) rename kernel/{cgroup_freezer.c => cgroup/freezer.c} (100%) rename kernel/{cgroup_pids.c => cgroup/pids.c} (100%) diff --git a/kernel/Makefile b/kernel/Makefile index 12c679f769c6..b302b4731d16 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -64,10 +64,7 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o -obj-$(CONFIG_CGROUPS) += cgroup.o -obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o -obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o -obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_CGROUPS) += cgroup/ obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile new file mode 100644 index 000000000000..4d561a50a5ac --- /dev/null +++ b/kernel/cgroup/Makefile @@ -0,0 +1,5 @@ +obj-y := cgroup.o + +obj-$(CONFIG_CGROUP_FREEZER) += freezer.o +obj-$(CONFIG_CGROUP_PIDS) += pids.o +obj-$(CONFIG_CPUSETS) += cpuset.o diff --git a/kernel/cgroup.c b/kernel/cgroup/cgroup.c similarity index 100% rename from kernel/cgroup.c rename to kernel/cgroup/cgroup.c diff --git a/kernel/cpuset.c b/kernel/cgroup/cpuset.c similarity index 100% rename from kernel/cpuset.c rename to kernel/cgroup/cpuset.c diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup/freezer.c similarity index 100% rename from kernel/cgroup_freezer.c rename to kernel/cgroup/freezer.c diff --git a/kernel/cgroup_pids.c b/kernel/cgroup/pids.c similarity index 100% rename from kernel/cgroup_pids.c rename to kernel/cgroup/pids.c From 0a268dbd7932c78896f5a45c8a492b31729db6c0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:06 -0500 Subject: [PATCH 08/27] cgroup: move cgroup v1 specific code to kernel/cgroup/cgroup-v1.c cgroup.c is getting too unwieldy. Let's move out cgroup v1 specific code along with the debug controller into kernel/cgroup/cgroup-v1.c. v2: cgroup_mutex and css_set_lock made available in cgroup-internal.h regardless of CONFIG_PROVE_RCU. Signed-off-by: Tejun Heo Acked-by: Acked-by: Zefan Li --- kernel/cgroup/Makefile | 2 +- kernel/cgroup/cgroup-internal.h | 103 +++ kernel/cgroup/cgroup-v1.c | 1027 ++++++++++++++++++++++++++++ kernel/cgroup/cgroup.c | 1140 +------------------------------ 4 files changed, 1164 insertions(+), 1108 deletions(-) create mode 100644 kernel/cgroup/cgroup-internal.h create mode 100644 kernel/cgroup/cgroup-v1.c diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 4d561a50a5ac..719588cb18cd 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,4 +1,4 @@ -obj-y := cgroup.o +obj-y := cgroup.o cgroup-v1.o obj-$(CONFIG_CGROUP_FREEZER) += freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h new file mode 100644 index 000000000000..dca3193bd9d2 --- /dev/null +++ b/kernel/cgroup/cgroup-internal.h @@ -0,0 +1,103 @@ +#ifndef __CGROUP_INTERNAL_H +#define __CGROUP_INTERNAL_H + +#include +#include +#include +#include + +/* + * A cgroup can be associated with multiple css_sets as different tasks may + * belong to different cgroups on different hierarchies. In the other + * direction, a css_set is naturally associated with multiple cgroups. + * This M:N relationship is represented by the following link structure + * which exists for each association and allows traversing the associations + * from both sides. + */ +struct cgrp_cset_link { + /* the cgroup and css_set this link associates */ + struct cgroup *cgrp; + struct css_set *cset; + + /* list of cgrp_cset_links anchored at cgrp->cset_links */ + struct list_head cset_link; + + /* list of cgrp_cset_links anchored at css_set->cgrp_links */ + struct list_head cgrp_link; +}; + +extern struct mutex cgroup_mutex; +extern spinlock_t css_set_lock; +extern struct cgroup_subsys *cgroup_subsys[]; +extern struct list_head cgroup_roots; +extern struct file_system_type cgroup_fs_type; + +/* iterate across the hierarchies */ +#define for_each_root(root) \ + list_for_each_entry((root), &cgroup_roots, root_list) + +/** + * for_each_subsys - iterate all enabled cgroup subsystems + * @ss: the iteration cursor + * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end + */ +#define for_each_subsys(ss, ssid) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ + (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) + +static inline bool cgroup_is_dead(const struct cgroup *cgrp) +{ + return !(cgrp->self.flags & CSS_ONLINE); +} + +static inline bool notify_on_release(const struct cgroup *cgrp) +{ + return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); +} + +bool cgroup_ssid_enabled(int ssid); +bool cgroup_on_dfl(const struct cgroup *cgrp); + +struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); +struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroup_root *root); +struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline); +void cgroup_kn_unlock(struct kernfs_node *kn); +int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns); + +int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); + +bool cgroup_may_migrate_to(struct cgroup *dst_cgrp); +void cgroup_migrate_finish(struct list_head *preloaded_csets); +void cgroup_migrate_add_src(struct css_set *src_cset, + struct cgroup *dst_cgrp, + struct list_head *preloaded_csets); +int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets); +int cgroup_migrate(struct task_struct *leader, bool threadgroup, + struct cgroup_root *root); + +int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, + bool threadgroup); +ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, bool threadgroup); +ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + +void cgroup_lock_and_drain_offline(struct cgroup *cgrp); + +/* + * cgroup-v1.c + */ +extern spinlock_t release_agent_path_lock; +extern struct cftype cgroup_legacy_base_files[]; +extern const struct file_operations proc_cgroupstats_operations; + +bool cgroup_ssid_no_v1(int ssid); +void cgroup_pidlist_destroy_all(struct cgroup *cgrp); +int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str); +void cgroup_release_agent(struct work_struct *work); +void check_for_release(struct cgroup *cgrp); + +#endif /* __CGROUP_INTERNAL_H */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c new file mode 100644 index 000000000000..7af745a46f91 --- /dev/null +++ b/kernel/cgroup/cgroup-v1.c @@ -0,0 +1,1027 @@ +#include "cgroup-internal.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * pidlists linger the following amount before being destroyed. The goal + * is avoiding frequent destruction in the middle of consecutive read calls + * Expiring in the middle is a performance problem not a correctness one. + * 1 sec should be enough. + */ +#define CGROUP_PIDLIST_DESTROY_DELAY HZ + +/* Controllers blocked by the commandline in v1 */ +static u16 cgroup_no_v1_mask; + +/* + * pidlist destructions need to be flushed on cgroup destruction. Use a + * separate workqueue as flush domain. + */ +static struct workqueue_struct *cgroup_pidlist_destroy_wq; + +/* + * Protects cgroup_subsys->release_agent_path. Modifying it also requires + * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. + */ +DEFINE_SPINLOCK(release_agent_path_lock); + +bool cgroup_ssid_no_v1(int ssid) +{ + return cgroup_no_v1_mask & (1 << ssid); +} + +/** + * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' + * @from: attach to all cgroups of a given task + * @tsk: the task to be attached + */ +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) +{ + struct cgroup_root *root; + int retval = 0; + + mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + for_each_root(root) { + struct cgroup *from_cgrp; + + if (root == &cgrp_dfl_root) + continue; + + spin_lock_irq(&css_set_lock); + from_cgrp = task_cgroup_from_root(from, root); + spin_unlock_irq(&css_set_lock); + + retval = cgroup_attach_task(from_cgrp, tsk, false); + if (retval) + break; + } + percpu_up_write(&cgroup_threadgroup_rwsem); + mutex_unlock(&cgroup_mutex); + + return retval; +} +EXPORT_SYMBOL_GPL(cgroup_attach_task_all); + +/** + * cgroup_trasnsfer_tasks - move tasks from one cgroup to another + * @to: cgroup to which the tasks will be moved + * @from: cgroup in which the tasks currently reside + * + * Locking rules between cgroup_post_fork() and the migration path + * guarantee that, if a task is forking while being migrated, the new child + * is guaranteed to be either visible in the source cgroup after the + * parent's migration is complete or put into the target cgroup. No task + * can slip out of migration through forking. + */ +int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) +{ + LIST_HEAD(preloaded_csets); + struct cgrp_cset_link *link; + struct css_task_iter it; + struct task_struct *task; + int ret; + + if (cgroup_on_dfl(to)) + return -EINVAL; + + if (!cgroup_may_migrate_to(to)) + return -EBUSY; + + mutex_lock(&cgroup_mutex); + + percpu_down_write(&cgroup_threadgroup_rwsem); + + /* all tasks in @from are being moved, all csets are source */ + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &from->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, to, &preloaded_csets); + spin_unlock_irq(&css_set_lock); + + ret = cgroup_migrate_prepare_dst(&preloaded_csets); + if (ret) + goto out_err; + + /* + * Migrate tasks one-by-one until @from is empty. This fails iff + * ->can_attach() fails. + */ + do { + css_task_iter_start(&from->self, &it); + task = css_task_iter_next(&it); + if (task) + get_task_struct(task); + css_task_iter_end(&it); + + if (task) { + ret = cgroup_migrate(task, false, to->root); + if (!ret) + trace_cgroup_transfer_tasks(to, task, false); + put_task_struct(task); + } + } while (task && !ret); +out_err: + cgroup_migrate_finish(&preloaded_csets); + percpu_up_write(&cgroup_threadgroup_rwsem); + mutex_unlock(&cgroup_mutex); + return ret; +} + +/* + * Stuff for reading the 'tasks'/'procs' files. + * + * Reading this file can return large amounts of data if a cgroup has + * *lots* of attached tasks. So it may need several calls to read(), + * but we cannot guarantee that the information we produce is correct + * unless we produce it entirely atomically. + * + */ + +/* which pidlist file are we talking about? */ +enum cgroup_filetype { + CGROUP_FILE_PROCS, + CGROUP_FILE_TASKS, +}; + +/* + * A pidlist is a list of pids that virtually represents the contents of one + * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, + * a pair (one each for procs, tasks) for each pid namespace that's relevant + * to the cgroup. + */ +struct cgroup_pidlist { + /* + * used to find which pidlist is wanted. doesn't change as long as + * this particular list stays in the list. + */ + struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; + /* array of xids */ + pid_t *list; + /* how many elements the above list has */ + int length; + /* each of these stored in a list by its cgroup */ + struct list_head links; + /* pointer to the cgroup we belong to, for list removal purposes */ + struct cgroup *owner; + /* for delayed destruction */ + struct delayed_work destroy_dwork; +}; + +/* + * The following two functions "fix" the issue where there are more pids + * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. + * TODO: replace with a kernel-wide solution to this problem + */ +#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) +static void *pidlist_allocate(int count) +{ + if (PIDLIST_TOO_LARGE(count)) + return vmalloc(count * sizeof(pid_t)); + else + return kmalloc(count * sizeof(pid_t), GFP_KERNEL); +} + +static void pidlist_free(void *p) +{ + kvfree(p); +} + +/* + * Used to destroy all pidlists lingering waiting for destroy timer. None + * should be left afterwards. + */ +void cgroup_pidlist_destroy_all(struct cgroup *cgrp) +{ + struct cgroup_pidlist *l, *tmp_l; + + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); + mutex_unlock(&cgrp->pidlist_mutex); + + flush_workqueue(cgroup_pidlist_destroy_wq); + BUG_ON(!list_empty(&cgrp->pidlists)); +} + +static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, + destroy_dwork); + struct cgroup_pidlist *tofree = NULL; + + mutex_lock(&l->owner->pidlist_mutex); + + /* + * Destroy iff we didn't get queued again. The state won't change + * as destroy_dwork can only be queued while locked. + */ + if (!delayed_work_pending(dwork)) { + list_del(&l->links); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + tofree = l; + } + + mutex_unlock(&l->owner->pidlist_mutex); + kfree(tofree); +} + +/* + * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries + * Returns the number of unique elements. + */ +static int pidlist_uniq(pid_t *list, int length) +{ + int src, dest = 1; + + /* + * we presume the 0th element is unique, so i starts at 1. trivial + * edge cases first; no work needs to be done for either + */ + if (length == 0 || length == 1) + return length; + /* src and dest walk down the list; dest counts unique elements */ + for (src = 1; src < length; src++) { + /* find next unique element */ + while (list[src] == list[src-1]) { + src++; + if (src == length) + goto after; + } + /* dest always points to where the next unique element goes */ + list[dest] = list[src]; + dest++; + } +after: + return dest; +} + +/* + * The two pid files - task and cgroup.procs - guaranteed that the result + * is sorted, which forced this whole pidlist fiasco. As pid order is + * different per namespace, each namespace needs differently sorted list, + * making it impossible to use, for example, single rbtree of member tasks + * sorted by task pointer. As pidlists can be fairly large, allocating one + * per open file is dangerous, so cgroup had to implement shared pool of + * pidlists keyed by cgroup and namespace. + */ +static int cmppid(const void *a, const void *b) +{ + return *(pid_t *)a - *(pid_t *)b; +} + +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) +{ + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = task_active_pid_ns(current); + + lockdep_assert_held(&cgrp->pidlist_mutex); + + list_for_each_entry(l, &cgrp->pidlists, links) + if (l->key.type == type && l->key.ns == ns) + return l; + return NULL; +} + +/* + * find the appropriate pidlist for our purpose (given procs vs tasks) + * returns with the lock on that pidlist already held, and takes care + * of the use count, or returns NULL with no locks held if we're out of + * memory. + */ +static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, + enum cgroup_filetype type) +{ + struct cgroup_pidlist *l; + + lockdep_assert_held(&cgrp->pidlist_mutex); + + l = cgroup_pidlist_find(cgrp, type); + if (l) + return l; + + /* entry not found; create a new one */ + l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); + if (!l) + return l; + + INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); + l->key.type = type; + /* don't need task_nsproxy() if we're looking at ourself */ + l->key.ns = get_pid_ns(task_active_pid_ns(current)); + l->owner = cgrp; + list_add(&l->links, &cgrp->pidlists); + return l; +} + +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + * + * Return the number of tasks in the cgroup. The returned number can be + * higher than the actual number of tasks due to css_set references from + * namespace roots and temporary usages. + */ +static int cgroup_task_count(const struct cgroup *cgrp) +{ + int count = 0; + struct cgrp_cset_link *link; + + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &cgrp->cset_links, cset_link) + count += atomic_read(&link->cset->refcount); + spin_unlock_irq(&css_set_lock); + return count; +} + +/* + * Load a cgroup's pidarray with either procs' tgids or tasks' pids + */ +static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, + struct cgroup_pidlist **lp) +{ + pid_t *array; + int length; + int pid, n = 0; /* used for populating the array */ + struct css_task_iter it; + struct task_struct *tsk; + struct cgroup_pidlist *l; + + lockdep_assert_held(&cgrp->pidlist_mutex); + + /* + * If cgroup gets more users after we read count, we won't have + * enough space - tough. This race is indistinguishable to the + * caller from the case that the additional cgroup users didn't + * show up until sometime later on. + */ + length = cgroup_task_count(cgrp); + array = pidlist_allocate(length); + if (!array) + return -ENOMEM; + /* now, populate the array */ + css_task_iter_start(&cgrp->self, &it); + while ((tsk = css_task_iter_next(&it))) { + if (unlikely(n == length)) + break; + /* get tgid or pid for procs or tasks file respectively */ + if (type == CGROUP_FILE_PROCS) + pid = task_tgid_vnr(tsk); + else + pid = task_pid_vnr(tsk); + if (pid > 0) /* make sure to only use valid results */ + array[n++] = pid; + } + css_task_iter_end(&it); + length = n; + /* now sort & (if procs) strip out duplicates */ + sort(array, length, sizeof(pid_t), cmppid, NULL); + if (type == CGROUP_FILE_PROCS) + length = pidlist_uniq(array, length); + + l = cgroup_pidlist_find_create(cgrp, type); + if (!l) { + pidlist_free(array); + return -ENOMEM; + } + + /* store array, freeing old if necessary */ + pidlist_free(l->list); + l->list = array; + l->length = length; + *lp = l; + return 0; +} + +/* + * seq_file methods for the tasks/procs files. The seq_file position is the + * next pid to display; the seq_file iterator is a pointer to the pid + * in the cgroup->l->list array. + */ + +static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) +{ + /* + * Initially we receive a position value that corresponds to + * one more than the last pid shown (or 0 on the first call or + * after a seek to the start). Use a binary-search to find the + * next pid to display, if any + */ + struct kernfs_open_file *of = s->private; + struct cgroup *cgrp = seq_css(s)->cgroup; + struct cgroup_pidlist *l; + enum cgroup_filetype type = seq_cft(s)->private; + int index = 0, pid = *pos; + int *iter, ret; + + mutex_lock(&cgrp->pidlist_mutex); + + /* + * !NULL @of->priv indicates that this isn't the first start() + * after open. If the matching pidlist is around, we can use that. + * Look for it. Note that @of->priv can't be used directly. It + * could already have been destroyed. + */ + if (of->priv) + of->priv = cgroup_pidlist_find(cgrp, type); + + /* + * Either this is the first start() after open or the matching + * pidlist has been destroyed inbetween. Create a new one. + */ + if (!of->priv) { + ret = pidlist_array_load(cgrp, type, + (struct cgroup_pidlist **)&of->priv); + if (ret) + return ERR_PTR(ret); + } + l = of->priv; + + if (pid) { + int end = l->length; + + while (index < end) { + int mid = (index + end) / 2; + if (l->list[mid] == pid) { + index = mid; + break; + } else if (l->list[mid] <= pid) + index = mid + 1; + else + end = mid; + } + } + /* If we're off the end of the array, we're done */ + if (index >= l->length) + return NULL; + /* Update the abstract position to be the actual pid that we found */ + iter = l->list + index; + *pos = *iter; + return iter; +} + +static void cgroup_pidlist_stop(struct seq_file *s, void *v) +{ + struct kernfs_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; + + if (l) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, + CGROUP_PIDLIST_DESTROY_DELAY); + mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); +} + +static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct kernfs_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; + pid_t *p = v; + pid_t *end = l->list + l->length; + /* + * Advance to the next pid in the array. If this goes off the + * end, we're done + */ + p++; + if (p >= end) { + return NULL; + } else { + *pos = *p; + return p; + } +} + +static int cgroup_pidlist_show(struct seq_file *s, void *v) +{ + seq_printf(s, "%d\n", *(int *)v); + + return 0; +} + +static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return __cgroup_procs_write(of, buf, nbytes, off, false); +} + +static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + + BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + spin_lock(&release_agent_path_lock); + strlcpy(cgrp->root->release_agent_path, strstrip(buf), + sizeof(cgrp->root->release_agent_path)); + spin_unlock(&release_agent_path_lock); + cgroup_kn_unlock(of->kn); + return nbytes; +} + +static int cgroup_release_agent_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + spin_lock(&release_agent_path_lock); + seq_puts(seq, cgrp->root->release_agent_path); + spin_unlock(&release_agent_path_lock); + seq_putc(seq, '\n'); + return 0; +} + +static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) +{ + seq_puts(seq, "0\n"); + return 0; +} + +static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return notify_on_release(css->cgroup); +} + +static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + if (val) + set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); + else + clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); + return 0; +} + +static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); +} + +static int cgroup_clone_children_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + if (val) + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); + else + clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); + return 0; +} + +/* cgroup core interface files for the legacy hierarchies */ +struct cftype cgroup_legacy_base_files[] = { + { + .name = "cgroup.procs", + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_PROCS, + .write = cgroup_procs_write, + }, + { + .name = "cgroup.clone_children", + .read_u64 = cgroup_clone_children_read, + .write_u64 = cgroup_clone_children_write, + }, + { + .name = "cgroup.sane_behavior", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_sane_behavior_show, + }, + { + .name = "tasks", + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_TASKS, + .write = cgroup_tasks_write, + }, + { + .name = "notify_on_release", + .read_u64 = cgroup_read_notify_on_release, + .write_u64 = cgroup_write_notify_on_release, + }, + { + .name = "release_agent", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_release_agent_show, + .write = cgroup_release_agent_write, + .max_write_len = PATH_MAX - 1, + }, + { } /* terminate */ +}; + +/* Display information about each subsystem and each hierarchy */ +static int proc_cgroupstats_show(struct seq_file *m, void *v) +{ + struct cgroup_subsys *ss; + int i; + + seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); + /* + * ideally we don't want subsystems moving around while we do this. + * cgroup_mutex is also necessary to guarantee an atomic snapshot of + * subsys/hierarchy state. + */ + mutex_lock(&cgroup_mutex); + + for_each_subsys(ss, i) + seq_printf(m, "%s\t%d\t%d\t%d\n", + ss->legacy_name, ss->root->hierarchy_id, + atomic_read(&ss->root->nr_cgrps), + cgroup_ssid_enabled(i)); + + mutex_unlock(&cgroup_mutex); + return 0; +} + +static int cgroupstats_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_cgroupstats_show, NULL); +} + +const struct file_operations proc_cgroupstats_operations = { + .open = cgroupstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/** + * cgroupstats_build - build and fill cgroupstats + * @stats: cgroupstats to fill information into + * @dentry: A dentry entry belonging to the cgroup for which stats have + * been requested. + * + * Build and fill cgroupstats so that taskstats can export it to user + * space. + */ +int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) +{ + struct kernfs_node *kn = kernfs_node_from_dentry(dentry); + struct cgroup *cgrp; + struct css_task_iter it; + struct task_struct *tsk; + + /* it should be kernfs_node belonging to cgroupfs and is a directory */ + if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || + kernfs_type(kn) != KERNFS_DIR) + return -EINVAL; + + mutex_lock(&cgroup_mutex); + + /* + * We aren't being called from kernfs and there's no guarantee on + * @kn->priv's validity. For this and css_tryget_online_from_dir(), + * @kn->priv is RCU safe. Let's do the RCU dancing. + */ + rcu_read_lock(); + cgrp = rcu_dereference(kn->priv); + if (!cgrp || cgroup_is_dead(cgrp)) { + rcu_read_unlock(); + mutex_unlock(&cgroup_mutex); + return -ENOENT; + } + rcu_read_unlock(); + + css_task_iter_start(&cgrp->self, &it); + while ((tsk = css_task_iter_next(&it))) { + switch (tsk->state) { + case TASK_RUNNING: + stats->nr_running++; + break; + case TASK_INTERRUPTIBLE: + stats->nr_sleeping++; + break; + case TASK_UNINTERRUPTIBLE: + stats->nr_uninterruptible++; + break; + case TASK_STOPPED: + stats->nr_stopped++; + break; + default: + if (delayacct_is_task_waiting_on_io(tsk)) + stats->nr_io_wait++; + break; + } + } + css_task_iter_end(&it); + + mutex_unlock(&cgroup_mutex); + return 0; +} + +void check_for_release(struct cgroup *cgrp) +{ + if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && + !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) + schedule_work(&cgrp->release_agent_work); +} + +/* + * Notify userspace when a cgroup is released, by running the + * configured release agent with the name of the cgroup (path + * relative to the root of cgroup file system) as the argument. + * + * Most likely, this user command will try to rmdir this cgroup. + * + * This races with the possibility that some other task will be + * attached to this cgroup before it is removed, or that some other + * user task will 'mkdir' a child cgroup of this cgroup. That's ok. + * The presumed 'rmdir' will fail quietly if this cgroup is no longer + * unused, and this cgroup will be reprieved from its death sentence, + * to continue to serve a useful existence. Next time it's released, + * we will get notified again, if it still has 'notify_on_release' set. + * + * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which + * means only wait until the task is successfully execve()'d. The + * separate release agent task is forked by call_usermodehelper(), + * then control in this thread returns here, without waiting for the + * release agent task. We don't bother to wait because the caller of + * this routine has no use for the exit status of the release agent + * task, so no sense holding our caller up for that. + */ +void cgroup_release_agent(struct work_struct *work) +{ + struct cgroup *cgrp = + container_of(work, struct cgroup, release_agent_work); + char *pathbuf = NULL, *agentbuf = NULL; + char *argv[3], *envp[3]; + int ret; + + mutex_lock(&cgroup_mutex); + + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); + agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); + if (!pathbuf || !agentbuf) + goto out; + + spin_lock_irq(&css_set_lock); + ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + spin_unlock_irq(&css_set_lock); + if (ret < 0 || ret >= PATH_MAX) + goto out; + + argv[0] = agentbuf; + argv[1] = pathbuf; + argv[2] = NULL; + + /* minimal command environment */ + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + mutex_unlock(&cgroup_mutex); + call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + goto out_free; +out: + mutex_unlock(&cgroup_mutex); +out_free: + kfree(agentbuf); + kfree(pathbuf); +} + +/* + * cgroup_rename - Only allow simple rename of directories in place. + */ +int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str) +{ + struct cgroup *cgrp = kn->priv; + int ret; + + if (kernfs_type(kn) != KERNFS_DIR) + return -ENOTDIR; + if (kn->parent != new_parent) + return -EIO; + + /* + * This isn't a proper migration and its usefulness is very + * limited. Disallow on the default hierarchy. + */ + if (cgroup_on_dfl(cgrp)) + return -EPERM; + + /* + * We're gonna grab cgroup_mutex which nests outside kernfs + * active_ref. kernfs_rename() doesn't require active_ref + * protection. Break them before grabbing cgroup_mutex. + */ + kernfs_break_active_protection(new_parent); + kernfs_break_active_protection(kn); + + mutex_lock(&cgroup_mutex); + + ret = kernfs_rename(kn, new_parent, new_name_str); + if (!ret) + trace_cgroup_rename(cgrp); + + mutex_unlock(&cgroup_mutex); + + kernfs_unbreak_active_protection(kn); + kernfs_unbreak_active_protection(new_parent); + return ret; +} + +static int __init cgroup1_wq_init(void) +{ + /* + * Used to destroy pidlists and separate to serve as flush domain. + * Cap @max_active to 1 too. + */ + cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", + 0, 1); + BUG_ON(!cgroup_pidlist_destroy_wq); + return 0; +} +core_initcall(cgroup1_wq_init); + +static int __init cgroup_no_v1(char *str) +{ + struct cgroup_subsys *ss; + char *token; + int i; + + while ((token = strsep(&str, ",")) != NULL) { + if (!*token) + continue; + + if (!strcmp(token, "all")) { + cgroup_no_v1_mask = U16_MAX; + break; + } + + for_each_subsys(ss, i) { + if (strcmp(token, ss->name) && + strcmp(token, ss->legacy_name)) + continue; + + cgroup_no_v1_mask |= 1 << i; + } + } + return 1; +} +__setup("cgroup_no_v1=", cgroup_no_v1); + + +#ifdef CONFIG_CGROUP_DEBUG +static struct cgroup_subsys_state * +debug_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static void debug_css_free(struct cgroup_subsys_state *css) +{ + kfree(css); +} + +static u64 debug_taskcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return cgroup_task_count(css->cgroup); +} + +static u64 current_css_set_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return (u64)(unsigned long)current->cgroups; +} + +static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 count; + + rcu_read_lock(); + count = atomic_read(&task_css_set(current)->refcount); + rcu_read_unlock(); + return count; +} + +static int current_css_set_cg_links_read(struct seq_file *seq, void *v) +{ + struct cgrp_cset_link *link; + struct css_set *cset; + char *name_buf; + + name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name_buf) + return -ENOMEM; + + spin_lock_irq(&css_set_lock); + rcu_read_lock(); + cset = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + struct cgroup *c = link->cgrp; + + cgroup_name(c, name_buf, NAME_MAX + 1); + seq_printf(seq, "Root %d group %s\n", + c->root->hierarchy_id, name_buf); + } + rcu_read_unlock(); + spin_unlock_irq(&css_set_lock); + kfree(name_buf); + return 0; +} + +#define MAX_TASKS_SHOWN_PER_CSS 25 +static int cgroup_css_links_read(struct seq_file *seq, void *v) +{ + struct cgroup_subsys_state *css = seq_css(seq); + struct cgrp_cset_link *link; + + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { + struct css_set *cset = link->cset; + struct task_struct *task; + int count = 0; + + seq_printf(seq, "css_set %p\n", cset); + + list_for_each_entry(task, &cset->tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + + list_for_each_entry(task, &cset->mg_tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + continue; + overflow: + seq_puts(seq, " ...\n"); + } + spin_unlock_irq(&css_set_lock); + return 0; +} + +static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return (!cgroup_is_populated(css->cgroup) && + !css_has_online_children(&css->cgroup->self)); +} + +static struct cftype debug_files[] = { + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .read_u64 = current_css_set_read, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + }, + + { + .name = "current_css_set_cg_links", + .seq_show = current_css_set_cg_links_read, + }, + + { + .name = "cgroup_css_links", + .seq_show = cgroup_css_links_read, + }, + + { + .name = "releasable", + .read_u64 = releasable_read, + }, + + { } /* terminate */ +}; + +struct cgroup_subsys debug_cgrp_subsys = { + .css_alloc = debug_css_alloc, + .css_free = debug_css_free, + .legacy_cftypes = debug_files, +}; +#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1a815f275849..d34c170f87ef 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -28,15 +28,14 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include +#include "cgroup-internal.h" + #include #include #include #include #include -#include #include -#include #include #include #include @@ -47,14 +46,8 @@ #include #include #include -#include -#include -#include -#include #include -#include #include -#include /* TODO: replace with more sophisticated array */ #include #include #include @@ -67,14 +60,6 @@ #define CREATE_TRACE_POINTS #include -/* - * pidlists linger the following amount before being destroyed. The goal - * is avoiding frequent destruction in the middle of consecutive read calls - * Expiring in the middle is a performance problem not a correctness one. - * 1 sec should be enough. - */ -#define CGROUP_PIDLIST_DESTROY_DELAY HZ - #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) @@ -88,14 +73,12 @@ * These locks are exported if CONFIG_PROVE_RCU so that accessors in * cgroup.h can use them for lockdep annotations. */ -#ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); + +#ifdef CONFIG_PROVE_RCU EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); -#else -static DEFINE_MUTEX(cgroup_mutex); -static DEFINE_SPINLOCK(css_set_lock); #endif /* @@ -110,12 +93,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(cgroup_file_kn_lock); -/* - * Protects cgroup_subsys->release_agent_path. Modifying it also requires - * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. - */ -static DEFINE_SPINLOCK(release_agent_path_lock); - struct percpu_rw_semaphore cgroup_threadgroup_rwsem; #define cgroup_assert_mutex_or_rcu_locked() \ @@ -131,15 +108,9 @@ struct percpu_rw_semaphore cgroup_threadgroup_rwsem; */ static struct workqueue_struct *cgroup_destroy_wq; -/* - * pidlist destructions need to be flushed on cgroup destruction. Use a - * separate workqueue as flush domain. - */ -static struct workqueue_struct *cgroup_pidlist_destroy_wq; - /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, -static struct cgroup_subsys *cgroup_subsys[] = { +struct cgroup_subsys *cgroup_subsys[] = { #include }; #undef SUBSYS @@ -186,9 +157,6 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); */ static bool cgrp_dfl_visible; -/* Controllers blocked by the commandline in v1 */ -static u16 cgroup_no_v1_mask; - /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; @@ -196,8 +164,7 @@ static u16 cgrp_dfl_inhibit_ss_mask; static unsigned long cgrp_dfl_implicit_ss_mask; /* The list of hierarchy roots */ - -static LIST_HEAD(cgroup_roots); +LIST_HEAD(cgroup_roots); static int cgroup_root_count; /* hierarchy ID allocation and mapping, protected by cgroup_mutex */ @@ -235,10 +202,7 @@ static u16 have_canfork_callback __read_mostly; static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_dfl_base_files[]; -static struct cftype cgroup_legacy_base_files[]; -static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); -static void cgroup_lock_and_drain_offline(struct cgroup *cgrp); static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_advance(struct css_task_iter *it); @@ -259,7 +223,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, * is fine for individual subsystems but unsuitable for cgroup core. This * is slower static_key_enabled() based test indexed by @ssid. */ -static bool cgroup_ssid_enabled(int ssid) +bool cgroup_ssid_enabled(int ssid) { if (CGROUP_SUBSYS_COUNT == 0) return false; @@ -267,11 +231,6 @@ static bool cgroup_ssid_enabled(int ssid) return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } -static bool cgroup_ssid_no_v1(int ssid) -{ - return cgroup_no_v1_mask & (1 << ssid); -} - /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest @@ -325,7 +284,7 @@ static bool cgroup_ssid_no_v1(int ssid) * * - debug: disallowed on the default hierarchy. */ -static bool cgroup_on_dfl(const struct cgroup *cgrp) +bool cgroup_on_dfl(const struct cgroup *cgrp) { return cgrp->root == &cgrp_dfl_root; } @@ -481,12 +440,6 @@ out_unlock: return css; } -/* convenient tests for these bits */ -static inline bool cgroup_is_dead(const struct cgroup *cgrp) -{ - return !(cgrp->self.flags & CSS_ONLINE); -} - static void cgroup_get(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); @@ -518,11 +471,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) } EXPORT_SYMBOL_GPL(of_css); -static int notify_on_release(const struct cgroup *cgrp) -{ - return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); -} - /** * for_each_css - iterate all css's of a cgroup * @css: the iteration cursor @@ -552,15 +500,6 @@ static int notify_on_release(const struct cgroup *cgrp) ; \ else -/** - * for_each_subsys - iterate all enabled cgroup subsystems - * @ss: the iteration cursor - * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end - */ -#define for_each_subsys(ss, ssid) \ - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ - (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) - /** * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor @@ -585,10 +524,6 @@ static int notify_on_release(const struct cgroup *cgrp) } \ } while (false) -/* iterate across the hierarchies */ -#define for_each_root(root) \ - list_for_each_entry((root), &cgroup_roots, root_list) - /* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ @@ -615,29 +550,6 @@ static int notify_on_release(const struct cgroup *cgrp) ; \ else -static void cgroup_release_agent(struct work_struct *work); -static void check_for_release(struct cgroup *cgrp); - -/* - * A cgroup can be associated with multiple css_sets as different tasks may - * belong to different cgroups on different hierarchies. In the other - * direction, a css_set is naturally associated with multiple cgroups. - * This M:N relationship is represented by the following link structure - * which exists for each association and allows traversing the associations - * from both sides. - */ -struct cgrp_cset_link { - /* the cgroup and css_set this link associates */ - struct cgroup *cgrp; - struct css_set *cset; - - /* list of cgrp_cset_links anchored at cgrp->cset_links */ - struct list_head cset_link; - - /* list of cgrp_cset_links anchored at css_set->cgrp_links */ - struct list_head cgrp_link; -}; - /* * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state @@ -1138,7 +1050,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, return cset; } -static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) +struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { struct cgroup *root_cgrp = kf_root->kn->priv; @@ -1283,8 +1195,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, * Return the cgroup for "task" from the given hierarchy. Must be * called with cgroup_mutex and css_set_lock held. */ -static struct cgroup *task_cgroup_from_root(struct task_struct *task, - struct cgroup_root *root) +struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroup_root *root) { /* * No need to lock the task - since we hold cgroup_mutex the @@ -1321,7 +1233,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, */ static struct kernfs_syscall_ops cgroup_kf_syscall_ops; -static const struct file_operations proc_cgroupstats_operations; static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, char *buf) @@ -1415,7 +1326,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) * inaccessible any time. If the caller intends to continue to access the * cgroup, it should pin it before invoking this function. */ -static void cgroup_kn_unlock(struct kernfs_node *kn) +void cgroup_kn_unlock(struct kernfs_node *kn) { struct cgroup *cgrp; @@ -1447,8 +1358,7 @@ static void cgroup_kn_unlock(struct kernfs_node *kn) * locking under kernfs active protection and allows all kernfs operations * including self-removal. */ -static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, - bool drain_offline) +struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) { struct cgroup *cgrp; @@ -1559,7 +1469,7 @@ err: return ret; } -static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) +int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; @@ -1656,8 +1566,7 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, return len; } -static int cgroup_show_options(struct seq_file *seq, - struct kernfs_root *kf_root) +static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; @@ -2311,7 +2220,7 @@ static void cgroup_kill_sb(struct super_block *sb) kernfs_kill_sb(sb); } -static struct file_system_type cgroup_fs_type = { +struct file_system_type cgroup_fs_type = { .name = "cgroup", .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, @@ -2325,8 +2234,8 @@ static struct file_system_type cgroup2_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) +int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); @@ -2616,7 +2525,7 @@ out_release_tset: * zero for migration destination cgroups with tasks so that child cgroups * don't compete against tasks. */ -static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) +bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) { return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || !dst_cgrp->subtree_control; @@ -2629,7 +2538,7 @@ static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See * those functions for details. */ -static void cgroup_migrate_finish(struct list_head *preloaded_csets) +void cgroup_migrate_finish(struct list_head *preloaded_csets) { struct css_set *cset, *tmp_cset; @@ -2662,9 +2571,9 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) * into play and the preloaded css_sets are guaranteed to cover all * migrations. */ -static void cgroup_migrate_add_src(struct css_set *src_cset, - struct cgroup *dst_cgrp, - struct list_head *preloaded_csets) +void cgroup_migrate_add_src(struct css_set *src_cset, + struct cgroup *dst_cgrp, + struct list_head *preloaded_csets) { struct cgroup *src_cgrp; @@ -2709,7 +2618,7 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, * using cgroup_migrate(), cgroup_migrate_finish() must be called on * @preloaded_csets. */ -static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) +int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) { LIST_HEAD(csets); struct css_set *src_cset, *tmp_cset; @@ -2773,8 +2682,8 @@ err: * decided for all targets by invoking group_migrate_prepare_dst() before * actually starting migrating. */ -static int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup_root *root) +int cgroup_migrate(struct task_struct *leader, bool threadgroup, + struct cgroup_root *root) { struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct task_struct *task; @@ -2806,8 +2715,8 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, * * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. */ -static int cgroup_attach_task(struct cgroup *dst_cgrp, - struct task_struct *leader, bool threadgroup) +int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, + bool threadgroup) { LIST_HEAD(preloaded_csets); struct task_struct *task; @@ -2888,8 +2797,8 @@ static int cgroup_procs_write_permission(struct task_struct *task, * function to attach either it or all tasks in its threadgroup. Will lock * cgroup_mutex and threadgroup. */ -static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off, bool threadgroup) +ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; struct cgroup_subsys *ss; @@ -2950,86 +2859,12 @@ out_unlock_threadgroup: return ret ?: nbytes; } -/** - * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' - * @from: attach to all cgroups of a given task - * @tsk: the task to be attached - */ -int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) -{ - struct cgroup_root *root; - int retval = 0; - - mutex_lock(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); - for_each_root(root) { - struct cgroup *from_cgrp; - - if (root == &cgrp_dfl_root) - continue; - - spin_lock_irq(&css_set_lock); - from_cgrp = task_cgroup_from_root(from, root); - spin_unlock_irq(&css_set_lock); - - retval = cgroup_attach_task(from_cgrp, tsk, false); - if (retval) - break; - } - percpu_up_write(&cgroup_threadgroup_rwsem); - mutex_unlock(&cgroup_mutex); - - return retval; -} -EXPORT_SYMBOL_GPL(cgroup_attach_task_all); - -static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - return __cgroup_procs_write(of, buf, nbytes, off, false); -} - -static ssize_t cgroup_procs_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) +ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off) { return __cgroup_procs_write(of, buf, nbytes, off, true); } -static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct cgroup *cgrp; - - BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENODEV; - spin_lock(&release_agent_path_lock); - strlcpy(cgrp->root->release_agent_path, strstrip(buf), - sizeof(cgrp->root->release_agent_path)); - spin_unlock(&release_agent_path_lock); - cgroup_kn_unlock(of->kn); - return nbytes; -} - -static int cgroup_release_agent_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - - spin_lock(&release_agent_path_lock); - seq_puts(seq, cgrp->root->release_agent_path); - spin_unlock(&release_agent_path_lock); - seq_putc(seq, '\n'); - return 0; -} - -static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) -{ - seq_puts(seq, "0\n"); - return 0; -} - static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; @@ -3131,7 +2966,7 @@ out_finish: * controller while the previous css is still around. This function grabs * cgroup_mutex and drains the previous css instances of @cgrp's subtree. */ -static void cgroup_lock_and_drain_offline(struct cgroup *cgrp) +void cgroup_lock_and_drain_offline(struct cgroup *cgrp) __acquires(&cgroup_mutex) { struct cgroup *dsct; @@ -3610,48 +3445,6 @@ static struct kernfs_ops cgroup_kf_ops = { .seq_show = cgroup_seqfile_show, }; -/* - * cgroup_rename - Only allow simple rename of directories in place. - */ -static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, - const char *new_name_str) -{ - struct cgroup *cgrp = kn->priv; - int ret; - - if (kernfs_type(kn) != KERNFS_DIR) - return -ENOTDIR; - if (kn->parent != new_parent) - return -EIO; - - /* - * This isn't a proper migration and its usefulness is very - * limited. Disallow on the default hierarchy. - */ - if (cgroup_on_dfl(cgrp)) - return -EPERM; - - /* - * We're gonna grab cgroup_mutex which nests outside kernfs - * active_ref. kernfs_rename() doesn't require active_ref - * protection. Break them before grabbing cgroup_mutex. - */ - kernfs_break_active_protection(new_parent); - kernfs_break_active_protection(kn); - - mutex_lock(&cgroup_mutex); - - ret = kernfs_rename(kn, new_parent, new_name_str); - if (!ret) - trace_cgroup_rename(cgrp); - - mutex_unlock(&cgroup_mutex); - - kernfs_unbreak_active_protection(kn); - kernfs_unbreak_active_protection(new_parent); - return ret; -} - /* set uid and gid of cgroup dirs and files to that of the creator */ static int cgroup_kn_set_ugid(struct kernfs_node *kn) { @@ -3947,26 +3740,6 @@ void cgroup_file_notify(struct cgroup_file *cfile) spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } -/** - * cgroup_task_count - count the number of tasks in a cgroup. - * @cgrp: the cgroup in question - * - * Return the number of tasks in the cgroup. The returned number can be - * higher than the actual number of tasks due to css_set references from - * namespace roots and temporary usages. - */ -static int cgroup_task_count(const struct cgroup *cgrp) -{ - int count = 0; - struct cgrp_cset_link *link; - - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += atomic_read(&link->cset->refcount); - spin_unlock_irq(&css_set_lock); - return count; -} - /** * css_next_child - find the next child of a given css * @pos: the current position (%NULL to initiate traversal) @@ -4365,70 +4138,6 @@ void css_task_iter_end(struct css_task_iter *it) put_task_struct(it->cur_task); } -/** - * cgroup_trasnsfer_tasks - move tasks from one cgroup to another - * @to: cgroup to which the tasks will be moved - * @from: cgroup in which the tasks currently reside - * - * Locking rules between cgroup_post_fork() and the migration path - * guarantee that, if a task is forking while being migrated, the new child - * is guaranteed to be either visible in the source cgroup after the - * parent's migration is complete or put into the target cgroup. No task - * can slip out of migration through forking. - */ -int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) -{ - LIST_HEAD(preloaded_csets); - struct cgrp_cset_link *link; - struct css_task_iter it; - struct task_struct *task; - int ret; - - if (cgroup_on_dfl(to)) - return -EINVAL; - - if (!cgroup_may_migrate_to(to)) - return -EBUSY; - - mutex_lock(&cgroup_mutex); - - percpu_down_write(&cgroup_threadgroup_rwsem); - - /* all tasks in @from are being moved, all csets are source */ - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &from->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, to, &preloaded_csets); - spin_unlock_irq(&css_set_lock); - - ret = cgroup_migrate_prepare_dst(&preloaded_csets); - if (ret) - goto out_err; - - /* - * Migrate tasks one-by-one until @from is empty. This fails iff - * ->can_attach() fails. - */ - do { - css_task_iter_start(&from->self, &it); - task = css_task_iter_next(&it); - if (task) - get_task_struct(task); - css_task_iter_end(&it); - - if (task) { - ret = cgroup_migrate(task, false, to->root); - if (!ret) - trace_cgroup_transfer_tasks(to, task, false); - put_task_struct(task); - } - } while (task && !ret); -out_err: - cgroup_migrate_finish(&preloaded_csets); - percpu_up_write(&cgroup_threadgroup_rwsem); - mutex_unlock(&cgroup_mutex); - return ret; -} - static void cgroup_procs_release(struct kernfs_open_file *of) { if (of->priv) { @@ -4483,456 +4192,6 @@ static int cgroup_procs_show(struct seq_file *s, void *v) return 0; } -/* - * Stuff for reading the 'tasks'/'procs' files. - * - * Reading this file can return large amounts of data if a cgroup has - * *lots* of attached tasks. So it may need several calls to read(), - * but we cannot guarantee that the information we produce is correct - * unless we produce it entirely atomically. - * - */ - -/* which pidlist file are we talking about? */ -enum cgroup_filetype { - CGROUP_FILE_PROCS, - CGROUP_FILE_TASKS, -}; - -/* - * A pidlist is a list of pids that virtually represents the contents of one - * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, - * a pair (one each for procs, tasks) for each pid namespace that's relevant - * to the cgroup. - */ -struct cgroup_pidlist { - /* - * used to find which pidlist is wanted. doesn't change as long as - * this particular list stays in the list. - */ - struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; - /* array of xids */ - pid_t *list; - /* how many elements the above list has */ - int length; - /* each of these stored in a list by its cgroup */ - struct list_head links; - /* pointer to the cgroup we belong to, for list removal purposes */ - struct cgroup *owner; - /* for delayed destruction */ - struct delayed_work destroy_dwork; -}; - -/* - * The following two functions "fix" the issue where there are more pids - * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. - * TODO: replace with a kernel-wide solution to this problem - */ -#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) -static void *pidlist_allocate(int count) -{ - if (PIDLIST_TOO_LARGE(count)) - return vmalloc(count * sizeof(pid_t)); - else - return kmalloc(count * sizeof(pid_t), GFP_KERNEL); -} - -static void pidlist_free(void *p) -{ - kvfree(p); -} - -/* - * Used to destroy all pidlists lingering waiting for destroy timer. None - * should be left afterwards. - */ -static void cgroup_pidlist_destroy_all(struct cgroup *cgrp) -{ - struct cgroup_pidlist *l, *tmp_l; - - mutex_lock(&cgrp->pidlist_mutex); - list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) - mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); - mutex_unlock(&cgrp->pidlist_mutex); - - flush_workqueue(cgroup_pidlist_destroy_wq); - BUG_ON(!list_empty(&cgrp->pidlists)); -} - -static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) -{ - struct delayed_work *dwork = to_delayed_work(work); - struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, - destroy_dwork); - struct cgroup_pidlist *tofree = NULL; - - mutex_lock(&l->owner->pidlist_mutex); - - /* - * Destroy iff we didn't get queued again. The state won't change - * as destroy_dwork can only be queued while locked. - */ - if (!delayed_work_pending(dwork)) { - list_del(&l->links); - pidlist_free(l->list); - put_pid_ns(l->key.ns); - tofree = l; - } - - mutex_unlock(&l->owner->pidlist_mutex); - kfree(tofree); -} - -/* - * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries - * Returns the number of unique elements. - */ -static int pidlist_uniq(pid_t *list, int length) -{ - int src, dest = 1; - - /* - * we presume the 0th element is unique, so i starts at 1. trivial - * edge cases first; no work needs to be done for either - */ - if (length == 0 || length == 1) - return length; - /* src and dest walk down the list; dest counts unique elements */ - for (src = 1; src < length; src++) { - /* find next unique element */ - while (list[src] == list[src-1]) { - src++; - if (src == length) - goto after; - } - /* dest always points to where the next unique element goes */ - list[dest] = list[src]; - dest++; - } -after: - return dest; -} - -/* - * The two pid files - task and cgroup.procs - guaranteed that the result - * is sorted, which forced this whole pidlist fiasco. As pid order is - * different per namespace, each namespace needs differently sorted list, - * making it impossible to use, for example, single rbtree of member tasks - * sorted by task pointer. As pidlists can be fairly large, allocating one - * per open file is dangerous, so cgroup had to implement shared pool of - * pidlists keyed by cgroup and namespace. - */ -static int cmppid(const void *a, const void *b) -{ - return *(pid_t *)a - *(pid_t *)b; -} - -static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, - enum cgroup_filetype type) -{ - struct cgroup_pidlist *l; - /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = task_active_pid_ns(current); - - lockdep_assert_held(&cgrp->pidlist_mutex); - - list_for_each_entry(l, &cgrp->pidlists, links) - if (l->key.type == type && l->key.ns == ns) - return l; - return NULL; -} - -/* - * find the appropriate pidlist for our purpose (given procs vs tasks) - * returns with the lock on that pidlist already held, and takes care - * of the use count, or returns NULL with no locks held if we're out of - * memory. - */ -static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, - enum cgroup_filetype type) -{ - struct cgroup_pidlist *l; - - lockdep_assert_held(&cgrp->pidlist_mutex); - - l = cgroup_pidlist_find(cgrp, type); - if (l) - return l; - - /* entry not found; create a new one */ - l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); - if (!l) - return l; - - INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); - l->key.type = type; - /* don't need task_nsproxy() if we're looking at ourself */ - l->key.ns = get_pid_ns(task_active_pid_ns(current)); - l->owner = cgrp; - list_add(&l->links, &cgrp->pidlists); - return l; -} - -/* - * Load a cgroup's pidarray with either procs' tgids or tasks' pids - */ -static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, - struct cgroup_pidlist **lp) -{ - pid_t *array; - int length; - int pid, n = 0; /* used for populating the array */ - struct css_task_iter it; - struct task_struct *tsk; - struct cgroup_pidlist *l; - - lockdep_assert_held(&cgrp->pidlist_mutex); - - /* - * If cgroup gets more users after we read count, we won't have - * enough space - tough. This race is indistinguishable to the - * caller from the case that the additional cgroup users didn't - * show up until sometime later on. - */ - length = cgroup_task_count(cgrp); - array = pidlist_allocate(length); - if (!array) - return -ENOMEM; - /* now, populate the array */ - css_task_iter_start(&cgrp->self, &it); - while ((tsk = css_task_iter_next(&it))) { - if (unlikely(n == length)) - break; - /* get tgid or pid for procs or tasks file respectively */ - if (type == CGROUP_FILE_PROCS) - pid = task_tgid_vnr(tsk); - else - pid = task_pid_vnr(tsk); - if (pid > 0) /* make sure to only use valid results */ - array[n++] = pid; - } - css_task_iter_end(&it); - length = n; - /* now sort & (if procs) strip out duplicates */ - sort(array, length, sizeof(pid_t), cmppid, NULL); - if (type == CGROUP_FILE_PROCS) - length = pidlist_uniq(array, length); - - l = cgroup_pidlist_find_create(cgrp, type); - if (!l) { - pidlist_free(array); - return -ENOMEM; - } - - /* store array, freeing old if necessary */ - pidlist_free(l->list); - l->list = array; - l->length = length; - *lp = l; - return 0; -} - -/** - * cgroupstats_build - build and fill cgroupstats - * @stats: cgroupstats to fill information into - * @dentry: A dentry entry belonging to the cgroup for which stats have - * been requested. - * - * Build and fill cgroupstats so that taskstats can export it to user - * space. - */ -int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) -{ - struct kernfs_node *kn = kernfs_node_from_dentry(dentry); - struct cgroup *cgrp; - struct css_task_iter it; - struct task_struct *tsk; - - /* it should be kernfs_node belonging to cgroupfs and is a directory */ - if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || - kernfs_type(kn) != KERNFS_DIR) - return -EINVAL; - - mutex_lock(&cgroup_mutex); - - /* - * We aren't being called from kernfs and there's no guarantee on - * @kn->priv's validity. For this and css_tryget_online_from_dir(), - * @kn->priv is RCU safe. Let's do the RCU dancing. - */ - rcu_read_lock(); - cgrp = rcu_dereference(kn->priv); - if (!cgrp || cgroup_is_dead(cgrp)) { - rcu_read_unlock(); - mutex_unlock(&cgroup_mutex); - return -ENOENT; - } - rcu_read_unlock(); - - css_task_iter_start(&cgrp->self, &it); - while ((tsk = css_task_iter_next(&it))) { - switch (tsk->state) { - case TASK_RUNNING: - stats->nr_running++; - break; - case TASK_INTERRUPTIBLE: - stats->nr_sleeping++; - break; - case TASK_UNINTERRUPTIBLE: - stats->nr_uninterruptible++; - break; - case TASK_STOPPED: - stats->nr_stopped++; - break; - default: - if (delayacct_is_task_waiting_on_io(tsk)) - stats->nr_io_wait++; - break; - } - } - css_task_iter_end(&it); - - mutex_unlock(&cgroup_mutex); - return 0; -} - - -/* - * seq_file methods for the tasks/procs files. The seq_file position is the - * next pid to display; the seq_file iterator is a pointer to the pid - * in the cgroup->l->list array. - */ - -static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) -{ - /* - * Initially we receive a position value that corresponds to - * one more than the last pid shown (or 0 on the first call or - * after a seek to the start). Use a binary-search to find the - * next pid to display, if any - */ - struct kernfs_open_file *of = s->private; - struct cgroup *cgrp = seq_css(s)->cgroup; - struct cgroup_pidlist *l; - enum cgroup_filetype type = seq_cft(s)->private; - int index = 0, pid = *pos; - int *iter, ret; - - mutex_lock(&cgrp->pidlist_mutex); - - /* - * !NULL @of->priv indicates that this isn't the first start() - * after open. If the matching pidlist is around, we can use that. - * Look for it. Note that @of->priv can't be used directly. It - * could already have been destroyed. - */ - if (of->priv) - of->priv = cgroup_pidlist_find(cgrp, type); - - /* - * Either this is the first start() after open or the matching - * pidlist has been destroyed inbetween. Create a new one. - */ - if (!of->priv) { - ret = pidlist_array_load(cgrp, type, - (struct cgroup_pidlist **)&of->priv); - if (ret) - return ERR_PTR(ret); - } - l = of->priv; - - if (pid) { - int end = l->length; - - while (index < end) { - int mid = (index + end) / 2; - if (l->list[mid] == pid) { - index = mid; - break; - } else if (l->list[mid] <= pid) - index = mid + 1; - else - end = mid; - } - } - /* If we're off the end of the array, we're done */ - if (index >= l->length) - return NULL; - /* Update the abstract position to be the actual pid that we found */ - iter = l->list + index; - *pos = *iter; - return iter; -} - -static void cgroup_pidlist_stop(struct seq_file *s, void *v) -{ - struct kernfs_open_file *of = s->private; - struct cgroup_pidlist *l = of->priv; - - if (l) - mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, - CGROUP_PIDLIST_DESTROY_DELAY); - mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); -} - -static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) -{ - struct kernfs_open_file *of = s->private; - struct cgroup_pidlist *l = of->priv; - pid_t *p = v; - pid_t *end = l->list + l->length; - /* - * Advance to the next pid in the array. If this goes off the - * end, we're done - */ - p++; - if (p >= end) { - return NULL; - } else { - *pos = *p; - return p; - } -} - -static int cgroup_pidlist_show(struct seq_file *s, void *v) -{ - seq_printf(s, "%d\n", *(int *)v); - - return 0; -} - -static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return notify_on_release(css->cgroup); -} - -static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, - struct cftype *cft, u64 val) -{ - if (val) - set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); - else - clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); - return 0; -} - -static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); -} - -static int cgroup_clone_children_write(struct cgroup_subsys_state *css, - struct cftype *cft, u64 val) -{ - if (val) - set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); - else - clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); - return 0; -} - /* cgroup core interface files for the default hierarchy */ static struct cftype cgroup_dfl_base_files[] = { { @@ -4962,51 +4221,6 @@ static struct cftype cgroup_dfl_base_files[] = { { } /* terminate */ }; -/* cgroup core interface files for the legacy hierarchies */ -static struct cftype cgroup_legacy_base_files[] = { - { - .name = "cgroup.procs", - .seq_start = cgroup_pidlist_start, - .seq_next = cgroup_pidlist_next, - .seq_stop = cgroup_pidlist_stop, - .seq_show = cgroup_pidlist_show, - .private = CGROUP_FILE_PROCS, - .write = cgroup_procs_write, - }, - { - .name = "cgroup.clone_children", - .read_u64 = cgroup_clone_children_read, - .write_u64 = cgroup_clone_children_write, - }, - { - .name = "cgroup.sane_behavior", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_sane_behavior_show, - }, - { - .name = "tasks", - .seq_start = cgroup_pidlist_start, - .seq_next = cgroup_pidlist_next, - .seq_stop = cgroup_pidlist_stop, - .seq_show = cgroup_pidlist_show, - .private = CGROUP_FILE_TASKS, - .write = cgroup_tasks_write, - }, - { - .name = "notify_on_release", - .read_u64 = cgroup_read_notify_on_release, - .write_u64 = cgroup_write_notify_on_release, - }, - { - .name = "release_agent", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_release_agent_show, - .write = cgroup_release_agent_write, - .max_write_len = PATH_MAX - 1, - }, - { } /* terminate */ -}; - /* * css destruction is four-stage process. * @@ -5792,15 +5006,6 @@ static int __init cgroup_wq_init(void) */ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); BUG_ON(!cgroup_destroy_wq); - - /* - * Used to destroy pidlists and separate to serve as flush domain. - * Cap @max_active to 1 too. - */ - cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", - 0, 1); - BUG_ON(!cgroup_pidlist_destroy_wq); - return 0; } core_initcall(cgroup_wq_init); @@ -5883,42 +5088,6 @@ out: return retval; } -/* Display information about each subsystem and each hierarchy */ -static int proc_cgroupstats_show(struct seq_file *m, void *v) -{ - struct cgroup_subsys *ss; - int i; - - seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); - /* - * ideally we don't want subsystems moving around while we do this. - * cgroup_mutex is also necessary to guarantee an atomic snapshot of - * subsys/hierarchy state. - */ - mutex_lock(&cgroup_mutex); - - for_each_subsys(ss, i) - seq_printf(m, "%s\t%d\t%d\t%d\n", - ss->legacy_name, ss->root->hierarchy_id, - atomic_read(&ss->root->nr_cgrps), - cgroup_ssid_enabled(i)); - - mutex_unlock(&cgroup_mutex); - return 0; -} - -static int cgroupstats_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_cgroupstats_show, NULL); -} - -static const struct file_operations proc_cgroupstats_operations = { - .open = cgroupstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. @@ -6098,76 +5267,6 @@ void cgroup_free(struct task_struct *task) put_css_set(cset); } -static void check_for_release(struct cgroup *cgrp) -{ - if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && - !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) - schedule_work(&cgrp->release_agent_work); -} - -/* - * Notify userspace when a cgroup is released, by running the - * configured release agent with the name of the cgroup (path - * relative to the root of cgroup file system) as the argument. - * - * Most likely, this user command will try to rmdir this cgroup. - * - * This races with the possibility that some other task will be - * attached to this cgroup before it is removed, or that some other - * user task will 'mkdir' a child cgroup of this cgroup. That's ok. - * The presumed 'rmdir' will fail quietly if this cgroup is no longer - * unused, and this cgroup will be reprieved from its death sentence, - * to continue to serve a useful existence. Next time it's released, - * we will get notified again, if it still has 'notify_on_release' set. - * - * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which - * means only wait until the task is successfully execve()'d. The - * separate release agent task is forked by call_usermodehelper(), - * then control in this thread returns here, without waiting for the - * release agent task. We don't bother to wait because the caller of - * this routine has no use for the exit status of the release agent - * task, so no sense holding our caller up for that. - */ -static void cgroup_release_agent(struct work_struct *work) -{ - struct cgroup *cgrp = - container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL; - char *argv[3], *envp[3]; - int ret; - - mutex_lock(&cgroup_mutex); - - pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); - agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!pathbuf || !agentbuf) - goto out; - - spin_lock_irq(&css_set_lock); - ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); - spin_unlock_irq(&css_set_lock); - if (ret < 0 || ret >= PATH_MAX) - goto out; - - argv[0] = agentbuf; - argv[1] = pathbuf; - argv[2] = NULL; - - /* minimal command environment */ - envp[0] = "HOME=/"; - envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - envp[2] = NULL; - - mutex_unlock(&cgroup_mutex); - call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - goto out_free; -out: - mutex_unlock(&cgroup_mutex); -out_free: - kfree(agentbuf); - kfree(pathbuf); -} - static int __init cgroup_disable(char *str) { struct cgroup_subsys *ss; @@ -6189,33 +5288,6 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); -static int __init cgroup_no_v1(char *str) -{ - struct cgroup_subsys *ss; - char *token; - int i; - - while ((token = strsep(&str, ",")) != NULL) { - if (!*token) - continue; - - if (!strcmp(token, "all")) { - cgroup_no_v1_mask = U16_MAX; - break; - } - - for_each_subsys(ss, i) { - if (strcmp(token, ss->name) && - strcmp(token, ss->legacy_name)) - continue; - - cgroup_no_v1_mask |= 1 << i; - } - } - return 1; -} -__setup("cgroup_no_v1=", cgroup_no_v1); - /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest @@ -6557,149 +5629,3 @@ void cgroup_bpf_update(struct cgroup *cgrp, mutex_unlock(&cgroup_mutex); } #endif /* CONFIG_CGROUP_BPF */ - -#ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state * -debug_css_alloc(struct cgroup_subsys_state *parent_css) -{ - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - - if (!css) - return ERR_PTR(-ENOMEM); - - return css; -} - -static void debug_css_free(struct cgroup_subsys_state *css) -{ - kfree(css); -} - -static u64 debug_taskcount_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return cgroup_task_count(css->cgroup); -} - -static u64 current_css_set_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return (u64)(unsigned long)current->cgroups; -} - -static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - u64 count; - - rcu_read_lock(); - count = atomic_read(&task_css_set(current)->refcount); - rcu_read_unlock(); - return count; -} - -static int current_css_set_cg_links_read(struct seq_file *seq, void *v) -{ - struct cgrp_cset_link *link; - struct css_set *cset; - char *name_buf; - - name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!name_buf) - return -ENOMEM; - - spin_lock_irq(&css_set_lock); - rcu_read_lock(); - cset = rcu_dereference(current->cgroups); - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { - struct cgroup *c = link->cgrp; - - cgroup_name(c, name_buf, NAME_MAX + 1); - seq_printf(seq, "Root %d group %s\n", - c->root->hierarchy_id, name_buf); - } - rcu_read_unlock(); - spin_unlock_irq(&css_set_lock); - kfree(name_buf); - return 0; -} - -#define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct seq_file *seq, void *v) -{ - struct cgroup_subsys_state *css = seq_css(seq); - struct cgrp_cset_link *link; - - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { - struct css_set *cset = link->cset; - struct task_struct *task; - int count = 0; - - seq_printf(seq, "css_set %p\n", cset); - - list_for_each_entry(task, &cset->tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); - } - - list_for_each_entry(task, &cset->mg_tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); - } - continue; - overflow: - seq_puts(seq, " ...\n"); - } - spin_unlock_irq(&css_set_lock); - return 0; -} - -static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) -{ - return (!cgroup_is_populated(css->cgroup) && - !css_has_online_children(&css->cgroup->self)); -} - -static struct cftype debug_files[] = { - { - .name = "taskcount", - .read_u64 = debug_taskcount_read, - }, - - { - .name = "current_css_set", - .read_u64 = current_css_set_read, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - }, - - { - .name = "current_css_set_cg_links", - .seq_show = current_css_set_cg_links_read, - }, - - { - .name = "cgroup_css_links", - .seq_show = cgroup_css_links_read, - }, - - { - .name = "releasable", - .read_u64 = releasable_read, - }, - - { } /* terminate */ -}; - -struct cgroup_subsys debug_cgrp_subsys = { - .css_alloc = debug_css_alloc, - .css_free = debug_css_free, - .legacy_cftypes = debug_files, -}; -#endif /* CONFIG_CGROUP_DEBUG */ From 633feee310de6b6c3191011140b88fe772f560cf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:07 -0500 Subject: [PATCH 09/27] cgroup: refactor mount path and clearly distinguish v1 and v2 paths While sharing some mechanisms, the mount paths of v1 and v2 are substantially different. Their implementations were mixed in cgroup_mount(). This patch splits them out so that they're easier to follow and organize. This patch causes one functional change - the WARN_ON(new_sb) gets lost. This is because the actual mounting gets moved to cgroup_do_mount() and thus @new_sb is no longer accessible by default to cgroup1_mount(). While we can add it as an explicit out parameter to cgroup_do_mount(), this part of code hasn't changed and the warning hasn't triggered for quite a while. Dropping it should be fine. Signed-off-by: Tejun Heo Acked-by: Acked-by: Zefan Li --- kernel/cgroup/cgroup.c | 148 +++++++++++++++++++++++------------------ 1 file changed, 83 insertions(+), 65 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d34c170f87ef..228dd9ae708b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1989,48 +1989,55 @@ out: return ret; } -static struct dentry *cgroup_mount(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data) +static struct dentry *cgroup_do_mount(struct file_system_type *fs_type, + int flags, struct cgroup_root *root, + unsigned long magic, + struct cgroup_namespace *ns) { - bool is_v2 = fs_type == &cgroup2_fs_type; - struct super_block *pinned_sb = NULL; - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; - struct cgroup_subsys *ss; - struct cgroup_root *root; - struct cgroup_sb_opts opts; struct dentry *dentry; - int ret; - int i; bool new_sb; - get_cgroup_ns(ns); - - /* Check if the caller has permission to mount. */ - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { - put_cgroup_ns(ns); - return ERR_PTR(-EPERM); - } + dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb); /* - * The first time anyone tries to mount a cgroup, enable the list - * linking each css_set to its tasks and fix up all existing tasks. + * In non-init cgroup namespace, instead of root cgroup's dentry, + * we return the dentry corresponding to the cgroupns->root_cgrp. */ - if (!use_task_css_set_links) - cgroup_enable_task_cg_lists(); + if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { + struct dentry *nsdentry; + struct cgroup *cgrp; - if (is_v2) { - if (data) { - pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); - put_cgroup_ns(ns); - return ERR_PTR(-EINVAL); - } - cgrp_dfl_visible = true; - root = &cgrp_dfl_root; - cgroup_get(&root->cgrp); - goto out_mount; + mutex_lock(&cgroup_mutex); + spin_lock_irq(&css_set_lock); + + cgrp = cset_cgroup_from_root(ns->root_cset, root); + + spin_unlock_irq(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); + dput(dentry); + dentry = nsdentry; } + if (IS_ERR(dentry) || !new_sb) + cgroup_put(&root->cgrp); + + return dentry; +} + +static struct dentry *cgroup1_mount(struct file_system_type *fs_type, + int flags, void *data, + unsigned long magic, + struct cgroup_namespace *ns) +{ + struct super_block *pinned_sb = NULL; + struct cgroup_sb_opts opts; + struct cgroup_root *root; + struct cgroup_subsys *ss; + struct dentry *dentry; + int i, ret; + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* First find the desired set of subsystems */ @@ -2152,47 +2159,58 @@ out_free: kfree(opts.release_agent); kfree(opts.name); - if (ret) { - put_cgroup_ns(ns); + if (ret) return ERR_PTR(ret); - } -out_mount: - dentry = kernfs_mount(fs_type, flags, root->kf_root, - is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, - &new_sb); - /* - * In non-init cgroup namespace, instead of root cgroup's - * dentry, we return the dentry corresponding to the - * cgroupns->root_cgrp. - */ - if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { - struct dentry *nsdentry; - struct cgroup *cgrp; - - mutex_lock(&cgroup_mutex); - spin_lock_irq(&css_set_lock); - - cgrp = cset_cgroup_from_root(ns->root_cset, root); - - spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); - - nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); - dput(dentry); - dentry = nsdentry; - } - - if (IS_ERR(dentry) || !new_sb) - cgroup_put(&root->cgrp); + dentry = cgroup_do_mount(&cgroup_fs_type, flags, root, + CGROUP_SUPER_MAGIC, ns); /* * If @pinned_sb, we're reusing an existing root and holding an * extra ref on its sb. Mount is complete. Put the extra ref. */ - if (pinned_sb) { - WARN_ON(new_sb); + if (pinned_sb) deactivate_super(pinned_sb); + + return dentry; +} + +static struct dentry *cgroup_mount(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, + void *data) +{ + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct dentry *dentry; + + get_cgroup_ns(ns); + + /* Check if the caller has permission to mount. */ + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { + put_cgroup_ns(ns); + return ERR_PTR(-EPERM); + } + + /* + * The first time anyone tries to mount a cgroup, enable the list + * linking each css_set to its tasks and fix up all existing tasks. + */ + if (!use_task_css_set_links) + cgroup_enable_task_cg_lists(); + + if (fs_type == &cgroup2_fs_type) { + if (data) { + pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + put_cgroup_ns(ns); + return ERR_PTR(-EINVAL); + } + cgrp_dfl_visible = true; + cgroup_get(&cgrp_dfl_root.cgrp); + + dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, + CGROUP2_SUPER_MAGIC, ns); + } else { + dentry = cgroup1_mount(&cgroup_fs_type, flags, data, + CGROUP_SUPER_MAGIC, ns); } put_cgroup_ns(ns); From fa069904dd38c2d8e121a3c7e37f8daaddb6dafa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:07 -0500 Subject: [PATCH 10/27] cgroup: separate out cgroup1_kf_syscall_ops Currently, cgroup_kf_syscall_ops is shared by v1 and v2 and the specific methods test the version and take different actions. Split out v1 functions and put them in cgroup1_kf_syscall_ops and remove the now unnecessary explicit branches in specific methods. Signed-off-by: Tejun Heo Acked-by: Acked-by: Zefan Li --- kernel/cgroup/cgroup-internal.h | 4 +-- kernel/cgroup/cgroup-v1.c | 11 ++------- kernel/cgroup/cgroup.c | 44 +++++++++++++++++++++------------ 3 files changed, 32 insertions(+), 27 deletions(-) diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index dca3193bd9d2..5790e5ff9a0f 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -95,8 +95,8 @@ extern const struct file_operations proc_cgroupstats_operations; bool cgroup_ssid_no_v1(int ssid); void cgroup_pidlist_destroy_all(struct cgroup *cgrp); -int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, - const char *new_name_str); +int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str); void cgroup_release_agent(struct work_struct *work); void check_for_release(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 7af745a46f91..0b2c24f0b310 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -800,8 +800,8 @@ out_free: /* * cgroup_rename - Only allow simple rename of directories in place. */ -int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, - const char *new_name_str) +int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str) { struct cgroup *cgrp = kn->priv; int ret; @@ -811,13 +811,6 @@ int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, if (kn->parent != new_parent) return -EIO; - /* - * This isn't a proper migration and its usefulness is very - * limited. Disallow on the default hierarchy. - */ - if (cgroup_on_dfl(cgrp)) - return -EPERM; - /* * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. kernfs_rename() doesn't require active_ref diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 228dd9ae708b..de6a2ac41d0b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1232,6 +1232,7 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ +static struct kernfs_syscall_ops cgroup1_kf_syscall_ops; static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, @@ -1566,16 +1567,15 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, return len; } -static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) +static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; int ssid; - if (root != &cgrp_dfl_root) - for_each_subsys(ss, ssid) - if (root->subsys_mask & (1 << ssid)) - seq_show_option(seq, ss->legacy_name, NULL); + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_show_option(seq, ss->legacy_name, NULL); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) @@ -1736,18 +1736,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return 0; } -static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) +static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) { int ret = 0; struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; u16 added_mask, removed_mask; - if (root == &cgrp_dfl_root) { - pr_err("remount is not allowed\n"); - return -EINVAL; - } - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ @@ -1798,6 +1793,12 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) return ret; } +static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) +{ + pr_err("remount is not allowed\n"); + return -EINVAL; +} + /* * To reduce the fork() overhead for systems that are not actually using * their cgroups capability, we don't maintain the lists running through @@ -1900,6 +1901,7 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; + struct kernfs_syscall_ops *kf_sops; struct css_set *cset; int i, ret; @@ -1931,7 +1933,10 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto cancel_ref; - root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, + kf_sops = root == &cgrp_dfl_root ? + &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; + + root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED, root_cgrp); if (IS_ERR(root->kf_root)) { @@ -4813,12 +4818,19 @@ static int cgroup_rmdir(struct kernfs_node *kn) return ret; } -static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { - .remount_fs = cgroup_remount, - .show_options = cgroup_show_options, +static struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { + .remount_fs = cgroup1_remount, + .show_options = cgroup1_show_options, + .rename = cgroup1_rename, + .mkdir = cgroup_mkdir, + .rmdir = cgroup_rmdir, + .show_path = cgroup_show_path, +}; + +static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { + .remount_fs = cgroup_remount, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, - .rename = cgroup_rename, .show_path = cgroup_show_path, }; From 1592c9b223749d59b933ebbfe37f1a8833d7a6cf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:08 -0500 Subject: [PATCH 11/27] cgroup: move v1 mount functions to kernel/cgroup/cgroup-v1.c Now that the v1 mount code is split into separate functions, move them to kernel/cgroup/cgroup-v1.c along with the mount option handling code. As this puts all v1-only kernfs_syscall_ops in cgroup-v1.c, move cgroup1_kf_syscall_ops to cgroup-v1.c too. Signed-off-by: Tejun Heo Acked-by: Acked-by: Zefan Li --- kernel/cgroup/cgroup-internal.h | 28 ++- kernel/cgroup/cgroup-v1.c | 381 ++++++++++++++++++++++++++++- kernel/cgroup/cgroup.c | 410 +------------------------------- 3 files changed, 413 insertions(+), 406 deletions(-) diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 5790e5ff9a0f..710edeeb1f9f 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -26,6 +26,16 @@ struct cgrp_cset_link { struct list_head cgrp_link; }; +struct cgroup_sb_opts { + u16 subsys_mask; + unsigned int flags; + char *release_agent; + bool cpuset_clone_children; + char *name; + /* User explicitly requested empty subsystem */ + bool none; +}; + extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; @@ -66,7 +76,13 @@ void cgroup_kn_unlock(struct kernfs_node *kn); int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); +void cgroup_free_root(struct cgroup_root *root); +void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); +struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, + struct cgroup_root *root, unsigned long magic, + struct cgroup_namespace *ns); bool cgroup_may_migrate_to(struct cgroup *dst_cgrp); void cgroup_migrate_finish(struct list_head *preloaded_csets); @@ -86,18 +102,24 @@ ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes void cgroup_lock_and_drain_offline(struct cgroup *cgrp); +int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode); +int cgroup_rmdir(struct kernfs_node *kn); +int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, + struct kernfs_root *kf_root); + /* * cgroup-v1.c */ -extern spinlock_t release_agent_path_lock; extern struct cftype cgroup_legacy_base_files[]; extern const struct file_operations proc_cgroupstats_operations; +extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; bool cgroup_ssid_no_v1(int ssid); void cgroup_pidlist_destroy_all(struct cgroup *cgrp); -int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, - const char *new_name_str); void cgroup_release_agent(struct work_struct *work); void check_for_release(struct cgroup *cgrp); +struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, + void *data, unsigned long magic, + struct cgroup_namespace *ns); #endif /* __CGROUP_INTERNAL_H */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 0b2c24f0b310..ae240c0d33cb 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1,7 +1,9 @@ #include "cgroup-internal.h" +#include #include #include +#include #include #include #include @@ -32,7 +34,7 @@ static struct workqueue_struct *cgroup_pidlist_destroy_wq; * Protects cgroup_subsys->release_agent_path. Modifying it also requires * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. */ -DEFINE_SPINLOCK(release_agent_path_lock); +static DEFINE_SPINLOCK(release_agent_path_lock); bool cgroup_ssid_no_v1(int ssid) { @@ -800,8 +802,8 @@ out_free: /* * cgroup_rename - Only allow simple rename of directories in place. */ -int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, - const char *new_name_str) +static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str) { struct cgroup *cgrp = kn->priv; int ret; @@ -832,6 +834,379 @@ int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, return ret; } +static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root) +{ + struct cgroup_root *root = cgroup_root_from_kf(kf_root); + struct cgroup_subsys *ss; + int ssid; + + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_show_option(seq, ss->legacy_name, NULL); + if (root->flags & CGRP_ROOT_NOPREFIX) + seq_puts(seq, ",noprefix"); + if (root->flags & CGRP_ROOT_XATTR) + seq_puts(seq, ",xattr"); + + spin_lock(&release_agent_path_lock); + if (strlen(root->release_agent_path)) + seq_show_option(seq, "release_agent", + root->release_agent_path); + spin_unlock(&release_agent_path_lock); + + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) + seq_puts(seq, ",clone_children"); + if (strlen(root->name)) + seq_show_option(seq, "name", root->name); + return 0; +} + +static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) +{ + char *token, *o = data; + bool all_ss = false, one_ss = false; + u16 mask = U16_MAX; + struct cgroup_subsys *ss; + int nr_opts = 0; + int i; + +#ifdef CONFIG_CPUSETS + mask = ~((u16)1 << cpuset_cgrp_id); +#endif + + memset(opts, 0, sizeof(*opts)); + + while ((token = strsep(&o, ",")) != NULL) { + nr_opts++; + + if (!*token) + return -EINVAL; + if (!strcmp(token, "none")) { + /* Explicitly have no subsystems */ + opts->none = true; + continue; + } + if (!strcmp(token, "all")) { + /* Mutually exclusive option 'all' + subsystem name */ + if (one_ss) + return -EINVAL; + all_ss = true; + continue; + } + if (!strcmp(token, "noprefix")) { + opts->flags |= CGRP_ROOT_NOPREFIX; + continue; + } + if (!strcmp(token, "clone_children")) { + opts->cpuset_clone_children = true; + continue; + } + if (!strcmp(token, "xattr")) { + opts->flags |= CGRP_ROOT_XATTR; + continue; + } + if (!strncmp(token, "release_agent=", 14)) { + /* Specifying two release agents is forbidden */ + if (opts->release_agent) + return -EINVAL; + opts->release_agent = + kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); + if (!opts->release_agent) + return -ENOMEM; + continue; + } + if (!strncmp(token, "name=", 5)) { + const char *name = token + 5; + /* Can't specify an empty name */ + if (!strlen(name)) + return -EINVAL; + /* Must match [\w.-]+ */ + for (i = 0; i < strlen(name); i++) { + char c = name[i]; + if (isalnum(c)) + continue; + if ((c == '.') || (c == '-') || (c == '_')) + continue; + return -EINVAL; + } + /* Specifying two names is forbidden */ + if (opts->name) + return -EINVAL; + opts->name = kstrndup(name, + MAX_CGROUP_ROOT_NAMELEN - 1, + GFP_KERNEL); + if (!opts->name) + return -ENOMEM; + + continue; + } + + for_each_subsys(ss, i) { + if (strcmp(token, ss->legacy_name)) + continue; + if (!cgroup_ssid_enabled(i)) + continue; + if (cgroup_ssid_no_v1(i)) + continue; + + /* Mutually exclusive option 'all' + subsystem name */ + if (all_ss) + return -EINVAL; + opts->subsys_mask |= (1 << i); + one_ss = true; + + break; + } + if (i == CGROUP_SUBSYS_COUNT) + return -ENOENT; + } + + /* + * If the 'all' option was specified select all the subsystems, + * otherwise if 'none', 'name=' and a subsystem name options were + * not specified, let's default to 'all' + */ + if (all_ss || (!one_ss && !opts->none && !opts->name)) + for_each_subsys(ss, i) + if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i)) + opts->subsys_mask |= (1 << i); + + /* + * We either have to specify by name or by subsystems. (So all + * empty hierarchies must have a name). + */ + if (!opts->subsys_mask && !opts->name) + return -EINVAL; + + /* + * Option noprefix was introduced just for backward compatibility + * with the old cpuset, so we allow noprefix only if mounting just + * the cpuset subsystem. + */ + if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) + return -EINVAL; + + /* Can't specify "none" and some subsystems */ + if (opts->subsys_mask && opts->none) + return -EINVAL; + + return 0; +} + +static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) +{ + int ret = 0; + struct cgroup_root *root = cgroup_root_from_kf(kf_root); + struct cgroup_sb_opts opts; + u16 added_mask, removed_mask; + + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); + + /* See what subsystems are wanted */ + ret = parse_cgroupfs_options(data, &opts); + if (ret) + goto out_unlock; + + if (opts.subsys_mask != root->subsys_mask || opts.release_agent) + pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", + task_tgid_nr(current), current->comm); + + added_mask = opts.subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~opts.subsys_mask; + + /* Don't allow flags or name to change at remount */ + if ((opts.flags ^ root->flags) || + (opts.name && strcmp(opts.name, root->name))) { + pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", + opts.flags, opts.name ?: "", root->flags, root->name); + ret = -EINVAL; + goto out_unlock; + } + + /* remounting is not allowed for populated hierarchies */ + if (!list_empty(&root->cgrp.self.children)) { + ret = -EBUSY; + goto out_unlock; + } + + ret = rebind_subsystems(root, added_mask); + if (ret) + goto out_unlock; + + WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); + + if (opts.release_agent) { + spin_lock(&release_agent_path_lock); + strcpy(root->release_agent_path, opts.release_agent); + spin_unlock(&release_agent_path_lock); + } + + trace_cgroup_remount(root); + + out_unlock: + kfree(opts.release_agent); + kfree(opts.name); + mutex_unlock(&cgroup_mutex); + return ret; +} + +struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { + .rename = cgroup1_rename, + .show_options = cgroup1_show_options, + .remount_fs = cgroup1_remount, + .mkdir = cgroup_mkdir, + .rmdir = cgroup_rmdir, + .show_path = cgroup_show_path, +}; + +struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, + void *data, unsigned long magic, + struct cgroup_namespace *ns) +{ + struct super_block *pinned_sb = NULL; + struct cgroup_sb_opts opts; + struct cgroup_root *root; + struct cgroup_subsys *ss; + struct dentry *dentry; + int i, ret; + + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); + + /* First find the desired set of subsystems */ + ret = parse_cgroupfs_options(data, &opts); + if (ret) + goto out_unlock; + + /* + * Destruction of cgroup root is asynchronous, so subsystems may + * still be dying after the previous unmount. Let's drain the + * dying subsystems. We just need to ensure that the ones + * unmounted previously finish dying and don't care about new ones + * starting. Testing ref liveliness is good enough. + */ + for_each_subsys(ss, i) { + if (!(opts.subsys_mask & (1 << i)) || + ss->root == &cgrp_dfl_root) + continue; + + if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { + mutex_unlock(&cgroup_mutex); + msleep(10); + ret = restart_syscall(); + goto out_free; + } + cgroup_put(&ss->root->cgrp); + } + + for_each_root(root) { + bool name_match = false; + + if (root == &cgrp_dfl_root) + continue; + + /* + * If we asked for a name then it must match. Also, if + * name matches but sybsys_mask doesn't, we should fail. + * Remember whether name matched. + */ + if (opts.name) { + if (strcmp(opts.name, root->name)) + continue; + name_match = true; + } + + /* + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match. + */ + if ((opts.subsys_mask || opts.none) && + (opts.subsys_mask != root->subsys_mask)) { + if (!name_match) + continue; + ret = -EBUSY; + goto out_unlock; + } + + if (root->flags ^ opts.flags) + pr_warn("new mount options do not match the existing superblock, will be ignored\n"); + + /* + * We want to reuse @root whose lifetime is governed by its + * ->cgrp. Let's check whether @root is alive and keep it + * that way. As cgroup_kill_sb() can happen anytime, we + * want to block it by pinning the sb so that @root doesn't + * get killed before mount is complete. + * + * With the sb pinned, tryget_live can reliably indicate + * whether @root can be reused. If it's being killed, + * drain it. We can use wait_queue for the wait but this + * path is super cold. Let's just sleep a bit and retry. + */ + pinned_sb = kernfs_pin_sb(root->kf_root, NULL); + if (IS_ERR(pinned_sb) || + !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { + mutex_unlock(&cgroup_mutex); + if (!IS_ERR_OR_NULL(pinned_sb)) + deactivate_super(pinned_sb); + msleep(10); + ret = restart_syscall(); + goto out_free; + } + + ret = 0; + goto out_unlock; + } + + /* + * No such thing, create a new one. name= matching without subsys + * specification is allowed for already existing hierarchies but we + * can't create new one without subsys specification. + */ + if (!opts.subsys_mask && !opts.none) { + ret = -EINVAL; + goto out_unlock; + } + + /* Hierarchies may only be created in the initial cgroup namespace. */ + if (ns != &init_cgroup_ns) { + ret = -EPERM; + goto out_unlock; + } + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) { + ret = -ENOMEM; + goto out_unlock; + } + + init_cgroup_root(root, &opts); + + ret = cgroup_setup_root(root, opts.subsys_mask); + if (ret) + cgroup_free_root(root); + +out_unlock: + mutex_unlock(&cgroup_mutex); +out_free: + kfree(opts.release_agent); + kfree(opts.name); + + if (ret) + return ERR_PTR(ret); + + dentry = cgroup_do_mount(&cgroup_fs_type, flags, root, + CGROUP_SUPER_MAGIC, ns); + + /* + * If @pinned_sb, we're reusing an existing root and holding an + * extra ref on its sb. Mount is complete. Put the extra ref. + */ + if (pinned_sb) + deactivate_super(pinned_sb); + + return dentry; +} + static int __init cgroup1_wq_init(void) { /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index de6a2ac41d0b..4be306510aff 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -31,7 +31,6 @@ #include "cgroup-internal.h" #include -#include #include #include #include @@ -49,7 +48,6 @@ #include #include #include -#include #include #include #include @@ -1078,7 +1076,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); } -static void cgroup_free_root(struct cgroup_root *root) +void cgroup_free_root(struct cgroup_root *root) { if (root) { idr_destroy(&root->cgroup_idr); @@ -1232,7 +1230,6 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -static struct kernfs_syscall_ops cgroup1_kf_syscall_ops; static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, @@ -1540,8 +1537,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) return 0; } -static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, - struct kernfs_root *kf_root) +int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, + struct kernfs_root *kf_root) { int len = 0; char *buf = NULL; @@ -1567,232 +1564,6 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, return len; } -static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root) -{ - struct cgroup_root *root = cgroup_root_from_kf(kf_root); - struct cgroup_subsys *ss; - int ssid; - - for_each_subsys(ss, ssid) - if (root->subsys_mask & (1 << ssid)) - seq_show_option(seq, ss->legacy_name, NULL); - if (root->flags & CGRP_ROOT_NOPREFIX) - seq_puts(seq, ",noprefix"); - if (root->flags & CGRP_ROOT_XATTR) - seq_puts(seq, ",xattr"); - - spin_lock(&release_agent_path_lock); - if (strlen(root->release_agent_path)) - seq_show_option(seq, "release_agent", - root->release_agent_path); - spin_unlock(&release_agent_path_lock); - - if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) - seq_puts(seq, ",clone_children"); - if (strlen(root->name)) - seq_show_option(seq, "name", root->name); - return 0; -} - -struct cgroup_sb_opts { - u16 subsys_mask; - unsigned int flags; - char *release_agent; - bool cpuset_clone_children; - char *name; - /* User explicitly requested empty subsystem */ - bool none; -}; - -static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) -{ - char *token, *o = data; - bool all_ss = false, one_ss = false; - u16 mask = U16_MAX; - struct cgroup_subsys *ss; - int nr_opts = 0; - int i; - -#ifdef CONFIG_CPUSETS - mask = ~((u16)1 << cpuset_cgrp_id); -#endif - - memset(opts, 0, sizeof(*opts)); - - while ((token = strsep(&o, ",")) != NULL) { - nr_opts++; - - if (!*token) - return -EINVAL; - if (!strcmp(token, "none")) { - /* Explicitly have no subsystems */ - opts->none = true; - continue; - } - if (!strcmp(token, "all")) { - /* Mutually exclusive option 'all' + subsystem name */ - if (one_ss) - return -EINVAL; - all_ss = true; - continue; - } - if (!strcmp(token, "noprefix")) { - opts->flags |= CGRP_ROOT_NOPREFIX; - continue; - } - if (!strcmp(token, "clone_children")) { - opts->cpuset_clone_children = true; - continue; - } - if (!strcmp(token, "xattr")) { - opts->flags |= CGRP_ROOT_XATTR; - continue; - } - if (!strncmp(token, "release_agent=", 14)) { - /* Specifying two release agents is forbidden */ - if (opts->release_agent) - return -EINVAL; - opts->release_agent = - kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); - if (!opts->release_agent) - return -ENOMEM; - continue; - } - if (!strncmp(token, "name=", 5)) { - const char *name = token + 5; - /* Can't specify an empty name */ - if (!strlen(name)) - return -EINVAL; - /* Must match [\w.-]+ */ - for (i = 0; i < strlen(name); i++) { - char c = name[i]; - if (isalnum(c)) - continue; - if ((c == '.') || (c == '-') || (c == '_')) - continue; - return -EINVAL; - } - /* Specifying two names is forbidden */ - if (opts->name) - return -EINVAL; - opts->name = kstrndup(name, - MAX_CGROUP_ROOT_NAMELEN - 1, - GFP_KERNEL); - if (!opts->name) - return -ENOMEM; - - continue; - } - - for_each_subsys(ss, i) { - if (strcmp(token, ss->legacy_name)) - continue; - if (!cgroup_ssid_enabled(i)) - continue; - if (cgroup_ssid_no_v1(i)) - continue; - - /* Mutually exclusive option 'all' + subsystem name */ - if (all_ss) - return -EINVAL; - opts->subsys_mask |= (1 << i); - one_ss = true; - - break; - } - if (i == CGROUP_SUBSYS_COUNT) - return -ENOENT; - } - - /* - * If the 'all' option was specified select all the subsystems, - * otherwise if 'none', 'name=' and a subsystem name options were - * not specified, let's default to 'all' - */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i)) - opts->subsys_mask |= (1 << i); - - /* - * We either have to specify by name or by subsystems. (So all - * empty hierarchies must have a name). - */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; - - /* - * Option noprefix was introduced just for backward compatibility - * with the old cpuset, so we allow noprefix only if mounting just - * the cpuset subsystem. - */ - if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) - return -EINVAL; - - /* Can't specify "none" and some subsystems */ - if (opts->subsys_mask && opts->none) - return -EINVAL; - - return 0; -} - -static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) -{ - int ret = 0; - struct cgroup_root *root = cgroup_root_from_kf(kf_root); - struct cgroup_sb_opts opts; - u16 added_mask, removed_mask; - - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); - - /* See what subsystems are wanted */ - ret = parse_cgroupfs_options(data, &opts); - if (ret) - goto out_unlock; - - if (opts.subsys_mask != root->subsys_mask || opts.release_agent) - pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", - task_tgid_nr(current), current->comm); - - added_mask = opts.subsys_mask & ~root->subsys_mask; - removed_mask = root->subsys_mask & ~opts.subsys_mask; - - /* Don't allow flags or name to change at remount */ - if ((opts.flags ^ root->flags) || - (opts.name && strcmp(opts.name, root->name))) { - pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", - opts.flags, opts.name ?: "", root->flags, root->name); - ret = -EINVAL; - goto out_unlock; - } - - /* remounting is not allowed for populated hierarchies */ - if (!list_empty(&root->cgrp.self.children)) { - ret = -EBUSY; - goto out_unlock; - } - - ret = rebind_subsystems(root, added_mask); - if (ret) - goto out_unlock; - - WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); - - if (opts.release_agent) { - spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, opts.release_agent); - spin_unlock(&release_agent_path_lock); - } - - trace_cgroup_remount(root); - - out_unlock: - kfree(opts.release_agent); - kfree(opts.name); - mutex_unlock(&cgroup_mutex); - return ret; -} - static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { pr_err("remount is not allowed\n"); @@ -1877,8 +1648,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent); } -static void init_cgroup_root(struct cgroup_root *root, - struct cgroup_sb_opts *opts) +void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) { struct cgroup *cgrp = &root->cgrp; @@ -1897,7 +1667,7 @@ static void init_cgroup_root(struct cgroup_root *root, set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -1994,10 +1764,9 @@ out: return ret; } -static struct dentry *cgroup_do_mount(struct file_system_type *fs_type, - int flags, struct cgroup_root *root, - unsigned long magic, - struct cgroup_namespace *ns) +struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, + struct cgroup_root *root, unsigned long magic, + struct cgroup_namespace *ns) { struct dentry *dentry; bool new_sb; @@ -2031,155 +1800,6 @@ static struct dentry *cgroup_do_mount(struct file_system_type *fs_type, return dentry; } -static struct dentry *cgroup1_mount(struct file_system_type *fs_type, - int flags, void *data, - unsigned long magic, - struct cgroup_namespace *ns) -{ - struct super_block *pinned_sb = NULL; - struct cgroup_sb_opts opts; - struct cgroup_root *root; - struct cgroup_subsys *ss; - struct dentry *dentry; - int i, ret; - - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); - - /* First find the desired set of subsystems */ - ret = parse_cgroupfs_options(data, &opts); - if (ret) - goto out_unlock; - - /* - * Destruction of cgroup root is asynchronous, so subsystems may - * still be dying after the previous unmount. Let's drain the - * dying subsystems. We just need to ensure that the ones - * unmounted previously finish dying and don't care about new ones - * starting. Testing ref liveliness is good enough. - */ - for_each_subsys(ss, i) { - if (!(opts.subsys_mask & (1 << i)) || - ss->root == &cgrp_dfl_root) - continue; - - if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - msleep(10); - ret = restart_syscall(); - goto out_free; - } - cgroup_put(&ss->root->cgrp); - } - - for_each_root(root) { - bool name_match = false; - - if (root == &cgrp_dfl_root) - continue; - - /* - * If we asked for a name then it must match. Also, if - * name matches but sybsys_mask doesn't, we should fail. - * Remember whether name matched. - */ - if (opts.name) { - if (strcmp(opts.name, root->name)) - continue; - name_match = true; - } - - /* - * If we asked for subsystems (or explicitly for no - * subsystems) then they must match. - */ - if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->subsys_mask)) { - if (!name_match) - continue; - ret = -EBUSY; - goto out_unlock; - } - - if (root->flags ^ opts.flags) - pr_warn("new mount options do not match the existing superblock, will be ignored\n"); - - /* - * We want to reuse @root whose lifetime is governed by its - * ->cgrp. Let's check whether @root is alive and keep it - * that way. As cgroup_kill_sb() can happen anytime, we - * want to block it by pinning the sb so that @root doesn't - * get killed before mount is complete. - * - * With the sb pinned, tryget_live can reliably indicate - * whether @root can be reused. If it's being killed, - * drain it. We can use wait_queue for the wait but this - * path is super cold. Let's just sleep a bit and retry. - */ - pinned_sb = kernfs_pin_sb(root->kf_root, NULL); - if (IS_ERR(pinned_sb) || - !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - if (!IS_ERR_OR_NULL(pinned_sb)) - deactivate_super(pinned_sb); - msleep(10); - ret = restart_syscall(); - goto out_free; - } - - ret = 0; - goto out_unlock; - } - - /* - * No such thing, create a new one. name= matching without subsys - * specification is allowed for already existing hierarchies but we - * can't create new one without subsys specification. - */ - if (!opts.subsys_mask && !opts.none) { - ret = -EINVAL; - goto out_unlock; - } - - /* Hierarchies may only be created in the initial cgroup namespace. */ - if (ns != &init_cgroup_ns) { - ret = -EPERM; - goto out_unlock; - } - - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) { - ret = -ENOMEM; - goto out_unlock; - } - - init_cgroup_root(root, &opts); - - ret = cgroup_setup_root(root, opts.subsys_mask); - if (ret) - cgroup_free_root(root); - -out_unlock: - mutex_unlock(&cgroup_mutex); -out_free: - kfree(opts.release_agent); - kfree(opts.name); - - if (ret) - return ERR_PTR(ret); - - dentry = cgroup_do_mount(&cgroup_fs_type, flags, root, - CGROUP_SUPER_MAGIC, ns); - - /* - * If @pinned_sb, we're reusing an existing root and holding an - * extra ref on its sb. Mount is complete. Put the extra ref. - */ - if (pinned_sb) - deactivate_super(pinned_sb); - - return dentry; -} - static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) @@ -4587,8 +4207,7 @@ out_destroy: return ERR_PTR(ret); } -static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) +int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; struct kernfs_node *kn; @@ -4800,7 +4419,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return 0; }; -static int cgroup_rmdir(struct kernfs_node *kn) +int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; int ret = 0; @@ -4818,15 +4437,6 @@ static int cgroup_rmdir(struct kernfs_node *kn) return ret; } -static struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { - .remount_fs = cgroup1_remount, - .show_options = cgroup1_show_options, - .rename = cgroup1_rename, - .mkdir = cgroup_mkdir, - .rmdir = cgroup_rmdir, - .show_path = cgroup_show_path, -}; - static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .remount_fs = cgroup_remount, .mkdir = cgroup_mkdir, From d62beb7f3dc6b45f9b9d381897e05fe8ba286d8a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:08 -0500 Subject: [PATCH 12/27] cgroup: rename functions for consistency Now that v1 functions are separated out, rename some functions for consistency. cgroup_dfl_base_files -> cgroup_base_files cgroup_legacy_base_files -> cgroup1_base_files cgroup_ssid_no_v1() -> cgroup1_ssid_disabled() cgroup_pidlist_destroy_all -> cgroup1_pidlist_destroy_all() cgroup_release_agent() -> cgroup1_release_agent() check_for_release() -> cgroup1_check_for_release() Signed-off-by: Tejun Heo Acked-by: Acked-by: Zefan Li --- kernel/cgroup/cgroup-internal.h | 10 +++++----- kernel/cgroup/cgroup-v1.c | 14 +++++++------- kernel/cgroup/cgroup.c | 22 +++++++++++----------- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 710edeeb1f9f..a890c92cb688 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -110,14 +110,14 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, /* * cgroup-v1.c */ -extern struct cftype cgroup_legacy_base_files[]; +extern struct cftype cgroup1_base_files[]; extern const struct file_operations proc_cgroupstats_operations; extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; -bool cgroup_ssid_no_v1(int ssid); -void cgroup_pidlist_destroy_all(struct cgroup *cgrp); -void cgroup_release_agent(struct work_struct *work); -void check_for_release(struct cgroup *cgrp); +bool cgroup1_ssid_disabled(int ssid); +void cgroup1_pidlist_destroy_all(struct cgroup *cgrp); +void cgroup1_release_agent(struct work_struct *work); +void cgroup1_check_for_release(struct cgroup *cgrp); struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, void *data, unsigned long magic, struct cgroup_namespace *ns); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index ae240c0d33cb..37be09e09740 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -36,7 +36,7 @@ static struct workqueue_struct *cgroup_pidlist_destroy_wq; */ static DEFINE_SPINLOCK(release_agent_path_lock); -bool cgroup_ssid_no_v1(int ssid) +bool cgroup1_ssid_disabled(int ssid) { return cgroup_no_v1_mask & (1 << ssid); } @@ -201,7 +201,7 @@ static void pidlist_free(void *p) * Used to destroy all pidlists lingering waiting for destroy timer. None * should be left afterwards. */ -void cgroup_pidlist_destroy_all(struct cgroup *cgrp) +void cgroup1_pidlist_destroy_all(struct cgroup *cgrp) { struct cgroup_pidlist *l, *tmp_l; @@ -585,7 +585,7 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, } /* cgroup core interface files for the legacy hierarchies */ -struct cftype cgroup_legacy_base_files[] = { +struct cftype cgroup1_base_files[] = { { .name = "cgroup.procs", .seq_start = cgroup_pidlist_start, @@ -729,7 +729,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) return 0; } -void check_for_release(struct cgroup *cgrp) +void cgroup1_check_for_release(struct cgroup *cgrp) { if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) @@ -759,7 +759,7 @@ void check_for_release(struct cgroup *cgrp) * this routine has no use for the exit status of the release agent * task, so no sense holding our caller up for that. */ -void cgroup_release_agent(struct work_struct *work) +void cgroup1_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); @@ -946,7 +946,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; if (!cgroup_ssid_enabled(i)) continue; - if (cgroup_ssid_no_v1(i)) + if (cgroup1_ssid_disabled(i)) continue; /* Mutually exclusive option 'all' + subsystem name */ @@ -968,7 +968,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) */ if (all_ss || (!one_ss && !opts->none && !opts->name)) for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i)) + if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) opts->subsys_mask |= (1 << i); /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4be306510aff..a05a2dacf5dc 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -199,7 +199,7 @@ struct cgroup_namespace init_cgroup_ns = { static u16 have_canfork_callback __read_mostly; static struct file_system_type cgroup2_fs_type; -static struct cftype cgroup_dfl_base_files[]; +static struct cftype cgroup_base_files[]; static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); @@ -609,7 +609,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (!trigger) break; - check_for_release(cgrp); + cgroup1_check_for_release(cgrp); cgroup_file_notify(&cgrp->events_file); cgrp = cgroup_parent(cgrp); @@ -1440,9 +1440,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css) if (!css->ss) { if (cgroup_on_dfl(cgrp)) - cfts = cgroup_dfl_base_files; + cfts = cgroup_base_files; else - cfts = cgroup_legacy_base_files; + cfts = cgroup1_base_files; return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); } @@ -1645,7 +1645,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); init_waitqueue_head(&cgrp->offline_waitq); - INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent); + INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) @@ -3836,7 +3836,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v) } /* cgroup core interface files for the default hierarchy */ -static struct cftype cgroup_dfl_base_files[] = { +static struct cftype cgroup_base_files[] = { { .name = "cgroup.procs", .file_offset = offsetof(struct cgroup, procs_file), @@ -3909,7 +3909,7 @@ static void css_free_work_fn(struct work_struct *work) } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); - cgroup_pidlist_destroy_all(cgrp); + cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); if (cgroup_parent(cgrp)) { @@ -4411,7 +4411,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ kernfs_remove(cgrp->kn); - check_for_release(cgroup_parent(cgrp)); + cgroup1_check_for_release(cgroup_parent(cgrp)); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -4548,8 +4548,8 @@ int __init cgroup_init(void) BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); - BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); - BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); /* * The latency of the synchronize_sched() is too high for cgroups, @@ -4599,7 +4599,7 @@ int __init cgroup_init(void) continue; } - if (cgroup_ssid_no_v1(ssid)) + if (cgroup1_ssid_disabled(ssid)) printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", ss->name); From dcfe149b9f45aaf89bb95e8b314210da626417d9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:09 -0500 Subject: [PATCH 13/27] cgroup: move namespace code to kernel/cgroup/namespace.c get/put_css_set() get exposed in cgroup-internal.h in the process. Signed-off-by: Tejun Heo Acked-by: Acked-by: Zefan Li --- kernel/cgroup/Makefile | 2 +- kernel/cgroup/cgroup-internal.h | 32 ++++++ kernel/cgroup/cgroup.c | 175 +------------------------------- kernel/cgroup/namespace.c | 155 ++++++++++++++++++++++++++++ 4 files changed, 189 insertions(+), 175 deletions(-) create mode 100644 kernel/cgroup/namespace.c diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 719588cb18cd..6d42a3211164 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,4 +1,4 @@ -obj-y := cgroup.o cgroup-v1.o +obj-y := cgroup.o namespace.o cgroup-v1.o obj-$(CONFIG_CGROUP_FREEZER) += freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index a890c92cb688..589b0e7013ec 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -65,6 +65,33 @@ static inline bool notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } +void put_css_set_locked(struct css_set *cset); + +static inline void put_css_set(struct css_set *cset) +{ + unsigned long flags; + + /* + * Ensure that the refcount doesn't hit zero while any readers + * can see it. Similar to atomic_dec_and_lock(), but for an + * rwlock + */ + if (atomic_add_unless(&cset->refcount, -1, 1)) + return; + + spin_lock_irqsave(&css_set_lock, flags); + put_css_set_locked(cset); + spin_unlock_irqrestore(&css_set_lock, flags); +} + +/* + * refcounted get/put for css_set objects + */ +static inline void get_css_set(struct css_set *cset) +{ + atomic_inc(&cset->refcount); +} + bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); @@ -107,6 +134,11 @@ int cgroup_rmdir(struct kernfs_node *kn); int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root); +/* + * namespace.c + */ +extern const struct proc_ns_operations cgroupns_operations; + /* * cgroup-v1.c */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a05a2dacf5dc..b6b9068ef468 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -718,7 +718,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -static void put_css_set_locked(struct css_set *cset) +void put_css_set_locked(struct css_set *cset) { struct cgrp_cset_link *link, *tmp_link; struct cgroup_subsys *ss; @@ -748,31 +748,6 @@ static void put_css_set_locked(struct css_set *cset) kfree_rcu(cset, rcu_head); } -static void put_css_set(struct css_set *cset) -{ - unsigned long flags; - - /* - * Ensure that the refcount doesn't hit zero while any readers - * can see it. Similar to atomic_dec_and_lock(), but for an - * rwlock - */ - if (atomic_add_unless(&cset->refcount, -1, 1)) - return; - - spin_lock_irqsave(&css_set_lock, flags); - put_css_set_locked(cset); - spin_unlock_irqrestore(&css_set_lock, flags); -} - -/* - * refcounted get/put for css_set objects - */ -static inline void get_css_set(struct css_set *cset) -{ - atomic_inc(&cset->refcount); -} - /** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested @@ -5109,154 +5084,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ -/* cgroup namespaces */ - -static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) -{ - return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); -} - -static void dec_cgroup_namespaces(struct ucounts *ucounts) -{ - dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); -} - -static struct cgroup_namespace *alloc_cgroup_ns(void) -{ - struct cgroup_namespace *new_ns; - int ret; - - new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); - if (!new_ns) - return ERR_PTR(-ENOMEM); - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); - return ERR_PTR(ret); - } - atomic_set(&new_ns->count, 1); - new_ns->ns.ops = &cgroupns_operations; - return new_ns; -} - -void free_cgroup_ns(struct cgroup_namespace *ns) -{ - put_css_set(ns->root_cset); - dec_cgroup_namespaces(ns->ucounts); - put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); - kfree(ns); -} -EXPORT_SYMBOL(free_cgroup_ns); - -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, - struct user_namespace *user_ns, - struct cgroup_namespace *old_ns) -{ - struct cgroup_namespace *new_ns; - struct ucounts *ucounts; - struct css_set *cset; - - BUG_ON(!old_ns); - - if (!(flags & CLONE_NEWCGROUP)) { - get_cgroup_ns(old_ns); - return old_ns; - } - - /* Allow only sysadmin to create cgroup namespace. */ - if (!ns_capable(user_ns, CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - - ucounts = inc_cgroup_namespaces(user_ns); - if (!ucounts) - return ERR_PTR(-ENOSPC); - - /* It is not safe to take cgroup_mutex here */ - spin_lock_irq(&css_set_lock); - cset = task_css_set(current); - get_css_set(cset); - spin_unlock_irq(&css_set_lock); - - new_ns = alloc_cgroup_ns(); - if (IS_ERR(new_ns)) { - put_css_set(cset); - dec_cgroup_namespaces(ucounts); - return new_ns; - } - - new_ns->user_ns = get_user_ns(user_ns); - new_ns->ucounts = ucounts; - new_ns->root_cset = cset; - - return new_ns; -} - -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - -static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) -{ - struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); - - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || - !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) - return -EPERM; - - /* Don't need to do anything if we are attaching to our own cgroupns. */ - if (cgroup_ns == nsproxy->cgroup_ns) - return 0; - - get_cgroup_ns(cgroup_ns); - put_cgroup_ns(nsproxy->cgroup_ns); - nsproxy->cgroup_ns = cgroup_ns; - - return 0; -} - -static struct ns_common *cgroupns_get(struct task_struct *task) -{ - struct cgroup_namespace *ns = NULL; - struct nsproxy *nsproxy; - - task_lock(task); - nsproxy = task->nsproxy; - if (nsproxy) { - ns = nsproxy->cgroup_ns; - get_cgroup_ns(ns); - } - task_unlock(task); - - return ns ? &ns->ns : NULL; -} - -static void cgroupns_put(struct ns_common *ns) -{ - put_cgroup_ns(to_cg_ns(ns)); -} - -static struct user_namespace *cgroupns_owner(struct ns_common *ns) -{ - return to_cg_ns(ns)->user_ns; -} - -const struct proc_ns_operations cgroupns_operations = { - .name = "cgroup", - .type = CLONE_NEWCGROUP, - .get = cgroupns_get, - .put = cgroupns_put, - .install = cgroupns_install, - .owner = cgroupns_owner, -}; - -static __init int cgroup_namespaces_init(void) -{ - return 0; -} -subsys_initcall(cgroup_namespaces_init); - #ifdef CONFIG_CGROUP_BPF void cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c new file mode 100644 index 000000000000..cff7ea62c38f --- /dev/null +++ b/kernel/cgroup/namespace.c @@ -0,0 +1,155 @@ +#include "cgroup-internal.h" + +#include +#include +#include +#include + + +/* cgroup namespaces */ + +static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); +} + +static void dec_cgroup_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); +} + +static struct cgroup_namespace *alloc_cgroup_ns(void) +{ + struct cgroup_namespace *new_ns; + int ret; + + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); + if (!new_ns) + return ERR_PTR(-ENOMEM); + ret = ns_alloc_inum(&new_ns->ns); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } + atomic_set(&new_ns->count, 1); + new_ns->ns.ops = &cgroupns_operations; + return new_ns; +} + +void free_cgroup_ns(struct cgroup_namespace *ns) +{ + put_css_set(ns->root_cset); + dec_cgroup_namespaces(ns->ucounts); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + kfree(ns); +} +EXPORT_SYMBOL(free_cgroup_ns); + +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + struct cgroup_namespace *new_ns; + struct ucounts *ucounts; + struct css_set *cset; + + BUG_ON(!old_ns); + + if (!(flags & CLONE_NEWCGROUP)) { + get_cgroup_ns(old_ns); + return old_ns; + } + + /* Allow only sysadmin to create cgroup namespace. */ + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + ucounts = inc_cgroup_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + + /* It is not safe to take cgroup_mutex here */ + spin_lock_irq(&css_set_lock); + cset = task_css_set(current); + get_css_set(cset); + spin_unlock_irq(&css_set_lock); + + new_ns = alloc_cgroup_ns(); + if (IS_ERR(new_ns)) { + put_css_set(cset); + dec_cgroup_namespaces(ucounts); + return new_ns; + } + + new_ns->user_ns = get_user_ns(user_ns); + new_ns->ucounts = ucounts; + new_ns->root_cset = cset; + + return new_ns; +} + +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + +static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) +{ + struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); + + if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || + !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + /* Don't need to do anything if we are attaching to our own cgroupns. */ + if (cgroup_ns == nsproxy->cgroup_ns) + return 0; + + get_cgroup_ns(cgroup_ns); + put_cgroup_ns(nsproxy->cgroup_ns); + nsproxy->cgroup_ns = cgroup_ns; + + return 0; +} + +static struct ns_common *cgroupns_get(struct task_struct *task) +{ + struct cgroup_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->cgroup_ns; + get_cgroup_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void cgroupns_put(struct ns_common *ns) +{ + put_cgroup_ns(to_cg_ns(ns)); +} + +static struct user_namespace *cgroupns_owner(struct ns_common *ns) +{ + return to_cg_ns(ns)->user_ns; +} + +const struct proc_ns_operations cgroupns_operations = { + .name = "cgroup", + .type = CLONE_NEWCGROUP, + .get = cgroupns_get, + .put = cgroupns_put, + .install = cgroupns_install, + .owner = cgroupns_owner, +}; + +static __init int cgroup_namespaces_init(void) +{ + return 0; +} +subsys_initcall(cgroup_namespaces_init); From e0aed7c74f0bf6c643103f3a10ff988b7ee6545a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:09 -0500 Subject: [PATCH 14/27] cgroup: fix RCU related sparse warnings kn->priv which is a void * is used as a RCU pointer by cgroup. When dereferencing it, it was passing kn->priv to rcu_derefreence() without casting it into a RCU pointer triggering address space mismatch warning from sparse. Fix them. Signed-off-by: Tejun Heo Reported-by: Fengguang Wu Acked-by: Acked-by: Zefan Li --- kernel/cgroup/cgroup-v1.c | 2 +- kernel/cgroup/cgroup.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 37be09e09740..465b10111eaf 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -694,7 +694,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) * @kn->priv is RCU safe. Let's do the RCU dancing. */ rcu_read_lock(); - cgrp = rcu_dereference(kn->priv); + cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (!cgrp || cgroup_is_dead(cgrp)) { rcu_read_unlock(); mutex_unlock(&cgroup_mutex); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b6b9068ef468..d9d82e96d67f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4932,7 +4932,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, * have been or be removed at any point. @kn->priv is RCU * protected for this access. See css_release_work_fn() for details. */ - cgrp = rcu_dereference(kn->priv); + cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (cgrp) css = cgroup_css(cgrp, ss); From 7b4632f048415263669676dda20fd5d811c3d3e4 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sat, 24 Dec 2016 23:28:35 +0800 Subject: [PATCH 15/27] cgroup: fix a comment typo Fix a comment typo in cgroup.h. Signed-off-by: Geliang Tang Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c83c23f0577b..f6b43fbb141c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -266,7 +266,7 @@ void css_task_iter_end(struct css_task_iter *it); * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset * @leader: the loop cursor * @dst_css: the destination css - * @tset: takset to iterate + * @tset: taskset to iterate * * Iterate threadgroup leaders of @tset. For single-task migrations, @tset * may not contain any. From 39d3e7584a686541a3295ff1624d341e669e1afc Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 10 Jan 2017 00:02:13 +0000 Subject: [PATCH 16/27] rdmacg: Added rdma cgroup controller Added rdma cgroup controller that does accounting, limit enforcement on rdma/IB resources. Added rdma cgroup header file which defines its APIs to perform charging/uncharging functionality. It also defined APIs for RDMA/IB stack for device registration. Devices which are registered will participate in controller functions of accounting and limit enforcements. It define rdmacg_device structure to bind IB stack and RDMA cgroup controller. RDMA resources are tracked using resource pool. Resource pool is per device, per cgroup entity which allows setting up accounting limits on per device basis. Currently resources are defined by the RDMA cgroup. Resource pool is created/destroyed dynamically whenever charging/uncharging occurs respectively and whenever user configuration is done. Its a tradeoff of memory vs little more code space that creates resource pool object whenever necessary, instead of creating them during cgroup creation and device registration time. Signed-off-by: Parav Pandit Signed-off-by: Tejun Heo --- include/linux/cgroup_rdma.h | 53 +++ include/linux/cgroup_subsys.h | 4 + init/Kconfig | 10 + kernel/cgroup/Makefile | 1 + kernel/cgroup/rdma.c | 617 ++++++++++++++++++++++++++++++++++ 5 files changed, 685 insertions(+) create mode 100644 include/linux/cgroup_rdma.h create mode 100644 kernel/cgroup/rdma.c diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h new file mode 100644 index 000000000000..e94290b29e99 --- /dev/null +++ b/include/linux/cgroup_rdma.h @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2016 Parav Pandit + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#ifndef _CGROUP_RDMA_H +#define _CGROUP_RDMA_H + +#include + +enum rdmacg_resource_type { + RDMACG_RESOURCE_HCA_HANDLE, + RDMACG_RESOURCE_HCA_OBJECT, + RDMACG_RESOURCE_MAX, +}; + +#ifdef CONFIG_CGROUP_RDMA + +struct rdma_cgroup { + struct cgroup_subsys_state css; + + /* + * head to keep track of all resource pools + * that belongs to this cgroup. + */ + struct list_head rpools; +}; + +struct rdmacg_device { + struct list_head dev_node; + struct list_head rpools; + char *name; +}; + +/* + * APIs for RDMA/IB stack to publish when a device wants to + * participate in resource accounting + */ +int rdmacg_register_device(struct rdmacg_device *device); +void rdmacg_unregister_device(struct rdmacg_device *device); + +/* APIs for RDMA/IB stack to charge/uncharge pool specific resources */ +int rdmacg_try_charge(struct rdma_cgroup **rdmacg, + struct rdmacg_device *device, + enum rdmacg_resource_type index); +void rdmacg_uncharge(struct rdma_cgroup *cg, + struct rdmacg_device *device, + enum rdmacg_resource_type index); +#endif /* CONFIG_CGROUP_RDMA */ +#endif /* _CGROUP_RDMA_H */ diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 0df0336acee9..d0e597c44585 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -56,6 +56,10 @@ SUBSYS(hugetlb) SUBSYS(pids) #endif +#if IS_ENABLED(CONFIG_CGROUP_RDMA) +SUBSYS(rdma) +#endif + /* * The following subsystems are not supported on the default hierarchy. */ diff --git a/init/Kconfig b/init/Kconfig index 223b734abccd..ef80d46a32b6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1090,6 +1090,16 @@ config CGROUP_PIDS since the PIDs limit only affects a process's ability to fork, not to attach to a cgroup. +config CGROUP_RDMA + bool "RDMA controller" + help + Provides enforcement of RDMA resources defined by IB stack. + It is fairly easy for consumers to exhaust RDMA resources, which + can result into resource unavailability to other consumers. + RDMA controller is designed to stop this from happening. + Attaching processes with active RDMA resources to the cgroup + hierarchy is allowed even if can cross the hierarchy's limit. + config CGROUP_FREEZER bool "Freezer controller" help diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 6d42a3211164..387348a40c64 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -2,4 +2,5 @@ obj-y := cgroup.o namespace.o cgroup-v1.o obj-$(CONFIG_CGROUP_FREEZER) += freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o +obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c new file mode 100644 index 000000000000..021bee7a9692 --- /dev/null +++ b/kernel/cgroup/rdma.c @@ -0,0 +1,617 @@ +/* + * RDMA resource limiting controller for cgroups. + * + * Used to allow a cgroup hierarchy to stop processes from consuming + * additional RDMA resources after a certain limit is reached. + * + * Copyright (C) 2016 Parav Pandit + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include +#include +#include +#include +#include +#include + +#define RDMACG_MAX_STR "max" + +/* + * Protects list of resource pools maintained on per cgroup basis + * and rdma device list. + */ +static DEFINE_MUTEX(rdmacg_mutex); +static LIST_HEAD(rdmacg_devices); + +enum rdmacg_file_type { + RDMACG_RESOURCE_TYPE_MAX, + RDMACG_RESOURCE_TYPE_STAT, +}; + +/* + * resource table definition as to be seen by the user. + * Need to add entries to it when more resources are + * added/defined at IB verb/core layer. + */ +static char const *rdmacg_resource_names[] = { + [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle", + [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object", +}; + +/* resource tracker for each resource of rdma cgroup */ +struct rdmacg_resource { + int max; + int usage; +}; + +/* + * resource pool object which represents per cgroup, per device + * resources. There are multiple instances of this object per cgroup, + * therefore it cannot be embedded within rdma_cgroup structure. It + * is maintained as list. + */ +struct rdmacg_resource_pool { + struct rdmacg_device *device; + struct rdmacg_resource resources[RDMACG_RESOURCE_MAX]; + + struct list_head cg_node; + struct list_head dev_node; + + /* count active user tasks of this pool */ + u64 usage_sum; + /* total number counts which are set to max */ + int num_max_cnt; +}; + +static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css) +{ + return container_of(css, struct rdma_cgroup, css); +} + +static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg) +{ + return css_rdmacg(cg->css.parent); +} + +static inline struct rdma_cgroup *get_current_rdmacg(void) +{ + return css_rdmacg(task_get_css(current, rdma_cgrp_id)); +} + +static void set_resource_limit(struct rdmacg_resource_pool *rpool, + int index, int new_max) +{ + if (new_max == S32_MAX) { + if (rpool->resources[index].max != S32_MAX) + rpool->num_max_cnt++; + } else { + if (rpool->resources[index].max == S32_MAX) + rpool->num_max_cnt--; + } + rpool->resources[index].max = new_max; +} + +static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool) +{ + int i; + + for (i = 0; i < RDMACG_RESOURCE_MAX; i++) + set_resource_limit(rpool, i, S32_MAX); +} + +static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool) +{ + lockdep_assert_held(&rdmacg_mutex); + + list_del(&rpool->cg_node); + list_del(&rpool->dev_node); + kfree(rpool); +} + +static struct rdmacg_resource_pool * +find_cg_rpool_locked(struct rdma_cgroup *cg, + struct rdmacg_device *device) + +{ + struct rdmacg_resource_pool *pool; + + lockdep_assert_held(&rdmacg_mutex); + + list_for_each_entry(pool, &cg->rpools, cg_node) + if (pool->device == device) + return pool; + + return NULL; +} + +static struct rdmacg_resource_pool * +get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) +{ + struct rdmacg_resource_pool *rpool; + + rpool = find_cg_rpool_locked(cg, device); + if (rpool) + return rpool; + + rpool = kzalloc(sizeof(*rpool), GFP_KERNEL); + if (!rpool) + return ERR_PTR(-ENOMEM); + + rpool->device = device; + set_all_resource_max_limit(rpool); + + INIT_LIST_HEAD(&rpool->cg_node); + INIT_LIST_HEAD(&rpool->dev_node); + list_add_tail(&rpool->cg_node, &cg->rpools); + list_add_tail(&rpool->dev_node, &device->rpools); + return rpool; +} + +/** + * uncharge_cg_locked - uncharge resource for rdma cgroup + * @cg: pointer to cg to uncharge and all parents in hierarchy + * @device: pointer to rdmacg device + * @index: index of the resource to uncharge in cg (resource pool) + * + * It also frees the resource pool which was created as part of + * charging operation when there are no resources attached to + * resource pool. + */ +static void +uncharge_cg_locked(struct rdma_cgroup *cg, + struct rdmacg_device *device, + enum rdmacg_resource_type index) +{ + struct rdmacg_resource_pool *rpool; + + rpool = find_cg_rpool_locked(cg, device); + + /* + * rpool cannot be null at this stage. Let kernel operate in case + * if there a bug in IB stack or rdma controller, instead of crashing + * the system. + */ + if (unlikely(!rpool)) { + pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device); + return; + } + + rpool->resources[index].usage--; + + /* + * A negative count (or overflow) is invalid, + * it indicates a bug in the rdma controller. + */ + WARN_ON_ONCE(rpool->resources[index].usage < 0); + rpool->usage_sum--; + if (rpool->usage_sum == 0 && + rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { + /* + * No user of the rpool and all entries are set to max, so + * safe to delete this rpool. + */ + free_cg_rpool_locked(rpool); + } +} + +/** + * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count + * @device: pointer to rdmacg device + * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup + * stop uncharging + * @index: index of the resource to uncharge in cg in given resource pool + */ +static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg, + struct rdmacg_device *device, + struct rdma_cgroup *stop_cg, + enum rdmacg_resource_type index) +{ + struct rdma_cgroup *p; + + mutex_lock(&rdmacg_mutex); + + for (p = cg; p != stop_cg; p = parent_rdmacg(p)) + uncharge_cg_locked(p, device, index); + + mutex_unlock(&rdmacg_mutex); + + css_put(&cg->css); +} + +/** + * rdmacg_uncharge - hierarchically uncharge rdma resource count + * @device: pointer to rdmacg device + * @index: index of the resource to uncharge in cgroup in given resource pool + */ +void rdmacg_uncharge(struct rdma_cgroup *cg, + struct rdmacg_device *device, + enum rdmacg_resource_type index) +{ + if (index >= RDMACG_RESOURCE_MAX) + return; + + rdmacg_uncharge_hierarchy(cg, device, NULL, index); +} +EXPORT_SYMBOL(rdmacg_uncharge); + +/** + * rdmacg_try_charge - hierarchically try to charge the rdma resource + * @rdmacg: pointer to rdma cgroup which will own this resource + * @device: pointer to rdmacg device + * @index: index of the resource to charge in cgroup (resource pool) + * + * This function follows charging resource in hierarchical way. + * It will fail if the charge would cause the new value to exceed the + * hierarchical limit. + * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL. + * Returns pointer to rdmacg for this resource when charging is successful. + * + * Charger needs to account resources on two criteria. + * (a) per cgroup & (b) per device resource usage. + * Per cgroup resource usage ensures that tasks of cgroup doesn't cross + * the configured limits. Per device provides granular configuration + * in multi device usage. It allocates resource pool in the hierarchy + * for each parent it come across for first resource. Later on resource + * pool will be available. Therefore it will be much faster thereon + * to charge/uncharge. + */ +int rdmacg_try_charge(struct rdma_cgroup **rdmacg, + struct rdmacg_device *device, + enum rdmacg_resource_type index) +{ + struct rdma_cgroup *cg, *p; + struct rdmacg_resource_pool *rpool; + s64 new; + int ret = 0; + + if (index >= RDMACG_RESOURCE_MAX) + return -EINVAL; + + /* + * hold on to css, as cgroup can be removed but resource + * accounting happens on css. + */ + cg = get_current_rdmacg(); + + mutex_lock(&rdmacg_mutex); + for (p = cg; p; p = parent_rdmacg(p)) { + rpool = get_cg_rpool_locked(p, device); + if (IS_ERR(rpool)) { + ret = PTR_ERR(rpool); + goto err; + } else { + new = rpool->resources[index].usage + 1; + if (new > rpool->resources[index].max) { + ret = -EAGAIN; + goto err; + } else { + rpool->resources[index].usage = new; + rpool->usage_sum++; + } + } + } + mutex_unlock(&rdmacg_mutex); + + *rdmacg = cg; + return 0; + +err: + mutex_unlock(&rdmacg_mutex); + rdmacg_uncharge_hierarchy(cg, device, p, index); + return ret; +} +EXPORT_SYMBOL(rdmacg_try_charge); + +/** + * rdmacg_register_device - register rdmacg device to rdma controller. + * @device: pointer to rdmacg device whose resources need to be accounted. + * + * If IB stack wish a device to participate in rdma cgroup resource + * tracking, it must invoke this API to register with rdma cgroup before + * any user space application can start using the RDMA resources. + * Returns 0 on success or EINVAL when table length given is beyond + * supported size. + */ +int rdmacg_register_device(struct rdmacg_device *device) +{ + INIT_LIST_HEAD(&device->dev_node); + INIT_LIST_HEAD(&device->rpools); + + mutex_lock(&rdmacg_mutex); + list_add_tail(&device->dev_node, &rdmacg_devices); + mutex_unlock(&rdmacg_mutex); + return 0; +} +EXPORT_SYMBOL(rdmacg_register_device); + +/** + * rdmacg_unregister_device - unregister rdmacg device from rdma controller. + * @device: pointer to rdmacg device which was previously registered with rdma + * controller using rdmacg_register_device(). + * + * IB stack must invoke this after all the resources of the IB device + * are destroyed and after ensuring that no more resources will be created + * when this API is invoked. + */ +void rdmacg_unregister_device(struct rdmacg_device *device) +{ + struct rdmacg_resource_pool *rpool, *tmp; + + /* + * Synchronize with any active resource settings, + * usage query happening via configfs. + */ + mutex_lock(&rdmacg_mutex); + list_del_init(&device->dev_node); + + /* + * Now that this device is off the cgroup list, its safe to free + * all the rpool resources. + */ + list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node) + free_cg_rpool_locked(rpool); + + mutex_unlock(&rdmacg_mutex); +} +EXPORT_SYMBOL(rdmacg_unregister_device); + +static int parse_resource(char *c, int *intval) +{ + substring_t argstr; + const char **table = &rdmacg_resource_names[0]; + char *name, *value = c; + size_t len; + int ret, i = 0; + + name = strsep(&value, "="); + if (!name || !value) + return -EINVAL; + + len = strlen(value); + + for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { + if (strcmp(table[i], name)) + continue; + + argstr.from = value; + argstr.to = value + len; + + ret = match_int(&argstr, intval); + if (ret >= 0) { + if (*intval < 0) + break; + return i; + } + if (strncmp(value, RDMACG_MAX_STR, len) == 0) { + *intval = S32_MAX; + return i; + } + break; + } + return -EINVAL; +} + +static int rdmacg_parse_limits(char *options, + int *new_limits, unsigned long *enables) +{ + char *c; + int err = -EINVAL; + + /* parse resource options */ + while ((c = strsep(&options, " ")) != NULL) { + int index, intval; + + index = parse_resource(c, &intval); + if (index < 0) + goto err; + + new_limits[index] = intval; + *enables |= BIT(index); + } + return 0; + +err: + return err; +} + +static struct rdmacg_device *rdmacg_get_device_locked(const char *name) +{ + struct rdmacg_device *device; + + lockdep_assert_held(&rdmacg_mutex); + + list_for_each_entry(device, &rdmacg_devices, dev_node) + if (!strcmp(name, device->name)) + return device; + + return NULL; +} + +static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdma_cgroup *cg = css_rdmacg(of_css(of)); + const char *dev_name; + struct rdmacg_resource_pool *rpool; + struct rdmacg_device *device; + char *options = strstrip(buf); + int *new_limits; + unsigned long enables = 0; + int i = 0, ret = 0; + + /* extract the device name first */ + dev_name = strsep(&options, " "); + if (!dev_name) { + ret = -EINVAL; + goto err; + } + + new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL); + if (!new_limits) { + ret = -ENOMEM; + goto err; + } + + ret = rdmacg_parse_limits(options, new_limits, &enables); + if (ret) + goto parse_err; + + /* acquire lock to synchronize with hot plug devices */ + mutex_lock(&rdmacg_mutex); + + device = rdmacg_get_device_locked(dev_name); + if (!device) { + ret = -ENODEV; + goto dev_err; + } + + rpool = get_cg_rpool_locked(cg, device); + if (IS_ERR(rpool)) { + ret = PTR_ERR(rpool); + goto dev_err; + } + + /* now set the new limits of the rpool */ + for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX) + set_resource_limit(rpool, i, new_limits[i]); + + if (rpool->usage_sum == 0 && + rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { + /* + * No user of the rpool and all entries are set to max, so + * safe to delete this rpool. + */ + free_cg_rpool_locked(rpool); + } + +dev_err: + mutex_unlock(&rdmacg_mutex); + +parse_err: + kfree(new_limits); + +err: + return ret ?: nbytes; +} + +static void print_rpool_values(struct seq_file *sf, + struct rdmacg_resource_pool *rpool) +{ + enum rdmacg_file_type sf_type; + int i; + u32 value; + + sf_type = seq_cft(sf)->private; + + for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { + seq_puts(sf, rdmacg_resource_names[i]); + seq_putc(sf, '='); + if (sf_type == RDMACG_RESOURCE_TYPE_MAX) { + if (rpool) + value = rpool->resources[i].max; + else + value = S32_MAX; + } else { + if (rpool) + value = rpool->resources[i].usage; + } + + if (value == S32_MAX) + seq_puts(sf, RDMACG_MAX_STR); + else + seq_printf(sf, "%d", value); + seq_putc(sf, ' '); + } +} + +static int rdmacg_resource_read(struct seq_file *sf, void *v) +{ + struct rdmacg_device *device; + struct rdmacg_resource_pool *rpool; + struct rdma_cgroup *cg = css_rdmacg(seq_css(sf)); + + mutex_lock(&rdmacg_mutex); + + list_for_each_entry(device, &rdmacg_devices, dev_node) { + seq_printf(sf, "%s ", device->name); + + rpool = find_cg_rpool_locked(cg, device); + print_rpool_values(sf, rpool); + + seq_putc(sf, '\n'); + } + + mutex_unlock(&rdmacg_mutex); + return 0; +} + +static struct cftype rdmacg_files[] = { + { + .name = "max", + .write = rdmacg_resource_set_max, + .seq_show = rdmacg_resource_read, + .private = RDMACG_RESOURCE_TYPE_MAX, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "current", + .seq_show = rdmacg_resource_read, + .private = RDMACG_RESOURCE_TYPE_STAT, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { } /* terminate */ +}; + +static struct cgroup_subsys_state * +rdmacg_css_alloc(struct cgroup_subsys_state *parent) +{ + struct rdma_cgroup *cg; + + cg = kzalloc(sizeof(*cg), GFP_KERNEL); + if (!cg) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&cg->rpools); + return &cg->css; +} + +static void rdmacg_css_free(struct cgroup_subsys_state *css) +{ + struct rdma_cgroup *cg = css_rdmacg(css); + + kfree(cg); +} + +/** + * rdmacg_css_offline - cgroup css_offline callback + * @css: css of interest + * + * This function is called when @css is about to go away and responsible + * for shooting down all rdmacg associated with @css. As part of that it + * marks all the resource pool entries to max value, so that when resources are + * uncharged, associated resource pool can be freed as well. + */ +static void rdmacg_css_offline(struct cgroup_subsys_state *css) +{ + struct rdma_cgroup *cg = css_rdmacg(css); + struct rdmacg_resource_pool *rpool; + + mutex_lock(&rdmacg_mutex); + + list_for_each_entry(rpool, &cg->rpools, cg_node) + set_all_resource_max_limit(rpool); + + mutex_unlock(&rdmacg_mutex); +} + +struct cgroup_subsys rdma_cgrp_subsys = { + .css_alloc = rdmacg_css_alloc, + .css_free = rdmacg_css_free, + .css_offline = rdmacg_css_offline, + .legacy_cftypes = rdmacg_files, + .dfl_cftypes = rdmacg_files, +}; From 43579b5f2c79d747d8294bd233db41c954e2dc4a Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 10 Jan 2017 00:02:14 +0000 Subject: [PATCH 17/27] IB/core: added support to use rdma cgroup controller Added support APIs for IB core to register/unregister every IB/RDMA device with rdma cgroup for tracking rdma resources. IB core registers with rdma cgroup controller. Added support APIs for uverbs layer to make use of rdma controller. Added uverbs layer to perform resource charge/uncharge functionality. Added support during query_device uverb operation to ensure it returns resource limits by honoring rdma cgroup configured limits. Signed-off-by: Parav Pandit Signed-off-by: Tejun Heo --- drivers/infiniband/core/Makefile | 1 + drivers/infiniband/core/cgroup.c | 62 ++++++++++++++++ drivers/infiniband/core/core_priv.h | 30 ++++++++ drivers/infiniband/core/device.c | 10 +++ drivers/infiniband/core/uverbs_cmd.c | 102 ++++++++++++++++++++++++-- drivers/infiniband/core/uverbs_main.c | 20 +++++ include/rdma/ib_verbs.h | 14 ++++ 7 files changed, 233 insertions(+), 6 deletions(-) create mode 100644 drivers/infiniband/core/cgroup.c diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index edaae9f9853c..e426ac877d19 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -13,6 +13,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o +ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o ib_cm-y := cm.o diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c new file mode 100644 index 000000000000..126ac5f99db7 --- /dev/null +++ b/drivers/infiniband/core/cgroup.c @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2016 Parav Pandit + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include "core_priv.h" + +/** + * ib_device_register_rdmacg - register with rdma cgroup. + * @device: device to register to participate in resource + * accounting by rdma cgroup. + * + * Register with the rdma cgroup. Should be called before + * exposing rdma device to user space applications to avoid + * resource accounting leak. + * Returns 0 on success or otherwise failure code. + */ +int ib_device_register_rdmacg(struct ib_device *device) +{ + device->cg_device.name = device->name; + return rdmacg_register_device(&device->cg_device); +} + +/** + * ib_device_unregister_rdmacg - unregister with rdma cgroup. + * @device: device to unregister. + * + * Unregister with the rdma cgroup. Should be called after + * all the resources are deallocated, and after a stage when any + * other resource allocation by user application cannot be done + * for this device to avoid any leak in accounting. + */ +void ib_device_unregister_rdmacg(struct ib_device *device) +{ + rdmacg_unregister_device(&device->cg_device); +} + +int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, + struct ib_device *device, + enum rdmacg_resource_type resource_index) +{ + return rdmacg_try_charge(&cg_obj->cg, &device->cg_device, + resource_index); +} +EXPORT_SYMBOL(ib_rdmacg_try_charge); + +void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, + struct ib_device *device, + enum rdmacg_resource_type resource_index) +{ + rdmacg_uncharge(cg_obj->cg, &device->cg_device, + resource_index); +} +EXPORT_SYMBOL(ib_rdmacg_uncharge); diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index d29372624f3a..389f6192bddc 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -35,6 +35,7 @@ #include #include +#include #include @@ -121,6 +122,35 @@ int ib_cache_setup_one(struct ib_device *device); void ib_cache_cleanup_one(struct ib_device *device); void ib_cache_release_one(struct ib_device *device); +#ifdef CONFIG_CGROUP_RDMA +int ib_device_register_rdmacg(struct ib_device *device); +void ib_device_unregister_rdmacg(struct ib_device *device); + +int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, + struct ib_device *device, + enum rdmacg_resource_type resource_index); + +void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, + struct ib_device *device, + enum rdmacg_resource_type resource_index); +#else +static inline int ib_device_register_rdmacg(struct ib_device *device) +{ return 0; } + +static inline void ib_device_unregister_rdmacg(struct ib_device *device) +{ } + +static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, + struct ib_device *device, + enum rdmacg_resource_type resource_index) +{ return 0; } + +static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, + struct ib_device *device, + enum rdmacg_resource_type resource_index) +{ } +#endif + static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, struct net_device *upper) { diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 571974cd3919..70065386acbc 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -360,10 +360,18 @@ int ib_register_device(struct ib_device *device, goto out; } + ret = ib_device_register_rdmacg(device); + if (ret) { + pr_warn("Couldn't register device with rdma cgroup\n"); + ib_cache_cleanup_one(device); + goto out; + } + memset(&device->attrs, 0, sizeof(device->attrs)); ret = device->query_device(device, &device->attrs, &uhw); if (ret) { pr_warn("Couldn't query the device attributes\n"); + ib_device_unregister_rdmacg(device); ib_cache_cleanup_one(device); goto out; } @@ -372,6 +380,7 @@ int ib_register_device(struct ib_device *device, if (ret) { pr_warn("Couldn't register device %s with driver model\n", device->name); + ib_device_unregister_rdmacg(device); ib_cache_cleanup_one(device); goto out; } @@ -421,6 +430,7 @@ void ib_unregister_device(struct ib_device *device) mutex_unlock(&device_mutex); + ib_device_unregister_rdmacg(device); ib_device_unregister_sysfs(device); ib_cache_cleanup_one(device); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 700782203483..33bc88a38574 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -316,6 +316,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, struct ib_udata udata; struct ib_ucontext *ucontext; struct file *filp; + struct ib_rdmacg_object cg_obj; int ret; if (out_len < sizeof resp) @@ -335,13 +336,18 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); + ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); + if (ret) + goto err; + ucontext = ib_dev->alloc_ucontext(ib_dev, &udata); if (IS_ERR(ucontext)) { ret = PTR_ERR(ucontext); - goto err; + goto err_alloc; } ucontext->device = ib_dev; + ucontext->cg_obj = cg_obj; INIT_LIST_HEAD(&ucontext->pd_list); INIT_LIST_HEAD(&ucontext->mr_list); INIT_LIST_HEAD(&ucontext->mw_list); @@ -407,6 +413,9 @@ err_free: put_pid(ucontext->tgid); ib_dev->dealloc_ucontext(ucontext); +err_alloc: + ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); + err: mutex_unlock(&file->mutex); return ret; @@ -561,6 +570,13 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, return -ENOMEM; init_uobj(uobj, 0, file->ucontext, &pd_lock_class); + ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev, + RDMACG_RESOURCE_HCA_OBJECT); + if (ret) { + kfree(uobj); + return ret; + } + down_write(&uobj->mutex); pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata); @@ -605,6 +621,7 @@ err_idr: ib_dealloc_pd(pd); err: + ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); put_uobj_write(uobj); return ret; } @@ -637,6 +654,8 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, if (ret) goto err_put; + ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); + uobj->live = 0; put_uobj_write(uobj); @@ -1006,6 +1025,10 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, goto err_put; } } + ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev, + RDMACG_RESOURCE_HCA_OBJECT); + if (ret) + goto err_charge; mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, cmd.access_flags, &udata); @@ -1054,6 +1077,9 @@ err_unreg: ib_dereg_mr(mr); err_put: + ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); + +err_charge: put_pd_read(pd); err_free: @@ -1178,6 +1204,8 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, if (ret) return ret; + ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); + idr_remove_uobj(&ib_uverbs_mr_idr, uobj); mutex_lock(&file->mutex); @@ -1226,6 +1254,11 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), out_len - sizeof(resp)); + ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev, + RDMACG_RESOURCE_HCA_OBJECT); + if (ret) + goto err_charge; + mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata); if (IS_ERR(mw)) { ret = PTR_ERR(mw); @@ -1271,6 +1304,9 @@ err_unalloc: uverbs_dealloc_mw(mw); err_put: + ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); + +err_charge: put_pd_read(pd); err_free: @@ -1306,6 +1342,8 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, if (ret) return ret; + ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); + idr_remove_uobj(&ib_uverbs_mw_idr, uobj); mutex_lock(&file->mutex); @@ -1405,6 +1443,11 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags)) attr.flags = cmd->flags; + ret = ib_rdmacg_try_charge(&obj->uobject.cg_obj, ib_dev, + RDMACG_RESOURCE_HCA_OBJECT); + if (ret) + goto err_charge; + cq = ib_dev->create_cq(ib_dev, &attr, file->ucontext, uhw); if (IS_ERR(cq)) { @@ -1452,6 +1495,10 @@ err_free: ib_destroy_cq(cq); err_file: + ib_rdmacg_uncharge(&obj->uobject.cg_obj, ib_dev, + RDMACG_RESOURCE_HCA_OBJECT); + +err_charge: if (ev_file) ib_uverbs_release_ucq(file, ev_file, obj); @@ -1732,6 +1779,8 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file, if (ret) return ret; + ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); + idr_remove_uobj(&ib_uverbs_cq_idr, uobj); mutex_lock(&file->mutex); @@ -1904,6 +1953,11 @@ static int create_qp(struct ib_uverbs_file *file, goto err_put; } + ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, device, + RDMACG_RESOURCE_HCA_OBJECT); + if (ret) + goto err_put; + if (cmd->qp_type == IB_QPT_XRC_TGT) qp = ib_create_qp(pd, &attr); else @@ -1911,7 +1965,7 @@ static int create_qp(struct ib_uverbs_file *file, if (IS_ERR(qp)) { ret = PTR_ERR(qp); - goto err_put; + goto err_create; } if (cmd->qp_type != IB_QPT_XRC_TGT) { @@ -1992,6 +2046,10 @@ err_cb: err_destroy: ib_destroy_qp(qp); +err_create: + ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, device, + RDMACG_RESOURCE_HCA_OBJECT); + err_put: if (xrcd) put_xrcd_read(xrcd_uobj); @@ -2518,6 +2576,8 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, if (ret) return ret; + ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); + if (obj->uxrcd) atomic_dec(&obj->uxrcd->refcnt); @@ -2969,11 +3029,16 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, memset(&attr.dmac, 0, sizeof(attr.dmac)); memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16); + ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev, + RDMACG_RESOURCE_HCA_OBJECT); + if (ret) + goto err_charge; + ah = pd->device->create_ah(pd, &attr, &udata); if (IS_ERR(ah)) { ret = PTR_ERR(ah); - goto err_put; + goto err_create; } ah->device = pd->device; @@ -3012,7 +3077,10 @@ err_copy: err_destroy: ib_destroy_ah(ah); -err_put: +err_create: + ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); + +err_charge: put_pd_read(pd); err: @@ -3046,6 +3114,8 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, if (ret) return ret; + ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); + idr_remove_uobj(&ib_uverbs_ah_idr, uobj); mutex_lock(&file->mutex); @@ -3822,10 +3892,16 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, err = -EINVAL; goto err_free; } + + err = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev, + RDMACG_RESOURCE_HCA_OBJECT); + if (err) + goto err_free; + flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); if (IS_ERR(flow_id)) { err = PTR_ERR(flow_id); - goto err_free; + goto err_create; } flow_id->uobject = uobj; uobj->object = flow_id; @@ -3858,6 +3934,8 @@ err_copy: idr_remove_uobj(&ib_uverbs_rule_idr, uobj); destroy_flow: ib_destroy_flow(flow_id); +err_create: + ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); err_free: kfree(flow_attr); err_put: @@ -3897,8 +3975,11 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, flow_id = uobj->object; ret = ib_destroy_flow(flow_id); - if (!ret) + if (!ret) { + ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, + RDMACG_RESOURCE_HCA_OBJECT); uobj->live = 0; + } put_uobj_write(uobj); @@ -3966,6 +4047,11 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); + ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, ib_dev, + RDMACG_RESOURCE_HCA_OBJECT); + if (ret) + goto err_put_cq; + srq = pd->device->create_srq(pd, &attr, udata); if (IS_ERR(srq)) { ret = PTR_ERR(srq); @@ -4030,6 +4116,8 @@ err_destroy: ib_destroy_srq(srq); err_put: + ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, ib_dev, + RDMACG_RESOURCE_HCA_OBJECT); put_pd_read(pd); err_put_cq: @@ -4216,6 +4304,8 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, if (ret) return ret; + ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); + if (srq_type == IB_SRQT_XRC) { us = container_of(obj, struct ib_usrq_object, uevent); atomic_dec(&us->uxrcd->refcnt); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index b3f95d453fba..cdbd26d6574b 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -51,6 +51,7 @@ #include #include "uverbs.h" +#include "core_priv.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand userspace verbs access"); @@ -237,6 +238,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, idr_remove_uobj(&ib_uverbs_ah_idr, uobj); ib_destroy_ah(ah); + ib_rdmacg_uncharge(&uobj->cg_obj, context->device, + RDMACG_RESOURCE_HCA_OBJECT); kfree(uobj); } @@ -246,6 +249,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, idr_remove_uobj(&ib_uverbs_mw_idr, uobj); uverbs_dealloc_mw(mw); + ib_rdmacg_uncharge(&uobj->cg_obj, context->device, + RDMACG_RESOURCE_HCA_OBJECT); kfree(uobj); } @@ -254,6 +259,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, idr_remove_uobj(&ib_uverbs_rule_idr, uobj); ib_destroy_flow(flow_id); + ib_rdmacg_uncharge(&uobj->cg_obj, context->device, + RDMACG_RESOURCE_HCA_OBJECT); kfree(uobj); } @@ -266,6 +273,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, if (qp == qp->real_qp) ib_uverbs_detach_umcast(qp, uqp); ib_destroy_qp(qp); + ib_rdmacg_uncharge(&uobj->cg_obj, context->device, + RDMACG_RESOURCE_HCA_OBJECT); ib_uverbs_release_uevent(file, &uqp->uevent); kfree(uqp); } @@ -298,6 +307,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, idr_remove_uobj(&ib_uverbs_srq_idr, uobj); ib_destroy_srq(srq); + ib_rdmacg_uncharge(&uobj->cg_obj, context->device, + RDMACG_RESOURCE_HCA_OBJECT); ib_uverbs_release_uevent(file, uevent); kfree(uevent); } @@ -310,6 +321,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, idr_remove_uobj(&ib_uverbs_cq_idr, uobj); ib_destroy_cq(cq); + ib_rdmacg_uncharge(&uobj->cg_obj, context->device, + RDMACG_RESOURCE_HCA_OBJECT); ib_uverbs_release_ucq(file, ev_file, ucq); kfree(ucq); } @@ -319,6 +332,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, idr_remove_uobj(&ib_uverbs_mr_idr, uobj); ib_dereg_mr(mr); + ib_rdmacg_uncharge(&uobj->cg_obj, context->device, + RDMACG_RESOURCE_HCA_OBJECT); kfree(uobj); } @@ -339,11 +354,16 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, idr_remove_uobj(&ib_uverbs_pd_idr, uobj); ib_dealloc_pd(pd); + ib_rdmacg_uncharge(&uobj->cg_obj, context->device, + RDMACG_RESOURCE_HCA_OBJECT); kfree(uobj); } put_pid(context->tgid); + ib_rdmacg_uncharge(&context->cg_obj, context->device, + RDMACG_RESOURCE_HCA_HANDLE); + return context->device->dealloc_ucontext(context); } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 958a24d8fae7..63896a477896 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -60,6 +60,7 @@ #include #include #include +#include extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; @@ -1331,6 +1332,12 @@ struct ib_fmr_attr { struct ib_umem; +struct ib_rdmacg_object { +#ifdef CONFIG_CGROUP_RDMA + struct rdma_cgroup *cg; /* owner rdma cgroup */ +#endif +}; + struct ib_ucontext { struct ib_device *device; struct list_head pd_list; @@ -1363,6 +1370,8 @@ struct ib_ucontext { struct list_head no_private_counters; int odp_mrs_count; #endif + + struct ib_rdmacg_object cg_obj; }; struct ib_uobject { @@ -1370,6 +1379,7 @@ struct ib_uobject { struct ib_ucontext *context; /* associated user context */ void *object; /* containing object */ struct list_head list; /* link to context's list */ + struct ib_rdmacg_object cg_obj; /* rdmacg object */ int id; /* index into kernel idr */ struct kref ref; struct rw_semaphore mutex; /* protects .live */ @@ -2118,6 +2128,10 @@ struct ib_device { struct attribute_group *hw_stats_ag; struct rdma_hw_stats *hw_stats; +#ifdef CONFIG_CGROUP_RDMA + struct rdmacg_device cg_device; +#endif + /** * The following mandatory functions are used only at device * registration. Keep functions such as these at the end of this From 9c1e67f941019907034d7e5584c891603cce2d8e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 10 Jan 2017 00:02:15 +0000 Subject: [PATCH 18/27] rdmacg: Added documentation for rdmacg Added documentation for v1 and v2 version describing high level design and usage examples on using rdma controller. Signed-off-by: Parav Pandit Signed-off-by: Tejun Heo --- Documentation/cgroup-v1/rdma.txt | 109 +++++++++++++++++++++++++++++++ Documentation/cgroup-v2.txt | 38 +++++++++++ 2 files changed, 147 insertions(+) create mode 100644 Documentation/cgroup-v1/rdma.txt diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.txt new file mode 100644 index 000000000000..af618171e0eb --- /dev/null +++ b/Documentation/cgroup-v1/rdma.txt @@ -0,0 +1,109 @@ + RDMA Controller + ---------------- + +Contents +-------- + +1. Overview + 1-1. What is RDMA controller? + 1-2. Why RDMA controller needed? + 1-3. How is RDMA controller implemented? +2. Usage Examples + +1. Overview + +1-1. What is RDMA controller? +----------------------------- + +RDMA controller allows user to limit RDMA/IB specific resources that a given +set of processes can use. These processes are grouped using RDMA controller. + +RDMA controller defines two resources which can be limited for processes of a +cgroup. + +1-2. Why RDMA controller needed? +-------------------------------- + +Currently user space applications can easily take away all the rdma verb +specific resources such as AH, CQ, QP, MR etc. Due to which other applications +in other cgroup or kernel space ULPs may not even get chance to allocate any +rdma resources. This can leads to service unavailability. + +Therefore RDMA controller is needed through which resource consumption +of processes can be limited. Through this controller different rdma +resources can be accounted. + +1-3. How is RDMA controller implemented? +---------------------------------------- + +RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains +resource accounting per cgroup, per device using resource pool structure. +Each such resource pool is limited up to 64 resources in given resource pool +by rdma cgroup, which can be extended later if required. + +This resource pool object is linked to the cgroup css. Typically there +are 0 to 4 resource pool instances per cgroup, per device in most use cases. +But nothing limits to have it more. At present hundreds of RDMA devices per +single cgroup may not be handled optimally, however there is no +known use case or requirement for such configuration either. + +Since RDMA resources can be allocated from any process and can be freed by any +of the child processes which shares the address space, rdma resources are +always owned by the creator cgroup css. This allows process migration from one +to other cgroup without major complexity of transferring resource ownership; +because such ownership is not really present due to shared nature of +rdma resources. Linking resources around css also ensures that cgroups can be +deleted after processes migrated. This allow progress migration as well with +active resources, even though that is not a primary use case. + +Whenever RDMA resource charging occurs, owner rdma cgroup is returned to +the caller. Same rdma cgroup should be passed while uncharging the resource. +This also allows process migrated with active RDMA resource to charge +to new owner cgroup for new resource. It also allows to uncharge resource of +a process from previously charged cgroup which is migrated to new cgroup, +even though that is not a primary use case. + +Resource pool object is created in following situations. +(a) User sets the limit and no previous resource pool exist for the device +of interest for the cgroup. +(b) No resource limits were configured, but IB/RDMA stack tries to +charge the resource. So that it correctly uncharge them when applications are +running without limits and later on when limits are enforced during uncharging, +otherwise usage count will drop to negative. + +Resource pool is destroyed if all the resource limits are set to max and +it is the last resource getting deallocated. + +User should set all the limit to max value if it intents to remove/unconfigure +the resource pool for a particular device. + +IB stack honors limits enforced by the rdma controller. When application +query about maximum resource limits of IB device, it returns minimum of +what is configured by user for a given cgroup and what is supported by +IB device. + +Following resources can be accounted by rdma controller. + hca_handle Maximum number of HCA Handles + hca_object Maximum number of HCA Objects + +2. Usage Examples +----------------- + +(a) Configure resource limit: +echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max +echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max + +(b) Query resource limit: +cat /sys/fs/cgroup/rdma/2/rdma.max +#Output: +mlx4_0 hca_handle=2 hca_object=2000 +ocrdma1 hca_handle=3 hca_object=max + +(c) Query current usage: +cat /sys/fs/cgroup/rdma/2/rdma.current +#Output: +mlx4_0 hca_handle=1 hca_object=20 +ocrdma1 hca_handle=1 hca_object=23 + +(d) Delete resource limit: +echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 4cc07ce3b8dd..94350d79e169 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -47,6 +47,8 @@ CONTENTS 5-3. IO 5-3-1. IO Interface Files 5-3-2. Writeback + 5-4. RDMA + 5-4-1. RDMA Interface Files 6. Namespace 6-1. Basics 6-2. The Root and Views @@ -1119,6 +1121,42 @@ writeback as follows. vm.dirty[_background]_ratio. +5-4. RDMA + +The "rdma" controller regulates the distribution and accounting of +of RDMA resources. + +5-4-1. RDMA Interface Files + + rdma.max + A readwrite nested-keyed file that exists for all the cgroups + except root that describes current configured resource limit + for a RDMA/IB device. + + Lines are keyed by device name and are not ordered. + Each line contains space separated resource name and its configured + limit that can be distributed. + + The following nested keys are defined. + + hca_handle Maximum number of HCA Handles + hca_object Maximum number of HCA Objects + + An example for mlx4 and ocrdma device follows. + + mlx4_0 hca_handle=2 hca_object=2000 + ocrdma1 hca_handle=3 hca_object=max + + rdma.current + A read-only file that describes current resource usage. + It exists for all the cgroup except root. + + An example for mlx4 and ocrdma device follows. + + mlx4_0 hca_handle=1 hca_object=20 + ocrdma1 hca_handle=1 hca_object=23 + + 6. Namespace 6-1. Basics From 20c56e595cd781b5305562c9bf8289d8d5e47c63 Mon Sep 17 00:00:00 2001 From: Hans Ragas Date: Tue, 10 Jan 2017 17:42:34 +0000 Subject: [PATCH 19/27] cgroup: Add missing cgroup-v2 PID controller documentation. Signed-off-by: Hans Ragas Signed-off-by: Tejun Heo --- Documentation/cgroup-v2.txt | 41 +++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 4cc07ce3b8dd..f6f54107fb4d 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -47,6 +47,8 @@ CONTENTS 5-3. IO 5-3-1. IO Interface Files 5-3-2. Writeback + 5-4. PID + 5-4-1. PID Interface Files 6. Namespace 6-1. Basics 6-2. The Root and Views @@ -1119,6 +1121,45 @@ writeback as follows. vm.dirty[_background]_ratio. +5-4. PID + +The process number controller is used to allow a cgroup to stop any +new tasks from being fork()'d or clone()'d after a specified limit is +reached. + +The number of tasks in a cgroup can be exhausted in ways which other +controllers cannot prevent, thus warranting its own controller. For +example, a fork bomb is likely to exhaust the number of tasks before +hitting memory restrictions. + +Note that PIDs used in this controller refer to TIDs, process IDs as +used by the kernel. + + +5-4-1. PID Interface Files + + pids.max + + A read-write single value file which exists on non-root cgroups. The + default is "max". + + Hard limit of number of processes. + + pids.current + + A read-only single value file which exists on all cgroups. + + The number of processes currently in the cgroup and its descendants. + +Organisational operations are not blocked by cgroup policies, so it is +possible to have pids.current > pids.max. This can be done by either +setting the limit to be smaller than pids.current, or attaching enough +processes to the cgroup such that pids.current is larger than +pids.max. However, it is not possible to violate a cgroup PID policy +through fork() or clone(). These will return -EAGAIN if the creation +of a new process would cause a cgroup policy to be violated. + + 6. Namespace 6-1. Basics From 7896dfb0a6180beb51bf34b32c9427984cb052c4 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 10 Jan 2017 17:51:48 +0000 Subject: [PATCH 20/27] rdmacg: Fixed uninitialized current resource usage Fixed warning reported by kbuild test robot. When reading current resource usage value, when no resources are allocated, its possible that it can report a uninitialized value for current resource usage. This fix avoids it by initializing it to zero as no resource is allocated. Signed-off-by: Parav Pandit Signed-off-by: Tejun Heo --- kernel/cgroup/rdma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index 021bee7a9692..defad3c5e7dc 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -518,6 +518,8 @@ static void print_rpool_values(struct seq_file *sf, } else { if (rpool) value = rpool->resources[i].usage; + else + value = 0; } if (value == S32_MAX) From d8ebf5191d7fdf81ba34a7c3d726b99c34918030 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 15 Jan 2017 19:03:40 -0500 Subject: [PATCH 21/27] cgroup: cosmetic update to cgroup_taskset_add() cgroup_taskset_add() was using list_add_tail() when for source csets but list_move_tail() for destination. As the operations are gated by list_empty() test, list_move_tail() is equivalent to list_add_tail() here. Use list_add_tail() too for destination csets too. This doesn't cause any functional changes. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d9d82e96d67f..aed492e907c1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1980,8 +1980,8 @@ static void cgroup_taskset_add(struct task_struct *task, if (list_empty(&cset->mg_node)) list_add_tail(&cset->mg_node, &tset->src_csets); if (list_empty(&cset->mg_dst_cset->mg_node)) - list_move_tail(&cset->mg_dst_cset->mg_node, - &tset->dst_csets); + list_add_tail(&cset->mg_dst_cset->mg_node, + &tset->dst_csets); } /** From e595cd706982bff0211e6fafe5a108421e747fbc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 15 Jan 2017 19:03:41 -0500 Subject: [PATCH 22/27] cgroup: track migration context in cgroup_mgctx cgroup migration is performed in four steps - css_set preloading, addition of target tasks, actual migration, and clean up. A list named preloaded_csets is used to track the preloading. This is a bit too restricted and the code is already depending on the subtlety that all source css_sets appear before destination ones. Let's create struct cgroup_mgctx which keeps track of everything during migration. Currently, it has separate preload lists for source and destination csets and also embeds cgroup_taskset which is used during the actual migration. This moves struct cgroup_taskset definition to cgroup-internal.h. This patch doesn't cause any functional changes. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup/cgroup-internal.h | 66 +++++++++++++-- kernel/cgroup/cgroup-v1.c | 10 +-- kernel/cgroup/cgroup.c | 144 +++++++++++++------------------- 3 files changed, 122 insertions(+), 98 deletions(-) diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 589b0e7013ec..5f8c8ac5ab88 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -26,6 +26,61 @@ struct cgrp_cset_link { struct list_head cgrp_link; }; +/* used to track tasks and csets during migration */ +struct cgroup_taskset { + /* the src and dst cset list running through cset->mg_node */ + struct list_head src_csets; + struct list_head dst_csets; + + /* the subsys currently being processed */ + int ssid; + + /* + * Fields for cgroup_taskset_*() iteration. + * + * Before migration is committed, the target migration tasks are on + * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of + * the csets on ->dst_csets. ->csets point to either ->src_csets + * or ->dst_csets depending on whether migration is committed. + * + * ->cur_csets and ->cur_task point to the current task position + * during iteration. + */ + struct list_head *csets; + struct css_set *cur_cset; + struct task_struct *cur_task; +}; + +/* migration context also tracks preloading */ +struct cgroup_mgctx { + /* + * Preloaded source and destination csets. Used to guarantee + * atomic success or failure on actual migration. + */ + struct list_head preloaded_src_csets; + struct list_head preloaded_dst_csets; + + /* tasks and csets to migrate */ + struct cgroup_taskset tset; +}; + +#define CGROUP_TASKSET_INIT(tset) \ +{ \ + .src_csets = LIST_HEAD_INIT(tset.src_csets), \ + .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ + .csets = &tset.src_csets, \ +} + +#define CGROUP_MGCTX_INIT(name) \ +{ \ + LIST_HEAD_INIT(name.preloaded_src_csets), \ + LIST_HEAD_INIT(name.preloaded_dst_csets), \ + CGROUP_TASKSET_INIT(name.tset), \ +} + +#define DEFINE_CGROUP_MGCTX(name) \ + struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) + struct cgroup_sb_opts { u16 subsys_mask; unsigned int flags; @@ -112,13 +167,12 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, struct cgroup_namespace *ns); bool cgroup_may_migrate_to(struct cgroup *dst_cgrp); -void cgroup_migrate_finish(struct list_head *preloaded_csets); -void cgroup_migrate_add_src(struct css_set *src_cset, - struct cgroup *dst_cgrp, - struct list_head *preloaded_csets); -int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets); +void cgroup_migrate_finish(struct cgroup_mgctx *mgctx); +void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, + struct cgroup_mgctx *mgctx); +int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx); int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup_root *root); + struct cgroup_mgctx *mgctx, struct cgroup_root *root); int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 465b10111eaf..7a965f460faf 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -87,7 +87,7 @@ EXPORT_SYMBOL_GPL(cgroup_attach_task_all); */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { - LIST_HEAD(preloaded_csets); + DEFINE_CGROUP_MGCTX(mgctx); struct cgrp_cset_link *link; struct css_task_iter it; struct task_struct *task; @@ -106,10 +106,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, to, &preloaded_csets); + cgroup_migrate_add_src(link->cset, to, &mgctx); spin_unlock_irq(&css_set_lock); - ret = cgroup_migrate_prepare_dst(&preloaded_csets); + ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_err; @@ -125,14 +125,14 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) css_task_iter_end(&it); if (task) { - ret = cgroup_migrate(task, false, to->root); + ret = cgroup_migrate(task, false, &mgctx, to->root); if (!ret) trace_cgroup_transfer_tasks(to, task, false); put_task_struct(task); } } while (task && !ret); out_err: - cgroup_migrate_finish(&preloaded_csets); + cgroup_migrate_finish(&mgctx); percpu_up_write(&cgroup_threadgroup_rwsem); mutex_unlock(&cgroup_mutex); return ret; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index aed492e907c1..f90554d24e59 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1916,49 +1916,18 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) } EXPORT_SYMBOL_GPL(task_cgroup_path); -/* used to track tasks and other necessary states during migration */ -struct cgroup_taskset { - /* the src and dst cset list running through cset->mg_node */ - struct list_head src_csets; - struct list_head dst_csets; - - /* the subsys currently being processed */ - int ssid; - - /* - * Fields for cgroup_taskset_*() iteration. - * - * Before migration is committed, the target migration tasks are on - * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of - * the csets on ->dst_csets. ->csets point to either ->src_csets - * or ->dst_csets depending on whether migration is committed. - * - * ->cur_csets and ->cur_task point to the current task position - * during iteration. - */ - struct list_head *csets; - struct css_set *cur_cset; - struct task_struct *cur_task; -}; - -#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \ - .src_csets = LIST_HEAD_INIT(tset.src_csets), \ - .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ - .csets = &tset.src_csets, \ -} - /** - * cgroup_taskset_add - try to add a migration target task to a taskset + * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task - * @tset: target taskset + * @mgctx: target migration context * - * Add @task, which is a migration target, to @tset. This function becomes - * noop if @task doesn't need to be migrated. @task's css_set should have - * been added as a migration source and @task->cg_list will be moved from - * the css_set's tasks list to mg_tasks one. + * Add @task, which is a migration target, to @mgctx->tset. This function + * becomes noop if @task doesn't need to be migrated. @task's css_set + * should have been added as a migration source and @task->cg_list will be + * moved from the css_set's tasks list to mg_tasks one. */ -static void cgroup_taskset_add(struct task_struct *task, - struct cgroup_taskset *tset) +static void cgroup_migrate_add_task(struct task_struct *task, + struct cgroup_mgctx *mgctx) { struct css_set *cset; @@ -1978,10 +1947,11 @@ static void cgroup_taskset_add(struct task_struct *task, list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) - list_add_tail(&cset->mg_node, &tset->src_csets); + list_add_tail(&cset->mg_node, + &mgctx->tset.src_csets); if (list_empty(&cset->mg_dst_cset->mg_node)) list_add_tail(&cset->mg_dst_cset->mg_node, - &tset->dst_csets); + &mgctx->tset.dst_csets); } /** @@ -2048,17 +2018,18 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, /** * cgroup_taskset_migrate - migrate a taskset - * @tset: taget taskset + * @mgctx: migration context * @root: cgroup root the migration is taking place on * - * Migrate tasks in @tset as setup by migration preparation functions. + * Migrate tasks in @mgctx as setup by migration preparation functions. * This function fails iff one of the ->can_attach callbacks fails and - * guarantees that either all or none of the tasks in @tset are migrated. - * @tset is consumed regardless of success. + * guarantees that either all or none of the tasks in @mgctx are migrated. + * @mgctx is consumed regardless of success. */ -static int cgroup_taskset_migrate(struct cgroup_taskset *tset, +static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx, struct cgroup_root *root) { + struct cgroup_taskset *tset = &mgctx->tset; struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; @@ -2151,25 +2122,31 @@ bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) /** * cgroup_migrate_finish - cleanup after attach - * @preloaded_csets: list of preloaded css_sets + * @mgctx: migration context * * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See * those functions for details. */ -void cgroup_migrate_finish(struct list_head *preloaded_csets) +void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) { + LIST_HEAD(preloaded); struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { + + list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded); + list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded); + + list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_preload_node); put_css_set_locked(cset); } + spin_unlock_irq(&css_set_lock); } @@ -2177,10 +2154,10 @@ void cgroup_migrate_finish(struct list_head *preloaded_csets) * cgroup_migrate_add_src - add a migration source css_set * @src_cset: the source css_set to add * @dst_cgrp: the destination cgroup - * @preloaded_csets: list of preloaded css_sets + * @mgctx: migration context * * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin - * @src_cset and add it to @preloaded_csets, which should later be cleaned + * @src_cset and add it to @mgctx->src_csets, which should later be cleaned * up by cgroup_migrate_finish(). * * This function may be called without holding cgroup_threadgroup_rwsem @@ -2191,7 +2168,7 @@ void cgroup_migrate_finish(struct list_head *preloaded_csets) */ void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, - struct list_head *preloaded_csets) + struct cgroup_mgctx *mgctx) { struct cgroup *src_cgrp; @@ -2219,32 +2196,32 @@ void cgroup_migrate_add_src(struct css_set *src_cset, src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); - list_add(&src_cset->mg_preload_node, preloaded_csets); + list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets); } /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration - * @preloaded_csets: list of preloaded source css_sets + * @mgctx: migration context * * Tasks are about to be moved and all the source css_sets have been - * preloaded to @preloaded_csets. This function looks up and pins all - * destination css_sets, links each to its source, and append them to - * @preloaded_csets. + * preloaded to @mgctx->preloaded_src_csets. This function looks up and + * pins all destination css_sets, links each to its source, and append them + * to @mgctx->preloaded_dst_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on - * @preloaded_csets. + * @mgctx. */ -int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) +int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) { - LIST_HEAD(csets); struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); /* look up the dst cset for each src cset and link it to src */ - list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { + list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, + mg_preload_node) { struct css_set *dst_cset; dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); @@ -2270,15 +2247,15 @@ int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) src_cset->mg_dst_cset = dst_cset; if (list_empty(&dst_cset->mg_preload_node)) - list_add(&dst_cset->mg_preload_node, &csets); + list_add_tail(&dst_cset->mg_preload_node, + &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); } - list_splice_tail(&csets, preloaded_csets); return 0; err: - cgroup_migrate_finish(&csets); + cgroup_migrate_finish(mgctx); return -ENOMEM; } @@ -2287,6 +2264,7 @@ err: * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task * @root: cgroup root migration is taking place on + * @mgctx: migration context * * Migrate a process or task denoted by @leader. If migrating a process, * the caller must be holding cgroup_threadgroup_rwsem. The caller is also @@ -2301,9 +2279,8 @@ err: * actually starting migrating. */ int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup_root *root) + struct cgroup_mgctx *mgctx, struct cgroup_root *root) { - struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct task_struct *task; /* @@ -2315,14 +2292,14 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, rcu_read_lock(); task = leader; do { - cgroup_taskset_add(task, &tset); + cgroup_migrate_add_task(task, mgctx); if (!threadgroup) break; } while_each_thread(leader, task); rcu_read_unlock(); spin_unlock_irq(&css_set_lock); - return cgroup_taskset_migrate(&tset, root); + return cgroup_migrate_execute(mgctx, root); } /** @@ -2336,7 +2313,7 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup) { - LIST_HEAD(preloaded_csets); + DEFINE_CGROUP_MGCTX(mgctx); struct task_struct *task; int ret; @@ -2348,8 +2325,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, rcu_read_lock(); task = leader; do { - cgroup_migrate_add_src(task_css_set(task), dst_cgrp, - &preloaded_csets); + cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); if (!threadgroup) break; } while_each_thread(leader, task); @@ -2357,11 +2333,11 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ - ret = cgroup_migrate_prepare_dst(&preloaded_csets); + ret = cgroup_migrate_prepare_dst(&mgctx); if (!ret) - ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); + ret = cgroup_migrate(leader, threadgroup, &mgctx, dst_cgrp->root); - cgroup_migrate_finish(&preloaded_csets); + cgroup_migrate_finish(&mgctx); if (!ret) trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); @@ -2528,8 +2504,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) */ static int cgroup_update_dfl_csses(struct cgroup *cgrp) { - LIST_HEAD(preloaded_csets); - struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); + DEFINE_CGROUP_MGCTX(mgctx); struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; @@ -2545,33 +2520,28 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgrp_cset_link *link; list_for_each_entry(link, &dsct->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, dsct, - &preloaded_csets); + cgroup_migrate_add_src(link->cset, dsct, &mgctx); } spin_unlock_irq(&css_set_lock); /* NULL dst indicates self on default hierarchy */ - ret = cgroup_migrate_prepare_dst(&preloaded_csets); + ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_finish; spin_lock_irq(&css_set_lock); - list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { + list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) { struct task_struct *task, *ntask; - /* src_csets precede dst_csets, break on the first dst_cset */ - if (!src_cset->mg_src_cgrp) - break; - /* all tasks in src_csets need to be migrated */ list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) - cgroup_taskset_add(task, &tset); + cgroup_migrate_add_task(task, &mgctx); } spin_unlock_irq(&css_set_lock); - ret = cgroup_taskset_migrate(&tset, cgrp->root); + ret = cgroup_migrate_execute(&mgctx, cgrp->root); out_finish: - cgroup_migrate_finish(&preloaded_csets); + cgroup_migrate_finish(&mgctx); percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } From bfc2cf6f61fceac42235345081eb713329baa2a2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 15 Jan 2017 19:03:41 -0500 Subject: [PATCH 23/27] cgroup: call subsys->*attach() only for subsystems which are actually affected by migration Currently, subsys->*attach() callbacks are called for all subsystems which are attached to the hierarchy on which the migration is taking place. With cgroup_migrate_prepare_dst() filtering out identity migrations, v1 hierarchies can avoid spurious ->*attach() callback invocations where the source and destination csses are identical; however, this isn't enough on v2 as only a subset of the attached controllers can be affected on controller enable/disable. While spurious ->*attach() invocations aren't critically broken, they're unnecessary overhead and can lead to temporary overcharges on certain controllers. Fix it by tracking which subsystems are affected by a migration and invoking ->*attach() callbacks only on those subsystems. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup/cgroup-internal.h | 5 ++++- kernel/cgroup/cgroup-v1.c | 2 +- kernel/cgroup/cgroup.c | 25 ++++++++++++++----------- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 5f8c8ac5ab88..9203bfb05603 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -62,6 +62,9 @@ struct cgroup_mgctx { /* tasks and csets to migrate */ struct cgroup_taskset tset; + + /* subsystems affected by migration */ + u16 ss_mask; }; #define CGROUP_TASKSET_INIT(tset) \ @@ -172,7 +175,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, struct cgroup_mgctx *mgctx); int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx); int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup_mgctx *mgctx, struct cgroup_root *root); + struct cgroup_mgctx *mgctx); int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 7a965f460faf..fc34bcf2329f 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -125,7 +125,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) css_task_iter_end(&it); if (task) { - ret = cgroup_migrate(task, false, &mgctx, to->root); + ret = cgroup_migrate(task, false, &mgctx); if (!ret) trace_cgroup_transfer_tasks(to, task, false); put_task_struct(task); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f90554d24e59..69ad5b3de0c1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2019,15 +2019,13 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, /** * cgroup_taskset_migrate - migrate a taskset * @mgctx: migration context - * @root: cgroup root the migration is taking place on * * Migrate tasks in @mgctx as setup by migration preparation functions. * This function fails iff one of the ->can_attach callbacks fails and * guarantees that either all or none of the tasks in @mgctx are migrated. * @mgctx is consumed regardless of success. */ -static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx, - struct cgroup_root *root) +static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) { struct cgroup_taskset *tset = &mgctx->tset; struct cgroup_subsys *ss; @@ -2040,7 +2038,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx, return 0; /* check that we can legitimately attach to the cgroup */ - do_each_subsys_mask(ss, ssid, root->subsys_mask) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->can_attach) { tset->ssid = ssid; ret = ss->can_attach(tset); @@ -2076,7 +2074,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx, */ tset->csets = &tset->dst_csets; - do_each_subsys_mask(ss, ssid, root->subsys_mask) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->attach) { tset->ssid = ssid; ss->attach(tset); @@ -2087,7 +2085,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx, goto out_release_tset; out_cancel_attach: - do_each_subsys_mask(ss, ssid, root->subsys_mask) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ssid == failed_ssid) break; if (ss->cancel_attach) { @@ -2223,6 +2221,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, mg_preload_node) { struct css_set *dst_cset; + struct cgroup_subsys *ss; + int ssid; dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) @@ -2251,6 +2251,10 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); + + for_each_subsys(ss, ssid) + if (src_cset->subsys[ssid] != dst_cset->subsys[ssid]) + mgctx->ss_mask |= 1 << ssid; } return 0; @@ -2263,7 +2267,6 @@ err: * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task - * @root: cgroup root migration is taking place on * @mgctx: migration context * * Migrate a process or task denoted by @leader. If migrating a process, @@ -2279,7 +2282,7 @@ err: * actually starting migrating. */ int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup_mgctx *mgctx, struct cgroup_root *root) + struct cgroup_mgctx *mgctx) { struct task_struct *task; @@ -2299,7 +2302,7 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, rcu_read_unlock(); spin_unlock_irq(&css_set_lock); - return cgroup_migrate_execute(mgctx, root); + return cgroup_migrate_execute(mgctx); } /** @@ -2335,7 +2338,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, /* prepare dst csets and commit */ ret = cgroup_migrate_prepare_dst(&mgctx); if (!ret) - ret = cgroup_migrate(leader, threadgroup, &mgctx, dst_cgrp->root); + ret = cgroup_migrate(leader, threadgroup, &mgctx); cgroup_migrate_finish(&mgctx); @@ -2539,7 +2542,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) } spin_unlock_irq(&css_set_lock); - ret = cgroup_migrate_execute(&mgctx, cgrp->root); + ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); percpu_up_write(&cgroup_threadgroup_rwsem); From b807421a720ff00bc31774d254f51313f4a7656b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Jan 2017 12:06:08 -0500 Subject: [PATCH 24/27] cgroup: misc cleanups * cgrp_dfl_implicit_ss_mask is ulong instead of u16 unlike other ss_masks. Make it a u16. * Move have_canfork_callback together with other callback ss_masks. Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0dcc4c7e935e..a99b15f9b577 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -159,7 +159,7 @@ static bool cgrp_dfl_visible; static u16 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ -static unsigned long cgrp_dfl_implicit_ss_mask; +static u16 cgrp_dfl_implicit_ss_mask; /* The list of hierarchy roots */ LIST_HEAD(cgroup_roots); @@ -178,13 +178,13 @@ static DEFINE_IDR(cgroup_hierarchy_idr); static u64 css_serial_nr_next = 1; /* - * These bitmask flags indicate whether tasks in the fork and exit paths have - * fork/exit handlers to call. This avoids us having to do extra work in the - * fork/exit path to check which subsystems have fork/exit callbacks. + * These bitmasks identify subsystems with specific features to avoid + * having to do iterative checks repeatedly. */ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; static u16 have_free_callback __read_mostly; +static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { @@ -195,9 +195,6 @@ struct cgroup_namespace init_cgroup_ns = { .root_cset = &init_css_set, }; -/* Ditto for the can_fork callback. */ -static u16 have_canfork_callback __read_mostly; - static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_base_files[]; From 968ebff1efde6948564308836ecf1ef57de4e106 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 29 Jan 2017 14:35:20 -0500 Subject: [PATCH 25/27] cgroup, perf_event: make perf_event controller work on cgroup2 hierarchy perf_event is a utility controller whose primary role is identifying cgroup membership to filter perf events; however, because it also tracks some per-css state, it can't be replaced by pure cgroup membership test. Mark the controller as implicitly enabled on the default hierarchy so that perf events can always be filtered based on cgroup v2 path as long as the controller is not mounted on a legacy hierarchy. "perf record" is updated accordingly so that it searches for both v1 and v2 hierarchies. A v1 hierarchy is used if perf_event is mounted on it; otherwise, it uses the v2 hierarchy. v2: Doc updated to reflect more flexible rebinding behavior. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- Documentation/cgroup-v2.txt | 12 ++++++++++++ kernel/events/core.c | 6 ++++++ tools/perf/util/cgroup.c | 26 +++++++++++++++++++------- 3 files changed, 37 insertions(+), 7 deletions(-) diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index f6f54107fb4d..227ce4883720 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -49,6 +49,8 @@ CONTENTS 5-3-2. Writeback 5-4. PID 5-4-1. PID Interface Files + 5-5. Misc + 5-5-1. perf_event 6. Namespace 6-1. Basics 6-2. The Root and Views @@ -1160,6 +1162,16 @@ through fork() or clone(). These will return -EAGAIN if the creation of a new process would cause a cgroup policy to be violated. +5-5. Misc + +5-5-1. perf_event + +perf_event controller, if not mounted on a legacy hierarchy, is +automatically enabled on the v2 hierarchy so that perf events can +always be filtered by cgroup v2 path. The controller can still be +moved to a legacy hierarchy after v2 hierarchy is populated. + + 6. Namespace 6-1. Basics diff --git a/kernel/events/core.c b/kernel/events/core.c index ab15509fab8c..d72128dce1e0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10792,5 +10792,11 @@ struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, .attach = perf_cgroup_attach, + /* + * Implicitly enable on dfl hierarchy so that perf events can + * always be filtered by cgroup2 path as long as perf_event + * controller is not mounted on a legacy hierarchy. + */ + .implicit_on_dfl = true, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c index 8fdee24725a7..eafbf11442b2 100644 --- a/tools/perf/util/cgroup.c +++ b/tools/perf/util/cgroup.c @@ -12,8 +12,8 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen) { FILE *fp; char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1]; + char path_v1[PATH_MAX + 1], path_v2[PATH_MAX + 2], *path; char *token, *saved_ptr = NULL; - int found = 0; fp = fopen("/proc/mounts", "r"); if (!fp) @@ -24,31 +24,43 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen) * and inspect every cgroupfs mount point to find one that has * perf_event subsystem */ + path_v1[0] = '\0'; + path_v2[0] = '\0'; + while (fscanf(fp, "%*s %"STR(PATH_MAX)"s %"STR(PATH_MAX)"s %" STR(PATH_MAX)"s %*d %*d\n", mountpoint, type, tokens) == 3) { - if (!strcmp(type, "cgroup")) { + if (!path_v1[0] && !strcmp(type, "cgroup")) { token = strtok_r(tokens, ",", &saved_ptr); while (token != NULL) { if (!strcmp(token, "perf_event")) { - found = 1; + strcpy(path_v1, mountpoint); break; } token = strtok_r(NULL, ",", &saved_ptr); } } - if (found) + + if (!path_v2[0] && !strcmp(type, "cgroup2")) + strcpy(path_v2, mountpoint); + + if (path_v1[0] && path_v2[0]) break; } fclose(fp); - if (!found) + + if (path_v1[0]) + path = path_v1; + else if (path_v2[0]) + path = path_v2; + else return -1; - if (strlen(mountpoint) < maxlen) { - strcpy(buf, mountpoint); + if (strlen(path) < maxlen) { + strcpy(buf, path); return 0; } return -1; From 576dd464505fc53d501bb94569db76f220104d28 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Jan 2017 11:29:54 -0500 Subject: [PATCH 26/27] cgroup: drop the matching uid requirement on migration for cgroup v2 Along with the write access to the cgroup.procs or tasks file, cgroup has required the writer's euid, unless root, to match [s]uid of the target process or task. On cgroup v1, this is necessary because there's nothing preventing a delegatee from pulling in tasks or processes from all over the system. If a user has a cgroup subdirectory delegated to it, the user would have write access to the cgroup.procs or tasks file. If there are no further checks than file write access check, the user would be able to pull processes from all over the system into its subhierarchy which is clearly not the intended behavior. The matching [s]uid requirement partially prevents this problem by allowing a delegatee to pull in the processes that belongs to it. This isn't a sufficient protection however, because a user would still be able to jump processes across two disjoint sub-hierarchies that has been delegated to them. cgroup v2 resolves the issue by requiring the writer to have access to the common ancestor of the cgroup.procs file of the source and target cgroups. This confines each delegatee to their own sub-hierarchy proper and bases all permission decisions on the cgroup filesystem rather than having to pull in explicit uid matching. cgroup v2 has still been applying the matching [s]uid requirement just for historical reasons. On cgroup2, the requirement doesn't serve any purpose while unnecessarily complicating the permission model. Let's drop it. Signed-off-by: Tejun Heo --- Documentation/cgroup-v2.txt | 12 +++++------- kernel/cgroup/cgroup.c | 27 ++++++++++++++------------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 227ce4883720..1d101423ca92 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -332,14 +332,12 @@ a process with a non-root euid to migrate a target process into a cgroup by writing its PID to the "cgroup.procs" file, the following conditions must be met. -- The writer's euid must match either uid or suid of the target process. - - The writer must have write access to the "cgroup.procs" file. - The writer must have write access to the "cgroup.procs" file of the common ancestor of the source and destination cgroups. -The above three constraints ensure that while a delegatee may migrate +The above two constraints ensure that while a delegatee may migrate processes around freely in the delegated sub-hierarchy it can't pull in from or push out to outside the sub-hierarchy. @@ -354,10 +352,10 @@ all processes under C0 and C1 belong to U0. Let's also say U0 wants to write the PID of a process which is currently in C10 into "C00/cgroup.procs". U0 has write access to the -file and uid match on the process; however, the common ancestor of the -source cgroup C10 and the destination cgroup C00 is above the points -of delegation and U0 would not have write access to its "cgroup.procs" -files and thus the write will be denied with -EACCES. +file; however, the common ancestor of the source cgroup C10 and the +destination cgroup C00 is above the points of delegation and U0 would +not have write access to its "cgroup.procs" files and thus the write +will be denied with -EACCES. 2-6. Guidelines diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a99b15f9b577..fe374f803b20 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2349,20 +2349,9 @@ static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *dst_cgrp, struct kernfs_open_file *of) { - const struct cred *cred = current_cred(); - const struct cred *tcred = get_task_cred(task); int ret = 0; - /* - * even if we're attaching all tasks in the thread group, we only - * need to check permissions on one of them. - */ - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && - !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) - ret = -EACCES; - - if (!ret && cgroup_on_dfl(dst_cgrp)) { + if (cgroup_on_dfl(dst_cgrp)) { struct super_block *sb = of->file->f_path.dentry->d_sb; struct cgroup *cgrp; struct inode *inode; @@ -2380,9 +2369,21 @@ static int cgroup_procs_write_permission(struct task_struct *task, ret = inode_permission(inode, MAY_WRITE); iput(inode); } + } else { + const struct cred *cred = current_cred(); + const struct cred *tcred = get_task_cred(task); + + /* + * even if we're attaching all tasks in the thread group, + * we only need to check permissions on one of them. + */ + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) + ret = -EACCES; + put_cred(tcred); } - put_cred(tcred); return ret; } From f83f3c515654474e19c7fc86e3b06564bb5cb4d4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 12 Feb 2017 05:33:02 +0900 Subject: [PATCH 27/27] kernfs: fix locking around kernfs_ops->release() callback The release callback may be called from two places - file release operation and kernfs open file draining. kernfs_open_file->mutex is used to synchronize the two callsites. This unfortunately leads to possible circular locking because of->mutex is used to protect the usual kernfs operations which may use locking constructs which are held while removing and thus draining kernfs files. @of->mutex is for synchronizing concurrent kernfs access operations and all we need here is synchronization between the releaes and drain paths. As the drain path has to grab kernfs_open_file_mutex anyway, let's use the mutex to synchronize the release operation instead. Signed-off-by: Tejun Heo Reported-and-tested-by: Tony Lindgren Fixes: 0e67db2f9fe9 ("kernfs: add kernfs_ops->open/release() callbacks") Acked-by: Greg Kroah-Hartman --- fs/kernfs/file.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 20c396291ac1..14da136028de 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -747,10 +747,15 @@ err_out: static void kernfs_release_file(struct kernfs_node *kn, struct kernfs_open_file *of) { - if (!(kn->flags & KERNFS_HAS_RELEASE)) - return; + /* + * @of is guaranteed to have no other file operations in flight and + * we just want to synchronize release and drain paths. + * @kernfs_open_file_mutex is enough. @of->mutex can't be used + * here because drain path may be called from places which can + * cause circular dependency. + */ + lockdep_assert_held(&kernfs_open_file_mutex); - mutex_lock(&of->mutex); if (!of->released) { /* * A file is never detached without being released and we @@ -760,7 +765,6 @@ static void kernfs_release_file(struct kernfs_node *kn, kn->attr.ops->release(of); of->released = true; } - mutex_unlock(&of->mutex); } static int kernfs_fop_release(struct inode *inode, struct file *filp) @@ -768,7 +772,12 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp) struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; struct kernfs_open_file *of = kernfs_of(filp); - kernfs_release_file(kn, of); + if (kn->flags & KERNFS_HAS_RELEASE) { + mutex_lock(&kernfs_open_file_mutex); + kernfs_release_file(kn, of); + mutex_unlock(&kernfs_open_file_mutex); + } + kernfs_put_open_node(kn, of); seq_release(inode, filp); kfree(of->prealloc_buf);