From 6b4e306aa3dc94a0545eb9279475b1ab6209a31f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 7 Mar 2010 16:41:34 -0800 Subject: [PATCH 1/9] ns: proc files for namespace naming policy. Create files under /proc//ns/ to allow controlling the namespaces of a process. This addresses three specific problems that can make namespaces hard to work with. - Namespaces require a dedicated process to pin them in memory. - It is not possible to use a namespace unless you are the child of the original creator. - Namespaces don't have names that userspace can use to talk about them. The namespace files under /proc//ns/ can be opened and the file descriptor can be used to talk about a specific namespace, and to keep the specified namespace alive. A namespace can be kept alive by either holding the file descriptor open or bind mounting the file someplace else. aka: mount --bind /proc/self/ns/net /some/filesystem/path mount --bind /proc/self/fd/ /some/filesystem/path This allows namespaces to be named with userspace policy. It requires additional support to make use of these filedescriptors and that will be comming in the following patches. Acked-by: Daniel Lezcano Signed-off-by: Eric W. Biederman --- fs/proc/Makefile | 1 + fs/proc/base.c | 20 ++--- fs/proc/inode.c | 7 ++ fs/proc/internal.h | 18 ++++ fs/proc/namespaces.c | 188 ++++++++++++++++++++++++++++++++++++++++ include/linux/proc_fs.h | 18 ++++ 6 files changed, 241 insertions(+), 11 deletions(-) create mode 100644 fs/proc/namespaces.c diff --git a/fs/proc/Makefile b/fs/proc/Makefile index df434c5f28fb..c1c729335924 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -20,6 +20,7 @@ proc-y += stat.o proc-y += uptime.o proc-y += version.o proc-y += softirqs.o +proc-y += namespaces.o proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o proc-$(CONFIG_NET) += proc_net.o proc-$(CONFIG_PROC_KCORE) += kcore.o diff --git a/fs/proc/base.c b/fs/proc/base.c index dfa532730e55..dc8bca72b002 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -600,7 +600,7 @@ static int proc_fd_access_allowed(struct inode *inode) return allowed; } -static int proc_setattr(struct dentry *dentry, struct iattr *attr) +int proc_setattr(struct dentry *dentry, struct iattr *attr) { int error; struct inode *inode = dentry->d_inode; @@ -1736,8 +1736,7 @@ static int task_dumpable(struct task_struct *task) return 0; } - -static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) +struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) { struct inode * inode; struct proc_inode *ei; @@ -1779,7 +1778,7 @@ out_unlock: return NULL; } -static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; struct task_struct *task; @@ -1820,7 +1819,7 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat * made this apply to all per process world readable and executable * directories. */ -static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) +int pid_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *inode; struct task_struct *task; @@ -1862,7 +1861,7 @@ static int pid_delete_dentry(const struct dentry * dentry) return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; } -static const struct dentry_operations pid_dentry_operations = +const struct dentry_operations pid_dentry_operations = { .d_revalidate = pid_revalidate, .d_delete = pid_delete_dentry, @@ -1870,9 +1869,6 @@ static const struct dentry_operations pid_dentry_operations = /* Lookups */ -typedef struct dentry *instantiate_t(struct inode *, struct dentry *, - struct task_struct *, const void *); - /* * Fill a directory entry. * @@ -1885,8 +1881,8 @@ typedef struct dentry *instantiate_t(struct inode *, struct dentry *, * reported by readdir in sync with the inode numbers reported * by stat. */ -static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, - char *name, int len, +int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + const char *name, int len, instantiate_t instantiate, struct task_struct *task, const void *ptr) { struct dentry *child, *dir = filp->f_path.dentry; @@ -2820,6 +2816,7 @@ static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif @@ -3168,6 +3165,7 @@ out_no_task: static const struct pid_entry tid_base_stuff[] = { DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), REG("environ", S_IRUSR, proc_environ_operations), INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), diff --git a/fs/proc/inode.c b/fs/proc/inode.c index d15aa1b1cc8f..74b48cfa1bb2 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -28,6 +28,7 @@ static void proc_evict_inode(struct inode *inode) { struct proc_dir_entry *de; struct ctl_table_header *head; + const struct proc_ns_operations *ns_ops; truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); @@ -44,6 +45,10 @@ static void proc_evict_inode(struct inode *inode) rcu_assign_pointer(PROC_I(inode)->sysctl, NULL); sysctl_head_put(head); } + /* Release any associated namespace */ + ns_ops = PROC_I(inode)->ns_ops; + if (ns_ops && ns_ops->put) + ns_ops->put(PROC_I(inode)->ns); } static struct kmem_cache * proc_inode_cachep; @@ -62,6 +67,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei->pde = NULL; ei->sysctl = NULL; ei->sysctl_entry = NULL; + ei->ns = NULL; + ei->ns_ops = NULL; inode = &ei->vfs_inode; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; return inode; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index c03e8d3a3a5b..96245a1b1a7c 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -119,3 +119,21 @@ struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); */ int proc_readdir(struct file *, void *, filldir_t); struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *); + + + +/* Lookups */ +typedef struct dentry *instantiate_t(struct inode *, struct dentry *, + struct task_struct *, const void *); +int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + const char *name, int len, + instantiate_t instantiate, struct task_struct *task, const void *ptr); +int pid_revalidate(struct dentry *dentry, struct nameidata *nd); +struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task); +extern const struct dentry_operations pid_dentry_operations; +int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); +int proc_setattr(struct dentry *dentry, struct iattr *attr); + +extern const struct inode_operations proc_ns_dir_inode_operations; +extern const struct file_operations proc_ns_dir_operations; + diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c new file mode 100644 index 000000000000..6ae9f07d59ee --- /dev/null +++ b/fs/proc/namespaces.c @@ -0,0 +1,188 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + + +static const struct proc_ns_operations *ns_entries[] = { +}; + +static const struct file_operations ns_file_operations = { + .llseek = no_llseek, +}; + +static struct dentry *proc_ns_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + const struct proc_ns_operations *ns_ops = ptr; + struct inode *inode; + struct proc_inode *ei; + struct dentry *error = ERR_PTR(-ENOENT); + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ei = PROC_I(inode); + inode->i_mode = S_IFREG|S_IRUSR; + inode->i_fop = &ns_file_operations; + ei->ns_ops = ns_ops; + ei->ns = ns_ops->get(task); + if (!ei->ns) + goto out_iput; + + dentry->d_op = &pid_dentry_operations; + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + error = NULL; +out: + return error; +out_iput: + iput(inode); + goto out; +} + +static int proc_ns_fill_cache(struct file *filp, void *dirent, + filldir_t filldir, struct task_struct *task, + const struct proc_ns_operations *ops) +{ + return proc_fill_cache(filp, dirent, filldir, + ops->name, strlen(ops->name), + proc_ns_instantiate, task, ops); +} + +static int proc_ns_dir_readdir(struct file *filp, void *dirent, + filldir_t filldir) +{ + int i; + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *task = get_proc_task(inode); + const struct proc_ns_operations **entry, **last; + ino_t ino; + int ret; + + ret = -ENOENT; + if (!task) + goto out_no_task; + + ret = -EPERM; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out; + + ret = 0; + i = filp->f_pos; + switch (i) { + case 0: + ino = inode->i_ino; + if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + default: + i -= 2; + if (i >= ARRAY_SIZE(ns_entries)) { + ret = 1; + goto out; + } + entry = ns_entries + i; + last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; + while (entry <= last) { + if (proc_ns_fill_cache(filp, dirent, filldir, + task, *entry) < 0) + goto out; + filp->f_pos++; + entry++; + } + } + + ret = 1; +out: + put_task_struct(task); +out_no_task: + return ret; +} + +const struct file_operations proc_ns_dir_operations = { + .read = generic_read_dir, + .readdir = proc_ns_dir_readdir, +}; + +static struct dentry *proc_ns_dir_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + struct dentry *error; + struct task_struct *task = get_proc_task(dir); + const struct proc_ns_operations **entry, **last; + unsigned int len = dentry->d_name.len; + + error = ERR_PTR(-ENOENT); + + if (!task) + goto out_no_task; + + error = ERR_PTR(-EPERM); + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out; + + last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; + for (entry = ns_entries; entry <= last; entry++) { + if (strlen((*entry)->name) != len) + continue; + if (!memcmp(dentry->d_name.name, (*entry)->name, len)) + break; + } + if (entry > last) + goto out; + + error = proc_ns_instantiate(dir, dentry, task, *entry); +out: + put_task_struct(task); +out_no_task: + return error; +} + +const struct inode_operations proc_ns_dir_inode_operations = { + .lookup = proc_ns_dir_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; + +struct file *proc_ns_fget(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADF); + + if (file->f_op != &ns_file_operations) + goto out_invalid; + + return file; + +out_invalid: + fput(file); + return ERR_PTR(-EINVAL); +} + diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 838c1149251a..a6d2c6da5e5a 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -179,6 +179,8 @@ extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm); +extern struct file *proc_ns_fget(int fd); + #else #define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; }) @@ -239,6 +241,11 @@ static inline void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) {} +static inline struct file *proc_ns_fget(int fd) +{ + return ERR_PTR(-EINVAL); +} + #endif /* CONFIG_PROC_FS */ #if !defined(CONFIG_PROC_KCORE) @@ -250,6 +257,15 @@ kclist_add(struct kcore_list *new, void *addr, size_t size, int type) extern void kclist_add(struct kcore_list *, void *, size_t, int type); #endif +struct nsproxy; +struct proc_ns_operations { + const char *name; + int type; + void *(*get)(struct task_struct *task); + void (*put)(void *ns); + int (*install)(struct nsproxy *nsproxy, void *ns); +}; + union proc_op { int (*proc_get_link)(struct inode *, struct path *); int (*proc_read)(struct task_struct *task, char *page); @@ -268,6 +284,8 @@ struct proc_inode { struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; + void *ns; + const struct proc_ns_operations *ns_ops; struct inode vfs_inode; }; From 0663c6f8fa37d777ede74ff991a0cba3a42fcbd7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 7 Mar 2010 17:48:52 -0800 Subject: [PATCH 2/9] ns: Introduce the setns syscall With the networking stack today there is demand to handle multiple network stacks at a time. Not in the context of containers but in the context of people doing interesting things with routing. There is also demand in the context of containers to have an efficient way to execute some code in the container itself. If nothing else it is very useful ad a debugging technique. Both problems can be solved by starting some form of login daemon in the namespaces people want access to, or you can play games by ptracing a process and getting the traced process to do things you want it to do. However it turns out that a login daemon or a ptrace puppet controller are more code, they are more prone to failure, and generally they are less efficient than simply changing the namespace of a process to a specified one. Pieces of this puzzle can also be solved by instead of coming up with a general purpose system call coming up with targed system calls perhaps socketat that solve a subset of the larger problem. Overall that appears to be more work for less reward. int setns(int fd, int nstype); The fd argument is a file descriptor referring to a proc file of the namespace you want to switch the process to. In the setns system call the nstype is 0 or specifies an clone flag of the namespace you intend to change to prevent changing a namespace unintentionally. v2: Most of the architecture support added by Daniel Lezcano v3: ported to v2.6.36-rc4 by: Eric W. Biederman v4: Moved wiring up of the system call to another patch v5: Cleaned up the system call arguments - Changed the order. - Modified nstype to take the standard clone flags. v6: Added missing error handling as pointed out by Matt Helsley Acked-by: Daniel Lezcano Signed-off-by: Eric W. Biederman --- kernel/nsproxy.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index a05d191ffdd9..5424e37673ed 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -22,6 +22,9 @@ #include #include #include +#include +#include +#include static struct kmem_cache *nsproxy_cachep; @@ -233,6 +236,45 @@ void exit_task_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } +SYSCALL_DEFINE2(setns, int, fd, int, nstype) +{ + const struct proc_ns_operations *ops; + struct task_struct *tsk = current; + struct nsproxy *new_nsproxy; + struct proc_inode *ei; + struct file *file; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + file = proc_ns_fget(fd); + if (IS_ERR(file)) + return PTR_ERR(file); + + err = -EINVAL; + ei = PROC_I(file->f_dentry->d_inode); + ops = ei->ns_ops; + if (nstype && (ops->type != nstype)) + goto out; + + new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); + if (IS_ERR(new_nsproxy)) { + err = PTR_ERR(new_nsproxy); + goto out; + } + + err = ops->install(new_nsproxy, ei->ns); + if (err) { + free_nsproxy(new_nsproxy); + goto out; + } + switch_task_namespaces(tsk, new_nsproxy); +out: + fput(file); + return err; +} + static int __init nsproxy_cache_init(void) { nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); From 13b6f57623bc485e116344fe91fbcb29f149242b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 7 Mar 2010 18:14:23 -0800 Subject: [PATCH 3/9] ns proc: Add support for the network namespace. Implementing file descriptors for the network namespace is simple and straight forward. Acked-by: David S. Miller Acked-by: Daniel Lezcano Signed-off-by: Eric W. Biederman --- fs/proc/namespaces.c | 3 +++ include/linux/proc_fs.h | 1 + net/core/net_namespace.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+) diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 6ae9f07d59ee..dcbd483e9915 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -16,6 +16,9 @@ static const struct proc_ns_operations *ns_entries[] = { +#ifdef CONFIG_NET_NS + &netns_operations, +#endif }; static const struct file_operations ns_file_operations = { diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index a6d2c6da5e5a..62126ec6ede9 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -265,6 +265,7 @@ struct proc_ns_operations { void (*put)(void *ns); int (*install)(struct nsproxy *nsproxy, void *ns); }; +extern const struct proc_ns_operations netns_operations; union proc_op { int (*proc_get_link)(struct inode *, struct path *); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 3f860261c5ee..bf7707e09a80 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -573,3 +573,34 @@ void unregister_pernet_device(struct pernet_operations *ops) mutex_unlock(&net_mutex); } EXPORT_SYMBOL_GPL(unregister_pernet_device); + +#ifdef CONFIG_NET_NS +static void *netns_get(struct task_struct *task) +{ + struct net *net; + rcu_read_lock(); + net = get_net(task->nsproxy->net_ns); + rcu_read_unlock(); + return net; +} + +static void netns_put(void *ns) +{ + put_net(ns); +} + +static int netns_install(struct nsproxy *nsproxy, void *ns) +{ + put_net(nsproxy->net_ns); + nsproxy->net_ns = get_net(ns); + return 0; +} + +const struct proc_ns_operations netns_operations = { + .name = "net", + .type = CLONE_NEWNET, + .get = netns_get, + .put = netns_put, + .install = netns_install, +}; +#endif From 34482e89a5218f0f9317abf1cfba3bb38b5c29dd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 7 Mar 2010 18:43:27 -0800 Subject: [PATCH 4/9] ns proc: Add support for the uts namespace Acked-by: Daniel Lezcano Signed-off-by: Eric W. Biederman --- fs/proc/namespaces.c | 3 +++ include/linux/proc_fs.h | 1 + kernel/utsname.c | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+) diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index dcbd483e9915..b017181f1273 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -19,6 +19,9 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_NET_NS &netns_operations, #endif +#ifdef CONFIG_UTS_NS + &utsns_operations, +#endif }; static const struct file_operations ns_file_operations = { diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 62126ec6ede9..52aa89df8a6d 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -266,6 +266,7 @@ struct proc_ns_operations { int (*install)(struct nsproxy *nsproxy, void *ns); }; extern const struct proc_ns_operations netns_operations; +extern const struct proc_ns_operations utsns_operations; union proc_op { int (*proc_get_link)(struct inode *, struct path *); diff --git a/kernel/utsname.c b/kernel/utsname.c index 44646179eaba..bff131b9510a 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -15,6 +15,7 @@ #include #include #include +#include static struct uts_namespace *create_uts_ns(void) { @@ -79,3 +80,41 @@ void free_uts_ns(struct kref *kref) put_user_ns(ns->user_ns); kfree(ns); } + +static void *utsns_get(struct task_struct *task) +{ + struct uts_namespace *ns = NULL; + struct nsproxy *nsproxy; + + rcu_read_lock(); + nsproxy = task_nsproxy(task); + if (nsproxy) { + ns = nsproxy->uts_ns; + get_uts_ns(ns); + } + rcu_read_unlock(); + + return ns; +} + +static void utsns_put(void *ns) +{ + put_uts_ns(ns); +} + +static int utsns_install(struct nsproxy *nsproxy, void *ns) +{ + get_uts_ns(ns); + put_uts_ns(nsproxy->uts_ns); + nsproxy->uts_ns = ns; + return 0; +} + +const struct proc_ns_operations utsns_operations = { + .name = "uts", + .type = CLONE_NEWUTS, + .get = utsns_get, + .put = utsns_put, + .install = utsns_install, +}; + From a00eaf11a223c63fbb212369d6db69ce4c55a2d1 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 7 Mar 2010 18:48:39 -0800 Subject: [PATCH 5/9] ns proc: Add support for the ipc namespace Acked-by: Daniel Lezcano Signed-off-by: Eric W. Biederman --- fs/proc/namespaces.c | 3 +++ include/linux/proc_fs.h | 1 + ipc/namespace.c | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index b017181f1273..f18d6d58bf79 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -22,6 +22,9 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_UTS_NS &utsns_operations, #endif +#ifdef CONFIG_IPC_NS + &ipcns_operations, +#endif }; static const struct file_operations ns_file_operations = { diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 52aa89df8a6d..a23f0b72a023 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -267,6 +267,7 @@ struct proc_ns_operations { }; extern const struct proc_ns_operations netns_operations; extern const struct proc_ns_operations utsns_operations; +extern const struct proc_ns_operations ipcns_operations; union proc_op { int (*proc_get_link)(struct inode *, struct path *); diff --git a/ipc/namespace.c b/ipc/namespace.c index 8054c8e5faf1..ce0a647869b1 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "util.h" @@ -140,3 +141,39 @@ void put_ipc_ns(struct ipc_namespace *ns) free_ipc_ns(ns); } } + +static void *ipcns_get(struct task_struct *task) +{ + struct ipc_namespace *ns = NULL; + struct nsproxy *nsproxy; + + rcu_read_lock(); + nsproxy = task_nsproxy(task); + if (nsproxy) + ns = get_ipc_ns(nsproxy->ipc_ns); + rcu_read_unlock(); + + return ns; +} + +static void ipcns_put(void *ns) +{ + return put_ipc_ns(ns); +} + +static int ipcns_install(struct nsproxy *nsproxy, void *ns) +{ + /* Ditch state from the old ipc namespace */ + exit_sem(current); + put_ipc_ns(nsproxy->ipc_ns); + nsproxy->ipc_ns = get_ipc_ns(ns); + return 0; +} + +const struct proc_ns_operations ipcns_operations = { + .name = "ipc", + .type = CLONE_NEWIPC, + .get = ipcns_get, + .put = ipcns_put, + .install = ipcns_install, +}; From f063052947f770845a6252f7fa24f6f624592a24 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 4 May 2011 17:51:50 -0700 Subject: [PATCH 6/9] net: Allow setting the network namespace by fd Take advantage of the new abstraction and allow network devices to be placed in any network namespace that we have a fd to talk about. Acked-by: David S. Miller Acked-by: Daniel Lezcano Signed-off-by: Eric W. Biederman --- include/linux/if_link.h | 1 + include/net/net_namespace.h | 1 + net/core/net_namespace.c | 33 +++++++++++++++++++++++++++++++-- net/core/rtnetlink.c | 5 ++++- 4 files changed, 37 insertions(+), 3 deletions(-) diff --git a/include/linux/if_link.h b/include/linux/if_link.h index f4a2e6b1b864..0ee969a5593d 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -136,6 +136,7 @@ enum { IFLA_PORT_SELF, IFLA_AF_SPEC, IFLA_GROUP, /* Group the device belongs to */ + IFLA_NET_NS_FD, __IFLA_MAX }; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 3ae491932bc8..dcc8f5749d3f 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -119,6 +119,7 @@ static inline struct net *copy_net_ns(unsigned long flags, struct net *net_ns) extern struct list_head net_namespace_list; extern struct net *get_net_ns_by_pid(pid_t pid); +extern struct net *get_net_ns_by_fd(int pid); #ifdef CONFIG_NET_NS extern void __put_net(struct net *net); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index bf7707e09a80..b7403ff4d6c6 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include #include @@ -343,6 +345,28 @@ struct net *get_net_ns_by_pid(pid_t pid) } EXPORT_SYMBOL_GPL(get_net_ns_by_pid); +struct net *get_net_ns_by_fd(int fd) +{ + struct proc_inode *ei; + struct file *file; + struct net *net; + + net = ERR_PTR(-EINVAL); + file = proc_ns_fget(fd); + if (!file) + goto out; + + ei = PROC_I(file->f_dentry->d_inode); + if (ei->ns_ops != &netns_operations) + goto out; + + net = get_net(ei->ns); +out: + if (file) + fput(file); + return net; +} + static int __init net_ns_init(void) { struct net_generic *ng; @@ -577,10 +601,15 @@ EXPORT_SYMBOL_GPL(unregister_pernet_device); #ifdef CONFIG_NET_NS static void *netns_get(struct task_struct *task) { - struct net *net; + struct net *net = NULL; + struct nsproxy *nsproxy; + rcu_read_lock(); - net = get_net(task->nsproxy->net_ns); + nsproxy = task_nsproxy(task); + if (nsproxy) + net = get_net(nsproxy->net_ns); rcu_read_unlock(); + return net; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d7c4bb4b1820..dca9602c62e4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1043,6 +1043,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_LINKMODE] = { .type = NLA_U8 }, [IFLA_LINKINFO] = { .type = NLA_NESTED }, [IFLA_NET_NS_PID] = { .type = NLA_U32 }, + [IFLA_NET_NS_FD] = { .type = NLA_U32 }, [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, [IFLA_VF_PORTS] = { .type = NLA_NESTED }, @@ -1091,6 +1092,8 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) */ if (tb[IFLA_NET_NS_PID]) net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); + else if (tb[IFLA_NET_NS_FD]) + net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); else net = get_net(src_net); return net; @@ -1221,7 +1224,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, int send_addr_notify = 0; int err; - if (tb[IFLA_NET_NS_PID]) { + if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) { struct net *net = rtnl_link_get_net(dev_net(dev), tb); if (IS_ERR(net)) { err = PTR_ERR(net); From 618e724b8d79c6232daac49cb39b0723bf5c4b08 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 11 May 2011 14:06:58 -0700 Subject: [PATCH 7/9] ns: Declare sys_setns in syscalls.h Ooops I overlooked this one, and missing it causes compile errors of the powerpc syscall. Signed-off-by: Eric W. Biederman --- include/linux/syscalls.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 83ecc1749ef6..b8b380443d05 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -844,4 +844,5 @@ asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name, asmlinkage long sys_open_by_handle_at(int mountdirfd, struct file_handle __user *handle, int flags); +asmlinkage long sys_setns(int fd, int nstype); #endif From 62ca24baf1417e56fd2ae4ff07adfe7f6a2e42fc Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 11 May 2011 15:42:08 -0700 Subject: [PATCH 8/9] ns proc: Return -ENOENT for a nonexistent /proc/self/ns/ entry. Spotted-by: Nathan Lynch Signed-off-by: Eric W. Biederman --- fs/proc/namespaces.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index f18d6d58bf79..781dec5bd682 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -161,6 +161,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, if (!memcmp(dentry->d_name.name, (*entry)->name, len)) break; } + error = ERR_PTR(-ENOENT); if (entry > last) goto out; From 956c920786694f51601a0ef7ee12956fd6aa216e Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 12 May 2011 13:51:13 +1000 Subject: [PATCH 9/9] net: fix get_net_ns_by_fd for !CONFIG_NET_NS After merging the final tree, today's linux-next build (powerpc ppc44x_defconfig) failed like this: net/built-in.o: In function `get_net_ns_by_fd': (.text+0x11976): undefined reference to `netns_operations' net/built-in.o: In function `get_net_ns_by_fd': (.text+0x1197a): undefined reference to `netns_operations' netns_operations is only available if CONFIG_NET_NS is set ... Caused by commit f063052947f7 ("net: Allow setting the network namespace by fd"). Signed-off-by: Stephen Rothwell Signed-off-by: Eric W. Biederman --- net/core/net_namespace.c | 49 ++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b7403ff4d6c6..d4cf178bdfc7 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -316,6 +316,28 @@ void __put_net(struct net *net) } EXPORT_SYMBOL_GPL(__put_net); +struct net *get_net_ns_by_fd(int fd) +{ + struct proc_inode *ei; + struct file *file; + struct net *net; + + net = ERR_PTR(-EINVAL); + file = proc_ns_fget(fd); + if (!file) + goto out; + + ei = PROC_I(file->f_dentry->d_inode); + if (ei->ns_ops != &netns_operations) + goto out; + + net = get_net(ei->ns); +out: + if (file) + fput(file); + return net; +} + #else struct net *copy_net_ns(unsigned long flags, struct net *old_net) { @@ -323,6 +345,11 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net) return ERR_PTR(-EINVAL); return old_net; } + +struct net *get_net_ns_by_fd(int fd) +{ + return ERR_PTR(-EINVAL); +} #endif struct net *get_net_ns_by_pid(pid_t pid) @@ -345,28 +372,6 @@ struct net *get_net_ns_by_pid(pid_t pid) } EXPORT_SYMBOL_GPL(get_net_ns_by_pid); -struct net *get_net_ns_by_fd(int fd) -{ - struct proc_inode *ei; - struct file *file; - struct net *net; - - net = ERR_PTR(-EINVAL); - file = proc_ns_fget(fd); - if (!file) - goto out; - - ei = PROC_I(file->f_dentry->d_inode); - if (ei->ns_ops != &netns_operations) - goto out; - - net = get_net(ei->ns); -out: - if (file) - fput(file); - return net; -} - static int __init net_ns_init(void) { struct net_generic *ng;