From 57dc9a5747942f131a1a8b36d661fec6e6a5e9f6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Jun 2008 15:35:32 -0400 Subject: [PATCH 01/82] NFS: Reduce the stack usage in NFSv4 create operations Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 204 ++++++++++++++++++++++------------------------ 1 file changed, 97 insertions(+), 107 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1293e0acd82b..26fbeb3467cb 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2079,47 +2079,81 @@ static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *n return err; } +struct nfs4_createdata { + struct rpc_message msg; + struct nfs4_create_arg arg; + struct nfs4_create_res res; + struct nfs_fh fh; + struct nfs_fattr fattr; + struct nfs_fattr dir_fattr; +}; + +static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, + struct qstr *name, struct iattr *sattr, u32 ftype) +{ + struct nfs4_createdata *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data != NULL) { + struct nfs_server *server = NFS_SERVER(dir); + + data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE]; + data->msg.rpc_argp = &data->arg; + data->msg.rpc_resp = &data->res; + data->arg.dir_fh = NFS_FH(dir); + data->arg.server = server; + data->arg.name = name; + data->arg.attrs = sattr; + data->arg.ftype = ftype; + data->arg.bitmask = server->attr_bitmask; + data->res.server = server; + data->res.fh = &data->fh; + data->res.fattr = &data->fattr; + data->res.dir_fattr = &data->dir_fattr; + nfs_fattr_init(data->res.fattr); + nfs_fattr_init(data->res.dir_fattr); + } + return data; +} + +static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data) +{ + int status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); + if (status == 0) { + update_changeattr(dir, &data->res.dir_cinfo); + nfs_post_op_update_inode(dir, data->res.dir_fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + } + return status; +} + +static void nfs4_free_createdata(struct nfs4_createdata *data) +{ + kfree(data); +} + static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, unsigned int len, struct iattr *sattr) { - struct nfs_server *server = NFS_SERVER(dir); - struct nfs_fh fhandle; - struct nfs_fattr fattr, dir_fattr; - struct nfs4_create_arg arg = { - .dir_fh = NFS_FH(dir), - .server = server, - .name = &dentry->d_name, - .attrs = sattr, - .ftype = NF4LNK, - .bitmask = server->attr_bitmask, - }; - struct nfs4_create_res res = { - .server = server, - .fh = &fhandle, - .fattr = &fattr, - .dir_fattr = &dir_fattr, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status; + struct nfs4_createdata *data; + int status = -ENAMETOOLONG; if (len > NFS4_MAXPATHLEN) - return -ENAMETOOLONG; + goto out; - arg.u.symlink.pages = &page; - arg.u.symlink.len = len; - nfs_fattr_init(&fattr); - nfs_fattr_init(&dir_fattr); + status = -ENOMEM; + data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4LNK); + if (data == NULL) + goto out; + + data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK]; + data->arg.u.symlink.pages = &page; + data->arg.u.symlink.len = len; - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - if (!status) { - update_changeattr(dir, &res.dir_cinfo); - nfs_post_op_update_inode(dir, res.dir_fattr); - status = nfs_instantiate(dentry, &fhandle, &fattr); - } + status = nfs4_do_create(dir, dentry, data); + + nfs4_free_createdata(data); +out: return status; } @@ -2140,39 +2174,17 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { - struct nfs_server *server = NFS_SERVER(dir); - struct nfs_fh fhandle; - struct nfs_fattr fattr, dir_fattr; - struct nfs4_create_arg arg = { - .dir_fh = NFS_FH(dir), - .server = server, - .name = &dentry->d_name, - .attrs = sattr, - .ftype = NF4DIR, - .bitmask = server->attr_bitmask, - }; - struct nfs4_create_res res = { - .server = server, - .fh = &fhandle, - .fattr = &fattr, - .dir_fattr = &dir_fattr, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status; + struct nfs4_createdata *data; + int status = -ENOMEM; - nfs_fattr_init(&fattr); - nfs_fattr_init(&dir_fattr); - - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - if (!status) { - update_changeattr(dir, &res.dir_cinfo); - nfs_post_op_update_inode(dir, res.dir_fattr); - status = nfs_instantiate(dentry, &fhandle, &fattr); - } + data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR); + if (data == NULL) + goto out; + + status = nfs4_do_create(dir, dentry, data); + + nfs4_free_createdata(data); +out: return status; } @@ -2242,56 +2254,34 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { - struct nfs_server *server = NFS_SERVER(dir); - struct nfs_fh fh; - struct nfs_fattr fattr, dir_fattr; - struct nfs4_create_arg arg = { - .dir_fh = NFS_FH(dir), - .server = server, - .name = &dentry->d_name, - .attrs = sattr, - .bitmask = server->attr_bitmask, - }; - struct nfs4_create_res res = { - .server = server, - .fh = &fh, - .fattr = &fattr, - .dir_fattr = &dir_fattr, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status; - int mode = sattr->ia_mode; - - nfs_fattr_init(&fattr); - nfs_fattr_init(&dir_fattr); + struct nfs4_createdata *data; + int mode = sattr->ia_mode; + int status = -ENOMEM; BUG_ON(!(sattr->ia_valid & ATTR_MODE)); BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode)); + + data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK); + if (data == NULL) + goto out; + if (S_ISFIFO(mode)) - arg.ftype = NF4FIFO; + data->arg.ftype = NF4FIFO; else if (S_ISBLK(mode)) { - arg.ftype = NF4BLK; - arg.u.device.specdata1 = MAJOR(rdev); - arg.u.device.specdata2 = MINOR(rdev); + data->arg.ftype = NF4BLK; + data->arg.u.device.specdata1 = MAJOR(rdev); + data->arg.u.device.specdata2 = MINOR(rdev); } else if (S_ISCHR(mode)) { - arg.ftype = NF4CHR; - arg.u.device.specdata1 = MAJOR(rdev); - arg.u.device.specdata2 = MINOR(rdev); + data->arg.ftype = NF4CHR; + data->arg.u.device.specdata1 = MAJOR(rdev); + data->arg.u.device.specdata2 = MINOR(rdev); } - else - arg.ftype = NF4SOCK; - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - if (status == 0) { - update_changeattr(dir, &res.dir_cinfo); - nfs_post_op_update_inode(dir, res.dir_fattr); - status = nfs_instantiate(dentry, &fh, &fattr); - } + status = nfs4_do_create(dir, dentry, data); + + nfs4_free_createdata(data); +out: return status; } From 0b4aae7aad162ad175ba8a65708332f888066b26 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Jun 2008 17:00:23 -0400 Subject: [PATCH 02/82] NFS: Reduce the stack usage in NFSv3 create operations Signed-off-by: Trond Myklebust --- fs/nfs/nfs3proc.c | 273 ++++++++++++++++++++++++---------------------- 1 file changed, 143 insertions(+), 130 deletions(-) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index c3523ad03ed1..cf7d4e5927d6 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -248,6 +248,53 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page, return status; } +struct nfs3_createdata { + struct rpc_message msg; + union { + struct nfs3_createargs create; + struct nfs3_mkdirargs mkdir; + struct nfs3_symlinkargs symlink; + struct nfs3_mknodargs mknod; + } arg; + struct nfs3_diropres res; + struct nfs_fh fh; + struct nfs_fattr fattr; + struct nfs_fattr dir_attr; +}; + +static struct nfs3_createdata *nfs3_alloc_createdata(void) +{ + struct nfs3_createdata *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data != NULL) { + data->msg.rpc_argp = &data->arg; + data->msg.rpc_resp = &data->res; + data->res.fh = &data->fh; + data->res.fattr = &data->fattr; + data->res.dir_attr = &data->dir_attr; + nfs_fattr_init(data->res.fattr); + nfs_fattr_init(data->res.dir_attr); + } + return data; +} + +static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data) +{ + int status; + + status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); + nfs_post_op_update_inode(dir, data->res.dir_attr); + if (status == 0) + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + return status; +} + +static void nfs3_free_createdata(struct nfs3_createdata *data) +{ + kfree(data); +} + /* * Create a regular file. * For now, we don't implement O_EXCL. @@ -256,70 +303,60 @@ static int nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags, struct nameidata *nd) { - struct nfs_fh fhandle; - struct nfs_fattr fattr; - struct nfs_fattr dir_attr; - struct nfs3_createargs arg = { - .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len, - .sattr = sattr, - }; - struct nfs3_diropres res = { - .dir_attr = &dir_attr, - .fh = &fhandle, - .fattr = &fattr - }; - struct rpc_message msg = { - .rpc_proc = &nfs3_procedures[NFS3PROC_CREATE], - .rpc_argp = &arg, - .rpc_resp = &res, - }; + struct nfs3_createdata *data; mode_t mode = sattr->ia_mode; - int status; + int status = -ENOMEM; dprintk("NFS call create %s\n", dentry->d_name.name); - arg.createmode = NFS3_CREATE_UNCHECKED; + + data = nfs3_alloc_createdata(); + if (data == NULL) + goto out; + + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_CREATE]; + data->arg.create.fh = NFS_FH(dir); + data->arg.create.name = dentry->d_name.name; + data->arg.create.len = dentry->d_name.len; + data->arg.create.sattr = sattr; + + data->arg.create.createmode = NFS3_CREATE_UNCHECKED; if (flags & O_EXCL) { - arg.createmode = NFS3_CREATE_EXCLUSIVE; - arg.verifier[0] = jiffies; - arg.verifier[1] = current->pid; + data->arg.create.createmode = NFS3_CREATE_EXCLUSIVE; + data->arg.create.verifier[0] = jiffies; + data->arg.create.verifier[1] = current->pid; } sattr->ia_mode &= ~current->fs->umask; -again: - nfs_fattr_init(&dir_attr); - nfs_fattr_init(&fattr); - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_refresh_inode(dir, &dir_attr); + for (;;) { + status = nfs3_do_create(dir, dentry, data); - /* If the server doesn't support the exclusive creation semantics, - * try again with simple 'guarded' mode. */ - if (status == -ENOTSUPP) { - switch (arg.createmode) { + if (status != -ENOTSUPP) + break; + /* If the server doesn't support the exclusive creation + * semantics, try again with simple 'guarded' mode. */ + switch (data->arg.create.createmode) { case NFS3_CREATE_EXCLUSIVE: - arg.createmode = NFS3_CREATE_GUARDED; + data->arg.create.createmode = NFS3_CREATE_GUARDED; break; case NFS3_CREATE_GUARDED: - arg.createmode = NFS3_CREATE_UNCHECKED; + data->arg.create.createmode = NFS3_CREATE_UNCHECKED; break; case NFS3_CREATE_UNCHECKED: goto out; } - goto again; + nfs_fattr_init(data->res.dir_attr); + nfs_fattr_init(data->res.fattr); } - if (status == 0) - status = nfs_instantiate(dentry, &fhandle, &fattr); if (status != 0) goto out; /* When we created the file with exclusive semantics, make * sure we set the attributes afterwards. */ - if (arg.createmode == NFS3_CREATE_EXCLUSIVE) { + if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) { dprintk("NFS call setattr (post-create)\n"); if (!(sattr->ia_valid & ATTR_ATIME_SET)) @@ -330,14 +367,15 @@ again: /* Note: we could use a guarded setattr here, but I'm * not sure this buys us anything (and I'd have * to revamp the NFSv3 XDR code) */ - status = nfs3_proc_setattr(dentry, &fattr, sattr); - nfs_post_op_update_inode(dentry->d_inode, &fattr); + status = nfs3_proc_setattr(dentry, data->res.fattr, sattr); + nfs_post_op_update_inode(dentry->d_inode, data->res.fattr); dprintk("NFS reply setattr (post-create): %d\n", status); + if (status != 0) + goto out; } - if (status != 0) - goto out; status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); out: + nfs3_free_createdata(data); dprintk("NFS reply create: %d\n", status); return status; } @@ -452,40 +490,28 @@ static int nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, unsigned int len, struct iattr *sattr) { - struct nfs_fh fhandle; - struct nfs_fattr fattr, dir_attr; - struct nfs3_symlinkargs arg = { - .fromfh = NFS_FH(dir), - .fromname = dentry->d_name.name, - .fromlen = dentry->d_name.len, - .pages = &page, - .pathlen = len, - .sattr = sattr - }; - struct nfs3_diropres res = { - .dir_attr = &dir_attr, - .fh = &fhandle, - .fattr = &fattr - }; - struct rpc_message msg = { - .rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status; + struct nfs3_createdata *data; + int status = -ENOMEM; if (len > NFS3_MAXPATHLEN) return -ENAMETOOLONG; dprintk("NFS call symlink %s\n", dentry->d_name.name); - nfs_fattr_init(&dir_attr); - nfs_fattr_init(&fattr); - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_post_op_update_inode(dir, &dir_attr); - if (status != 0) + data = nfs3_alloc_createdata(); + if (data == NULL) goto out; - status = nfs_instantiate(dentry, &fhandle, &fattr); + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK]; + data->arg.symlink.fromfh = NFS_FH(dir); + data->arg.symlink.fromname = dentry->d_name.name; + data->arg.symlink.fromlen = dentry->d_name.len; + data->arg.symlink.pages = &page; + data->arg.symlink.pathlen = len; + data->arg.symlink.sattr = sattr; + + status = nfs3_do_create(dir, dentry, data); + + nfs3_free_createdata(data); out: dprintk("NFS reply symlink: %d\n", status); return status; @@ -494,42 +520,31 @@ out: static int nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { - struct nfs_fh fhandle; - struct nfs_fattr fattr, dir_attr; - struct nfs3_mkdirargs arg = { - .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len, - .sattr = sattr - }; - struct nfs3_diropres res = { - .dir_attr = &dir_attr, - .fh = &fhandle, - .fattr = &fattr - }; - struct rpc_message msg = { - .rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR], - .rpc_argp = &arg, - .rpc_resp = &res, - }; + struct nfs3_createdata *data; int mode = sattr->ia_mode; - int status; + int status = -ENOMEM; dprintk("NFS call mkdir %s\n", dentry->d_name.name); sattr->ia_mode &= ~current->fs->umask; - nfs_fattr_init(&dir_attr); - nfs_fattr_init(&fattr); - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_post_op_update_inode(dir, &dir_attr); - if (status != 0) - goto out; - status = nfs_instantiate(dentry, &fhandle, &fattr); + data = nfs3_alloc_createdata(); + if (data == NULL) + goto out; + + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR]; + data->arg.mkdir.fh = NFS_FH(dir); + data->arg.mkdir.name = dentry->d_name.name; + data->arg.mkdir.len = dentry->d_name.len; + data->arg.mkdir.sattr = sattr; + + status = nfs3_do_create(dir, dentry, data); if (status != 0) goto out; + status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); out: + nfs3_free_createdata(data); dprintk("NFS reply mkdir: %d\n", status); return status; } @@ -615,52 +630,50 @@ static int nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { - struct nfs_fh fh; - struct nfs_fattr fattr, dir_attr; - struct nfs3_mknodargs arg = { - .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len, - .sattr = sattr, - .rdev = rdev - }; - struct nfs3_diropres res = { - .dir_attr = &dir_attr, - .fh = &fh, - .fattr = &fattr - }; - struct rpc_message msg = { - .rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD], - .rpc_argp = &arg, - .rpc_resp = &res, - }; + struct nfs3_createdata *data; mode_t mode = sattr->ia_mode; - int status; - - switch (sattr->ia_mode & S_IFMT) { - case S_IFBLK: arg.type = NF3BLK; break; - case S_IFCHR: arg.type = NF3CHR; break; - case S_IFIFO: arg.type = NF3FIFO; break; - case S_IFSOCK: arg.type = NF3SOCK; break; - default: return -EINVAL; - } + int status = -ENOMEM; dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, MAJOR(rdev), MINOR(rdev)); sattr->ia_mode &= ~current->fs->umask; - nfs_fattr_init(&dir_attr); - nfs_fattr_init(&fattr); - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_post_op_update_inode(dir, &dir_attr); - if (status != 0) + data = nfs3_alloc_createdata(); + if (data == NULL) goto out; - status = nfs_instantiate(dentry, &fh, &fattr); + + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD]; + data->arg.mknod.fh = NFS_FH(dir); + data->arg.mknod.name = dentry->d_name.name; + data->arg.mknod.len = dentry->d_name.len; + data->arg.mknod.sattr = sattr; + data->arg.mknod.rdev = rdev; + + switch (sattr->ia_mode & S_IFMT) { + case S_IFBLK: + data->arg.mknod.type = NF3BLK; + break; + case S_IFCHR: + data->arg.mknod.type = NF3CHR; + break; + case S_IFIFO: + data->arg.mknod.type = NF3FIFO; + break; + case S_IFSOCK: + data->arg.mknod.type = NF3SOCK; + break; + default: + status = -EINVAL; + goto out; + } + + status = nfs3_do_create(dir, dentry, data); if (status != 0) goto out; status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); out: + nfs3_free_createdata(data); dprintk("NFS reply mknod: %d\n", status); return status; } From f3d47a3a6a1484a93c8cfe1e8c8d4399c95199c7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 5 Jun 2008 15:17:39 -0400 Subject: [PATCH 03/82] NFS: Fix a preemption count leak in nfs_update_request The commit 2785259631697ebb0749a3782cca206e2e542939 (nfs: use GFP_NOFS preloads for radix-tree insertion) appears to have introduced a bug: We only want to call radix_tree_preload() once after creating a request. Calling it every time we loop after we created the request, will cause preemption count leaks. Signed-off-by: Trond Myklebust Cc: Nick Piggin --- fs/nfs/write.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f333848fd3be..dc62bc504693 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -584,13 +584,6 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, /* Loop over all inode entries and see if we find * A request for the page we wish to update */ - if (new) { - if (radix_tree_preload(GFP_NOFS)) { - nfs_release_request(new); - return ERR_PTR(-ENOMEM); - } - } - spin_lock(&inode->i_lock); req = nfs_page_find_request_locked(page); if (req) { @@ -630,6 +623,10 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, new = nfs_create_request(ctx, inode, page, offset, bytes); if (IS_ERR(new)) return new; + if (radix_tree_preload(GFP_NOFS)) { + nfs_release_request(new); + return ERR_PTR(-ENOMEM); + } } /* We have a request for our page. From 2116271a347d1181b5497602c2bfada1de8fd53b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 20 May 2008 19:34:39 -0400 Subject: [PATCH 04/82] NFS: Add correct bounds checking to NFSv2 locks NFSv2 file locking currently fails the Connectathon tests, because the calls to the VFS locking code do not return an EINVAL error if the struct file_lock overflows the 32-bit boundaries. The problem is due to the fact that we occasionally call helpers from fs/locks.c in order to avoid RPC calls to the server when we know that a local process holds the lock. These helpers are, of course, always 64-bit enabled, so EINVAL is not returned in cases when it would if the call had gone to the NLM code. For consistency, we therefore add support for a bounds-checking helper. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 20 +++++++++++++++----- fs/nfs/proc.c | 24 ++++++++++++++++++++++++ include/linux/nfs_xdr.h | 1 + 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d84a3d8f32af..7c73f06692b6 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -593,6 +593,7 @@ out: static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) { struct inode * inode = filp->f_mapping->host; + int ret = -ENOLCK; dprintk("NFS: nfs_lock(f=%s/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n", inode->i_sb->s_id, inode->i_ino, @@ -602,13 +603,22 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) /* No mandatory locks over NFS */ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - return -ENOLCK; + goto out_err; + + if (NFS_PROTO(inode)->lock_check_bounds != NULL) { + ret = NFS_PROTO(inode)->lock_check_bounds(fl); + if (ret < 0) + goto out_err; + } if (IS_GETLK(cmd)) - return do_getlk(filp, cmd, fl); - if (fl->fl_type == F_UNLCK) - return do_unlk(filp, cmd, fl); - return do_setlk(filp, cmd, fl); + ret = do_getlk(filp, cmd, fl); + else if (fl->fl_type == F_UNLCK) + ret = do_unlk(filp, cmd, fl); + else + ret = do_setlk(filp, cmd, fl); +out_err: + return ret; } /* diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 03599bfe81cf..5c35b02857f3 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -598,6 +598,29 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl) return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); } +/* Helper functions for NFS lock bounds checking */ +#define NFS_LOCK32_OFFSET_MAX ((__s32)0x7fffffffUL) +static int nfs_lock_check_bounds(const struct file_lock *fl) +{ + __s32 start, end; + + start = (__s32)fl->fl_start; + if ((loff_t)start != fl->fl_start) + goto out_einval; + + if (fl->fl_end != OFFSET_MAX) { + end = (__s32)fl->fl_end; + if ((loff_t)end != fl->fl_end) + goto out_einval; + } else + end = NFS_LOCK32_OFFSET_MAX; + + if (start < 0 || start > end) + goto out_einval; + return 0; +out_einval: + return -EINVAL; +} const struct nfs_rpc_ops nfs_v2_clientops = { .version = 2, /* protocol version */ @@ -633,4 +656,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .file_open = nfs_open, .file_release = nfs_release, .lock = nfs_proc_lock, + .lock_check_bounds = nfs_lock_check_bounds, }; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 24263bb8e0be..8d780de371f0 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -832,6 +832,7 @@ struct nfs_rpc_ops { int (*file_open) (struct inode *, struct file *); int (*file_release) (struct inode *, struct file *); int (*lock)(struct file *, int, struct file_lock *); + int (*lock_check_bounds)(const struct file_lock *); void (*clear_acl_cache)(struct inode *); }; From 8b39f2b41033754e7ba669503d27268beb1b524a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 14 May 2008 19:48:25 -0700 Subject: [PATCH 05/82] SUNRPC: Ensure we exit early in case of an encode error All errors from call_encode(), with exception of EAGAIN are fatal, so we should immediately return instead of proceeding to xprt_transmit(). Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8945307556ec..9503b4c177d2 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -890,7 +890,6 @@ call_encode(struct rpc_task *task) task->tk_msg.rpc_argp); if (task->tk_status == -ENOMEM) { /* XXX: Is this sane? */ - rpc_delay(task, 3*HZ); task->tk_status = -EAGAIN; } } @@ -1048,8 +1047,14 @@ call_transmit(struct rpc_task *task) BUG_ON(task->tk_rqstp->rq_bytes_sent != 0); call_encode(task); /* Did the encode result in an error condition? */ - if (task->tk_status != 0) + if (task->tk_status != 0) { + /* Was the error nonfatal? */ + if (task->tk_status == -EAGAIN) + rpc_delay(task, HZ >> 4); + else + rpc_exit(task, task->tk_status); return; + } } xprt_transmit(task); if (task->tk_status < 0) From b390c2b55c830eb3b64633fa8d8b8837e073e458 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Jun 2008 18:30:11 -0400 Subject: [PATCH 06/82] SUNRPC: An ENOMEM error from call_encode is always fatal The special 'ENOMEM' case that was previously flagged as non-fatal is bogus: auth_gss always returns EAGAIN for non-fatal errors, and may in fact return ENOMEM in the special case where xdr_buf_read_netobj runs out of preallocated buffer space (invariably a _fatal_ error, since there is no provision for preallocating larger buffers). Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 9503b4c177d2..1af4f161cda8 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -888,10 +888,6 @@ call_encode(struct rpc_task *task) task->tk_status = rpcauth_wrap_req(task, encode, req, p, task->tk_msg.rpc_argp); - if (task->tk_status == -ENOMEM) { - /* XXX: Is this sane? */ - task->tk_status = -EAGAIN; - } } /* From efc91ed0191e3fc62bb1c556ac93fc4e661214d2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Jun 2008 18:31:00 -0400 Subject: [PATCH 07/82] NFS: Optimise append writes with holes If a file is being extended, and we're creating a hole, we might as well declare the entire page to be up to date. This patch significantly improves the write performance for sparse files in the case where lseek(SEEK_END) is used to append several non-contiguous writes at intervals of < PAGE_SIZE. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 20 ++++++++++++++++++++ fs/nfs/write.c | 12 +++--------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 7c73f06692b6..7ac89a845a5e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -344,6 +344,26 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, unsigned offset = pos & (PAGE_CACHE_SIZE - 1); int status; + /* + * Zero any uninitialised parts of the page, and then mark the page + * as up to date if it turns out that we're extending the file. + */ + if (!PageUptodate(page)) { + unsigned pglen = nfs_page_length(page); + unsigned end = offset + len; + + if (pglen == 0) { + zero_user_segments(page, 0, offset, + end, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } else if (end >= pglen) { + zero_user_segment(page, end, PAGE_CACHE_SIZE); + if (offset == 0) + SetPageUptodate(page); + } else + zero_user_segment(page, pglen, PAGE_CACHE_SIZE); + } + lock_kernel(); status = nfs_updatepage(file, page, offset, copied); unlock_kernel(); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index dc62bc504693..eea2d2b5278c 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -616,7 +616,7 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, spin_unlock(&inode->i_lock); radix_tree_preload_end(); req = new; - goto zero_page; + goto out; } spin_unlock(&inode->i_lock); @@ -649,19 +649,13 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, req->wb_offset = offset; req->wb_pgbase = offset; req->wb_bytes = max(end, rqend) - req->wb_offset; - goto zero_page; + goto out; } if (end > rqend) req->wb_bytes = end - req->wb_offset; - return req; -zero_page: - /* If this page might potentially be marked as up to date, - * then we need to zero any uninitalised data. */ - if (req->wb_pgbase == 0 && req->wb_bytes != PAGE_CACHE_SIZE - && !PageUptodate(req->wb_page)) - zero_user_segment(req->wb_page, req->wb_bytes, PAGE_CACHE_SIZE); +out: return req; } From 7e5f6146609eb9134fac7d1b6bfee43df1732188 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 25 May 2008 12:59:19 -0400 Subject: [PATCH 08/82] NFS: Revert commit 44dd151d Revert commit 44dd151d "NFS: Don't mark a written page as uptodate until it is on disk". While it is true that the write may fail, that is always the case. There is no reason why we should treat data on pages that are not already marked as PG_uptodate as being special. The only thing we gain is a noticeable slowdown when re-reading these pages. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index eea2d2b5278c..ee6fcdecb871 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -188,6 +188,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, } /* Update file length */ nfs_grow_file(page, offset, count); + nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); nfs_clear_page_tag_locked(req); return 0; } @@ -743,12 +744,7 @@ int nfs_updatepage(struct file *file, struct page *page, static void nfs_writepage_release(struct nfs_page *req) { - if (PageError(req->wb_page)) { - nfs_end_page_writeback(req->wb_page); - nfs_inode_remove_request(req); - } else if (!nfs_reschedule_unstable_write(req)) { - /* Set the PG_uptodate flag */ - nfs_mark_uptodate(req->wb_page, req->wb_pgbase, req->wb_bytes); + if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) { nfs_end_page_writeback(req->wb_page); nfs_inode_remove_request(req); } else @@ -1069,8 +1065,6 @@ static void nfs_writeback_release_full(void *calldata) dprintk(" marked for commit\n"); goto next; } - /* Set the PG_uptodate flag? */ - nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); dprintk(" OK\n"); remove_request: nfs_end_page_writeback(page); @@ -1309,9 +1303,6 @@ static void nfs_commit_release(void *calldata) * returned by the server against all stored verfs. */ if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) { /* We have a match */ - /* Set the PG_uptodate flag */ - nfs_mark_uptodate(req->wb_page, req->wb_pgbase, - req->wb_bytes); nfs_inode_remove_request(req); dprintk(" OK\n"); goto next; From 0f38b873aeaae698c3693748438547c8493165fb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Jun 2008 18:31:01 -0400 Subject: [PATCH 09/82] SUNRPC: Use GFP_NOFS when allocating credentials Since the credentials may be allocated during the call to rpc_new_task(), which again may be called by a memory allocator... Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 10 +++++----- net/sunrpc/auth_gss/gss_krb5_mech.c | 4 ++-- net/sunrpc/auth_gss/gss_spkm3_mech.c | 4 ++-- net/sunrpc/auth_gss/gss_spkm3_token.c | 2 +- net/sunrpc/auth_unix.c | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index cc12d5f5d5da..bf7585b80543 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -146,7 +146,7 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) q = (const void *)((const char *)p + len); if (unlikely(q > end || q < p)) return ERR_PTR(-EFAULT); - dest->data = kmemdup(p, len, GFP_KERNEL); + dest->data = kmemdup(p, len, GFP_NOFS); if (unlikely(dest->data == NULL)) return ERR_PTR(-ENOMEM); dest->len = len; @@ -171,7 +171,7 @@ gss_alloc_context(void) { struct gss_cl_ctx *ctx; - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + ctx = kzalloc(sizeof(*ctx), GFP_NOFS); if (ctx != NULL) { ctx->gc_proc = RPC_GSS_PROC_DATA; ctx->gc_seq = 1; /* NetApp 6.4R1 doesn't accept seq. no. 0 */ @@ -341,7 +341,7 @@ gss_alloc_msg(struct gss_auth *gss_auth, uid_t uid) { struct gss_upcall_msg *gss_msg; - gss_msg = kzalloc(sizeof(*gss_msg), GFP_KERNEL); + gss_msg = kzalloc(sizeof(*gss_msg), GFP_NOFS); if (gss_msg != NULL) { INIT_LIST_HEAD(&gss_msg->list); rpc_init_wait_queue(&gss_msg->rpc_waitqueue, "RPCSEC_GSS upcall waitq"); @@ -503,7 +503,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (mlen > MSG_BUF_MAXSIZE) goto out; err = -ENOMEM; - buf = kmalloc(mlen, GFP_KERNEL); + buf = kmalloc(mlen, GFP_NOFS); if (!buf) goto out; @@ -806,7 +806,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) dprintk("RPC: gss_create_cred for uid %d, flavor %d\n", acred->uid, auth->au_flavor); - if (!(cred = kzalloc(sizeof(*cred), GFP_KERNEL))) + if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS))) goto out_err; rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops); diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 60c3dba545d7..ef45eba22485 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -70,7 +70,7 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res) q = (const void *)((const char *)p + len); if (unlikely(q > end || q < p)) return ERR_PTR(-EFAULT); - res->data = kmemdup(p, len, GFP_KERNEL); + res->data = kmemdup(p, len, GFP_NOFS); if (unlikely(res->data == NULL)) return ERR_PTR(-ENOMEM); res->len = len; @@ -131,7 +131,7 @@ gss_import_sec_context_kerberos(const void *p, struct krb5_ctx *ctx; int tmp; - if (!(ctx = kzalloc(sizeof(*ctx), GFP_KERNEL))) + if (!(ctx = kzalloc(sizeof(*ctx), GFP_NOFS))) goto out_err; p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate)); diff --git a/net/sunrpc/auth_gss/gss_spkm3_mech.c b/net/sunrpc/auth_gss/gss_spkm3_mech.c index 5deb4b6e4514..035e1dd6af1b 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_mech.c +++ b/net/sunrpc/auth_gss/gss_spkm3_mech.c @@ -76,7 +76,7 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res) q = (const void *)((const char *)p + len); if (unlikely(q > end || q < p)) return ERR_PTR(-EFAULT); - res->data = kmemdup(p, len, GFP_KERNEL); + res->data = kmemdup(p, len, GFP_NOFS); if (unlikely(res->data == NULL)) return ERR_PTR(-ENOMEM); return q; @@ -90,7 +90,7 @@ gss_import_sec_context_spkm3(const void *p, size_t len, struct spkm3_ctx *ctx; int version; - if (!(ctx = kzalloc(sizeof(*ctx), GFP_KERNEL))) + if (!(ctx = kzalloc(sizeof(*ctx), GFP_NOFS))) goto out_err; p = simple_get_bytes(p, end, &version, sizeof(version)); diff --git a/net/sunrpc/auth_gss/gss_spkm3_token.c b/net/sunrpc/auth_gss/gss_spkm3_token.c index 6cdd241ad267..3308157436d2 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_token.c +++ b/net/sunrpc/auth_gss/gss_spkm3_token.c @@ -90,7 +90,7 @@ asn1_bitstring_len(struct xdr_netobj *in, int *enclen, int *zerobits) int decode_asn1_bitstring(struct xdr_netobj *out, char *in, int enclen, int explen) { - if (!(out->data = kzalloc(explen,GFP_KERNEL))) + if (!(out->data = kzalloc(explen,GFP_NOFS))) return 0; out->len = explen; memcpy(out->data, in, enclen); diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 44920b90bdc4..46b2647c5bd2 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -66,7 +66,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) dprintk("RPC: allocating UNIX cred for uid %d gid %d\n", acred->uid, acred->gid); - if (!(cred = kmalloc(sizeof(*cred), GFP_KERNEL))) + if (!(cred = kmalloc(sizeof(*cred), GFP_NOFS))) return ERR_PTR(-ENOMEM); rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops); From b5418383ef13f70528281546d02c15edc03d8567 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Jun 2008 18:31:02 -0400 Subject: [PATCH 10/82] NFS: do_setlk(): don't flush caches when we have a delegation Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 7ac89a845a5e..0213c21038fa 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -602,7 +602,8 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl) * This makes locking act as a cache coherency point. */ nfs_sync_mapping(filp->f_mapping); - nfs_zap_caches(inode); + if (!nfs_have_delegation(inode, FMODE_READ)) + nfs_zap_caches(inode); out: return status; } From 6fb1bc10303c0d88f635d014324432ab6ee49d1b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 May 2008 17:09:04 -0400 Subject: [PATCH 11/82] NFS: Update help text for CONFIG_NFS_FS Clean up: refresh the help text for Kconfig items related to the NFS client. Remove obsolete URLs, and make the language consistent among the options. Also move the ROOT_NFS config option next to the options related to the NFS client. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/Kconfig | 115 ++++++++++++++++++++++++++--------------------------- 1 file changed, 57 insertions(+), 58 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index 2694648cbd1b..1c16de9611e9 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1544,10 +1544,6 @@ config UFS_FS The recently released UFS2 variant (used in FreeBSD 5.x) is READ-ONLY supported. - If you only intend to mount files from some other Unix over the - network using NFS, you don't need the UFS file system support (but - you need NFS file system support obviously). - Note that this option is generally not needed for floppies, since a good portable way to transport files and directories between unixes (and even other operating systems) is given by the tar program ("man @@ -1587,6 +1583,7 @@ menuconfig NETWORK_FILESYSTEMS Say Y here to get to see options for network filesystems and filesystem-related networking code, such as NFS daemon and RPCSEC security modules. + This option alone does not add any kernel code. If you say N, all options in this submenu will be skipped and @@ -1595,76 +1592,92 @@ menuconfig NETWORK_FILESYSTEMS if NETWORK_FILESYSTEMS config NFS_FS - tristate "NFS file system support" + tristate "NFS client support" depends on INET select LOCKD select SUNRPC select NFS_ACL_SUPPORT if NFS_V3_ACL help - If you are connected to some other (usually local) Unix computer - (using SLIP, PLIP, PPP or Ethernet) and want to mount files residing - on that computer (the NFS server) using the Network File Sharing - protocol, say Y. "Mounting files" means that the client can access - the files with usual UNIX commands as if they were sitting on the - client's hard disk. For this to work, the server must run the - programs nfsd and mountd (but does not need to have NFS file system - support enabled in its kernel). NFS is explained in the Network - Administrator's Guide, available from - , on its man page: "man - nfs", and in the NFS-HOWTO. + Choose Y here if you want to access files residing on other + computers using Sun's Network File System protocol. To compile + this file system support as a module, choose M here: the module + will be called nfs. - A superior but less widely used alternative to NFS is provided by - the Coda file system; see "Coda file system support" below. + To mount file systems exported by NFS servers, you also need to + install the user space mount.nfs command which can be found in + the Linux nfs-utils package, available from http://linux-nfs.org/. + Information about using the mount command is available in the + mount(8) man page. More detail about the Linux NFS client + implementation is available via the nfs(5) man page. - If you say Y here, you should have said Y to TCP/IP networking also. - This option would enlarge your kernel by about 27 KB. + Below you can choose which versions of the NFS protocol are + available in the kernel to mount NFS servers. Support for NFS + version 2 (RFC 1094) is always available when NFS_FS is selected. - To compile this file system support as a module, choose M here: the - module will be called nfs. + To configure a system which mounts its root file system via NFS + at boot time, say Y here, select "Kernel level IP + autoconfiguration" in the NETWORK menu, and select "Root file + system on NFS" below. You cannot compile this file system as a + module in this case. - If you are configuring a diskless machine which will mount its root - file system over NFS at boot time, say Y here and to "Kernel - level IP autoconfiguration" above and to "Root file system on NFS" - below. You cannot compile this driver as a module in this case. - There are two packages designed for booting diskless machines over - the net: netboot, available from - , and Etherboot, - available from . - - If you don't know what all this is about, say N. + If unsure, say N. config NFS_V3 - bool "Provide NFSv3 client support" + bool "NFS client support for NFS version 3" depends on NFS_FS help - Say Y here if you want your NFS client to be able to speak version - 3 of the NFS protocol. + This option enables support for version 3 of the NFS protocol + (RFC 1813) in the kernel's NFS client. If unsure, say Y. config NFS_V3_ACL - bool "Provide client support for the NFSv3 ACL protocol extension" + bool "NFS client support for the NFSv3 ACL protocol extension" depends on NFS_V3 help - Implement the NFSv3 ACL protocol extension for manipulating POSIX - Access Control Lists. The server should also be compiled with - the NFSv3 ACL protocol extension; see the CONFIG_NFSD_V3_ACL option. + Some NFS servers support an auxiliary NFSv3 ACL protocol that + Sun added to Solaris but never became an official part of the + NFS version 3 protocol. This protocol extension allows + applications on NFS clients to manipulate POSIX Access Control + Lists on files residing on NFS servers. NFS servers enforce + ACLs on local files whether this protocol is available or not. + + Choose Y here if your NFS server supports the Solaris NFSv3 ACL + protocol extension and you want your NFS client to allow + applications to access and modify ACLs on files on the server. + + Most NFS servers don't support the Solaris NFSv3 ACL protocol + extension. You can choose N here or specify the "noacl" mount + option to prevent your NFS client from trying to use the NFSv3 + ACL protocol. If unsure, say N. config NFS_V4 - bool "Provide NFSv4 client support (EXPERIMENTAL)" + bool "NFS client support for NFS version 4 (EXPERIMENTAL)" depends on NFS_FS && EXPERIMENTAL select RPCSEC_GSS_KRB5 help - Say Y here if you want your NFS client to be able to speak the newer - version 4 of the NFS protocol. + This option enables support for version 4 of the NFS protocol + (RFC 3530) in the kernel's NFS client. - Note: Requires auxiliary userspace daemons which may be found on - http://www.citi.umich.edu/projects/nfsv4/ + To mount NFS servers using NFSv4, you also need to install user + space programs which can be found in the Linux nfs-utils package, + available from http://linux-nfs.org/. If unsure, say N. +config ROOT_NFS + bool "Root file system on NFS" + depends on NFS_FS=y && IP_PNP + help + If you want your system to mount its root file system via NFS, + choose Y here. This is common practice for managing systems + without local permanent storage. For details, read + . + + Most people say N here. + config NFSD tristate "NFS server support" depends on INET @@ -1746,20 +1759,6 @@ config NFSD_V4 If unsure, say N. -config ROOT_NFS - bool "Root file system on NFS" - depends on NFS_FS=y && IP_PNP - help - If you want your Linux box to mount its whole root file system (the - one containing the directory /) from some other computer over the - net via NFS (presumably because your box doesn't have a hard disk), - say Y. Read for - details. It is likely that in this case, you also want to say Y to - "Kernel level IP autoconfiguration" so that your box can discover - its network address at boot time. - - Most people say N here. - config LOCKD tristate From 3748f1e447ac1dbf45f33ee7491a008a8bb5cdf0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 May 2008 17:09:12 -0400 Subject: [PATCH 12/82] SUNRPC: Add a function to display the name of an RPC procedure Improve debugging messages in call_start() and call_verify() by having them show the RPC procedure name instead of the procedure number. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 1af4f161cda8..68ea6dddcf1e 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -690,6 +690,21 @@ rpc_restart_call(struct rpc_task *task) } EXPORT_SYMBOL_GPL(rpc_restart_call); +#ifdef RPC_DEBUG +static const char *rpc_proc_name(const struct rpc_task *task) +{ + const struct rpc_procinfo *proc = task->tk_msg.rpc_proc; + + if (proc) { + if (proc->p_name) + return proc->p_name; + else + return "NULL"; + } else + return "no proc"; +} +#endif + /* * 0. Initial state * @@ -701,9 +716,9 @@ call_start(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - dprintk("RPC: %5u call_start %s%d proc %d (%s)\n", task->tk_pid, + dprintk("RPC: %5u call_start %s%d proc %s (%s)\n", task->tk_pid, clnt->cl_protname, clnt->cl_vers, - task->tk_msg.rpc_proc->p_proc, + rpc_proc_name(task), (RPC_IS_ASYNC(task) ? "async" : "sync")); /* Increment call count */ @@ -1432,10 +1447,10 @@ call_verify(struct rpc_task *task) error = -EPROTONOSUPPORT; goto out_err; case RPC_PROC_UNAVAIL: - dprintk("RPC: %5u %s: proc %p unsupported by program %u, " + dprintk("RPC: %5u %s: proc %s unsupported by program %u, " "version %u on server %s\n", task->tk_pid, __func__, - task->tk_msg.rpc_proc, + rpc_proc_name(task), task->tk_client->cl_prog, task->tk_client->cl_vers, task->tk_client->cl_server); From b0e1c57ea00302c3ac541ffd37e7db07d13cd674 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 May 2008 17:09:19 -0400 Subject: [PATCH 13/82] SUNRPC: Rename "call_" functions that are no longer FSM states The RPC client uses a finite state machine to move RPC tasks through each step of an RPC request. Each state is contained in a function in net/sunrpc/clnt.c, and named call_foo. Some of the functions named call_foo have changed over the past few years and are no longer states in the FSM. These include: call_encode, call_header, and call_verify. As a clean up, rename the functions that have changed. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 68ea6dddcf1e..ab8038db8ef8 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -58,7 +58,6 @@ static void call_start(struct rpc_task *task); static void call_reserve(struct rpc_task *task); static void call_reserveresult(struct rpc_task *task); static void call_allocate(struct rpc_task *task); -static void call_encode(struct rpc_task *task); static void call_decode(struct rpc_task *task); static void call_bind(struct rpc_task *task); static void call_bind_status(struct rpc_task *task); @@ -70,9 +69,9 @@ static void call_refreshresult(struct rpc_task *task); static void call_timeout(struct rpc_task *task); static void call_connect(struct rpc_task *task); static void call_connect_status(struct rpc_task *task); -static __be32 * call_header(struct rpc_task *task); -static __be32 * call_verify(struct rpc_task *task); +static __be32 *rpc_encode_header(struct rpc_task *task); +static __be32 *rpc_verify_header(struct rpc_task *task); static int rpc_ping(struct rpc_clnt *clnt, int flags); static void rpc_register_client(struct rpc_clnt *clnt) @@ -876,7 +875,7 @@ rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len) * 3. Encode arguments of an RPC call */ static void -call_encode(struct rpc_task *task) +rpc_xdr_encode(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; kxdrproc_t encode; @@ -891,13 +890,14 @@ call_encode(struct rpc_task *task) (char *)req->rq_buffer + req->rq_callsize, req->rq_rcvsize); - /* Encode header and provided arguments */ - encode = task->tk_msg.rpc_proc->p_encode; - if (!(p = call_header(task))) { - printk(KERN_INFO "RPC: call_header failed, exit EIO\n"); + p = rpc_encode_header(task); + if (p == NULL) { + printk(KERN_INFO "RPC: couldn't encode RPC header, exit EIO\n"); rpc_exit(task, -EIO); return; } + + encode = task->tk_msg.rpc_proc->p_encode; if (encode == NULL) return; @@ -1056,7 +1056,7 @@ call_transmit(struct rpc_task *task) /* Encode here so that rpcsec_gss can use correct sequence number. */ if (rpc_task_need_encode(task)) { BUG_ON(task->tk_rqstp->rq_bytes_sent != 0); - call_encode(task); + rpc_xdr_encode(task); /* Did the encode result in an error condition? */ if (task->tk_status != 0) { /* Was the error nonfatal? */ @@ -1240,8 +1240,7 @@ call_decode(struct rpc_task *task) goto out_retry; } - /* Verify the RPC header */ - p = call_verify(task); + p = rpc_verify_header(task); if (IS_ERR(p)) { if (p == ERR_PTR(-EAGAIN)) goto out_retry; @@ -1259,7 +1258,7 @@ call_decode(struct rpc_task *task) return; out_retry: task->tk_status = 0; - /* Note: call_verify() may have freed the RPC slot */ + /* Note: rpc_verify_header() may have freed the RPC slot */ if (task->tk_rqstp == req) { req->rq_received = req->rq_rcv_buf.len = 0; if (task->tk_client->cl_discrtry) @@ -1306,11 +1305,8 @@ call_refreshresult(struct rpc_task *task) return; } -/* - * Call header serialization - */ static __be32 * -call_header(struct rpc_task *task) +rpc_encode_header(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; @@ -1330,11 +1326,8 @@ call_header(struct rpc_task *task) return p; } -/* - * Reply header verification - */ static __be32 * -call_verify(struct rpc_task *task) +rpc_verify_header(struct rpc_task *task) { struct kvec *iov = &task->tk_rqstp->rq_rcv_buf.head[0]; int len = task->tk_rqstp->rq_rcv_buf.len >> 2; @@ -1408,7 +1401,7 @@ call_verify(struct rpc_task *task) task->tk_action = call_bind; goto out_retry; case RPC_AUTH_TOOWEAK: - printk(KERN_NOTICE "call_verify: server %s requires stronger " + printk(KERN_NOTICE "RPC: server %s requires stronger " "authentication.\n", task->tk_client->cl_server); break; default: From 68a23ee94e3a06819f5a39d64f2e1f3131bab12d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 May 2008 17:09:26 -0400 Subject: [PATCH 14/82] SUNRPC: Don't display the rpc_show_tasks header if there are no tasks Clean up: don't display the rpc_show_tasks column header unless there is at least one task to display. As far as I can tell, it is safe to let the list_for_each_entry macro decide that each list is empty. scripts/checkpatch.pl also wants a KERN_FOO at the start of any newly added printk() calls, so this and subsequent patches will also add KERN_INFO. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index ab8038db8ef8..7dda32851adb 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1526,24 +1526,30 @@ struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int EXPORT_SYMBOL_GPL(rpc_call_null); #ifdef RPC_DEBUG +static void rpc_show_header(void) +{ + printk(KERN_INFO "-pid- proc flgs status -client- -prog- --rqstp- " + "-timeout -rpcwait -action- ---ops--\n"); +} + void rpc_show_tasks(void) { struct rpc_clnt *clnt; struct rpc_task *t; + int header = 0; spin_lock(&rpc_client_lock); - if (list_empty(&all_clients)) - goto out; - printk("-pid- proc flgs status -client- -prog- --rqstp- -timeout " - "-rpcwait -action- ---ops--\n"); list_for_each_entry(clnt, &all_clients, cl_clients) { - if (list_empty(&clnt->cl_tasks)) - continue; spin_lock(&clnt->cl_lock); list_for_each_entry(t, &clnt->cl_tasks, tk_task) { const char *rpc_waitq = "none"; int proc; + if (!header) { + rpc_show_header(); + header++; + } + if (t->tk_msg.rpc_proc) proc = t->tk_msg.rpc_proc->p_proc; else @@ -1563,7 +1569,6 @@ void rpc_show_tasks(void) } spin_unlock(&clnt->cl_lock); } -out: spin_unlock(&rpc_client_lock); } #endif From 38e886e0c18975543938519254fc9bf0829c75a3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 May 2008 17:09:33 -0400 Subject: [PATCH 15/82] SUNRPC: Refactor rpc_show_tasks Clean up: move the logic that displays each task to its own function. This removes indentation and makes future changes easier. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 7dda32851adb..7964a98c90e5 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1532,40 +1532,42 @@ static void rpc_show_header(void) "-timeout -rpcwait -action- ---ops--\n"); } +static void rpc_show_task(const struct rpc_clnt *clnt, + const struct rpc_task *task) +{ + const char *rpc_waitq = "none"; + int proc = -1; + + if (task->tk_msg.rpc_proc) + proc = task->tk_msg.rpc_proc->p_proc; + + if (RPC_IS_QUEUED(task)) + rpc_waitq = rpc_qname(task->tk_waitqueue); + + printk(KERN_INFO "%5u %04d %04x %6d %8p %6d %8p %8ld %8s %8p %8p\n", + task->tk_pid, proc, + task->tk_flags, task->tk_status, + clnt, clnt->cl_prog, + task->tk_rqstp, task->tk_timeout, + rpc_waitq, + task->tk_action, task->tk_ops); +} + void rpc_show_tasks(void) { struct rpc_clnt *clnt; - struct rpc_task *t; + struct rpc_task *task; int header = 0; spin_lock(&rpc_client_lock); list_for_each_entry(clnt, &all_clients, cl_clients) { spin_lock(&clnt->cl_lock); - list_for_each_entry(t, &clnt->cl_tasks, tk_task) { - const char *rpc_waitq = "none"; - int proc; - + list_for_each_entry(task, &clnt->cl_tasks, tk_task) { if (!header) { rpc_show_header(); header++; } - - if (t->tk_msg.rpc_proc) - proc = t->tk_msg.rpc_proc->p_proc; - else - proc = -1; - - if (RPC_IS_QUEUED(t)) - rpc_waitq = rpc_qname(t->tk_waitqueue); - - printk("%5u %04d %04x %6d %8p %6d %8p %8ld %8s %8p %8p\n", - t->tk_pid, proc, - t->tk_flags, t->tk_status, - t->tk_client, - (t->tk_client ? t->tk_client->cl_prog : 0), - t->tk_rqstp, t->tk_timeout, - rpc_waitq, - t->tk_action, t->tk_ops); + rpc_show_task(clnt, task); } spin_unlock(&clnt->cl_lock); } From cb3997b5a0b21864368bd1bd1d0929f9304fb6d9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 May 2008 17:09:41 -0400 Subject: [PATCH 16/82] SUNRPC: Display some debugging information as text rather than numbers In rpc_show_tasks(), display the program name, version number, procedure name and tk_action as human-readable variable-length text fields rather than columnar numbers. Doing the symbol lookup here helps in cases where we have actual debugging output from a kernel log, but don't have access to the kernel image or RPC module that generated the output. Sample output: -pid- flgs status -client- --rqstp- -timeout ---ops-- 5608 0001 -11 eeb42690 f6d93710 0 f8fa1764 nfsv3 WRITE a:call_transmit_status q:none 5609 0001 -11 eeb42690 f6d937e0 0 f8fa1764 nfsv3 WRITE a:call_status q:xprt_sending 5610 0001 -11 eeb42690 f6d93230 0 f8fa1764 nfsv3 WRITE a:call_status q:xprt_sending 5611 0001 -11 eeb42690 f6d93300 0 f8fa1764 nfsv3 WRITE a:call_status q:xprt_sending 5612 0001 -11 eeb42690 f6d93090 0 f8fa1764 nfsv3 WRITE a:call_status q:xprt_sending 5613 0001 -11 eeb42690 f6d933d0 0 f8fa1764 nfsv3 WRITE a:call_status q:xprt_sending 5614 0001 -11 eeb42690 f6d93cc0 0 f8fa1764 nfsv3 WRITE a:call_status q:xprt_sending 5615 0001 -11 eeb42690 f6d93a50 0 f8fa1764 nfsv3 WRITE a:call_status q:xprt_sending 5616 0001 -11 eeb42690 f6d93640 0 f8fa1764 nfsv3 WRITE a:call_status q:xprt_sending 5617 0001 -11 eeb42690 f6d93b20 0 f8fa1764 nfsv3 WRITE a:call_status q:xprt_sending 5618 0001 -11 eeb42690 f6d93160 0 f8fa1764 nfsv3 WRITE a:call_status q:xprt_sending Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 7964a98c90e5..0530eea37b59 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -1528,29 +1529,31 @@ EXPORT_SYMBOL_GPL(rpc_call_null); #ifdef RPC_DEBUG static void rpc_show_header(void) { - printk(KERN_INFO "-pid- proc flgs status -client- -prog- --rqstp- " - "-timeout -rpcwait -action- ---ops--\n"); + printk(KERN_INFO "-pid- flgs status -client- --rqstp- " + "-timeout ---ops--\n"); } static void rpc_show_task(const struct rpc_clnt *clnt, const struct rpc_task *task) { const char *rpc_waitq = "none"; - int proc = -1; - - if (task->tk_msg.rpc_proc) - proc = task->tk_msg.rpc_proc->p_proc; + char *p, action[KSYM_SYMBOL_LEN]; if (RPC_IS_QUEUED(task)) rpc_waitq = rpc_qname(task->tk_waitqueue); - printk(KERN_INFO "%5u %04d %04x %6d %8p %6d %8p %8ld %8s %8p %8p\n", - task->tk_pid, proc, - task->tk_flags, task->tk_status, - clnt, clnt->cl_prog, - task->tk_rqstp, task->tk_timeout, - rpc_waitq, - task->tk_action, task->tk_ops); + /* map tk_action pointer to a function name; then trim off + * the "+0x0 [sunrpc]" */ + sprint_symbol(action, (unsigned long)task->tk_action); + p = strchr(action, '+'); + if (p) + *p = '\0'; + + printk(KERN_INFO "%5u %04x %6d %8p %8p %8ld %8p %sv%u %s a:%s q:%s\n", + task->tk_pid, task->tk_flags, task->tk_status, + clnt, task->tk_rqstp, task->tk_timeout, task->tk_ops, + clnt->cl_protname, clnt->cl_vers, rpc_proc_name(task), + action, rpc_waitq); } void rpc_show_tasks(void) From 549177863bac22f23663ee9f70c4e3b9fb269f2c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 27 May 2008 16:29:07 -0400 Subject: [PATCH 17/82] NFS: Make nfs_fsync methods consistent Clean up: Report the same debugging info, count function calls the same, and use similar function naming in nfs_fsync_dir() and nfs_fsync(). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 3 ++- fs/nfs/file.c | 12 +++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 982a2064fe4c..5d73fbd67070 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -629,10 +629,11 @@ out: */ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) { - dfprintk(VFS, "NFS: fsync_dir(%s/%s) datasync %d\n", + dfprintk(VFS, "NFS: fsync dir(%s/%s) datasync %d\n", dentry->d_parent->d_name.name, dentry->d_name.name, datasync); + nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC); return 0; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 0213c21038fa..1789de218cca 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -50,7 +50,7 @@ static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov, static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, unsigned long nr_segs, loff_t pos); static int nfs_file_flush(struct file *, fl_owner_t id); -static int nfs_fsync(struct file *, struct dentry *dentry, int datasync); +static int nfs_file_fsync(struct file *, struct dentry *dentry, int datasync); static int nfs_check_flags(int flags); static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); @@ -72,7 +72,7 @@ const struct file_operations nfs_file_operations = { .open = nfs_file_open, .flush = nfs_file_flush, .release = nfs_file_release, - .fsync = nfs_fsync, + .fsync = nfs_file_fsync, .lock = nfs_lock, .flock = nfs_flock, .splice_read = nfs_file_splice_read, @@ -181,7 +181,7 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) } /* - * Helper for nfs_file_flush() and nfs_fsync() + * Helper for nfs_file_flush() and nfs_file_fsync() * * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to * disk, but it retrieves and clears ctx->error after synching, despite @@ -296,12 +296,14 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma) * whether any write errors occurred for this process. */ static int -nfs_fsync(struct file *file, struct dentry *dentry, int datasync) +nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) { struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = dentry->d_inode; - dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); + dfprintk(VFS, "NFS: fsync file(%s/%s) datasync %d\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + datasync); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); return nfs_do_fsync(ctx, inode); From b84e06c58fdefdc42931f771dc295e63f4b27365 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 11 Jun 2008 17:55:34 -0400 Subject: [PATCH 18/82] NFS: Make nfs_llseek methods consistent Clean up: Report the same debugging info in nfs_llseek_dir() and nfs_llseek_file(). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 12 ++++++++++-- fs/nfs/file.c | 5 +++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 5d73fbd67070..24571067bf72 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -603,7 +603,15 @@ out: static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) { - mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + + dfprintk(VFS, "NFS: llseek dir(%s/%s, %lld, %d)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name, + offset, origin); + + mutex_lock(&inode->i_mutex); switch (origin) { case 1: offset += filp->f_pos; @@ -619,7 +627,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) nfs_file_open_context(filp)->dir_cookie = 0; } out: - mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); + mutex_unlock(&inode->i_mutex); return offset; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 1789de218cca..cef36362c9eb 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -170,6 +170,11 @@ force_reval: static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) { + dfprintk(VFS, "NFS: llseek file(%s/%s, %lld, %d)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name, + offset, origin); + /* origin == SEEK_END => we must revalidate the cached file length */ if (origin == SEEK_END) { struct inode *inode = filp->f_mapping->host; From cc0dd2d1052aede2946ad1338a8f6f5d5c604740 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 11 Jun 2008 17:55:42 -0400 Subject: [PATCH 19/82] NFS: Make nfs_open methods consistent Clean up: Report the same debugging info and count function calls the same for files and directories in nfs_opendir() and nfs_file_open(). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 7 +++++-- fs/nfs/file.c | 4 ++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 24571067bf72..c962233c094a 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -133,8 +133,11 @@ nfs_opendir(struct inode *inode, struct file *filp) { int res; - dfprintk(VFS, "NFS: opendir(%s/%ld)\n", - inode->i_sb->s_id, inode->i_ino); + dfprintk(VFS, "NFS: open dir(%s/%s)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name); + + nfs_inc_stats(inode, NFSIOS_VFSOPEN); lock_kernel(); /* Call generic open code in order to cache credentials */ diff --git a/fs/nfs/file.c b/fs/nfs/file.c index cef36362c9eb..202408d96ee8 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -119,6 +119,10 @@ nfs_file_open(struct inode *inode, struct file *filp) { int res; + dfprintk(VFS, "NFS: open file(%s/%s)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name); + res = nfs_check_flags(filp->f_flags); if (res) return res; From b7eaefaa8722fd98e5c2632640d1abd2b0c83e84 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 11 Jun 2008 17:55:50 -0400 Subject: [PATCH 20/82] NFS: Add debugging facility for NFS aops Recent work in fs/nfs/file.c neglected to add appropriate trace debugging for the NFS client's address space operations. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 202408d96ee8..6c447a37edef 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -335,6 +335,11 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, struct page *page; index = pos >> PAGE_CACHE_SHIFT; + dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + mapping->host->i_ino, len, (long long) pos); + page = __grab_cache_page(mapping, index); if (!page) return -ENOMEM; @@ -355,6 +360,11 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, unsigned offset = pos & (PAGE_CACHE_SIZE - 1); int status; + dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + mapping->host->i_ino, len, (long long) pos); + /* * Zero any uninitialised parts of the page, and then mark the page * as up to date if it turns out that we're extending the file. @@ -389,6 +399,8 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, static void nfs_invalidate_page(struct page *page, unsigned long offset) { + dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); + if (offset != 0) return; /* Cancel any unstarted writes on this page */ @@ -397,13 +409,20 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset) static int nfs_release_page(struct page *page, gfp_t gfp) { + dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); + /* If PagePrivate() is set, then the page is not freeable */ return 0; } static int nfs_launder_page(struct page *page) { - return nfs_wb_page(page->mapping->host, page); + struct inode *inode = page->mapping->host; + + dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", + inode->i_ino, (long long)page_offset(page)); + + return nfs_wb_page(inode, page); } const struct address_space_operations nfs_file_aops = { @@ -423,13 +442,19 @@ const struct address_space_operations nfs_file_aops = { static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) { struct file *filp = vma->vm_file; + struct dentry *dentry = filp->f_path.dentry; unsigned pagelen; int ret = -EINVAL; struct address_space *mapping; + dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + filp->f_mapping->host->i_ino, + (long long)page_offset(page)); + lock_page(page); mapping = page->mapping; - if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping) + if (mapping != dentry->d_inode->i_mapping) goto out_unlock; ret = 0; From 6da24bc9cfc645c619992e39aab09747164c9f14 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 11 Jun 2008 17:55:58 -0400 Subject: [PATCH 21/82] NFS: Use NFSDBG_FILE for all fops Clean up: some fops use NFSDBG_FILE, some use NFSDBG_VFS. Let's use NFSDBG_FILE for all fops, and consistently report file names instead of inode numbers. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 10 ++++----- fs/nfs/direct.c | 4 ++-- fs/nfs/file.c | 59 ++++++++++++++++++++++++++++++------------------- 3 files changed, 43 insertions(+), 30 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c962233c094a..b1940660502f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -133,7 +133,7 @@ nfs_opendir(struct inode *inode, struct file *filp) { int res; - dfprintk(VFS, "NFS: open dir(%s/%s)\n", + dfprintk(FILE, "NFS: open dir(%s/%s)\n", filp->f_path.dentry->d_parent->d_name.name, filp->f_path.dentry->d_name.name); @@ -531,7 +531,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) struct nfs_fattr fattr; long res; - dfprintk(VFS, "NFS: readdir(%s/%s) starting at cookie %Lu\n", + dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", dentry->d_parent->d_name.name, dentry->d_name.name, (long long)filp->f_pos); nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); @@ -598,7 +598,7 @@ out: unlock_kernel(); if (res > 0) res = 0; - dfprintk(VFS, "NFS: readdir(%s/%s) returns %ld\n", + dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n", dentry->d_parent->d_name.name, dentry->d_name.name, res); return res; @@ -609,7 +609,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; - dfprintk(VFS, "NFS: llseek dir(%s/%s, %lld, %d)\n", + dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n", dentry->d_parent->d_name.name, dentry->d_name.name, offset, origin); @@ -640,7 +640,7 @@ out: */ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) { - dfprintk(VFS, "NFS: fsync dir(%s/%s) datasync %d\n", + dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n", dentry->d_parent->d_name.name, dentry->d_name.name, datasync); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 4757a2b326a1..08f6b040d289 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -890,7 +890,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, count = iov_length(iov, nr_segs); nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); - dprintk("nfs: direct read(%s/%s, %zd@%Ld)\n", + dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, count, (long long) pos); @@ -947,7 +947,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, count = iov_length(iov, nr_segs); nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); - dfprintk(VFS, "nfs: direct write(%s/%s, %zd@%Ld)\n", + dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, count, (long long) pos); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6c447a37edef..78b26f875eee 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -119,7 +119,7 @@ nfs_file_open(struct inode *inode, struct file *filp) { int res; - dfprintk(VFS, "NFS: open file(%s/%s)\n", + dprintk("NFS: open file(%s/%s)\n", filp->f_path.dentry->d_parent->d_name.name, filp->f_path.dentry->d_name.name); @@ -137,9 +137,15 @@ nfs_file_open(struct inode *inode, struct file *filp) static int nfs_file_release(struct inode *inode, struct file *filp) { + struct dentry *dentry = filp->f_path.dentry; + + dprintk("NFS: release(%s/%s)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name); + /* Ensure that dirty pages are flushed out with the right creds */ if (filp->f_mode & FMODE_WRITE) - nfs_wb_all(filp->f_path.dentry->d_inode); + nfs_wb_all(dentry->d_inode); nfs_inc_stats(inode, NFSIOS_VFSRELEASE); return NFS_PROTO(inode)->file_release(inode, filp); } @@ -174,7 +180,7 @@ force_reval: static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) { - dfprintk(VFS, "NFS: llseek file(%s/%s, %lld, %d)\n", + dprintk("NFS: llseek file(%s/%s, %lld, %d)\n", filp->f_path.dentry->d_parent->d_name.name, filp->f_path.dentry->d_name.name, offset, origin); @@ -216,16 +222,18 @@ static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode) /* * Flush all dirty pages, and check for write errors. - * */ static int nfs_file_flush(struct file *file, fl_owner_t id) { struct nfs_open_context *ctx = nfs_file_open_context(file); - struct inode *inode = file->f_path.dentry->d_inode; + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; int status; - dfprintk(VFS, "nfs: flush(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); + dprintk("NFS: flush(%s/%s)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name); if ((file->f_mode & FMODE_WRITE) == 0) return 0; @@ -250,7 +258,7 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov, if (iocb->ki_filp->f_flags & O_DIRECT) return nfs_file_direct_read(iocb, iov, nr_segs, pos); - dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n", + dprintk("NFS: read(%s/%s, %lu@%lu)\n", dentry->d_parent->d_name.name, dentry->d_name.name, (unsigned long) count, (unsigned long) pos); @@ -270,7 +278,7 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos, struct inode *inode = dentry->d_inode; ssize_t res; - dfprintk(VFS, "nfs: splice_read(%s/%s, %lu@%Lu)\n", + dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n", dentry->d_parent->d_name.name, dentry->d_name.name, (unsigned long) count, (unsigned long long) *ppos); @@ -287,7 +295,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma) struct inode *inode = dentry->d_inode; int status; - dfprintk(VFS, "nfs: mmap(%s/%s)\n", + dprintk("NFS: mmap(%s/%s)\n", dentry->d_parent->d_name.name, dentry->d_name.name); status = nfs_revalidate_mapping(inode, file->f_mapping); @@ -310,7 +318,7 @@ nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = dentry->d_inode; - dfprintk(VFS, "NFS: fsync file(%s/%s) datasync %d\n", + dprintk("NFS: fsync file(%s/%s) datasync %d\n", dentry->d_parent->d_name.name, dentry->d_name.name, datasync); @@ -502,9 +510,9 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, if (iocb->ki_filp->f_flags & O_DIRECT) return nfs_file_direct_write(iocb, iov, nr_segs, pos); - dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%Ld)\n", + dprintk("NFS: write(%s/%s, %lu@%Ld)\n", dentry->d_parent->d_name.name, dentry->d_name.name, - inode->i_ino, (unsigned long) count, (long long) pos); + (unsigned long) count, (long long) pos); result = -EBUSY; if (IS_SWAPFILE(inode)) @@ -649,13 +657,15 @@ out: */ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) { - struct inode * inode = filp->f_mapping->host; + struct inode *inode = filp->f_mapping->host; int ret = -ENOLCK; - dprintk("NFS: nfs_lock(f=%s/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n", - inode->i_sb->s_id, inode->i_ino, + dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name, fl->fl_type, fl->fl_flags, (long long)fl->fl_start, (long long)fl->fl_end); + nfs_inc_stats(inode, NFSIOS_VFSLOCK); /* No mandatory locks over NFS */ @@ -683,9 +693,9 @@ out_err: */ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) { - dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n", - filp->f_path.dentry->d_inode->i_sb->s_id, - filp->f_path.dentry->d_inode->i_ino, + dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name, fl->fl_type, fl->fl_flags); /* @@ -708,12 +718,15 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) return do_setlk(filp, cmd, fl); } +/* + * There is no protocol support for leases, so we have no way to implement + * them correctly in the face of opens by other clients. + */ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl) { - /* - * There is no protocol support for leases, so we have no way - * to implement them correctly in the face of opens by other - * clients. - */ + dprintk("NFS: setlease(%s/%s, arg=%ld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, arg); + return -EINVAL; } From 48186c7d5734a6b137f9186b37f6dc98097d0429 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 11 Jun 2008 17:56:05 -0400 Subject: [PATCH 22/82] NFS: Fix trace debugging nits in write.c Clean up: fix a few dprintk messages that still need to show the RPC task ID correctly, and be sure we use the preferred %lld or %llu instead of %Ld or %Lu. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ee6fcdecb871..21d8a48b624b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -713,10 +713,10 @@ int nfs_updatepage(struct file *file, struct page *page, nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); - dprintk("NFS: nfs_updatepage(%s/%s %d@%Ld)\n", + dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, count, - (long long)(page_offset(page) +offset)); + (long long)(page_offset(page) + offset)); /* If we're not using byte range locks, and we know the page * is up to date, it may be more efficient to extend the write @@ -736,7 +736,7 @@ int nfs_updatepage(struct file *file, struct page *page, else __set_page_dirty_nobuffers(page); - dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n", + dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); return status; } @@ -821,7 +821,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req, NFS_PROTO(inode)->write_setup(data, &msg); dprintk("NFS: %5u initiated write call " - "(req %s/%Ld, %u bytes @ offset %Lu)\n", + "(req %s/%lld, %u bytes @ offset %llu)\n", data->task.tk_pid, inode->i_sb->s_id, (long long)NFS_FILEID(inode), @@ -965,13 +965,13 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; - struct nfs_page *req = data->req; - dprintk("NFS: write (%s/%Ld %d@%Ld)", - req->wb_context->path.dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), - req->wb_bytes, - (long long)req_offset(req)); + dprintk("NFS: %5u write(%s/%lld %d@%lld)", + task->tk_pid, + data->req->wb_context->path.dentry->d_inode->i_sb->s_id, + (long long) + NFS_FILEID(data->req->wb_context->path.dentry->d_inode), + data->req->wb_bytes, (long long)req_offset(data->req)); nfs_writeback_done(task, data); } @@ -1045,7 +1045,8 @@ static void nfs_writeback_release_full(void *calldata) nfs_list_remove_request(req); - dprintk("NFS: write (%s/%Ld %d@%Ld)", + dprintk("NFS: %5u write (%s/%lld %d@%lld)", + data->task.tk_pid, req->wb_context->path.dentry->d_inode->i_sb->s_id, (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), req->wb_bytes, @@ -1118,7 +1119,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) static unsigned long complain; if (time_before(complain, jiffies)) { - dprintk("NFS: faulty NFS server %s:" + dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", NFS_SERVER(data->inode)->nfs_client->cl_hostname, resp->verf->committed, argp->stable); @@ -1287,7 +1288,7 @@ static void nfs_commit_release(void *calldata) dec_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); - dprintk("NFS: commit (%s/%Ld %d@%Ld)", + dprintk("NFS: commit (%s/%lld %d@%lld)", req->wb_context->path.dentry->d_inode->i_sb->s_id, (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), req->wb_bytes, From cd983ef81b9d79573848dabf81277c7314220257 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 11 Jun 2008 17:56:13 -0400 Subject: [PATCH 23/82] SUNRPC: Remove obsolete messages during transport connect Recent changes to the RPC client's transport connect logic make connect status values ECONNREFUSED and ECONNRESET impossible. Clean up xprt_connect_status() to account for these changes. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index e1770f7ba0b3..67996bd7fbf9 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -690,7 +690,7 @@ static void xprt_connect_status(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; - if (task->tk_status >= 0) { + if (task->tk_status == 0) { xprt->stat.connect_count++; xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; dprintk("RPC: %5u xprt_connect_status: connection established\n", @@ -699,12 +699,6 @@ static void xprt_connect_status(struct rpc_task *task) } switch (task->tk_status) { - case -ECONNREFUSED: - case -ECONNRESET: - dprintk("RPC: %5u xprt_connect_status: server %s refused " - "connection\n", task->tk_pid, - task->tk_client->cl_server); - break; case -ENOTCONN: dprintk("RPC: %5u xprt_connect_status: connection broken\n", task->tk_pid); From c2d946e55e1be20a7443918e3b7497b905e6b3f7 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 20 May 2008 01:08:00 +0300 Subject: [PATCH 24/82] fs/nfs/nfsroot.c: remove CVS keyword This patch removes a CVS keyword that wasn't updated for a long time from a comment. Signed-off-by: Adrian Bunk Signed-off-by: Trond Myklebust --- fs/nfs/nfsroot.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 531379d36823..15afe3a6c5be 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -1,6 +1,4 @@ /* - * $Id: nfsroot.c,v 1.45 1998/03/07 10:44:46 mj Exp $ - * * Copyright (C) 1995, 1996 Gero Kuhlmann * * Allow an NFS filesystem to be mounted as root. The way this works is: From 48b605f83c920d8daa50e43fc2c7f718e04c7bfa Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 10 Jun 2008 15:38:39 -0400 Subject: [PATCH 25/82] NFS: implement option checking when remounting NFS filesystems (resend) When remounting an NFS or NFS4 filesystem, the new NFS options are not respected, yet the remount will still return success. This patch adds a remount_fs sb op for NFS that checks any new nfs mount options against the existing ones and fails the mount if any have changed. This is only implemented for string-based mount options since doing this with binary options isn't really feasible. This is essentially the same as the original patch I sent out, but adds a check to see if the addr= option has changed. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 614efeed5437..b550e921f661 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -207,6 +207,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); static void nfs_kill_super(struct super_block *); static void nfs_put_super(struct super_block *); +static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); static struct file_system_type nfs_fs_type = { .owner = THIS_MODULE, @@ -234,6 +235,7 @@ static const struct super_operations nfs_sops = { .umount_begin = nfs_umount_begin, .show_options = nfs_show_options, .show_stats = nfs_show_stats, + .remount_fs = nfs_remount, }; #ifdef CONFIG_NFS_V4 @@ -278,6 +280,7 @@ static const struct super_operations nfs4_sops = { .umount_begin = nfs_umount_begin, .show_options = nfs_show_options, .show_stats = nfs_show_stats, + .remount_fs = nfs_remount, }; #endif @@ -1396,6 +1399,79 @@ out_invalid_fh: return -EINVAL; } +static int +nfs_compare_remount_data(struct nfs_server *nfss, + struct nfs_parsed_mount_data *data) +{ + if (data->flags != nfss->flags || + data->rsize != nfss->rsize || + data->wsize != nfss->wsize || + data->retrans != nfss->client->cl_timeout->to_retries || + data->auth_flavors[0] != nfss->client->cl_auth->au_flavor || + data->acregmin != nfss->acregmin / HZ || + data->acregmax != nfss->acregmax / HZ || + data->acdirmin != nfss->acdirmin / HZ || + data->acdirmax != nfss->acdirmax / HZ || + data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) || + data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen || + memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr, + data->nfs_server.addrlen) != 0) + return -EINVAL; + + return 0; +} + +static int +nfs_remount(struct super_block *sb, int *flags, char *raw_data) +{ + int error; + struct nfs_server *nfss = sb->s_fs_info; + struct nfs_parsed_mount_data *data; + struct nfs_mount_data *options = (struct nfs_mount_data *)raw_data; + struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data; + + /* + * Userspace mount programs that send binary options generally send + * them populated with default values. We have no way to know which + * ones were explicitly specified. Fall back to legacy behavior and + * just return success. + */ + if ((sb->s_type == &nfs4_fs_type && options4->version == 1) || + (sb->s_type == &nfs_fs_type && options->version >= 1 && + options->version <= 6)) + return 0; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + return -ENOMEM; + + /* fill out struct with values from existing mount */ + data->flags = nfss->flags; + data->rsize = nfss->rsize; + data->wsize = nfss->wsize; + data->retrans = nfss->client->cl_timeout->to_retries; + data->auth_flavors[0] = nfss->client->cl_auth->au_flavor; + data->acregmin = nfss->acregmin / HZ; + data->acregmax = nfss->acregmax / HZ; + data->acdirmin = nfss->acdirmin / HZ; + data->acdirmax = nfss->acdirmax / HZ; + data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; + data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; + memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, + data->nfs_server.addrlen); + + /* overwrite those values with any that were specified */ + error = nfs_parse_mount_options((char *)options, data); + if (error < 0) + goto out; + + /* compare new mount options with old ones */ + error = nfs_compare_remount_data(nfss, data); +out: + kfree(data); + return error; +} + /* * Initialise the common bits of the superblock */ From b6b6152c46861dd914d0e6cea9c27df057d6e235 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 9 Jun 2008 16:51:31 -0400 Subject: [PATCH 26/82] rpc: bring back cl_chatty The cl_chatty flag alows us to control whether a given rpc client leaves "server X not responding, timed out" messages in the syslog. Such messages make sense for ordinary nfs clients (where an unresponsive server means applications on the mountpoint are probably hanging), but not for the callback client (which can fail more commonly, with the only result just of disabling some optimizations). Previously cl_chatty was removed, do to lack of users; reinstate it, and use it for the nfsd's callback client. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- fs/nfsd/nfs4callback.c | 2 +- include/linux/sunrpc/clnt.h | 4 +++- net/sunrpc/clnt.c | 16 +++++++++++----- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 4d4760e687c3..702fa577aa6e 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -381,7 +381,7 @@ static int do_probe_callback(void *data) .program = &cb_program, .version = nfs_cb_version[1]->number, .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ - .flags = (RPC_CLNT_CREATE_NOPING), + .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), }; struct rpc_message msg = { .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 6fff7f82ef12..764fd4c286e0 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -42,7 +42,8 @@ struct rpc_clnt { unsigned int cl_softrtry : 1,/* soft timeouts */ cl_discrtry : 1,/* disconnect before retry */ - cl_autobind : 1;/* use getport() */ + cl_autobind : 1,/* use getport() */ + cl_chatty : 1;/* be verbose */ struct rpc_rtt * cl_rtt; /* RTO estimator data */ const struct rpc_timeout *cl_timeout; /* Timeout strategy */ @@ -114,6 +115,7 @@ struct rpc_create_args { #define RPC_CLNT_CREATE_NONPRIVPORT (1UL << 3) #define RPC_CLNT_CREATE_NOPING (1UL << 4) #define RPC_CLNT_CREATE_DISCRTRY (1UL << 5) +#define RPC_CLNT_CREATE_QUIET (1UL << 6) struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 0530eea37b59..09631f6e30e9 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -324,6 +324,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) clnt->cl_autobind = 1; if (args->flags & RPC_CLNT_CREATE_DISCRTRY) clnt->cl_discrtry = 1; + if (!(args->flags & RPC_CLNT_CREATE_QUIET)) + clnt->cl_chatty = 1; return clnt; } @@ -1149,7 +1151,8 @@ call_status(struct rpc_task *task) rpc_exit(task, status); break; default: - printk("%s: RPC call returned error %d\n", + if (clnt->cl_chatty) + printk("%s: RPC call returned error %d\n", clnt->cl_protname, -status); rpc_exit(task, status); } @@ -1174,7 +1177,8 @@ call_timeout(struct rpc_task *task) task->tk_timeouts++; if (RPC_IS_SOFT(task)) { - printk(KERN_NOTICE "%s: server %s not responding, timed out\n", + if (clnt->cl_chatty) + printk(KERN_NOTICE "%s: server %s not responding, timed out\n", clnt->cl_protname, clnt->cl_server); rpc_exit(task, -EIO); return; @@ -1182,7 +1186,8 @@ call_timeout(struct rpc_task *task) if (!(task->tk_flags & RPC_CALL_MAJORSEEN)) { task->tk_flags |= RPC_CALL_MAJORSEEN; - printk(KERN_NOTICE "%s: server %s not responding, still trying\n", + if (clnt->cl_chatty) + printk(KERN_NOTICE "%s: server %s not responding, still trying\n", clnt->cl_protname, clnt->cl_server); } rpc_force_rebind(clnt); @@ -1213,8 +1218,9 @@ call_decode(struct rpc_task *task) task->tk_pid, task->tk_status); if (task->tk_flags & RPC_CALL_MAJORSEEN) { - printk(KERN_NOTICE "%s: server %s OK\n", - clnt->cl_protname, clnt->cl_server); + if (clnt->cl_chatty) + printk(KERN_NOTICE "%s: server %s OK\n", + clnt->cl_protname, clnt->cl_server); task->tk_flags &= ~RPC_CALL_MAJORSEEN; } From 720b8f2d6f7de9e16f1217448cc7396e1604e175 Mon Sep 17 00:00:00 2001 From: "\\\\\\\"J. Bruce Fields\\\\\\" Date: Mon, 9 Jun 2008 16:51:33 -0400 Subject: [PATCH 27/82] rpc: eliminate unused variable in auth_gss upcall code Also, a minor comment grammar fix in the same file. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index bf7585b80543..ebfd630541fa 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -272,7 +272,7 @@ __gss_find_upcall(struct rpc_inode *rpci, uid_t uid) return NULL; } -/* Try to add a upcall to the pipefs queue. +/* Try to add an upcall to the pipefs queue. * If an upcall owned by our uid already exists, then we return a reference * to that upcall instead of adding the new upcall. */ @@ -493,7 +493,6 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { const void *p, *end; void *buf; - struct rpc_clnt *clnt; struct gss_upcall_msg *gss_msg; struct inode *inode = filp->f_path.dentry->d_inode; struct gss_cl_ctx *ctx; @@ -507,7 +506,6 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (!buf) goto out; - clnt = RPC_I(inode)->private; err = -EFAULT; if (copy_from_user(buf, src, mlen)) goto err; From d25a03cf966f2cf9990dc0bf2a921a554919ea34 Mon Sep 17 00:00:00 2001 From: "\\\\\\\"J. Bruce Fields\\\\\\" Date: Mon, 9 Jun 2008 16:51:34 -0400 Subject: [PATCH 28/82] rpc: remove some unused macros There used to be a print_hexl() function that used isprint(), now gone. I don't know why NFS_NGROUPS and CA_RUN_AS_MACHINE were here. I also don't know why another #define that's actually used was marked "unused". Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index ebfd630541fa..834a83199bdf 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -63,22 +63,11 @@ static const struct rpc_credops gss_nullops; # define RPCDBG_FACILITY RPCDBG_AUTH #endif -#define NFS_NGROUPS 16 - -#define GSS_CRED_SLACK 1024 /* XXX: unused */ +#define GSS_CRED_SLACK 1024 /* length of a krb5 verifier (48), plus data added before arguments when * using integrity (two 4-byte integers): */ #define GSS_VERF_SLACK 100 -/* XXX this define must match the gssd define -* as it is passed to gssd to signal the use of -* machine creds should be part of the shared rpc interface */ - -#define CA_RUN_AS_MACHINE 0x00000200 - -/* dump the buffer in `emacs-hexl' style */ -#define isprint(c) ((c > 0x1f) && (c < 0x7f)) - struct gss_auth { struct kref kref; struct rpc_auth rpc_auth; From a486aeda9b2b0d944aecce7871b3186379b898de Mon Sep 17 00:00:00 2001 From: "\\\\\\\"J. Bruce Fields\\\\\\" Date: Mon, 9 Jun 2008 16:51:35 -0400 Subject: [PATCH 29/82] rpc: minor cleanup of scheduler callback code Try to make the comment here a little more clear and concise. Also, this macro definition seems unnecessary. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 1 - net/sunrpc/sched.c | 14 +++++--------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index d1a5c8c1a0f1..64981a2f1cae 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -135,7 +135,6 @@ struct rpc_task_setup { #define RPC_IS_SWAPPER(t) ((t)->tk_flags & RPC_TASK_SWAPPER) #define RPC_DO_ROOTOVERRIDE(t) ((t)->tk_flags & RPC_TASK_ROOTCREDS) #define RPC_ASSASSINATED(t) ((t)->tk_flags & RPC_TASK_KILLED) -#define RPC_DO_CALLBACK(t) ((t)->tk_callback != NULL) #define RPC_IS_SOFT(t) ((t)->tk_flags & RPC_TASK_SOFT) #define RPC_TASK_RUNNING 0 diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 6eab9bf94baf..6288af05c20f 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -626,19 +626,15 @@ static void __rpc_execute(struct rpc_task *task) /* * Execute any pending callback. */ - if (RPC_DO_CALLBACK(task)) { - /* Define a callback save pointer */ + if (task->tk_callback) { void (*save_callback)(struct rpc_task *); /* - * If a callback exists, save it, reset it, - * call it. - * The save is needed to stop from resetting - * another callback set within the callback handler - * - Dave + * We set tk_callback to NULL before calling it, + * in case it sets the tk_callback field itself: */ - save_callback=task->tk_callback; - task->tk_callback=NULL; + save_callback = task->tk_callback; + task->tk_callback = NULL; save_callback(task); } From 659bfcd6dd88919a5ad453f62afbeffcb3106847 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Jun 2008 19:39:41 -0400 Subject: [PATCH 30/82] NFS: Fix the ftruncate() credential problem ftruncate() access checking is supposed to be performed at open() time, just like reads and writes. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 4 ++-- fs/nfs/nfs3proc.c | 2 ++ fs/nfs/nfs4proc.c | 49 +++++++++++++++++++++++------------------------ fs/nfs/proc.c | 2 ++ 4 files changed, 30 insertions(+), 27 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 596c5d8e86f4..2e4ab4a5e107 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -347,7 +347,7 @@ out_no_inode: goto out; } -#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET) +#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE) int nfs_setattr(struct dentry *dentry, struct iattr *attr) @@ -369,7 +369,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) /* Optimization: if the end result is no change, don't RPC */ attr->ia_valid &= NFS_VALID_ATTRS; - if (attr->ia_valid == 0) + if ((attr->ia_valid & ~ATTR_FILE) == 0) return 0; lock_kernel(); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index cf7d4e5927d6..b9c2d995332b 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -129,6 +129,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, int status; dprintk("NFS call setattr\n"); + if (sattr->ia_valid & ATTR_FILE) + msg.rpc_cred = nfs_file_cred(sattr->ia_file); nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status == 0) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 26fbeb3467cb..31a7e4c54a12 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1139,8 +1139,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int return res; } -static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr, - struct iattr *sattr, struct nfs4_state *state) +static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, + struct nfs_fattr *fattr, struct iattr *sattr, + struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_setattrargs arg = { @@ -1154,9 +1155,10 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr, .server = server, }; struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], - .rpc_argp = &arg, - .rpc_resp = &res, + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], + .rpc_argp = &arg, + .rpc_resp = &res, + .rpc_cred = cred, }; unsigned long timestamp = jiffies; int status; @@ -1166,7 +1168,6 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr, if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { /* Use that stateid */ } else if (state != NULL) { - msg.rpc_cred = state->owner->so_cred; nfs4_copy_stateid(&arg.stateid, state, current->files); } else memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); @@ -1177,15 +1178,16 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr, return status; } -static int nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr, - struct iattr *sattr, struct nfs4_state *state) +static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, + struct nfs_fattr *fattr, struct iattr *sattr, + struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(inode); struct nfs4_exception exception = { }; int err; do { err = nfs4_handle_exception(server, - _nfs4_do_setattr(inode, fattr, sattr, state), + _nfs4_do_setattr(inode, cred, fattr, sattr, state), &exception); } while (exception.retry); return err; @@ -1647,29 +1649,25 @@ static int nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct iattr *sattr) { - struct rpc_cred *cred; struct inode *inode = dentry->d_inode; - struct nfs_open_context *ctx; + struct rpc_cred *cred = NULL; struct nfs4_state *state = NULL; int status; nfs_fattr_init(fattr); - cred = rpc_lookup_cred(); - if (IS_ERR(cred)) - return PTR_ERR(cred); - /* Search for an existing open(O_WRITE) file */ - ctx = nfs_find_open_context(inode, cred, FMODE_WRITE); - if (ctx != NULL) - state = ctx->state; + if (sattr->ia_valid & ATTR_FILE) { + struct nfs_open_context *ctx; - status = nfs4_do_setattr(inode, fattr, sattr, state); + ctx = nfs_file_open_context(sattr->ia_file); + cred = ctx->cred; + state = ctx->state; + } + + status = nfs4_do_setattr(inode, cred, fattr, sattr, state); if (status == 0) nfs_setattr_update_inode(inode, sattr); - if (ctx != NULL) - put_nfs_open_context(ctx); - put_rpccred(cred); return status; } @@ -1897,17 +1895,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, goto out; } state = nfs4_do_open(dir, &path, flags, sattr, cred); - put_rpccred(cred); d_drop(dentry); if (IS_ERR(state)) { status = PTR_ERR(state); - goto out; + goto out_putcred; } d_add(dentry, igrab(state->inode)); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); if (flags & O_EXCL) { struct nfs_fattr fattr; - status = nfs4_do_setattr(state->inode, &fattr, sattr, state); + status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state); if (status == 0) nfs_setattr_update_inode(state->inode, sattr); nfs_post_op_update_inode(state->inode, &fattr); @@ -1916,6 +1913,8 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, status = nfs4_intent_set_file(nd, &path, state); else nfs4_close_sync(&path, state, flags); +out_putcred: + put_rpccred(cred); out: return status; } diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 5c35b02857f3..c7605587d0eb 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -129,6 +129,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, sattr->ia_mode &= S_IALLUGO; dprintk("NFS call setattr\n"); + if (sattr->ia_valid & ATTR_FILE) + msg.rpc_cred = nfs_file_cred(sattr->ia_file); nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status == 0) From b22602a673b1743bba4b62bb404ffd3b269d2f09 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 6 Jun 2008 13:22:25 -0400 Subject: [PATCH 31/82] SUNRPC: Ensure all transports set rq_xtime consistently The RPC client uses the rq_xtime field in each RPC request to determine the round-trip time of the request. Currently, the rq_xtime field is initialized by each transport just before it starts enqueing a request to be sent. However, transports do not handle initializing this value consistently; sometimes they don't initialize it at all. To make the measurement of request round-trip time consistent for all RPC client transport capabilities, pull rq_xtime initialization into the RPC client's generic transport logic. Now all transports will get a standardized RTT measure automatically, from: xprt_transmit() to xprt_complete_rqst() This makes round-trip time calculation more accurate for the TCP transport. The socket ->sendmsg() method can return "-EAGAIN" if the socket's output buffer is full, so the TCP transport's ->send_request() method may call the ->sendmsg() method repeatedly until it gets all of the request's bytes queued in the socket's buffer. Currently, the TCP transport sets the rq_xtime field every time through that loop so the final value is the timestamp just before the *last* call to the underlying socket's ->sendmsg() method. After this patch, the rq_xtime field contains a timestamp that reflects the time just before the *first* call to ->sendmsg(). This is consequential under heavy workloads because large requests often take multiple ->sendmsg() calls to get all the bytes of a request queued. The TCP transport causes the request to sleep until the remote end of the socket has received enough bytes to clear space in the socket's local output buffer. This delay can be quite significant. The method introduced by this patch is a more accurate measure of RTT for stream transports, since the server can cause enough back pressure to delay (ie increase the latency of) requests from the client. Additionally, this patch corrects the behavior of the RDMA transport, which entirely neglected to initialize the rq_xtime field. RPC performance metrics for RDMA transports now display correct RPC request round trip times. Signed-off-by: Chuck Lever Acked-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 1 + net/sunrpc/xprtsock.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 67996bd7fbf9..99a52aabe332 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -872,6 +872,7 @@ void xprt_transmit(struct rpc_task *task) return; req->rq_connect_cookie = xprt->connect_cookie; + req->rq_xtime = jiffies; status = xprt->ops->send_request(task); if (status == 0) { dprintk("RPC: %5u xmit complete\n", task->tk_pid); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index ddbe981ab516..4486c59c3aca 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -579,7 +579,6 @@ static int xs_udp_send_request(struct rpc_task *task) req->rq_svec->iov_base, req->rq_svec->iov_len); - req->rq_xtime = jiffies; status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen, xdr, @@ -671,7 +670,6 @@ static int xs_tcp_send_request(struct rpc_task *task) * to cope with writespace callbacks arriving _after_ we have * called sendmsg(). */ while (1) { - req->rq_xtime = jiffies; status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent); From 46cb650c224bb8e64a749090105d74b9e8eda669 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jun 2008 16:32:46 -0400 Subject: [PATCH 32/82] NFS: Remove the redundant file_open entry from struct nfs_rpc_ops All instances are set to nfs_open(), so we should just remove the redundant indirection. Ditto for the file_release op Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 4 ++-- fs/nfs/nfs3proc.c | 2 -- fs/nfs/nfs4proc.c | 2 -- fs/nfs/proc.c | 2 -- include/linux/nfs_xdr.h | 2 -- 5 files changed, 2 insertions(+), 10 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 78b26f875eee..509dcb58959e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -129,7 +129,7 @@ nfs_file_open(struct inode *inode, struct file *filp) nfs_inc_stats(inode, NFSIOS_VFSOPEN); lock_kernel(); - res = NFS_PROTO(inode)->file_open(inode, filp); + res = nfs_open(inode, filp); unlock_kernel(); return res; } @@ -147,7 +147,7 @@ nfs_file_release(struct inode *inode, struct file *filp) if (filp->f_mode & FMODE_WRITE) nfs_wb_all(dentry->d_inode); nfs_inc_stats(inode, NFSIOS_VFSRELEASE); - return NFS_PROTO(inode)->file_release(inode, filp); + return nfs_release(inode, filp); } /** diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index b9c2d995332b..1e750e4574a9 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -816,8 +816,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .write_done = nfs3_write_done, .commit_setup = nfs3_proc_commit_setup, .commit_done = nfs3_commit_done, - .file_open = nfs_open, - .file_release = nfs_release, .lock = nfs3_proc_lock, .clear_acl_cache = nfs3_forget_cached_acls, }; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 31a7e4c54a12..dfdd19a6d17e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3714,8 +3714,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .write_done = nfs4_write_done, .commit_setup = nfs4_proc_commit_setup, .commit_done = nfs4_commit_done, - .file_open = nfs_open, - .file_release = nfs_release, .lock = nfs4_proc_lock, .clear_acl_cache = nfs4_zap_acl_attr, }; diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index c7605587d0eb..4dbb84df1b68 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -655,8 +655,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .write_setup = nfs_proc_write_setup, .write_done = nfs_write_done, .commit_setup = nfs_proc_commit_setup, - .file_open = nfs_open, - .file_release = nfs_release, .lock = nfs_proc_lock, .lock_check_bounds = nfs_lock_check_bounds, }; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 8d780de371f0..8c77c11224d1 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -829,8 +829,6 @@ struct nfs_rpc_ops { int (*write_done) (struct rpc_task *, struct nfs_write_data *); void (*commit_setup) (struct nfs_write_data *, struct rpc_message *); int (*commit_done) (struct rpc_task *, struct nfs_write_data *); - int (*file_open) (struct inode *, struct file *); - int (*file_release) (struct inode *, struct file *); int (*lock)(struct file *, int, struct file_lock *); int (*lock_check_bounds)(const struct file_lock *); void (*clear_acl_cache)(struct inode *); From 34e8f92831cb5c40b3137e47a3daf4c09016ef02 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 12 Jun 2008 12:32:25 -0400 Subject: [PATCH 33/82] NFS: Move fs/nfs/iostat.h to include/linux The fs/nfs/iostat.h header has definitions that were designed to be exposed to user space. Move these definitions under include/linux so user space can use the definitions in applications that read /proc/self/mountstats. Also address a handful of coding style issues called out by checkpatch.pl in fs/nfs/iostat.h. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/iostat.h | 119 ++++--------------------------------- include/linux/nfs_iostat.h | 119 +++++++++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+), 106 deletions(-) create mode 100644 include/linux/nfs_iostat.h diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index 6350ecbde589..2ec65e12bfed 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -5,135 +5,41 @@ * * Copyright (C) 2005, 2006 Chuck Lever * - * NFS client per-mount statistics provide information about the health of - * the NFS client and the health of each NFS mount point. Generally these - * are not for detailed problem diagnosis, but simply to indicate that there - * is a problem. - * - * These counters are not meant to be human-readable, but are meant to be - * integrated into system monitoring tools such as "sar" and "iostat". As - * such, the counters are sampled by the tools over time, and are never - * zeroed after a file system is mounted. Moving averages can be computed - * by the tools by taking the difference between two instantaneous samples - * and dividing that by the time between the samples. */ #ifndef _NFS_IOSTAT #define _NFS_IOSTAT -#define NFS_IOSTAT_VERS "1.0" - -/* - * NFS byte counters - * - * 1. SERVER - the number of payload bytes read from or written to the - * server by the NFS client via an NFS READ or WRITE request. - * - * 2. NORMAL - the number of bytes read or written by applications via - * the read(2) and write(2) system call interfaces. - * - * 3. DIRECT - the number of bytes read or written from files opened - * with the O_DIRECT flag. - * - * These counters give a view of the data throughput into and out of the NFS - * client. Comparing the number of bytes requested by an application with the - * number of bytes the client requests from the server can provide an - * indication of client efficiency (per-op, cache hits, etc). - * - * These counters can also help characterize which access methods are in - * use. DIRECT by itself shows whether there is any O_DIRECT traffic. - * NORMAL + DIRECT shows how much data is going through the system call - * interface. A large amount of SERVER traffic without much NORMAL or - * DIRECT traffic shows that applications are using mapped files. - * - * NFS page counters - * - * These count the number of pages read or written via nfs_readpage(), - * nfs_readpages(), or their write equivalents. - */ -enum nfs_stat_bytecounters { - NFSIOS_NORMALREADBYTES = 0, - NFSIOS_NORMALWRITTENBYTES, - NFSIOS_DIRECTREADBYTES, - NFSIOS_DIRECTWRITTENBYTES, - NFSIOS_SERVERREADBYTES, - NFSIOS_SERVERWRITTENBYTES, - NFSIOS_READPAGES, - NFSIOS_WRITEPAGES, - __NFSIOS_BYTESMAX, -}; - -/* - * NFS event counters - * - * These counters provide a low-overhead way of monitoring client activity - * without enabling NFS trace debugging. The counters show the rate at - * which VFS requests are made, and how often the client invalidates its - * data and attribute caches. This allows system administrators to monitor - * such things as how close-to-open is working, and answer questions such - * as "why are there so many GETATTR requests on the wire?" - * - * They also count anamolous events such as short reads and writes, silly - * renames due to close-after-delete, and operations that change the size - * of a file (such operations can often be the source of data corruption - * if applications aren't using file locking properly). - */ -enum nfs_stat_eventcounters { - NFSIOS_INODEREVALIDATE = 0, - NFSIOS_DENTRYREVALIDATE, - NFSIOS_DATAINVALIDATE, - NFSIOS_ATTRINVALIDATE, - NFSIOS_VFSOPEN, - NFSIOS_VFSLOOKUP, - NFSIOS_VFSACCESS, - NFSIOS_VFSUPDATEPAGE, - NFSIOS_VFSREADPAGE, - NFSIOS_VFSREADPAGES, - NFSIOS_VFSWRITEPAGE, - NFSIOS_VFSWRITEPAGES, - NFSIOS_VFSGETDENTS, - NFSIOS_VFSSETATTR, - NFSIOS_VFSFLUSH, - NFSIOS_VFSFSYNC, - NFSIOS_VFSLOCK, - NFSIOS_VFSRELEASE, - NFSIOS_CONGESTIONWAIT, - NFSIOS_SETATTRTRUNC, - NFSIOS_EXTENDWRITE, - NFSIOS_SILLYRENAME, - NFSIOS_SHORTREAD, - NFSIOS_SHORTWRITE, - NFSIOS_DELAY, - __NFSIOS_COUNTSMAX, -}; - -#ifdef __KERNEL__ - #include #include +#include struct nfs_iostats { unsigned long long bytes[__NFSIOS_BYTESMAX]; unsigned long events[__NFSIOS_COUNTSMAX]; } ____cacheline_aligned; -static inline void nfs_inc_server_stats(struct nfs_server *server, enum nfs_stat_eventcounters stat) +static inline void nfs_inc_server_stats(struct nfs_server *server, + enum nfs_stat_eventcounters stat) { struct nfs_iostats *iostats; int cpu; cpu = get_cpu(); iostats = per_cpu_ptr(server->io_stats, cpu); - iostats->events[stat] ++; + iostats->events[stat]++; put_cpu_no_resched(); } -static inline void nfs_inc_stats(struct inode *inode, enum nfs_stat_eventcounters stat) +static inline void nfs_inc_stats(struct inode *inode, + enum nfs_stat_eventcounters stat) { nfs_inc_server_stats(NFS_SERVER(inode), stat); } -static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat_bytecounters stat, unsigned long addend) +static inline void nfs_add_server_stats(struct nfs_server *server, + enum nfs_stat_bytecounters stat, + unsigned long addend) { struct nfs_iostats *iostats; int cpu; @@ -144,7 +50,9 @@ static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat put_cpu_no_resched(); } -static inline void nfs_add_stats(struct inode *inode, enum nfs_stat_bytecounters stat, unsigned long addend) +static inline void nfs_add_stats(struct inode *inode, + enum nfs_stat_bytecounters stat, + unsigned long addend) { nfs_add_server_stats(NFS_SERVER(inode), stat, addend); } @@ -160,5 +68,4 @@ static inline void nfs_free_iostats(struct nfs_iostats *stats) free_percpu(stats); } -#endif -#endif +#endif /* _NFS_IOSTAT */ diff --git a/include/linux/nfs_iostat.h b/include/linux/nfs_iostat.h new file mode 100644 index 000000000000..1cb9a3fed2b3 --- /dev/null +++ b/include/linux/nfs_iostat.h @@ -0,0 +1,119 @@ +/* + * User-space visible declarations for NFS client per-mount + * point statistics + * + * Copyright (C) 2005, 2006 Chuck Lever + * + * NFS client per-mount statistics provide information about the + * health of the NFS client and the health of each NFS mount point. + * Generally these are not for detailed problem diagnosis, but + * simply to indicate that there is a problem. + * + * These counters are not meant to be human-readable, but are meant + * to be integrated into system monitoring tools such as "sar" and + * "iostat". As such, the counters are sampled by the tools over + * time, and are never zeroed after a file system is mounted. + * Moving averages can be computed by the tools by taking the + * difference between two instantaneous samples and dividing that + * by the time between the samples. + */ + +#ifndef _LINUX_NFS_IOSTAT +#define _LINUX_NFS_IOSTAT + +#define NFS_IOSTAT_VERS "1.0" + +/* + * NFS byte counters + * + * 1. SERVER - the number of payload bytes read from or written + * to the server by the NFS client via an NFS READ or WRITE + * request. + * + * 2. NORMAL - the number of bytes read or written by applications + * via the read(2) and write(2) system call interfaces. + * + * 3. DIRECT - the number of bytes read or written from files + * opened with the O_DIRECT flag. + * + * These counters give a view of the data throughput into and out + * of the NFS client. Comparing the number of bytes requested by + * an application with the number of bytes the client requests from + * the server can provide an indication of client efficiency + * (per-op, cache hits, etc). + * + * These counters can also help characterize which access methods + * are in use. DIRECT by itself shows whether there is any O_DIRECT + * traffic. NORMAL + DIRECT shows how much data is going through + * the system call interface. A large amount of SERVER traffic + * without much NORMAL or DIRECT traffic shows that applications + * are using mapped files. + * + * NFS page counters + * + * These count the number of pages read or written via nfs_readpage(), + * nfs_readpages(), or their write equivalents. + * + * NB: When adding new byte counters, please include the measured + * units in the name of each byte counter to help users of this + * interface determine what exactly is being counted. + */ +enum nfs_stat_bytecounters { + NFSIOS_NORMALREADBYTES = 0, + NFSIOS_NORMALWRITTENBYTES, + NFSIOS_DIRECTREADBYTES, + NFSIOS_DIRECTWRITTENBYTES, + NFSIOS_SERVERREADBYTES, + NFSIOS_SERVERWRITTENBYTES, + NFSIOS_READPAGES, + NFSIOS_WRITEPAGES, + __NFSIOS_BYTESMAX, +}; + +/* + * NFS event counters + * + * These counters provide a low-overhead way of monitoring client + * activity without enabling NFS trace debugging. The counters + * show the rate at which VFS requests are made, and how often the + * client invalidates its data and attribute caches. This allows + * system administrators to monitor such things as how close-to-open + * is working, and answer questions such as "why are there so many + * GETATTR requests on the wire?" + * + * They also count anamolous events such as short reads and writes, + * silly renames due to close-after-delete, and operations that + * change the size of a file (such operations can often be the + * source of data corruption if applications aren't using file + * locking properly). + */ +enum nfs_stat_eventcounters { + NFSIOS_INODEREVALIDATE = 0, + NFSIOS_DENTRYREVALIDATE, + NFSIOS_DATAINVALIDATE, + NFSIOS_ATTRINVALIDATE, + NFSIOS_VFSOPEN, + NFSIOS_VFSLOOKUP, + NFSIOS_VFSACCESS, + NFSIOS_VFSUPDATEPAGE, + NFSIOS_VFSREADPAGE, + NFSIOS_VFSREADPAGES, + NFSIOS_VFSWRITEPAGE, + NFSIOS_VFSWRITEPAGES, + NFSIOS_VFSGETDENTS, + NFSIOS_VFSSETATTR, + NFSIOS_VFSFLUSH, + NFSIOS_VFSFSYNC, + NFSIOS_VFSLOCK, + NFSIOS_VFSRELEASE, + NFSIOS_CONGESTIONWAIT, + NFSIOS_SETATTRTRUNC, + NFSIOS_EXTENDWRITE, + NFSIOS_SILLYRENAME, + NFSIOS_SHORTREAD, + NFSIOS_SHORTWRITE, + NFSIOS_DELAY, + __NFSIOS_COUNTSMAX, +}; + +#endif /* _LINUX_NFS_IOSTAT */ From 2e96d2867245668dbdb973729288cf69b9fafa66 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jun 2008 16:42:05 -0400 Subject: [PATCH 34/82] NFS: Fix a warning in nfs4_async_handle_error We're not modifying the nfs_server when we call nfs_inc_server_stats and friends, so allow the compiler to pass 'const' pointers too. Signed-off-by: Trond Myklebust --- fs/nfs/iostat.h | 8 ++++---- fs/nfs/nfs4proc.c | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index 2ec65e12bfed..a36952810032 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -19,7 +19,7 @@ struct nfs_iostats { unsigned long events[__NFSIOS_COUNTSMAX]; } ____cacheline_aligned; -static inline void nfs_inc_server_stats(struct nfs_server *server, +static inline void nfs_inc_server_stats(const struct nfs_server *server, enum nfs_stat_eventcounters stat) { struct nfs_iostats *iostats; @@ -31,13 +31,13 @@ static inline void nfs_inc_server_stats(struct nfs_server *server, put_cpu_no_resched(); } -static inline void nfs_inc_stats(struct inode *inode, +static inline void nfs_inc_stats(const struct inode *inode, enum nfs_stat_eventcounters stat) { nfs_inc_server_stats(NFS_SERVER(inode), stat); } -static inline void nfs_add_server_stats(struct nfs_server *server, +static inline void nfs_add_server_stats(const struct nfs_server *server, enum nfs_stat_bytecounters stat, unsigned long addend) { @@ -50,7 +50,7 @@ static inline void nfs_add_server_stats(struct nfs_server *server, put_cpu_no_resched(); } -static inline void nfs_add_stats(struct inode *inode, +static inline void nfs_add_stats(const struct inode *inode, enum nfs_stat_bytecounters stat, unsigned long addend) { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index dfdd19a6d17e..058723d9122d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2756,8 +2756,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server) task->tk_status = 0; return -EAGAIN; case -NFS4ERR_DELAY: - nfs_inc_server_stats((struct nfs_server *) server, - NFSIOS_DELAY); + nfs_inc_server_stats(server, NFSIOS_DELAY); case -NFS4ERR_GRACE: rpc_delay(task, NFS4_POLL_RETRY_MAX); task->tk_status = 0; From f41f741838480aeaa3a189cff6e210503cf9c42d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jun 2008 17:39:04 -0400 Subject: [PATCH 35/82] NFS: Ensure we zap only the access and acl caches when setting new acls ...and ensure that we obey the NFS_INO_INVALID_ACL flag when retrieving the acls. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 4 +--- fs/nfs/internal.h | 1 + fs/nfs/nfs3acl.c | 9 ++++++--- fs/nfs/nfs4proc.c | 5 ++++- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 2e4ab4a5e107..2c23d067e2a6 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -57,8 +57,6 @@ static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; static void nfs_invalidate_inode(struct inode *); static int nfs_update_inode(struct inode *, struct nfs_fattr *); -static void nfs_zap_acl_cache(struct inode *); - static struct kmem_cache * nfs_inode_cachep; static inline unsigned long @@ -167,7 +165,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) } } -static void nfs_zap_acl_cache(struct inode *inode) +void nfs_zap_acl_cache(struct inode *inode) { void (*clear_acl_cache)(struct inode *); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 04ae867dddba..24241fcbb98d 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -150,6 +150,7 @@ extern void nfs_clear_inode(struct inode *); #ifdef CONFIG_NFS_V4 extern void nfs4_clear_inode(struct inode *); #endif +void nfs_zap_acl_cache(struct inode *inode); /* super.c */ extern struct file_system_type nfs_xdev_fs_type; diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 9b7362565c0c..423842f51ac9 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -5,6 +5,8 @@ #include #include +#include "internal.h" + #define NFSDBG_FACILITY NFSDBG_PROC ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size) @@ -205,6 +207,8 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) status = nfs_revalidate_inode(server, inode); if (status < 0) return ERR_PTR(status); + if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) + nfs_zap_acl_cache(inode); acl = nfs3_get_cached_acl(inode, type); if (acl != ERR_PTR(-EAGAIN)) return acl; @@ -319,9 +323,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, dprintk("NFS call setacl\n"); msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; status = rpc_call_sync(server->client_acl, &msg, 0); - spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS; - spin_unlock(&inode->i_lock); + nfs_access_zap_cache(inode); + nfs_zap_acl_cache(inode); dprintk("NFS reply setacl: %d\n", status); /* pages may have been allocated at the xdr layer. */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 058723d9122d..10f01c05a4e4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2695,6 +2695,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) ret = nfs_revalidate_inode(server, inode); if (ret < 0) return ret; + if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) + nfs_zap_acl_cache(inode); ret = nfs4_read_cached_acl(inode, buf, buflen); if (ret != -ENOENT) return ret; @@ -2722,7 +2724,8 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl nfs_inode_return_delegation(inode); buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); - nfs_zap_caches(inode); + nfs_access_zap_cache(inode); + nfs_zap_acl_cache(inode); return ret; } From ecbb3845dd678364148c1faad32adbc9dd833ef9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 12 Jun 2008 12:37:33 -0400 Subject: [PATCH 36/82] NFS: Allow any value for the "retry" option The kernel NFS mount option parser should ignore the retry= mount option since it is meaningful only in user space. Today it expects a number rather than arbitrary text, so it ignores the option if the value is numeric, but chokes if there are other characters in the value. Change it to allow any text (except ",") as its value. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index b550e921f661..5278eef61110 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -101,6 +101,8 @@ enum { static match_table_t nfs_mount_option_tokens = { { Opt_userspace, "bg" }, { Opt_userspace, "fg" }, + { Opt_userspace, "retry=%s" }, + { Opt_soft, "soft" }, { Opt_hard, "hard" }, { Opt_intr, "intr" }, @@ -136,7 +138,6 @@ static match_table_t nfs_mount_option_tokens = { { Opt_acdirmin, "acdirmin=%u" }, { Opt_acdirmax, "acdirmax=%u" }, { Opt_actimeo, "actimeo=%u" }, - { Opt_userspace, "retry=%u" }, { Opt_namelen, "namlen=%u" }, { Opt_mountport, "mountport=%u" }, { Opt_mountvers, "mountvers=%u" }, From d33e4dfeab5eb0c3c1359cc1a399f2f8b49e18de Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 12 Jun 2008 12:37:41 -0400 Subject: [PATCH 37/82] NFS: Treat "intr" and "nointr" options as deprecated Clean up: the "intr" and "nointr" mount options were recently retired. Document this in the NFS mount option parser. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 5278eef61110..f824c415c8a7 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -65,7 +65,6 @@ enum { /* Mount options that take no arguments */ Opt_soft, Opt_hard, - Opt_intr, Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac, Opt_noac, @@ -105,8 +104,8 @@ static match_table_t nfs_mount_option_tokens = { { Opt_soft, "soft" }, { Opt_hard, "hard" }, - { Opt_intr, "intr" }, - { Opt_nointr, "nointr" }, + { Opt_deprecated, "intr" }, + { Opt_deprecated, "nointr" }, { Opt_posix, "posix" }, { Opt_noposix, "noposix" }, { Opt_cto, "cto" }, @@ -787,9 +786,6 @@ static int nfs_parse_mount_options(char *raw, case Opt_hard: mnt->flags &= ~NFS_MOUNT_SOFT; break; - case Opt_intr: - case Opt_nointr: - break; case Opt_posix: mnt->flags |= NFS_MOUNT_POSIX; break; @@ -1105,6 +1101,8 @@ static int nfs_parse_mount_options(char *raw, case Opt_userspace: case Opt_deprecated: + dfprintk(MOUNT, "NFS: ignoring mount option " + "'%s'\n", p); break; default: From 396cee977f79590673ad51b04f1853e58bc30e7b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 12 Jun 2008 12:37:49 -0400 Subject: [PATCH 38/82] NFS: missing newline in NFS mount debugging message Clean up. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f824c415c8a7..a1065c1a3149 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1190,7 +1190,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, if (status == 0) return 0; - dfprintk(MOUNT, "NFS: unable to mount server %s, error %d", + dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", hostname, status); return status; } From e7d39069e387a12d4c57f4067d9f48c1d29ea900 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 13 Jun 2008 12:12:32 -0400 Subject: [PATCH 39/82] NFS: Clean up nfs_update_request() Simplify the loop in nfs_update_request by moving into a separate function the code that attempts to update an existing cached NFS write. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 199 +++++++++++++++++++++++++------------------------ 1 file changed, 102 insertions(+), 97 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 21d8a48b624b..04f51e52e184 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -34,9 +34,6 @@ /* * Local function declarations */ -static struct nfs_page * nfs_update_request(struct nfs_open_context*, - struct page *, - unsigned int, unsigned int); static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc, struct inode *inode, int ioflags); static void nfs_redirty_request(struct nfs_page *req); @@ -169,30 +166,6 @@ static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int SetPageUptodate(page); } -static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, - unsigned int offset, unsigned int count) -{ - struct nfs_page *req; - int ret; - - for (;;) { - req = nfs_update_request(ctx, page, offset, count); - if (!IS_ERR(req)) - break; - ret = PTR_ERR(req); - if (ret != -EBUSY) - return ret; - ret = nfs_wb_page(page->mapping->host, page); - if (ret != 0) - return ret; - } - /* Update file length */ - nfs_grow_file(page, offset, count); - nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); - nfs_clear_page_tag_locked(req); - return 0; -} - static int wb_priority(struct writeback_control *wbc) { if (wbc->for_reclaim) @@ -356,11 +329,19 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) /* * Insert a write request into an inode */ -static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) +static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) { struct nfs_inode *nfsi = NFS_I(inode); int error; + error = radix_tree_preload(GFP_NOFS); + if (error != 0) + goto out; + + /* Lock the request! */ + nfs_lock_request_dontget(req); + + spin_lock(&inode->i_lock); error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); BUG_ON(error); if (!nfsi->npages) { @@ -374,6 +355,10 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) kref_get(&req->wb_kref); radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); + spin_unlock(&inode->i_lock); + radix_tree_preload_end(); +out: + return error; } /* @@ -565,101 +550,121 @@ static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pg #endif /* - * Try to update any existing write request, or create one if there is none. - * In order to match, the request's credentials must match those of - * the calling process. + * Search for an existing write request, and attempt to update + * it to reflect a new dirty region on a given page. * - * Note: Should always be called with the Page Lock held! + * If the attempt fails, then the existing request is flushed out + * to disk. */ -static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, - struct page *page, unsigned int offset, unsigned int bytes) +static struct nfs_page *nfs_try_to_update_request(struct inode *inode, + struct page *page, + unsigned int offset, + unsigned int bytes) { - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - struct nfs_page *req, *new = NULL; - pgoff_t rqend, end; + struct nfs_page *req; + unsigned int rqend; + unsigned int end; + int error; + + if (!PagePrivate(page)) + return NULL; end = offset + bytes; + spin_lock(&inode->i_lock); for (;;) { - /* Loop over all inode entries and see if we find - * A request for the page we wish to update - */ - spin_lock(&inode->i_lock); req = nfs_page_find_request_locked(page); - if (req) { - if (!nfs_set_page_tag_locked(req)) { - int error; + if (req == NULL) + goto out_unlock; - spin_unlock(&inode->i_lock); - error = nfs_wait_on_request(req); - nfs_release_request(req); - if (error < 0) { - if (new) { - radix_tree_preload_end(); - nfs_release_request(new); - } - return ERR_PTR(error); - } - continue; - } - spin_unlock(&inode->i_lock); - if (new) { - radix_tree_preload_end(); - nfs_release_request(new); - } + rqend = req->wb_offset + req->wb_bytes; + /* + * Tell the caller to flush out the request if + * the offsets are non-contiguous. + * Note: nfs_flush_incompatible() will already + * have flushed out requests having wrong owners. + */ + if (!nfs_dirty_request(req) + || offset > rqend + || end < req->wb_offset) + goto out_flushme; + + if (nfs_set_page_tag_locked(req)) break; - } - if (new) { - nfs_lock_request_dontget(new); - nfs_inode_add_request(inode, new); - spin_unlock(&inode->i_lock); - radix_tree_preload_end(); - req = new; - goto out; - } + /* The request is locked, so wait and then retry */ spin_unlock(&inode->i_lock); - - new = nfs_create_request(ctx, inode, page, offset, bytes); - if (IS_ERR(new)) - return new; - if (radix_tree_preload(GFP_NOFS)) { - nfs_release_request(new); - return ERR_PTR(-ENOMEM); - } - } - - /* We have a request for our page. - * If the creds don't match, or the - * page addresses don't match, - * tell the caller to wait on the conflicting - * request. - */ - rqend = req->wb_offset + req->wb_bytes; - if (req->wb_context != ctx - || req->wb_page != page - || !nfs_dirty_request(req) - || offset > rqend || end < req->wb_offset) { - nfs_clear_page_tag_locked(req); - return ERR_PTR(-EBUSY); + error = nfs_wait_on_request(req); + nfs_release_request(req); + if (error != 0) + goto out_err; + spin_lock(&inode->i_lock); } /* Okay, the request matches. Update the region */ if (offset < req->wb_offset) { req->wb_offset = offset; req->wb_pgbase = offset; - req->wb_bytes = max(end, rqend) - req->wb_offset; - goto out; } - if (end > rqend) req->wb_bytes = end - req->wb_offset; + else + req->wb_bytes = rqend - req->wb_offset; +out_unlock: + spin_unlock(&inode->i_lock); + return req; +out_flushme: + spin_unlock(&inode->i_lock); + nfs_release_request(req); + error = nfs_wb_page(inode, page); +out_err: + return ERR_PTR(error); +} +/* + * Try to update an existing write request, or create one if there is none. + * + * Note: Should always be called with the Page Lock held to prevent races + * if we have to add a new request. Also assumes that the caller has + * already called nfs_flush_incompatible() if necessary. + */ +static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, + struct page *page, unsigned int offset, unsigned int bytes) +{ + struct inode *inode = page->mapping->host; + struct nfs_page *req; + int error; + + req = nfs_try_to_update_request(inode, page, offset, bytes); + if (req != NULL) + goto out; + req = nfs_create_request(ctx, inode, page, offset, bytes); + if (IS_ERR(req)) + goto out; + error = nfs_inode_add_request(inode, req); + if (error != 0) { + nfs_release_request(req); + req = ERR_PTR(error); + } out: return req; } +static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, + unsigned int offset, unsigned int count) +{ + struct nfs_page *req; + + req = nfs_setup_write_request(ctx, page, offset, count); + if (IS_ERR(req)) + return PTR_ERR(req); + /* Update file length */ + nfs_grow_file(page, offset, count); + nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); + nfs_clear_page_tag_locked(req); + return 0; +} + int nfs_flush_incompatible(struct file *file, struct page *page) { struct nfs_open_context *ctx = nfs_file_open_context(file); From e468bae97d243fe0e1515abaa1f7d0edf1476ad0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 13 Jun 2008 13:25:22 -0400 Subject: [PATCH 40/82] NFS: Allow redirtying of a completed unstable write. Currently, if an unstable write completes, we cannot redirty the page in order to reflect a new change in the page data until after we've sent a COMMIT request. This patch allows a page rewrite to proceed without the unnecessary COMMIT step, putting it immediately back onto the dirty page list, undoing the VM unstable write accounting, and removing the NFS_PAGE_TAG_COMMIT tag from the NFS radix tree. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 65 ++++++++++++++++++++-------------------- include/linux/nfs_page.h | 9 ++++-- 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 04f51e52e184..feca8c648766 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -242,12 +242,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, return ret; spin_lock(&inode->i_lock); } - if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) { - /* This request is marked for commit */ + if (test_bit(PG_CLEAN, &req->wb_flags)) { spin_unlock(&inode->i_lock); - nfs_clear_page_tag_locked(req); - nfs_pageio_complete(pgio); - return 0; + BUG(); } if (nfs_set_page_writeback(page) != 0) { spin_unlock(&inode->i_lock); @@ -391,19 +388,6 @@ nfs_mark_request_dirty(struct nfs_page *req) __set_page_dirty_nobuffers(req->wb_page); } -/* - * Check if a request is dirty - */ -static inline int -nfs_dirty_request(struct nfs_page *req) -{ - struct page *page = req->wb_page; - - if (page == NULL || test_bit(PG_NEED_COMMIT, &req->wb_flags)) - return 0; - return !PageWriteback(page); -} - #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) /* * Add a request to the inode's commit list. @@ -416,7 +400,7 @@ nfs_mark_request_commit(struct nfs_page *req) spin_lock(&inode->i_lock); nfsi->ncommit++; - set_bit(PG_NEED_COMMIT, &(req)->wb_flags); + set_bit(PG_CLEAN, &(req)->wb_flags); radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_COMMIT); @@ -426,6 +410,19 @@ nfs_mark_request_commit(struct nfs_page *req) __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } +static int +nfs_clear_request_commit(struct nfs_page *req) +{ + struct page *page = req->wb_page; + + if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) { + dec_zone_page_state(page, NR_UNSTABLE_NFS); + dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); + return 1; + } + return 0; +} + static inline int nfs_write_need_commit(struct nfs_write_data *data) { @@ -435,7 +432,7 @@ int nfs_write_need_commit(struct nfs_write_data *data) static inline int nfs_reschedule_unstable_write(struct nfs_page *req) { - if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) { + if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) { nfs_mark_request_commit(req); return 1; } @@ -451,6 +448,12 @@ nfs_mark_request_commit(struct nfs_page *req) { } +static inline int +nfs_clear_request_commit(struct nfs_page *req) +{ + return 0; +} + static inline int nfs_write_need_commit(struct nfs_write_data *data) { @@ -508,11 +511,8 @@ static void nfs_cancel_commit_list(struct list_head *head) while(!list_empty(head)) { req = nfs_list_entry(head->next); - dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - dec_bdi_stat(req->wb_page->mapping->backing_dev_info, - BDI_RECLAIMABLE); nfs_list_remove_request(req); - clear_bit(PG_NEED_COMMIT, &(req)->wb_flags); + nfs_clear_request_commit(req); nfs_inode_remove_request(req); nfs_unlock_request(req); } @@ -584,8 +584,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, * Note: nfs_flush_incompatible() will already * have flushed out requests having wrong owners. */ - if (!nfs_dirty_request(req) - || offset > rqend + if (offset > rqend || end < req->wb_offset) goto out_flushme; @@ -601,6 +600,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, spin_lock(&inode->i_lock); } + if (nfs_clear_request_commit(req)) + radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, + req->wb_index, NFS_PAGE_TAG_COMMIT); + /* Okay, the request matches. Update the region */ if (offset < req->wb_offset) { req->wb_offset = offset; @@ -682,8 +685,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) req = nfs_page_find_request(page); if (req == NULL) return 0; - do_flush = req->wb_page != page || req->wb_context != ctx - || !nfs_dirty_request(req); + do_flush = req->wb_page != page || req->wb_context != ctx; nfs_release_request(req); if (!do_flush) return 0; @@ -1288,10 +1290,7 @@ static void nfs_commit_release(void *calldata) while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - clear_bit(PG_NEED_COMMIT, &(req)->wb_flags); - dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - dec_bdi_stat(req->wb_page->mapping->backing_dev_info, - BDI_RECLAIMABLE); + nfs_clear_request_commit(req); dprintk("NFS: commit (%s/%lld %d@%lld)", req->wb_context->path.dentry->d_inode->i_sb->s_id, @@ -1467,7 +1466,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) req = nfs_page_find_request(page); if (req == NULL) goto out; - if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) { + if (test_bit(PG_CLEAN, &req->wb_flags)) { nfs_release_request(req); break; } diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index a1676e19e491..3c60685d972b 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -27,9 +27,12 @@ /* * Valid flags for a dirty buffer */ -#define PG_BUSY 0 -#define PG_NEED_COMMIT 1 -#define PG_NEED_RESCHED 2 +enum { + PG_BUSY = 0, + PG_CLEAN, + PG_NEED_COMMIT, + PG_NEED_RESCHED, +}; struct nfs_inode; struct nfs_page { From cd100725620a8063fbc81bcf16d7c9e71a9583e0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 17 Jun 2008 16:12:00 -0400 Subject: [PATCH 41/82] NFS: Fix a dependency on CONFIG_NFS_V4 in nfs_remount Fix the 'nfs4_fs_type' undeclared error in nfs_remount when compiling sans NFSv4... Signed-off-by: Trond Myklebust Cc: Jeff Layton --- fs/nfs/super.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index a1065c1a3149..b880db18035b 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1428,6 +1428,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) struct nfs_parsed_mount_data *data; struct nfs_mount_data *options = (struct nfs_mount_data *)raw_data; struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data; + u32 nfsvers = nfss->nfs_client->rpc_ops->version; /* * Userspace mount programs that send binary options generally send @@ -1435,8 +1436,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) * ones were explicitly specified. Fall back to legacy behavior and * just return success. */ - if ((sb->s_type == &nfs4_fs_type && options4->version == 1) || - (sb->s_type == &nfs_fs_type && options->version >= 1 && + if ((nfsvers == 4 && options4->version == 1) || + (nfsvers <= 3 && options->version >= 1 && options->version <= 6)) return 0; From dc04589827f7e1af12714af8bb00e3f3c4c48c62 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 23 Jun 2008 12:36:37 -0400 Subject: [PATCH 42/82] NFS: Use common device name parsing logic for NFSv4 and NFSv2/v3 To support passing a raw IPv6 address as a server hostname, we need to expand the logic that handles splitting the passed-in device name into a server hostname and export path Start by pulling device name parsing out of the mount option validation functions and into separate helper functions. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 124 +++++++++++++++++++++++++++++++------------------ 1 file changed, 79 insertions(+), 45 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index b880db18035b..c4ee8b3a27c1 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1195,6 +1195,67 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, return status; } +/* + * Split "dev_name" into "hostname:export_path". + * + * Note: caller frees hostname and export path, even on error. + */ +static int nfs_parse_devname(const char *dev_name, + char **hostname, size_t maxnamlen, + char **export_path, size_t maxpathlen) +{ + size_t len; + char *colon, *comma; + + colon = strchr(dev_name, ':'); + if (colon == NULL) + goto out_bad_devname; + + len = colon - dev_name; + if (len > maxnamlen) + goto out_hostname; + + /* N.B. caller will free nfs_server.hostname in all cases */ + *hostname = kstrndup(dev_name, len, GFP_KERNEL); + if (!*hostname) + goto out_nomem; + + /* kill possible hostname list: not supported */ + comma = strchr(*hostname, ','); + if (comma != NULL) { + if (comma == *hostname) + goto out_bad_devname; + *comma = '\0'; + } + + colon++; + len = strlen(colon); + if (len > maxpathlen) + goto out_path; + *export_path = kstrndup(colon, len, GFP_KERNEL); + if (!*export_path) + goto out_nomem; + + dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path); + return 0; + +out_bad_devname: + dfprintk(MOUNT, "NFS: device name not in host:path format\n"); + return -EINVAL; + +out_nomem: + dfprintk(MOUNT, "NFS: not enough memory to parse device name\n"); + return -ENOMEM; + +out_hostname: + dfprintk(MOUNT, "NFS: server hostname too long\n"); + return -ENAMETOOLONG; + +out_path: + dfprintk(MOUNT, "NFS: export pathname too long\n"); + return -ENAMETOOLONG; +} + /* * Validate the NFS2/NFS3 mount data * - fills in the mount root filehandle @@ -1323,8 +1384,6 @@ static int nfs_validate_mount_data(void *options, break; default: { - unsigned int len; - char *c; int status; if (nfs_parse_mount_options((char *)options, args) == 0) @@ -1334,21 +1393,17 @@ static int nfs_validate_mount_data(void *options, &args->nfs_server.address)) goto out_no_address; - c = strchr(dev_name, ':'); - if (c == NULL) - return -EINVAL; - len = c - dev_name; - /* N.B. caller will free nfs_server.hostname in all cases */ - args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL); - if (!args->nfs_server.hostname) - goto out_nomem; + status = nfs_parse_devname(dev_name, + &args->nfs_server.hostname, + PAGE_SIZE, + &args->nfs_server.export_path, + NFS_MAXPATHLEN); + if (!status) + status = nfs_try_mount(args, mntfh); - c++; - if (strlen(c) > NFS_MAXPATHLEN) - return -ENAMETOOLONG; - args->nfs_server.export_path = c; + kfree(args->nfs_server.export_path); + args->nfs_server.export_path = NULL; - status = nfs_try_mount(args, mntfh); if (status) return status; @@ -1958,7 +2013,7 @@ static int nfs4_validate_mount_data(void *options, break; default: { - unsigned int len; + int status; if (nfs_parse_mount_options((char *)options, args) == 0) return -EINVAL; @@ -1977,34 +2032,17 @@ static int nfs4_validate_mount_data(void *options, goto out_inval_auth; } - /* - * Split "dev_name" into "hostname:mntpath". - */ - c = strchr(dev_name, ':'); - if (c == NULL) - return -EINVAL; - /* while calculating len, pretend ':' is '\0' */ - len = c - dev_name; - if (len > NFS4_MAXNAMLEN) - return -ENAMETOOLONG; - /* N.B. caller will free nfs_server.hostname in all cases */ - args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL); - if (!args->nfs_server.hostname) - goto out_nomem; - - c++; /* step over the ':' */ - len = strlen(c); - if (len > NFS4_MAXPATHLEN) - return -ENAMETOOLONG; - args->nfs_server.export_path = kstrndup(c, len, GFP_KERNEL); - if (!args->nfs_server.export_path) - goto out_nomem; - - dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path); - if (args->client_address == NULL) goto out_no_client_address; + status = nfs_parse_devname(dev_name, + &args->nfs_server.hostname, + NFS4_MAXNAMLEN, + &args->nfs_server.export_path, + NFS4_MAXPATHLEN); + if (status < 0) + return status; + break; } } @@ -2020,10 +2058,6 @@ out_inval_auth: data->auth_flavourlen); return -EINVAL; -out_nomem: - dfprintk(MOUNT, "NFS4: not enough memory to handle mount options\n"); - return -ENOMEM; - out_no_address: dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); return -EINVAL; From d1aa08257312f1439c1ab7c8a18e3856f9530f46 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 23 Jun 2008 12:36:45 -0400 Subject: [PATCH 43/82] NFS: Support raw IPv6 address hostnames during NFS mount operation Traditionally the mount command has looked for a ":" to separate the server's hostname from the export path in the mounted on device name, like this: mount server:/export /mounted/on/dir The server's hostname is "server" and the export path is "/export". You can also substitute a specific IPv4 network address for the server hostname, like this: mount 192.168.0.55:/export /mounted/on/dir Raw IPv6 addresses present a problem, however, because they look something like this: fe80::200:5aff:fe00:30b Note the use of colons. To get around the presence of colons, copy the Solaris convention used for mounting IPv6 servers by address: wrap a raw IPv6 address with square brackets. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 90 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 82 insertions(+), 8 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index c4ee8b3a27c1..ea4abd266a7a 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1195,14 +1195,9 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, return status; } -/* - * Split "dev_name" into "hostname:export_path". - * - * Note: caller frees hostname and export path, even on error. - */ -static int nfs_parse_devname(const char *dev_name, - char **hostname, size_t maxnamlen, - char **export_path, size_t maxpathlen) +static int nfs_parse_simple_hostname(const char *dev_name, + char **hostname, size_t maxnamlen, + char **export_path, size_t maxpathlen) { size_t len; char *colon, *comma; @@ -1256,6 +1251,85 @@ out_path: return -ENAMETOOLONG; } +/* + * Hostname has square brackets around it because it contains one or + * more colons. We look for the first closing square bracket, and a + * colon must follow it. + */ +static int nfs_parse_protected_hostname(const char *dev_name, + char **hostname, size_t maxnamlen, + char **export_path, size_t maxpathlen) +{ + size_t len; + char *start, *end; + + start = (char *)(dev_name + 1); + + end = strchr(start, ']'); + if (end == NULL) + goto out_bad_devname; + if (*(end + 1) != ':') + goto out_bad_devname; + + len = end - start; + if (len > maxnamlen) + goto out_hostname; + + /* N.B. caller will free nfs_server.hostname in all cases */ + *hostname = kstrndup(start, len, GFP_KERNEL); + if (*hostname == NULL) + goto out_nomem; + + end += 2; + len = strlen(end); + if (len > maxpathlen) + goto out_path; + *export_path = kstrndup(end, len, GFP_KERNEL); + if (!*export_path) + goto out_nomem; + + return 0; + +out_bad_devname: + dfprintk(MOUNT, "NFS: device name not in host:path format\n"); + return -EINVAL; + +out_nomem: + dfprintk(MOUNT, "NFS: not enough memory to parse device name\n"); + return -ENOMEM; + +out_hostname: + dfprintk(MOUNT, "NFS: server hostname too long\n"); + return -ENAMETOOLONG; + +out_path: + dfprintk(MOUNT, "NFS: export pathname too long\n"); + return -ENAMETOOLONG; +} + +/* + * Split "dev_name" into "hostname:export_path". + * + * The leftmost colon demarks the split between the server's hostname + * and the export path. If the hostname starts with a left square + * bracket, then it may contain colons. + * + * Note: caller frees hostname and export path, even on error. + */ +static int nfs_parse_devname(const char *dev_name, + char **hostname, size_t maxnamlen, + char **export_path, size_t maxpathlen) +{ + if (*dev_name == '[') + return nfs_parse_protected_hostname(dev_name, + hostname, maxnamlen, + export_path, maxpathlen); + + return nfs_parse_simple_hostname(dev_name, + hostname, maxnamlen, + export_path, maxpathlen); +} + /* * Validate the NFS2/NFS3 mount data * - fills in the mount root filehandle From ce3b7e1906ebbe96753fe090b36de6ffb8e0e0e7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 23 Jun 2008 12:36:53 -0400 Subject: [PATCH 44/82] NFS: Add string length argument to nfs_parse_server_address To make nfs_parse_server_address() more generally useful, allow it to accept input strings that are not terminated with '\0'. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 102 ++++++++++++++++++++++++++++++------------- include/linux/inet.h | 7 +++ 2 files changed, 78 insertions(+), 31 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ea4abd266a7a..d27aa1db0074 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -705,38 +705,76 @@ static int nfs_verify_server_address(struct sockaddr *addr) return 0; } -/* - * Parse string addresses passed in via a mount option, - * and construct a sockaddr based on the result. - * - * If address parsing fails, set the sockaddr's address - * family to AF_UNSPEC to force nfs_verify_server_address() - * to punt the mount. - */ -static void nfs_parse_server_address(char *value, - struct sockaddr *sap, - size_t *len) +static void nfs_parse_ipv4_address(char *string, size_t str_len, + struct sockaddr *sap, size_t *addr_len) { - if (strchr(value, ':')) { - struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap; - u8 *addr = (u8 *)&ap->sin6_addr.in6_u; + struct sockaddr_in *sin = (struct sockaddr_in *)sap; + u8 *addr = (u8 *)&sin->sin_addr.s_addr; - ap->sin6_family = AF_INET6; - *len = sizeof(*ap); - if (in6_pton(value, -1, addr, '\0', NULL)) - return; - } else { - struct sockaddr_in *ap = (struct sockaddr_in *)sap; - u8 *addr = (u8 *)&ap->sin_addr.s_addr; + if (str_len <= INET_ADDRSTRLEN) { + dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n", + (int)str_len, string); - ap->sin_family = AF_INET; - *len = sizeof(*ap); - if (in4_pton(value, -1, addr, '\0', NULL)) + sin->sin_family = AF_INET; + *addr_len = sizeof(*sin); + if (in4_pton(string, str_len, addr, '\0', NULL)) return; } sap->sa_family = AF_UNSPEC; - *len = 0; + *addr_len = 0; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static void nfs_parse_ipv6_address(char *string, size_t str_len, + struct sockaddr *sap, size_t *addr_len) +{ + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + u8 *addr = (u8 *)&sin6->sin6_addr.in6_u; + + if (str_len <= INET6_ADDRSTRLEN) { + dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n", + (int)str_len, string); + + sin6->sin6_family = AF_INET6; + *addr_len = sizeof(*sin6); + if (in6_pton(string, str_len, addr, '\0', NULL)) + return; + } + + sap->sa_family = AF_UNSPEC; + *addr_len = 0; +} +#else +static void nfs_parse_ipv6_address(char *string, size_t str_len, + struct sockaddr *sap, size_t *addr_len) +{ + sap->sa_family = AF_UNSPEC; + *addr_len = 0; +} +#endif + +/* + * Construct a sockaddr based on the contents of a string that contains + * an IP address in presentation format. + * + * If there is a problem constructing the new sockaddr, set the address + * family to AF_UNSPEC. + */ +static void nfs_parse_ip_address(char *string, size_t str_len, + struct sockaddr *sap, size_t *addr_len) +{ + unsigned int i, colons; + + colons = 0; + for (i = 0; i < str_len; i++) + if (string[i] == ':') + colons++; + + if (colons >= 2) + nfs_parse_ipv6_address(string, str_len, sap, addr_len); + else + nfs_parse_ipv4_address(string, str_len, sap, addr_len); } /* @@ -1070,9 +1108,10 @@ static int nfs_parse_mount_options(char *raw, string = match_strdup(args); if (string == NULL) goto out_nomem; - nfs_parse_server_address(string, (struct sockaddr *) - &mnt->nfs_server.address, - &mnt->nfs_server.addrlen); + nfs_parse_ip_address(string, strlen(string), + (struct sockaddr *) + &mnt->nfs_server.address, + &mnt->nfs_server.addrlen); kfree(string); break; case Opt_clientaddr: @@ -1093,9 +1132,10 @@ static int nfs_parse_mount_options(char *raw, string = match_strdup(args); if (string == NULL) goto out_nomem; - nfs_parse_server_address(string, (struct sockaddr *) - &mnt->mount_server.address, - &mnt->mount_server.addrlen); + nfs_parse_ip_address(string, strlen(string), + (struct sockaddr *) + &mnt->mount_server.address, + &mnt->mount_server.addrlen); kfree(string); break; diff --git a/include/linux/inet.h b/include/linux/inet.h index 1354080cf8cf..4cca05c9678e 100644 --- a/include/linux/inet.h +++ b/include/linux/inet.h @@ -44,6 +44,13 @@ #include +/* + * These mimic similar macros defined in user-space for inet_ntop(3). + * See /usr/include/netinet/in.h . + */ +#define INET_ADDRSTRLEN (16) +#define INET6_ADDRSTRLEN (48) + extern __be32 in_aton(const char *str); extern int in4_pton(const char *src, int srclen, u8 *dst, int delim, const char **end); extern int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char **end); From d8e7748ab8322171ebfd78f6155ff35e7d57ac32 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 23 Jun 2008 12:37:01 -0400 Subject: [PATCH 45/82] NFS: handle interface identifiers in incoming IPv6 addresses Add support in the kernel NFS client's address parser for interface identifiers. IPv6 link-local addresses require an additional "interface identifier", which is a network device name or an integer that indexes the array of local network interfaces. They are suffixed to the address with a '%'. For example: fe80::215:c5ff:fe3b:e1b2%2 indicates an interface index of 2. Or fe80::215:c5ff:fe3b:e1b2%eth0 indicates that requests should be routed through the eth0 device. Without the interface ID, link-local addresses are not usable for NFS. Both the kernel NFS client mount option parser and the mount.nfs command can take either form. The mount.nfs command always passes the address through getnameinfo(3), which usually re-writes interface indices as device names. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index d27aa1db0074..73a8e5970f02 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -725,12 +726,48 @@ static void nfs_parse_ipv4_address(char *string, size_t str_len, *addr_len = 0; } +#define IPV6_SCOPE_DELIMITER '%' + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, + const char *delim, + struct sockaddr_in6 *sin6) +{ + char *p; + size_t len; + + if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) + return ; + if (*delim != IPV6_SCOPE_DELIMITER) + return; + + len = (string + str_len) - delim - 1; + p = kstrndup(delim + 1, len, GFP_KERNEL); + if (p) { + unsigned long scope_id = 0; + struct net_device *dev; + + dev = dev_get_by_name(&init_net, p); + if (dev != NULL) { + scope_id = dev->ifindex; + dev_put(dev); + } else { + /* scope_id is set to zero on error */ + strict_strtoul(p, 10, &scope_id); + } + + kfree(p); + sin6->sin6_scope_id = scope_id; + dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id); + } +} + static void nfs_parse_ipv6_address(char *string, size_t str_len, struct sockaddr *sap, size_t *addr_len) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; u8 *addr = (u8 *)&sin6->sin6_addr.in6_u; + const char *delim; if (str_len <= INET6_ADDRSTRLEN) { dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n", @@ -738,8 +775,10 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len, sin6->sin6_family = AF_INET6; *addr_len = sizeof(*sin6); - if (in6_pton(string, str_len, addr, '\0', NULL)) + if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) { + nfs_parse_ipv6_scope_id(string, str_len, delim, sin6); return; + } } sap->sa_family = AF_UNSPEC; From 77e03677ac76d413fbb6d6caaffa1d64877a0c2a Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Tue, 24 Jun 2008 20:25:57 +0300 Subject: [PATCH 46/82] nfs: initialize timeout variable in nfs4_proc_setclientid_confirm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc (4.3.0) rightfully warns about this: /usr0/export/dev/bhalevy/git/linux-pnfs-bh-nfs41/fs/nfs/nfs4proc.c: In function ‘nfs4_proc_setclientid_confirm’: /usr0/export/dev/bhalevy/git/linux-pnfs-bh-nfs41/fs/nfs/nfs4proc.c:2936: warning: ‘timeout’ may be used uninitialized in this function nfs4_delay that's passed a pointer to 'timeout' is looking at its value and sets it up to some value in the range: NFS4_POLL_RETRY_MIN..NFS4_POLL_RETRY_MAX if (*timeout <= 0) *timeout = NFS4_POLL_RETRY_MIN; if (*timeout > NFS4_POLL_RETRY_MAX) *timeout = NFS4_POLL_RETRY_MAX; Therefore it will end up set to some sane, though rather indeterministic, value. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 10f01c05a4e4..4451287a81d1 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2924,7 +2924,7 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) { - long timeout; + long timeout = 0; int err; do { err = _nfs4_proc_setclientid_confirm(clp, cred); From ee84dfc45467fd8e5ce04fa2813d98e0aebe465c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 11 Jun 2008 10:03:10 -0400 Subject: [PATCH 47/82] nfs4: remove BKL from nfs_callback_up and nfs_callback_down The nfs_callback_mutex is sufficient protection. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/callback.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index c1e7c8300629..9e713d2d5d74 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -105,7 +105,6 @@ int nfs_callback_up(void) struct svc_rqst *rqstp; int ret = 0; - lock_kernel(); mutex_lock(&nfs_callback_mutex); if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) goto out; @@ -149,7 +148,6 @@ out: if (serv) svc_destroy(serv); mutex_unlock(&nfs_callback_mutex); - unlock_kernel(); return ret; out_err: dprintk("Couldn't create callback socket or server thread; err = %d\n", @@ -163,13 +161,11 @@ out_err: */ void nfs_callback_down(void) { - lock_kernel(); mutex_lock(&nfs_callback_mutex); nfs_callback_info.users--; if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) kthread_stop(nfs_callback_info.task); mutex_unlock(&nfs_callback_mutex); - unlock_kernel(); } static int nfs_callback_authenticate(struct svc_rqst *rqstp) From 5afc597c5f0bd184457e49b9a330fcb37b69db11 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 11 Jun 2008 10:03:11 -0400 Subject: [PATCH 48/82] nfs4: fix potential race with rapid nfs_callback_up/down cycle If the nfsv4 callback thread is rapidly brought up and down, it's possible that nfs_callback_svc might never get a chance to run. If this happens, the cleanup at thread exit might never occur, throwing the refcounting off and nfs_callback_info in an incorrect state. Move the clean functions into nfs_callback_down. Also change the nfs_callback_info struct to track the svc_rqst rather than svc_serv since we need to know that to call svc_exit_thread. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/callback.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 9e713d2d5d74..f447f4b4476c 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -27,7 +27,7 @@ struct nfs_callback_data { unsigned int users; - struct svc_serv *serv; + struct svc_rqst *rqst; struct task_struct *task; }; @@ -91,18 +91,15 @@ nfs_callback_svc(void *vrqstp) svc_process(rqstp); } unlock_kernel(); - nfs_callback_info.task = NULL; - svc_exit_thread(rqstp); return 0; } /* - * Bring up the server process if it is not already up. + * Bring up the callback thread if it is not already up. */ int nfs_callback_up(void) { struct svc_serv *serv = NULL; - struct svc_rqst *rqstp; int ret = 0; mutex_lock(&nfs_callback_mutex); @@ -120,22 +117,23 @@ int nfs_callback_up(void) nfs_callback_tcpport = ret; dprintk("Callback port = 0x%x\n", nfs_callback_tcpport); - rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); - if (IS_ERR(rqstp)) { - ret = PTR_ERR(rqstp); + nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); + if (IS_ERR(nfs_callback_info.rqst)) { + ret = PTR_ERR(nfs_callback_info.rqst); + nfs_callback_info.rqst = NULL; goto out_err; } svc_sock_update_bufs(serv); - nfs_callback_info.serv = serv; - nfs_callback_info.task = kthread_run(nfs_callback_svc, rqstp, + nfs_callback_info.task = kthread_run(nfs_callback_svc, + nfs_callback_info.rqst, "nfsv4-svc"); if (IS_ERR(nfs_callback_info.task)) { ret = PTR_ERR(nfs_callback_info.task); - nfs_callback_info.serv = NULL; + svc_exit_thread(nfs_callback_info.rqst); + nfs_callback_info.rqst = NULL; nfs_callback_info.task = NULL; - svc_exit_thread(rqstp); goto out_err; } out: @@ -157,14 +155,18 @@ out_err: } /* - * Kill the server process if it is not already down. + * Kill the callback thread if it's no longer being used. */ void nfs_callback_down(void) { mutex_lock(&nfs_callback_mutex); nfs_callback_info.users--; - if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) + if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) { kthread_stop(nfs_callback_info.task); + svc_exit_thread(nfs_callback_info.rqst); + nfs_callback_info.rqst = NULL; + nfs_callback_info.task = NULL; + } mutex_unlock(&nfs_callback_mutex); } From 877fcf103982e52a59a1035378b4c0b8c63fe004 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 25 Jun 2008 17:24:23 -0400 Subject: [PATCH 49/82] SUNRPC: More useful debugging output for rpcb client Clean up dprintk's in rpcb client's XDR decoder functions. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index e6fb21b19b86..cf2b91613ac6 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -438,7 +438,7 @@ static int rpcb_decode_getport(struct rpc_rqst *req, __be32 *p, unsigned short *portp) { *portp = (unsigned short) ntohl(*p++); - dprintk("RPC: rpcb_decode_getport result %u\n", + dprintk("RPC: rpcb_decode_getport result %u\n", *portp); return 0; } @@ -447,8 +447,8 @@ static int rpcb_decode_set(struct rpc_rqst *req, __be32 *p, unsigned int *boolp) { *boolp = (unsigned int) ntohl(*p++); - dprintk("RPC: rpcb_decode_set result %u\n", - *boolp); + dprintk("RPC: rpcb_decode_set: call %s\n", + (*boolp ? "succeeded" : "failed")); return 0; } From fc200e794d723bc88c39859e8f096b717532f9c9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 25 Jun 2008 17:24:31 -0400 Subject: [PATCH 50/82] SUNRPC: Document some naked integers in rpcbind client Clean up: Replace naked integers that represent rpcbind protocol versions. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 49 +++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index cf2b91613ac6..b23a719aca30 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -32,6 +32,10 @@ #define RPCBIND_PROGRAM (100000u) #define RPCBIND_PORT (111u) +#define RPCBVERS_2 (2u) +#define RPCBVERS_3 (3u) +#define RPCBVERS_4 (4u) + enum { RPCBPROC_NULL, RPCBPROC_SET, @@ -82,7 +86,7 @@ static struct rpc_procinfo rpcb_procedures2[]; static struct rpc_procinfo rpcb_procedures3[]; struct rpcb_info { - int rpc_vers; + u32 rpc_vers; struct rpc_procinfo * rpc_proc; }; @@ -177,7 +181,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) prog, vers, prot, port); rpcb_clnt = rpcb_create("localhost", (struct sockaddr *) &sin, - sizeof(sin), XPRT_TRANSPORT_UDP, 2, 1); + sizeof(sin), XPRT_TRANSPORT_UDP, RPCBVERS_2, 1); if (IS_ERR(rpcb_clnt)) return PTR_ERR(rpcb_clnt); @@ -227,7 +231,7 @@ int rpcb_getport_sync(struct sockaddr_in *sin, u32 prog, u32 vers, int prot) __func__, NIPQUAD(sin->sin_addr.s_addr), prog, vers, prot); rpcb_clnt = rpcb_create(NULL, (struct sockaddr *)sin, - sizeof(*sin), prot, 2, 0); + sizeof(*sin), prot, RPCBVERS_2, 0); if (IS_ERR(rpcb_clnt)) return PTR_ERR(rpcb_clnt); @@ -588,35 +592,54 @@ static struct rpc_procinfo rpcb_procedures4[] = { static struct rpcb_info rpcb_next_version[] = { #ifdef CONFIG_SUNRPC_BIND34 - { 4, &rpcb_procedures4[RPCBPROC_GETVERSADDR] }, - { 3, &rpcb_procedures3[RPCBPROC_GETADDR] }, + { + .rpc_vers = RPCBVERS_4, + .rpc_proc = &rpcb_procedures4[RPCBPROC_GETVERSADDR], + }, + { + .rpc_vers = RPCBVERS_3, + .rpc_proc = &rpcb_procedures3[RPCBPROC_GETADDR], + }, #endif - { 2, &rpcb_procedures2[RPCBPROC_GETPORT] }, - { 0, NULL }, + { + .rpc_vers = RPCBVERS_2, + .rpc_proc = &rpcb_procedures2[RPCBPROC_GETPORT], + }, + { + .rpc_proc = NULL, + }, }; static struct rpcb_info rpcb_next_version6[] = { #ifdef CONFIG_SUNRPC_BIND34 - { 4, &rpcb_procedures4[RPCBPROC_GETVERSADDR] }, - { 3, &rpcb_procedures3[RPCBPROC_GETADDR] }, + { + .rpc_vers = RPCBVERS_4, + .rpc_proc = &rpcb_procedures4[RPCBPROC_GETVERSADDR], + }, + { + .rpc_vers = RPCBVERS_3, + .rpc_proc = &rpcb_procedures3[RPCBPROC_GETADDR], + }, #endif - { 0, NULL }, + { + .rpc_proc = NULL, + }, }; static struct rpc_version rpcb_version2 = { - .number = 2, + .number = RPCBVERS_2, .nrprocs = RPCB_HIGHPROC_2, .procs = rpcb_procedures2 }; static struct rpc_version rpcb_version3 = { - .number = 3, + .number = RPCBVERS_3, .nrprocs = RPCB_HIGHPROC_3, .procs = rpcb_procedures3 }; static struct rpc_version rpcb_version4 = { - .number = 4, + .number = RPCBVERS_4, .nrprocs = RPCB_HIGHPROC_4, .procs = rpcb_procedures4 }; From 6a774051573042cdeb57e81f77b36c25e5856739 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 25 Jun 2008 17:24:39 -0400 Subject: [PATCH 51/82] SUNRPC: Use rpcbind version 2 GETPORT Clean up: Change the version 2 procedure name to GETPORT. It's the same procedure number as GETADDR, but version 2 implementations usually refer to it as GETPORT. This also now matches the procedure name used in the version 2 procedure entry in the rpcb_next_version[] array, making it slightly less confusing. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index b23a719aca30..a70cc1e11792 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -575,7 +575,7 @@ out_err: static struct rpc_procinfo rpcb_procedures2[] = { PROC(SET, mapping, set), PROC(UNSET, mapping, set), - PROC(GETADDR, mapping, getport), + PROC(GETPORT, mapping, getport), }; static struct rpc_procinfo rpcb_procedures3[] = { From 8842413aa4c3220ce9313791f99808fc149ca16d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 25 Jun 2008 17:24:47 -0400 Subject: [PATCH 52/82] SUNRPC: Use GETADDR for rpcbind version 4 queries Some rpcbind servers that do support rpcbind version 4 do not support the GETVERSADDR procedure. Use GETADDR for querying rpcbind servers via rpcbind version 4 instead of GETVERSADDR. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index a70cc1e11792..625ba72e624a 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -587,6 +587,7 @@ static struct rpc_procinfo rpcb_procedures3[] = { static struct rpc_procinfo rpcb_procedures4[] = { PROC(SET, mapping, set), PROC(UNSET, mapping, set), + PROC(GETADDR, getaddr, getaddr), PROC(GETVERSADDR, getaddr, getaddr), }; @@ -594,7 +595,7 @@ static struct rpcb_info rpcb_next_version[] = { #ifdef CONFIG_SUNRPC_BIND34 { .rpc_vers = RPCBVERS_4, - .rpc_proc = &rpcb_procedures4[RPCBPROC_GETVERSADDR], + .rpc_proc = &rpcb_procedures4[RPCBPROC_GETADDR], }, { .rpc_vers = RPCBVERS_3, @@ -614,7 +615,7 @@ static struct rpcb_info rpcb_next_version6[] = { #ifdef CONFIG_SUNRPC_BIND34 { .rpc_vers = RPCBVERS_4, - .rpc_proc = &rpcb_procedures4[RPCBPROC_GETVERSADDR], + .rpc_proc = &rpcb_procedures4[RPCBPROC_GETADDR], }, { .rpc_vers = RPCBVERS_3, From 40fef8a649e5344bfb6a67a7cc3def3e0dad6448 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 25 Jun 2008 17:24:54 -0400 Subject: [PATCH 53/82] SUNRPC: Use only rpcbind v2 for AF_INET requests Some server vendors support the higher versions of rpcbind only for AF_INET6. The kernel doesn't need to use v3 or v4 for AF_INET anyway, so change the kernel's rpcbind client to query AF_INET servers over rpcbind v2 only. This has a few interesting benefits: 1. If the rpcbind request is going over TCP, and the server doesn't support rpcbind versions 3 or 4, the client reduces by two the number of ephemeral ports left in TIME_WAIT for each rpcbind request. This will help during NFS mount storms. 2. The rpcbind interaction with servers that don't support rpcbind versions 3 or 4 will use less network traffic. Also helpful during mount storms. 3. We can eliminate the kernel build option that controls whether the kernel's rpcbind client uses rpcbind version 3 and 4 for AF_INET servers. Less complicated kernel configuration... Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/Kconfig | 21 --------------------- net/sunrpc/rpcb_clnt.c | 12 ------------ 2 files changed, 33 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index 1c16de9611e9..0ce72dcd6b96 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1799,27 +1799,6 @@ config SUNRPC_XPRT_RDMA If unsure, say N. -config SUNRPC_BIND34 - bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)" - depends on SUNRPC && EXPERIMENTAL - default n - help - RPC requests over IPv6 networks require support for larger - addresses when performing an RPC bind. Sun added support for - IPv6 addressing by creating two new versions of the rpcbind - protocol (RFC 1833). - - This option enables support in the kernel RPC client for - querying rpcbind servers via versions 3 and 4 of the rpcbind - protocol. The kernel automatically falls back to version 2 - if a remote rpcbind service does not support versions 3 or 4. - By themselves, these new versions do not provide support for - RPC over IPv6, but the new protocol versions are necessary to - support it. - - If unsure, say N to get traditional behavior (version 2 rpcbind - requests only). - config RPCSEC_GSS_KRB5 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" depends on SUNRPC && EXPERIMENTAL diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 625ba72e624a..c62e446723ae 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -592,16 +592,6 @@ static struct rpc_procinfo rpcb_procedures4[] = { }; static struct rpcb_info rpcb_next_version[] = { -#ifdef CONFIG_SUNRPC_BIND34 - { - .rpc_vers = RPCBVERS_4, - .rpc_proc = &rpcb_procedures4[RPCBPROC_GETADDR], - }, - { - .rpc_vers = RPCBVERS_3, - .rpc_proc = &rpcb_procedures3[RPCBPROC_GETADDR], - }, -#endif { .rpc_vers = RPCBVERS_2, .rpc_proc = &rpcb_procedures2[RPCBPROC_GETPORT], @@ -612,7 +602,6 @@ static struct rpcb_info rpcb_next_version[] = { }; static struct rpcb_info rpcb_next_version6[] = { -#ifdef CONFIG_SUNRPC_BIND34 { .rpc_vers = RPCBVERS_4, .rpc_proc = &rpcb_procedures4[RPCBPROC_GETADDR], @@ -621,7 +610,6 @@ static struct rpcb_info rpcb_next_version6[] = { .rpc_vers = RPCBVERS_3, .rpc_proc = &rpcb_procedures3[RPCBPROC_GETADDR], }, -#endif { .rpc_proc = NULL, }, From 259875efed06d6936f54c9a264e868937f1bc217 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 2 Jul 2008 14:43:47 -0400 Subject: [PATCH 54/82] NFS: set transport defaults after mount option parsing is finished Move the UDP/TCP default timeo/retrans settings for text mounts to nfs_init_timeout_values(), which was were they were always being initialised (and sanity checked) for binary mounts. Document the default timeout values using appropriate #defines. Ensure that we initialise and sanity check the transport protocols that may have been specified by the user. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 13 +++++---- fs/nfs/super.c | 60 +++++++++++++++++++++++++++++------------- include/linux/nfs_fs.h | 5 ++++ 3 files changed, 55 insertions(+), 23 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index f2a092ca69b5..5ee23e7058b3 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -431,14 +431,14 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, { to->to_initval = timeo * HZ / 10; to->to_retries = retrans; - if (!to->to_retries) - to->to_retries = 2; switch (proto) { case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_RDMA: + if (to->to_retries == 0) + to->to_retries = NFS_DEF_TCP_RETRANS; if (to->to_initval == 0) - to->to_initval = 60 * HZ; + to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) to->to_initval = NFS_MAX_TCP_TIMEOUT; to->to_increment = to->to_initval; @@ -450,14 +450,17 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, to->to_exponential = 0; break; case XPRT_TRANSPORT_UDP: - default: + if (to->to_retries == 0) + to->to_retries = NFS_DEF_UDP_RETRANS; if (!to->to_initval) - to->to_initval = 11 * HZ / 10; + to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_UDP_TIMEOUT) to->to_initval = NFS_MAX_UDP_TIMEOUT; to->to_maxval = NFS_MAX_UDP_TIMEOUT; to->to_exponential = 1; break; + default: + BUG(); } } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 73a8e5970f02..9c1a960f5b94 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -816,6 +816,43 @@ static void nfs_parse_ip_address(char *string, size_t str_len, nfs_parse_ipv4_address(string, str_len, sap, addr_len); } +/* + * Sanity check the NFS transport protocol. + * + */ +static void nfs_validate_transport_protocol(struct nfs_parsed_mount_data *mnt) +{ + switch (mnt->nfs_server.protocol) { + case XPRT_TRANSPORT_UDP: + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_RDMA: + break; + default: + mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; + } +} + +/* + * For text based NFSv2/v3 mounts, the mount protocol transport default + * settings should depend upon the specified NFS transport. + */ +static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt) +{ + nfs_validate_transport_protocol(mnt); + + if (mnt->mount_server.protocol == XPRT_TRANSPORT_UDP || + mnt->mount_server.protocol == XPRT_TRANSPORT_TCP) + return; + switch (mnt->nfs_server.protocol) { + case XPRT_TRANSPORT_UDP: + mnt->mount_server.protocol = XPRT_TRANSPORT_UDP; + break; + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_RDMA: + mnt->mount_server.protocol = XPRT_TRANSPORT_TCP; + } +} + /* * Error-check and convert a string of mount options from user space into * a data structure @@ -896,20 +933,14 @@ static int nfs_parse_mount_options(char *raw, case Opt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; - mnt->timeo = 7; - mnt->retrans = 5; break; case Opt_tcp: mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; - mnt->timeo = 600; - mnt->retrans = 2; break; case Opt_rdma: mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */ mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; - mnt->timeo = 600; - mnt->retrans = 2; break; case Opt_acl: mnt->flags &= ~NFS_MOUNT_NOACL; @@ -1103,21 +1134,15 @@ static int nfs_parse_mount_options(char *raw, case Opt_xprt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; - mnt->timeo = 7; - mnt->retrans = 5; break; case Opt_xprt_tcp: mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; - mnt->timeo = 600; - mnt->retrans = 2; break; case Opt_xprt_rdma: /* vector side protocols to TCP */ mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; - mnt->timeo = 600; - mnt->retrans = 2; break; default: goto out_unrec_xprt; @@ -1438,14 +1463,11 @@ static int nfs_validate_mount_data(void *options, args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP); args->rsize = NFS_MAX_FILE_IO_SIZE; args->wsize = NFS_MAX_FILE_IO_SIZE; - args->timeo = 600; - args->retrans = 2; args->acregmin = 3; args->acregmax = 60; args->acdirmin = 30; args->acdirmax = 60; args->mount_server.port = 0; /* autobind unless user sets port */ - args->mount_server.protocol = XPRT_TRANSPORT_UDP; args->nfs_server.port = 0; /* autobind unless user sets port */ args->nfs_server.protocol = XPRT_TRANSPORT_TCP; @@ -1546,6 +1568,8 @@ static int nfs_validate_mount_data(void *options, &args->nfs_server.address)) goto out_no_address; + nfs_set_mount_transport_protocol(args); + status = nfs_parse_devname(dev_name, &args->nfs_server.hostname, PAGE_SIZE, @@ -2095,14 +2119,11 @@ static int nfs4_validate_mount_data(void *options, args->rsize = NFS_MAX_FILE_IO_SIZE; args->wsize = NFS_MAX_FILE_IO_SIZE; - args->timeo = 600; - args->retrans = 2; args->acregmin = 3; args->acregmax = 60; args->acdirmin = 30; args->acdirmax = 60; args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */ - args->nfs_server.protocol = XPRT_TRANSPORT_TCP; switch (data->version) { case 1: @@ -2163,6 +2184,7 @@ static int nfs4_validate_mount_data(void *options, args->acdirmin = data->acdirmin; args->acdirmax = data->acdirmax; args->nfs_server.protocol = data->proto; + nfs_validate_transport_protocol(args); break; default: { @@ -2175,6 +2197,8 @@ static int nfs4_validate_mount_data(void *options, &args->nfs_server.address)) return -EINVAL; + nfs_validate_transport_protocol(args); + switch (args->auth_flavor_len) { case 0: args->auth_flavors[0] = RPC_AUTH_UNIX; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 27d6a8d98cef..830d9cc8cdce 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -12,6 +12,11 @@ #include /* Default timeout values */ +#define NFS_DEF_UDP_TIMEO (11) +#define NFS_DEF_UDP_RETRANS (3) +#define NFS_DEF_TCP_TIMEO (600) +#define NFS_DEF_TCP_RETRANS (2) + #define NFS_MAX_UDP_TIMEOUT (60*HZ) #define NFS_MAX_TCP_TIMEOUT (600*HZ) From ed596a8adb7964cf9d2f7682f9cf2c37322a775d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 26 Jun 2008 17:47:05 -0400 Subject: [PATCH 55/82] NFS: Move the nfs_set_port() call out of nfs_parse_mount_options() The remount path does not need to set the port in the server address. Since it's not really a part of option parsing, move the nfs_set_port() call to nfs_parse_mount_options()'s callers. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 9c1a960f5b94..de424d2f3155 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1214,9 +1214,6 @@ static int nfs_parse_mount_options(char *raw, } } - nfs_set_port((struct sockaddr *)&mnt->nfs_server.address, - mnt->nfs_server.port); - return 1; out_nomem: @@ -1568,6 +1565,9 @@ static int nfs_validate_mount_data(void *options, &args->nfs_server.address)) goto out_no_address; + nfs_set_port((struct sockaddr *)&args->nfs_server.address, + args->nfs_server.port); + nfs_set_mount_transport_protocol(args); status = nfs_parse_devname(dev_name, @@ -2197,6 +2197,9 @@ static int nfs4_validate_mount_data(void *options, &args->nfs_server.address)) return -EINVAL; + nfs_set_port((struct sockaddr *)&args->nfs_server.address, + args->nfs_server.port); + nfs_validate_transport_protocol(args); switch (args->auth_flavor_len) { From 0e0cab744b17a70ef0f08d818d66935feade7cad Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 26 Jun 2008 17:47:12 -0400 Subject: [PATCH 56/82] NFS: use documenting macro constants for initializing ac{reg, dir}{min, max} Clean up. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfsroot.c | 8 ++++---- fs/nfs/super.c | 24 ++++++++++++------------ include/linux/nfs_fs.h | 5 +++++ 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 15afe3a6c5be..46763d1cd397 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -295,10 +295,10 @@ static int __init root_nfs_name(char *name) nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */ nfs_data.rsize = NFS_DEF_FILE_IO_SIZE; nfs_data.wsize = NFS_DEF_FILE_IO_SIZE; - nfs_data.acregmin = 3; - nfs_data.acregmax = 60; - nfs_data.acdirmin = 30; - nfs_data.acdirmax = 60; + nfs_data.acregmin = NFS_DEF_ACREGMIN; + nfs_data.acregmax = NFS_DEF_ACREGMAX; + nfs_data.acdirmin = NFS_DEF_ACDIRMIN; + nfs_data.acdirmax = NFS_DEF_ACDIRMAX; strcpy(buf, NFS_ROOT); /* Process options received from the remote server */ diff --git a/fs/nfs/super.c b/fs/nfs/super.c index de424d2f3155..ebed63e0ff8e 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -518,13 +518,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, if (nfss->bsize != 0) seq_printf(m, ",bsize=%u", nfss->bsize); seq_printf(m, ",namlen=%u", nfss->namelen); - if (nfss->acregmin != 3*HZ || showdefaults) + if (nfss->acregmin != NFS_DEF_ACREGMIN*HZ || showdefaults) seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ); - if (nfss->acregmax != 60*HZ || showdefaults) + if (nfss->acregmax != NFS_DEF_ACREGMAX*HZ || showdefaults) seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ); - if (nfss->acdirmin != 30*HZ || showdefaults) + if (nfss->acdirmin != NFS_DEF_ACDIRMIN*HZ || showdefaults) seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ); - if (nfss->acdirmax != 60*HZ || showdefaults) + if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults) seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ); for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) { if (nfss->flags & nfs_infop->flag) @@ -1460,10 +1460,10 @@ static int nfs_validate_mount_data(void *options, args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP); args->rsize = NFS_MAX_FILE_IO_SIZE; args->wsize = NFS_MAX_FILE_IO_SIZE; - args->acregmin = 3; - args->acregmax = 60; - args->acdirmin = 30; - args->acdirmax = 60; + args->acregmin = NFS_DEF_ACREGMIN; + args->acregmax = NFS_DEF_ACREGMAX; + args->acdirmin = NFS_DEF_ACDIRMIN; + args->acdirmax = NFS_DEF_ACDIRMAX; args->mount_server.port = 0; /* autobind unless user sets port */ args->nfs_server.port = 0; /* autobind unless user sets port */ args->nfs_server.protocol = XPRT_TRANSPORT_TCP; @@ -2119,10 +2119,10 @@ static int nfs4_validate_mount_data(void *options, args->rsize = NFS_MAX_FILE_IO_SIZE; args->wsize = NFS_MAX_FILE_IO_SIZE; - args->acregmin = 3; - args->acregmax = 60; - args->acdirmin = 30; - args->acdirmax = 60; + args->acregmin = NFS_DEF_ACREGMIN; + args->acregmax = NFS_DEF_ACREGMAX; + args->acdirmin = NFS_DEF_ACDIRMIN; + args->acdirmax = NFS_DEF_ACDIRMAX; args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */ switch (data->version) { diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 830d9cc8cdce..29d261918734 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -20,6 +20,11 @@ #define NFS_MAX_UDP_TIMEOUT (60*HZ) #define NFS_MAX_TCP_TIMEOUT (600*HZ) +#define NFS_DEF_ACREGMIN (3) +#define NFS_DEF_ACREGMAX (60) +#define NFS_DEF_ACDIRMIN (30) +#define NFS_DEF_ACDIRMAX (60) + /* * When flushing a cluster of dirty pages, there can be different * strategies: From 01060c896e3e1ef53dcb914301c186932cd31b81 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 24 Jun 2008 16:33:38 -0400 Subject: [PATCH 57/82] NFS: Refactor logic for parsing NFS security flavor mount options Clean up: Refactor the NFS mount option parsing function to extract the security flavor parsing logic into a separate function. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 143 +++++++++++++++++++++++++++---------------------- 1 file changed, 78 insertions(+), 65 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ebed63e0ff8e..6eb145ea71ac 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -853,6 +853,82 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt) } } +/* + * Parse the value of the 'sec=' option. + * + * The flags setting is for v2/v3. The flavor_len setting is for v4. + * v2/v3 also need to know the difference between NULL and UNIX. + */ +static int nfs_parse_security_flavors(char *value, + struct nfs_parsed_mount_data *mnt) +{ + substring_t args[MAX_OPT_ARGS]; + + dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value); + + switch (match_token(value, nfs_secflavor_tokens, args)) { + case Opt_sec_none: + mnt->flags &= ~NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 0; + mnt->auth_flavors[0] = RPC_AUTH_NULL; + break; + case Opt_sec_sys: + mnt->flags &= ~NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 0; + mnt->auth_flavors[0] = RPC_AUTH_UNIX; + break; + case Opt_sec_krb5: + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5; + break; + case Opt_sec_krb5i: + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I; + break; + case Opt_sec_krb5p: + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P; + break; + case Opt_sec_lkey: + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY; + break; + case Opt_sec_lkeyi: + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI; + break; + case Opt_sec_lkeyp: + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP; + break; + case Opt_sec_spkm: + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM; + break; + case Opt_sec_spkmi: + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI; + break; + case Opt_sec_spkmp: + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP; + break; + default: + return 0; + } + + return 1; +} + /* * Error-check and convert a string of mount options from user space into * a data structure @@ -1054,73 +1130,10 @@ static int nfs_parse_mount_options(char *raw, string = match_strdup(args); if (string == NULL) goto out_nomem; - token = match_token(string, nfs_secflavor_tokens, args); + rc = nfs_parse_security_flavors(string, mnt); kfree(string); - - /* - * The flags setting is for v2/v3. The flavor_len - * setting is for v4. v2/v3 also need to know the - * difference between NULL and UNIX. - */ - switch (token) { - case Opt_sec_none: - mnt->flags &= ~NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 0; - mnt->auth_flavors[0] = RPC_AUTH_NULL; - break; - case Opt_sec_sys: - mnt->flags &= ~NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 0; - mnt->auth_flavors[0] = RPC_AUTH_UNIX; - break; - case Opt_sec_krb5: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5; - break; - case Opt_sec_krb5i: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I; - break; - case Opt_sec_krb5p: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P; - break; - case Opt_sec_lkey: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY; - break; - case Opt_sec_lkeyi: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI; - break; - case Opt_sec_lkeyp: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP; - break; - case Opt_sec_spkm: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM; - break; - case Opt_sec_spkmi: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI; - break; - case Opt_sec_spkmp: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP; - break; - default: + if (!rc) goto out_unrec_sec; - } break; case Opt_proto: string = match_strdup(args); From dd07c94750cb1ee4449fb0db06623e1865b3e26e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 24 Jun 2008 16:33:46 -0400 Subject: [PATCH 58/82] NFS: Set security flavor default for NFSv2/3 mounts like other defaults Set the default security flavor when we set the other mount option default values. After this change, only the legacy user-space mount path needs to set the NFS_MOUNT_SECFLAVOUR flag. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 6eb145ea71ac..2f7e7f20c917 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -856,8 +856,7 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt) /* * Parse the value of the 'sec=' option. * - * The flags setting is for v2/v3. The flavor_len setting is for v4. - * v2/v3 also need to know the difference between NULL and UNIX. + * The flavor_len setting is for v4 mounts. */ static int nfs_parse_security_flavors(char *value, struct nfs_parsed_mount_data *mnt) @@ -868,57 +867,46 @@ static int nfs_parse_security_flavors(char *value, switch (match_token(value, nfs_secflavor_tokens, args)) { case Opt_sec_none: - mnt->flags &= ~NFS_MOUNT_SECFLAVOUR; mnt->auth_flavor_len = 0; mnt->auth_flavors[0] = RPC_AUTH_NULL; break; case Opt_sec_sys: - mnt->flags &= ~NFS_MOUNT_SECFLAVOUR; mnt->auth_flavor_len = 0; mnt->auth_flavors[0] = RPC_AUTH_UNIX; break; case Opt_sec_krb5: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5; break; case Opt_sec_krb5i: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I; break; case Opt_sec_krb5p: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P; break; case Opt_sec_lkey: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY; break; case Opt_sec_lkeyi: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI; break; case Opt_sec_lkeyp: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP; break; case Opt_sec_spkm: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM; break; case Opt_sec_spkmi: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI; break; case Opt_sec_spkmp: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP; break; @@ -1480,6 +1468,7 @@ static int nfs_validate_mount_data(void *options, args->mount_server.port = 0; /* autobind unless user sets port */ args->nfs_server.port = 0; /* autobind unless user sets port */ args->nfs_server.protocol = XPRT_TRANSPORT_TCP; + args->auth_flavors[0] = RPC_AUTH_UNIX; switch (data->version) { case 1: @@ -1537,7 +1526,9 @@ static int nfs_validate_mount_data(void *options, args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); args->namlen = data->namlen; args->bsize = data->bsize; - args->auth_flavors[0] = data->pseudoflavor; + + if (data->flags & NFS_MOUNT_SECFLAVOUR) + args->auth_flavors[0] = data->pseudoflavor; if (!args->nfs_server.hostname) goto out_nomem; @@ -1601,9 +1592,6 @@ static int nfs_validate_mount_data(void *options, } } - if (!(args->flags & NFS_MOUNT_SECFLAVOUR)) - args->auth_flavors[0] = RPC_AUTH_UNIX; - #ifndef CONFIG_NFS_V3 if (args->flags & NFS_MOUNT_VER3) goto out_v3_not_compiled; From 6738b2512bdf93ffbefe108b38a9165d5a718c3f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 24 Jun 2008 16:33:54 -0400 Subject: [PATCH 59/82] NFS4: Set security flavor default for NFSv4 mounts like other defaults Set the default security flavor when we set the other mount option default values for NFSv4. This cleans up the NFSv4 mount option parsing path to look like the NFSv2/v3 one. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2f7e7f20c917..4bbdbf6de417 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2125,6 +2125,8 @@ static int nfs4_validate_mount_data(void *options, args->acdirmin = NFS_DEF_ACDIRMIN; args->acdirmax = NFS_DEF_ACDIRMAX; args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */ + args->auth_flavors[0] = RPC_AUTH_UNIX; + args->auth_flavor_len = 0; switch (data->version) { case 1: @@ -2140,18 +2142,13 @@ static int nfs4_validate_mount_data(void *options, &args->nfs_server.address)) goto out_no_address; - switch (data->auth_flavourlen) { - case 0: - args->auth_flavors[0] = RPC_AUTH_UNIX; - break; - case 1: + if (data->auth_flavourlen) { + if (data->auth_flavourlen > 1) + goto out_inval_auth; if (copy_from_user(&args->auth_flavors[0], data->auth_flavours, sizeof(args->auth_flavors[0]))) return -EFAULT; - break; - default: - goto out_inval_auth; } c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); @@ -2203,15 +2200,8 @@ static int nfs4_validate_mount_data(void *options, nfs_validate_transport_protocol(args); - switch (args->auth_flavor_len) { - case 0: - args->auth_flavors[0] = RPC_AUTH_UNIX; - break; - case 1: - break; - default: + if (args->auth_flavor_len > 1) goto out_inval_auth; - } if (args->client_address == NULL) goto out_no_client_address; From f45663ce5fb30f76a3414ab3ac69f4dd320e760a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 24 Jun 2008 19:28:02 -0400 Subject: [PATCH 60/82] NFS: Allow either strict or sloppy mount option parsing The kernel's NFS client mount option parser currently doesn't allow unrecognized or incorrect mount options. This prevents misspellings or incorrectly specified mount options from possibly causing silent data corruption. However, NFS mount options are not standardized, so different operating systems can use differently spelled mount options to support similar features, or can support mount options which no other operating system supports. "Sloppy" mount option parsing, which allows the parser to ignore any option it doesn't recognize, is needed to support automounters that often use maps that are shared between heterogenous operating systems. The legacy mount command ignores the validity of the values of mount options entirely, except for the "sec=" and "proto=" options. If an incorrect value is specified, the out-of-range value is passed to the kernel; if a value is specified that contains non-numeric characters, it appears as though the legacy mount command sets that option to zero (probably incorrect behavior in general). In any case, this sets a precedent which we will partially follow for the kernel mount option parser: + if "sloppy" is not set, the parser will be strict about both unrecognized options (same as legacy) and invalid option values (stricter than legacy) + if "sloppy" is set, the parser will ignore unrecognized options and invalid option values (same as legacy) An "invalid" option value in this case means that either the type (integer, short, or string) or sign (for integer values) of the specified value is incorrect. This patch does two things: it changes the NFS client's mount option parsing loop so that it parses the whole string instead of failing at the first unrecognized option or invalid option value. An unrecognized option or an invalid option value cause the option to be skipped. Then, the patch adds a "sloppy" mount option that allows the parsing to succeed anyway if there were any problems during parsing. When parsing a set of options is complete, if there are errors and "sloppy" was specified, return success anyway. Otherwise, only return success if there are no errors. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 203 +++++++++++++++++++++++++++++++------------------ 1 file changed, 128 insertions(+), 75 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 4bbdbf6de417..47cf83e917be 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -92,8 +92,8 @@ enum { Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, Opt_addr, Opt_mountaddr, Opt_clientaddr, - /* Mount options that are ignored */ - Opt_userspace, Opt_deprecated, + /* Special mount options */ + Opt_userspace, Opt_deprecated, Opt_sloppy, Opt_err }; @@ -103,6 +103,8 @@ static match_table_t nfs_mount_option_tokens = { { Opt_userspace, "fg" }, { Opt_userspace, "retry=%s" }, + { Opt_sloppy, "sloppy" }, + { Opt_soft, "soft" }, { Opt_hard, "hard" }, { Opt_deprecated, "intr" }, @@ -917,15 +919,22 @@ static int nfs_parse_security_flavors(char *value, return 1; } +static void nfs_parse_invalid_value(const char *option) +{ + dfprintk(MOUNT, "NFS: bad value specified for %s option\n", option); +} + /* * Error-check and convert a string of mount options from user space into - * a data structure + * a data structure. The whole mount string is processed; bad options are + * skipped as they are encountered. If there were no errors, return 1; + * otherwise return 0 (zero). */ static int nfs_parse_mount_options(char *raw, struct nfs_parsed_mount_data *mnt) { char *p, *string, *secdata; - int rc; + int rc, sloppy = 0, errors = 0; if (!raw) { dfprintk(MOUNT, "NFS: mount options string was NULL.\n"); @@ -958,6 +967,10 @@ static int nfs_parse_mount_options(char *raw, token = match_token(p, nfs_mount_option_tokens, args); switch (token) { + + /* + * boolean options: foo/nofoo + */ case Opt_soft: mnt->flags |= NFS_MOUNT_SOFT; break; @@ -1025,103 +1038,145 @@ static int nfs_parse_mount_options(char *raw, mnt->flags |= NFS_MOUNT_UNSHARED; break; + /* + * options that take numeric values + */ case Opt_port: - if (match_int(args, &option)) - return 0; - if (option < 0 || option > 65535) - return 0; - mnt->nfs_server.port = option; + if (match_int(args, &option) || + option < 0 || option > USHORT_MAX) { + errors++; + nfs_parse_invalid_value("port"); + } else + mnt->nfs_server.port = option; break; case Opt_rsize: - if (match_int(args, &mnt->rsize)) - return 0; + if (match_int(args, &option) || option < 0) { + errors++; + nfs_parse_invalid_value("rsize"); + } else + mnt->rsize = option; break; case Opt_wsize: - if (match_int(args, &mnt->wsize)) - return 0; + if (match_int(args, &option) || option < 0) { + errors++; + nfs_parse_invalid_value("wsize"); + } else + mnt->wsize = option; break; case Opt_bsize: - if (match_int(args, &option)) - return 0; - if (option < 0) - return 0; - mnt->bsize = option; + if (match_int(args, &option) || option < 0) { + errors++; + nfs_parse_invalid_value("bsize"); + } else + mnt->bsize = option; break; case Opt_timeo: - if (match_int(args, &mnt->timeo)) - return 0; + if (match_int(args, &option) || option <= 0) { + errors++; + nfs_parse_invalid_value("timeo"); + } else + mnt->timeo = option; break; case Opt_retrans: - if (match_int(args, &mnt->retrans)) - return 0; + if (match_int(args, &option) || option <= 0) { + errors++; + nfs_parse_invalid_value("retrans"); + } else + mnt->retrans = option; break; case Opt_acregmin: - if (match_int(args, &mnt->acregmin)) - return 0; + if (match_int(args, &option) || option < 0) { + errors++; + nfs_parse_invalid_value("acregmin"); + } else + mnt->acregmin = option; break; case Opt_acregmax: - if (match_int(args, &mnt->acregmax)) - return 0; + if (match_int(args, &option) || option < 0) { + errors++; + nfs_parse_invalid_value("acregmax"); + } else + mnt->acregmax = option; break; case Opt_acdirmin: - if (match_int(args, &mnt->acdirmin)) - return 0; + if (match_int(args, &option) || option < 0) { + errors++; + nfs_parse_invalid_value("acdirmin"); + } else + mnt->acdirmin = option; break; case Opt_acdirmax: - if (match_int(args, &mnt->acdirmax)) - return 0; + if (match_int(args, &option) || option < 0) { + errors++; + nfs_parse_invalid_value("acdirmax"); + } else + mnt->acdirmax = option; break; case Opt_actimeo: - if (match_int(args, &option)) - return 0; - if (option < 0) - return 0; - mnt->acregmin = - mnt->acregmax = - mnt->acdirmin = - mnt->acdirmax = option; + if (match_int(args, &option) || option < 0) { + errors++; + nfs_parse_invalid_value("actimeo"); + } else + mnt->acregmin = mnt->acregmax = + mnt->acdirmin = mnt->acdirmax = option; break; case Opt_namelen: - if (match_int(args, &mnt->namlen)) - return 0; + if (match_int(args, &option) || option < 0) { + errors++; + nfs_parse_invalid_value("namlen"); + } else + mnt->namlen = option; break; case Opt_mountport: - if (match_int(args, &option)) - return 0; - if (option < 0 || option > 65535) - return 0; - mnt->mount_server.port = option; + if (match_int(args, &option) || + option < 0 || option > USHORT_MAX) { + errors++; + nfs_parse_invalid_value("mountport"); + } else + mnt->mount_server.port = option; break; case Opt_mountvers: - if (match_int(args, &option)) - return 0; - if (option < 0) - return 0; - mnt->mount_server.version = option; + if (match_int(args, &option) || + option < NFS_MNT_VERSION || + option > NFS_MNT3_VERSION) { + errors++; + nfs_parse_invalid_value("mountvers"); + } else + mnt->mount_server.version = option; break; case Opt_nfsvers: - if (match_int(args, &option)) - return 0; + if (match_int(args, &option)) { + errors++; + nfs_parse_invalid_value("nfsvers"); + break; + } switch (option) { - case 2: + case NFS2_VERSION: mnt->flags &= ~NFS_MOUNT_VER3; break; - case 3: + case NFS3_VERSION: mnt->flags |= NFS_MOUNT_VER3; break; default: - goto out_unrec_vers; + errors++; + nfs_parse_invalid_value("nfsvers"); } break; + /* + * options that take text values + */ case Opt_sec: string = match_strdup(args); if (string == NULL) goto out_nomem; rc = nfs_parse_security_flavors(string, mnt); kfree(string); - if (!rc) - goto out_unrec_sec; + if (!rc) { + errors++; + dfprintk(MOUNT, "NFS: unrecognized " + "security flavor\n"); + } break; case Opt_proto: string = match_strdup(args); @@ -1146,7 +1201,9 @@ static int nfs_parse_mount_options(char *raw, mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; break; default: - goto out_unrec_xprt; + errors++; + dfprintk(MOUNT, "NFS: unrecognized " + "transport protocol\n"); } break; case Opt_mountproto: @@ -1166,7 +1223,9 @@ static int nfs_parse_mount_options(char *raw, break; case Opt_xprt_rdma: /* not used for side protocols */ default: - goto out_unrec_xprt; + errors++; + dfprintk(MOUNT, "NFS: unrecognized " + "transport protocol\n"); } break; case Opt_addr: @@ -1204,6 +1263,13 @@ static int nfs_parse_mount_options(char *raw, kfree(string); break; + /* + * Special options + */ + case Opt_sloppy: + sloppy = 1; + dfprintk(MOUNT, "NFS: relaxing parsing rules\n"); + break; case Opt_userspace: case Opt_deprecated: dfprintk(MOUNT, "NFS: ignoring mount option " @@ -1211,7 +1277,9 @@ static int nfs_parse_mount_options(char *raw, break; default: - goto out_unknown; + errors++; + dfprintk(MOUNT, "NFS: unrecognized mount option " + "'%s'\n", p); } } @@ -1224,21 +1292,6 @@ out_security_failure: free_secdata(secdata); printk(KERN_INFO "NFS: security options invalid: %d\n", rc); return 0; -out_unrec_vers: - printk(KERN_INFO "NFS: unrecognized NFS version number\n"); - return 0; - -out_unrec_xprt: - printk(KERN_INFO "NFS: unrecognized transport protocol\n"); - return 0; - -out_unrec_sec: - printk(KERN_INFO "NFS: unrecognized security flavor\n"); - return 0; - -out_unknown: - printk(KERN_INFO "NFS: unknown mount option: %s\n", p); - return 0; } /* From 381ba74af55e58bca4c01553835a360a9f6fbb07 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 7 Jul 2008 12:18:53 -0400 Subject: [PATCH 61/82] SUNRPC: Ensure our task is notified when an rpcbind call is done If another task is busy in rpcb_getport_async number, it is more efficient to have it wake us up when it has finished instead of arbitrarily sleeping for 5 seconds. Also ensure that rpcb_wake_rpcbind_waiters() is called regardless of whether or not rpcb_getport_done() gets called. Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 8 +++----- net/sunrpc/rpcb_clnt.c | 38 ++++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 09631f6e30e9..76739e928d0d 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -942,11 +942,9 @@ call_bind_status(struct rpc_task *task) } switch (task->tk_status) { - case -EAGAIN: - dprintk("RPC: %5u rpcbind waiting for another request " - "to finish\n", task->tk_pid); - /* avoid busy-waiting here -- could be a network outage. */ - rpc_delay(task, 5*HZ); + case -ENOMEM: + dprintk("RPC: %5u rpcbind out of memory\n", task->tk_pid); + rpc_delay(task, HZ >> 2); goto retry_timeout; case -EACCES: dprintk("RPC: %5u remote rpcbind: RPC program/version " diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index c62e446723ae..24e93e0a0a22 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -68,6 +68,7 @@ enum { #define RPCB_MAXOWNERLEN sizeof(RPCB_OWNER_STRING) static void rpcb_getport_done(struct rpc_task *, void *); +static void rpcb_map_release(void *data); static struct rpc_program rpcb_program; struct rpcbind_args { @@ -80,6 +81,8 @@ struct rpcbind_args { const char * r_netid; const char * r_addr; const char * r_owner; + + int r_status; }; static struct rpc_procinfo rpcb_procedures2[]; @@ -93,14 +96,6 @@ struct rpcb_info { static struct rpcb_info rpcb_next_version[]; static struct rpcb_info rpcb_next_version6[]; -static void rpcb_map_release(void *data) -{ - struct rpcbind_args *map = data; - - xprt_put(map->r_xprt); - kfree(map); -} - static const struct rpc_call_ops rpcb_getport_ops = { .rpc_call_done = rpcb_getport_done, .rpc_release = rpcb_map_release, @@ -112,6 +107,15 @@ static void rpcb_wake_rpcbind_waiters(struct rpc_xprt *xprt, int status) rpc_wake_up_status(&xprt->binding, status); } +static void rpcb_map_release(void *data) +{ + struct rpcbind_args *map = data; + + rpcb_wake_rpcbind_waiters(map->r_xprt, map->r_status); + xprt_put(map->r_xprt); + kfree(map); +} + static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, size_t salen, int proto, u32 version, int privileged) @@ -293,17 +297,16 @@ void rpcb_getport_async(struct rpc_task *task) /* Autobind on cloned rpc clients is discouraged */ BUG_ON(clnt->cl_parent != clnt); + /* Put self on the wait queue to ensure we get notified if + * some other task is already attempting to bind the port */ + rpc_sleep_on(&xprt->binding, task, NULL); + if (xprt_test_and_set_binding(xprt)) { - status = -EAGAIN; /* tell caller to check again */ dprintk("RPC: %5u %s: waiting for another binder\n", task->tk_pid, __func__); - goto bailout_nowake; + return; } - /* Put self on queue before sending rpcbind request, in case - * rpcb_getport_done completes before we return from rpc_run_task */ - rpc_sleep_on(&xprt->binding, task, NULL); - /* Someone else may have bound if we slept */ if (xprt_bound(xprt)) { status = 0; @@ -365,15 +368,15 @@ void rpcb_getport_async(struct rpc_task *task) map->r_netid = rpc_peeraddr2str(clnt, RPC_DISPLAY_NETID); map->r_addr = rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR); map->r_owner = RPCB_OWNER_STRING; /* ignored for GETADDR */ + map->r_status = -EIO; child = rpcb_call_async(rpcb_clnt, map, proc); rpc_release_client(rpcb_clnt); if (IS_ERR(child)) { - status = -EIO; /* rpcb_map_release() has freed the arguments */ dprintk("RPC: %5u %s: rpc_run_task failed\n", task->tk_pid, __func__); - goto bailout_nofree; + return; } rpc_put_task(child); @@ -382,7 +385,6 @@ void rpcb_getport_async(struct rpc_task *task) bailout_nofree: rpcb_wake_rpcbind_waiters(xprt, status); -bailout_nowake: task->tk_status = status; } EXPORT_SYMBOL_GPL(rpcb_getport_async); @@ -421,7 +423,7 @@ static void rpcb_getport_done(struct rpc_task *child, void *data) dprintk("RPC: %5u rpcb_getport_done(status %d, port %u)\n", child->tk_pid, status, map->r_port); - rpcb_wake_rpcbind_waiters(xprt, status); + map->r_status = status; } static int rpcb_encode_mapping(struct rpc_rqst *req, __be32 *p, From 166b88d755f925139af7f7b75aa0a1b896ca0670 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 14 Jul 2008 16:03:26 -0400 Subject: [PATCH 62/82] SUNRPC: Use correct XDR encoding procedure for rpcbind SET/UNSET The rpcbind versions 3 and 4 SET and UNSET procedures use the same arguments as the GETADDR procedure. While definitely a bug, this hasn't been a problem so far since the kernel hasn't used version 3 or 4 SET and UNSET. But this will change in just a moment. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 24e93e0a0a22..0021fad464e0 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -426,6 +426,10 @@ static void rpcb_getport_done(struct rpc_task *child, void *data) map->r_status = status; } +/* + * XDR functions for rpcbind + */ + static int rpcb_encode_mapping(struct rpc_rqst *req, __be32 *p, struct rpcbind_args *rpcb) { @@ -581,14 +585,14 @@ static struct rpc_procinfo rpcb_procedures2[] = { }; static struct rpc_procinfo rpcb_procedures3[] = { - PROC(SET, mapping, set), - PROC(UNSET, mapping, set), + PROC(SET, getaddr, set), + PROC(UNSET, getaddr, set), PROC(GETADDR, getaddr, getaddr), }; static struct rpc_procinfo rpcb_procedures4[] = { - PROC(SET, mapping, set), - PROC(UNSET, mapping, set), + PROC(SET, getaddr, set), + PROC(UNSET, getaddr, set), PROC(GETADDR, getaddr, getaddr), PROC(GETVERSADDR, getaddr, getaddr), }; From cc5598b78fd320dd6d1f90c14491e08029f3c4f6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 14 Jul 2008 16:03:27 -0400 Subject: [PATCH 63/82] SUNRPC: Introduce a specific rpcb_create for contacting localhost Add rpcb_create_local() for use by rpcb_register() and upcoming IPv6 registration functions. Ensure any errors encountered by rpcb_create_local() are properly reported. We can also use a statically allocated constant loopback socket address instead of one allocated on the stack and initialized every time the function is called. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 0021fad464e0..35c1ded1fc47 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -116,6 +116,29 @@ static void rpcb_map_release(void *data) kfree(map); } +static const struct sockaddr_in rpcb_inaddr_loopback = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_LOOPBACK), + .sin_port = htons(RPCBIND_PORT), +}; + +static struct rpc_clnt *rpcb_create_local(struct sockaddr *addr, + size_t addrlen, u32 version) +{ + struct rpc_create_args args = { + .protocol = XPRT_TRANSPORT_UDP, + .address = addr, + .addrsize = addrlen, + .servername = "localhost", + .program = &rpcb_program, + .version = version, + .authflavor = RPC_AUTH_UNIX, + .flags = RPC_CLNT_CREATE_NOPING, + }; + + return rpc_create(&args); +} + static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, size_t salen, int proto, u32 version, int privileged) @@ -161,10 +184,6 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, */ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) { - struct sockaddr_in sin = { - .sin_family = AF_INET, - .sin_addr.s_addr = htonl(INADDR_LOOPBACK), - }; struct rpcbind_args map = { .r_prog = prog, .r_vers = vers, @@ -184,14 +203,15 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) "rpcbind\n", (port ? "" : "un"), prog, vers, prot, port); - rpcb_clnt = rpcb_create("localhost", (struct sockaddr *) &sin, - sizeof(sin), XPRT_TRANSPORT_UDP, RPCBVERS_2, 1); - if (IS_ERR(rpcb_clnt)) - return PTR_ERR(rpcb_clnt); + rpcb_clnt = rpcb_create_local((struct sockaddr *)&rpcb_inaddr_loopback, + sizeof(rpcb_inaddr_loopback), + RPCBVERS_2); + if (!IS_ERR(rpcb_clnt)) { + error = rpc_call_sync(rpcb_clnt, &msg, 0); + rpc_shutdown_client(rpcb_clnt); + } else + error = PTR_ERR(rpcb_clnt); - error = rpc_call_sync(rpcb_clnt, &msg, 0); - - rpc_shutdown_client(rpcb_clnt); if (error < 0) printk(KERN_WARNING "RPC: failed to contact local rpcbind " "server (errno %d).\n", -error); From 423d8b064771f5cd8b706a4839b18db9bb6c3c59 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 14 Jul 2008 16:03:28 -0400 Subject: [PATCH 64/82] SUNRPC: None of rpcb_create's callers wants a privileged source port Clean up: Callers that required a privileged source port now use rpcb_create_local(), so we can remove the @privileged argument from rpcb_create(). Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 35c1ded1fc47..691bd216f469 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -140,8 +140,7 @@ static struct rpc_clnt *rpcb_create_local(struct sockaddr *addr, } static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, - size_t salen, int proto, u32 version, - int privileged) + size_t salen, int proto, u32 version) { struct rpc_create_args args = { .protocol = proto, @@ -151,7 +150,8 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, .program = &rpcb_program, .version = version, .authflavor = RPC_AUTH_UNIX, - .flags = RPC_CLNT_CREATE_NOPING, + .flags = (RPC_CLNT_CREATE_NOPING | + RPC_CLNT_CREATE_NONPRIVPORT), }; switch (srvaddr->sa_family) { @@ -165,8 +165,6 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, return NULL; } - if (!privileged) - args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; return rpc_create(&args); } @@ -255,7 +253,7 @@ int rpcb_getport_sync(struct sockaddr_in *sin, u32 prog, u32 vers, int prot) __func__, NIPQUAD(sin->sin_addr.s_addr), prog, vers, prot); rpcb_clnt = rpcb_create(NULL, (struct sockaddr *)sin, - sizeof(*sin), prot, RPCBVERS_2, 0); + sizeof(*sin), prot, RPCBVERS_2); if (IS_ERR(rpcb_clnt)) return PTR_ERR(rpcb_clnt); @@ -365,7 +363,7 @@ void rpcb_getport_async(struct rpc_task *task) task->tk_pid, __func__, bind_version); rpcb_clnt = rpcb_create(clnt->cl_server, sap, salen, xprt->prot, - bind_version, 0); + bind_version); if (IS_ERR(rpcb_clnt)) { status = PTR_ERR(rpcb_clnt); dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n", From babe80eb4994dfdc97d5be19a68b5af66d667585 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 14 Jul 2008 16:03:29 -0400 Subject: [PATCH 65/82] SUNRPC: Refactor rpcb_register to make rpcbindv4 support easier rpcbind version 4 registration will reuse part of rpcb_register, so just split it out into a separate function now. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 48 ++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 691bd216f469..8b75c306e661 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -168,6 +168,30 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, return rpc_create(&args); } +static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, + u32 version, struct rpc_message *msg, + int *result) +{ + struct rpc_clnt *rpcb_clnt; + int error = 0; + + *result = 0; + + rpcb_clnt = rpcb_create_local(addr, addrlen, version); + if (!IS_ERR(rpcb_clnt)) { + error = rpc_call_sync(rpcb_clnt, msg, 0); + rpc_shutdown_client(rpcb_clnt); + } else + error = PTR_ERR(rpcb_clnt); + + if (error < 0) + printk(KERN_WARNING "RPC: failed to contact local rpcbind " + "server (errno %d).\n", -error); + dprintk("RPC: registration status %d/%d\n", error, *result); + + return error; +} + /** * rpcb_register - set or unset a port registration with the local rpcbind svc * @prog: RPC program number to bind @@ -189,33 +213,21 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) .r_port = port, }; struct rpc_message msg = { - .rpc_proc = &rpcb_procedures2[port ? - RPCBPROC_SET : RPCBPROC_UNSET], .rpc_argp = &map, .rpc_resp = okay, }; - struct rpc_clnt *rpcb_clnt; - int error = 0; dprintk("RPC: %sregistering (%u, %u, %d, %u) with local " "rpcbind\n", (port ? "" : "un"), prog, vers, prot, port); - rpcb_clnt = rpcb_create_local((struct sockaddr *)&rpcb_inaddr_loopback, + msg.rpc_proc = &rpcb_procedures2[RPCBPROC_UNSET]; + if (port) + msg.rpc_proc = &rpcb_procedures2[RPCBPROC_SET]; + + return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback, sizeof(rpcb_inaddr_loopback), - RPCBVERS_2); - if (!IS_ERR(rpcb_clnt)) { - error = rpc_call_sync(rpcb_clnt, &msg, 0); - rpc_shutdown_client(rpcb_clnt); - } else - error = PTR_ERR(rpcb_clnt); - - if (error < 0) - printk(KERN_WARNING "RPC: failed to contact local rpcbind " - "server (errno %d).\n", -error); - dprintk("RPC: registration status %d/%d\n", error, *okay); - - return error; + RPCBVERS_2, &msg, okay); } /** From c2e1b09ff237c0a3687b9a804cc8bf489743cffc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 14 Jul 2008 16:03:30 -0400 Subject: [PATCH 66/82] SUNRPC: Support registering IPv6 interfaces with local rpcbind daemon Introduce a new API to register RPC services on IPv6 interfaces to allow the NFS server and lockd to advertise on IPv6 networks. Unlike rpcb_register(), the new rpcb_v4_register() function uses rpcbind protocol version 4 to contact the local rpcbind daemon. The version 4 SET/UNSET procedures allow services to register address families besides AF_INET, register at specific network interfaces, and register transport protocols besides UDP and TCP. All of this functionality is exposed via the new rpcb_v4_register() kernel API. A user-space rpcbind daemon implementation that supports version 4 of the rpcbind protocol is required in order to make use of this new API. Note that rpcbind version 3 is sufficient to support the new rpcbind facilities listed above, but most extant implementations use version 4. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 3 + net/sunrpc/rpcb_clnt.c | 178 +++++++++++++++++++++++++++++++++++- 2 files changed, 177 insertions(+), 4 deletions(-) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 764fd4c286e0..e5bfe01ee305 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -125,6 +125,9 @@ void rpc_shutdown_client(struct rpc_clnt *); void rpc_release_client(struct rpc_clnt *); int rpcb_register(u32, u32, int, unsigned short, int *); +int rpcb_v4_register(const u32 program, const u32 version, + const struct sockaddr *address, + const char *netid, int *result); int rpcb_getport_sync(struct sockaddr_in *, u32, u32, int); void rpcb_getport_async(struct rpc_task *); diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 8b75c306e661..24db2b4d12d3 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -87,6 +87,7 @@ struct rpcbind_args { static struct rpc_procinfo rpcb_procedures2[]; static struct rpc_procinfo rpcb_procedures3[]; +static struct rpc_procinfo rpcb_procedures4[]; struct rpcb_info { u32 rpc_vers; @@ -122,6 +123,12 @@ static const struct sockaddr_in rpcb_inaddr_loopback = { .sin_port = htons(RPCBIND_PORT), }; +static const struct sockaddr_in6 rpcb_in6addr_loopback = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_LOOPBACK_INIT, + .sin6_port = htons(RPCBIND_PORT), +}; + static struct rpc_clnt *rpcb_create_local(struct sockaddr *addr, size_t addrlen, u32 version) { @@ -196,13 +203,38 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, * rpcb_register - set or unset a port registration with the local rpcbind svc * @prog: RPC program number to bind * @vers: RPC version number to bind - * @prot: transport protocol to use to make this request + * @prot: transport protocol to register * @port: port value to register - * @okay: result code + * @okay: OUT: result code * - * port == 0 means unregister, port != 0 means register. + * RPC services invoke this function to advertise their contact + * information via the system's rpcbind daemon. RPC services + * invoke this function once for each [program, version, transport] + * tuple they wish to advertise. * - * This routine supports only rpcbind version 2. + * Callers may also unregister RPC services that are no longer + * available by setting the passed-in port to zero. This removes + * all registered transports for [program, version] from the local + * rpcbind database. + * + * Returns zero if the registration request was dispatched + * successfully and a reply was received. The rpcbind daemon's + * boolean result code is stored in *okay. + * + * Returns an errno value and sets *result to zero if there was + * some problem that prevented the rpcbind request from being + * dispatched, or if the rpcbind daemon did not respond within + * the timeout. + * + * This function uses rpcbind protocol version 2 to contact the + * local rpcbind daemon. + * + * Registration works over both AF_INET and AF_INET6, and services + * registered via this function are advertised as available for any + * address. If the local rpcbind daemon is listening on AF_INET6, + * services registered via this function will be advertised on + * IN6ADDR_ANY (ie available for all AF_INET and AF_INET6 + * addresses). */ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) { @@ -230,6 +262,144 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) RPCBVERS_2, &msg, okay); } +/* + * Fill in AF_INET family-specific arguments to register + */ +static int rpcb_register_netid4(struct sockaddr_in *address_to_register, + struct rpc_message *msg) +{ + struct rpcbind_args *map = msg->rpc_argp; + unsigned short port = ntohs(address_to_register->sin_port); + char buf[32]; + + /* Construct AF_INET universal address */ + snprintf(buf, sizeof(buf), + NIPQUAD_FMT".%u.%u", + NIPQUAD(address_to_register->sin_addr.s_addr), + port >> 8, port & 0xff); + map->r_addr = buf; + + dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " + "local rpcbind\n", (port ? "" : "un"), + map->r_prog, map->r_vers, + map->r_addr, map->r_netid); + + msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET]; + if (port) + msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; + + return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback, + sizeof(rpcb_inaddr_loopback), + RPCBVERS_4, msg, msg->rpc_resp); +} + +/* + * Fill in AF_INET6 family-specific arguments to register + */ +static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, + struct rpc_message *msg) +{ + struct rpcbind_args *map = msg->rpc_argp; + unsigned short port = ntohs(address_to_register->sin6_port); + char buf[64]; + + /* Construct AF_INET6 universal address */ + snprintf(buf, sizeof(buf), + NIP6_FMT".%u.%u", + NIP6(address_to_register->sin6_addr), + port >> 8, port & 0xff); + map->r_addr = buf; + + dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " + "local rpcbind\n", (port ? "" : "un"), + map->r_prog, map->r_vers, + map->r_addr, map->r_netid); + + msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET]; + if (port) + msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; + + return rpcb_register_call((struct sockaddr *)&rpcb_in6addr_loopback, + sizeof(rpcb_in6addr_loopback), + RPCBVERS_4, msg, msg->rpc_resp); +} + +/** + * rpcb_v4_register - set or unset a port registration with the local rpcbind + * @program: RPC program number of service to (un)register + * @version: RPC version number of service to (un)register + * @address: address family, IP address, and port to (un)register + * @netid: netid of transport protocol to (un)register + * @result: result code from rpcbind RPC call + * + * RPC services invoke this function to advertise their contact + * information via the system's rpcbind daemon. RPC services + * invoke this function once for each [program, version, address, + * netid] tuple they wish to advertise. + * + * Callers may also unregister RPC services that are no longer + * available by setting the port number in the passed-in address + * to zero. Callers pass a netid of "" to unregister all + * transport netids associated with [program, version, address]. + * + * Returns zero if the registration request was dispatched + * successfully and a reply was received. The rpcbind daemon's + * result code is stored in *result. + * + * Returns an errno value and sets *result to zero if there was + * some problem that prevented the rpcbind request from being + * dispatched, or if the rpcbind daemon did not respond within + * the timeout. + * + * This function uses rpcbind protocol version 4 to contact the + * local rpcbind daemon. The local rpcbind daemon must support + * version 4 of the rpcbind protocol in order for these functions + * to register a service successfully. + * + * Supported netids include "udp" and "tcp" for UDP and TCP over + * IPv4, and "udp6" and "tcp6" for UDP and TCP over IPv6, + * respectively. + * + * The contents of @address determine the address family and the + * port to be registered. The usual practice is to pass INADDR_ANY + * as the raw address, but specifying a non-zero address is also + * supported by this API if the caller wishes to advertise an RPC + * service on a specific network interface. + * + * Note that passing in INADDR_ANY does not create the same service + * registration as IN6ADDR_ANY. The former advertises an RPC + * service on any IPv4 address, but not on IPv6. The latter + * advertises the service on all IPv4 and IPv6 addresses. + */ +int rpcb_v4_register(const u32 program, const u32 version, + const struct sockaddr *address, const char *netid, + int *result) +{ + struct rpcbind_args map = { + .r_prog = program, + .r_vers = version, + .r_netid = netid, + .r_owner = RPCB_OWNER_STRING, + }; + struct rpc_message msg = { + .rpc_argp = &map, + .rpc_resp = result, + }; + + *result = 0; + + switch (address->sa_family) { + case AF_INET: + return rpcb_register_netid4((struct sockaddr_in *)address, + &msg); + case AF_INET6: + return rpcb_register_netid6((struct sockaddr_in6 *)address, + &msg); + } + + return -EAFNOSUPPORT; +} + /** * rpcb_getport_sync - obtain the port for an RPC service on a given host * @sin: address of remote peer From d67d1c7bf948341fd8678c8e337ec27f4b46b206 Mon Sep 17 00:00:00 2001 From: Felix Blyakher Date: Tue, 15 Jul 2008 12:40:22 -0500 Subject: [PATCH 67/82] nfs: set correct fl_len in nlmclnt_test() fcntl(F_GETLK) on an nfs client incorrectly returns the values for the conflicting lock. fl_len value is always 1. If the conflicting lock is (0, 4095) the F_GETLK request for (1024, 10) returns (0, 1), which doesn't even cover the requested range, and is quite confusing. The fix is trivial, set fl_end from the fl_end value recieved from the nfs server. Signed-off-by: Felix Blyakher Signed-off-by: "J. Bruce Fields" Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 5df517b81f3f..fd7d4669776e 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -430,7 +430,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl) * Report the conflicting lock back to the application. */ fl->fl_start = req->a_res.lock.fl.fl_start; - fl->fl_end = req->a_res.lock.fl.fl_start; + fl->fl_end = req->a_res.lock.fl.fl_end; fl->fl_type = req->a_res.lock.fl.fl_type; fl->fl_pid = 0; break; From 1b83d707032a1be40a60ed0a9bd841662cc04a5d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jun 2008 15:44:04 -0400 Subject: [PATCH 68/82] NFS: Protect inode->i_nlink updates using inode->i_lock Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b1940660502f..d6ec1c85995a 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -870,6 +870,14 @@ static int nfs_dentry_delete(struct dentry *dentry) } +static void nfs_drop_nlink(struct inode *inode) +{ + spin_lock(&inode->i_lock); + if (inode->i_nlink > 0) + drop_nlink(inode); + spin_unlock(&inode->i_lock); +} + /* * Called when the dentry loses inode. * We use it to clean up silly-renamed files. @@ -1420,7 +1428,7 @@ static int nfs_safe_remove(struct dentry *dentry) error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); /* The VFS may want to delete this inode */ if (error == 0) - drop_nlink(inode); + nfs_drop_nlink(inode); nfs_mark_for_revalidate(inode); } else error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); @@ -1647,7 +1655,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* dentry still busy? */ goto out; } else - drop_nlink(new_inode); + nfs_drop_nlink(new_inode); go_ahead: /* From a3d01454bc58b5a211ef64a7670572a40b71e682 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jun 2008 12:21:19 -0400 Subject: [PATCH 69/82] NFS: Remove BKL requirement from attribute updates The main problem is dealing with inode->i_size: we need to set the inode->i_lock on all attribute updates, and so vmtruncate won't cut it. Make an NFS-private version of vmtruncate that has the necessary locking semantics. The result should be that the following inode attribute updates are protected by inode->i_lock nfsi->cache_validity nfsi->read_cache_jiffies nfsi->attrtimeo nfsi->attrtimeo_timestamp nfsi->change_attr nfsi->last_updated nfsi->cache_change_attribute nfsi->access_cache nfsi->access_cache_entry_lru nfsi->access_cache_inode_lru nfsi->acl_access nfsi->acl_default nfsi->nfs_page_tree nfsi->ncommit nfsi->npages nfsi->open_files nfsi->silly_list nfsi->acl nfsi->open_states inode->i_size inode->i_atime inode->i_mtime inode->i_ctime inode->i_nlink inode->i_uid inode->i_gid The following is protected by dir->i_mutex nfsi->cookieverf Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 67 +++++++++++++++++++++++++++++++++++++++++++++----- fs/nfs/write.c | 15 +++++++---- 2 files changed, 71 insertions(+), 11 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 2c23d067e2a6..3adabd154779 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -388,6 +388,62 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) return error; } +/** + * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @offset: file offset to start truncating + * + * This is a copy of the common vmtruncate, but with the locking + * corrected to take into account the fact that NFS requires + * inode->i_size to be updated under the inode->i_lock. + */ +static int nfs_vmtruncate(struct inode * inode, loff_t offset) +{ + if (i_size_read(inode) < offset) { + unsigned long limit; + + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out_big; + spin_lock(&inode->i_lock); + i_size_write(inode, offset); + spin_unlock(&inode->i_lock); + } else { + struct address_space *mapping = inode->i_mapping; + + /* + * truncation of in-use swapfiles is disallowed - it would + * cause subsequent swapout to scribble on the now-freed + * blocks. + */ + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + spin_lock(&inode->i_lock); + i_size_write(inode, offset); + spin_unlock(&inode->i_lock); + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, offset); + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); + } + return 0; +out_sig: + send_sig(SIGXFSZ, current, 0); +out_big: + return -EFBIG; +} + /** * nfs_setattr_update_inode - Update inode metadata after a setattr call. * @inode: pointer to struct inode @@ -414,8 +470,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) } if ((attr->ia_valid & ATTR_SIZE) != 0) { nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); - inode->i_size = attr->ia_size; - vmtruncate(inode, attr->ia_size); + nfs_vmtruncate(inode, attr->ia_size); } } @@ -829,9 +884,9 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; } - if (inode->i_size == nfs_size_to_loff_t(fattr->pre_size) && + if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) && nfsi->npages == 0) - inode->i_size = nfs_size_to_loff_t(fattr->size); + i_size_write(inode, nfs_size_to_loff_t(fattr->size)); } } @@ -972,7 +1027,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa (fattr->valid & NFS_ATTR_WCC) == 0) { memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); - fattr->pre_size = inode->i_size; + fattr->pre_size = i_size_read(inode); fattr->valid |= NFS_ATTR_WCC; } return nfs_post_op_update_inode(inode, fattr); @@ -1057,7 +1112,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ if (nfsi->npages == 0 || new_isize > cur_isize) { - inode->i_size = new_isize; + i_size_write(inode, new_isize); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } dprintk("NFS: isize change on server for file %s/%ld\n", diff --git a/fs/nfs/write.c b/fs/nfs/write.c index feca8c648766..3229e217c773 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -133,16 +133,21 @@ static struct nfs_page *nfs_page_find_request(struct page *page) static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) { struct inode *inode = page->mapping->host; - loff_t end, i_size = i_size_read(inode); - pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + loff_t end, i_size; + pgoff_t end_index; + spin_lock(&inode->i_lock); + i_size = i_size_read(inode); + end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; if (i_size > 0 && page->index < end_index) - return; + goto out; end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count); if (i_size >= end) - return; - nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); + goto out; i_size_write(inode, end); + nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); +out: + spin_unlock(&inode->i_lock); } /* A writeback failed: mark the page as bad, and invalidate the page cache */ From fa6dc9dc59c3a76fd209a97c8cf37395980fb903 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jun 2008 13:26:14 -0400 Subject: [PATCH 70/82] NFS: Remove attribute update related BKL references Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 4 ---- fs/nfs/super.c | 4 ---- 2 files changed, 8 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 3adabd154779..df23f987da6b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -370,7 +370,6 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ~ATTR_FILE) == 0) return 0; - lock_kernel(); /* Write all dirty data */ if (S_ISREG(inode->i_mode)) { filemap_write_and_wait(inode->i_mapping); @@ -384,7 +383,6 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr); if (error == 0) nfs_refresh_inode(inode, &fattr); - unlock_kernel(); return error; } @@ -700,7 +698,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) inode->i_sb->s_id, (long long)NFS_FILEID(inode)); nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); - lock_kernel(); if (is_bad_inode(inode)) goto out_nowait; if (NFS_STALE(inode)) @@ -749,7 +746,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) nfs_wake_up_inode(inode); out_nowait: - unlock_kernel(); return status; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 47cf83e917be..1b94e3650f5c 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -374,8 +374,6 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) }; int error; - lock_kernel(); - error = server->nfs_client->rpc_ops->statfs(server, fh, &res); if (error < 0) goto out_err; @@ -407,12 +405,10 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = server->namelen; - unlock_kernel(); return 0; out_err: dprintk("%s: statfs error = %d\n", __func__, -error); - unlock_kernel(); return error; } From 4d80f2ecd506d9732ad94a6da104580bb47680d6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jun 2008 15:44:18 -0400 Subject: [PATCH 71/82] NFS: Remove the BKL from the permission checking code Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index d6ec1c85995a..73e0f9740dd1 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1982,8 +1982,6 @@ int nfs_permission(struct inode *inode, int mask, struct nameidata *nd) } force_lookup: - lock_kernel(); - if (!NFS_PROTO(inode)->access) goto out_notsup; @@ -1993,7 +1991,6 @@ force_lookup: put_rpccred(cred); } else res = PTR_ERR(cred); - unlock_kernel(); out: dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n", inode->i_sb->s_id, inode->i_ino, mask, res); @@ -2002,7 +1999,6 @@ out_notsup: res = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (res == 0) res = generic_permission(inode, mask, NULL); - unlock_kernel(); goto out; } From b6a2e569e2157509951c9f3f58dfa18b44ce91b3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jun 2008 13:26:16 -0400 Subject: [PATCH 72/82] NFS: Remove BKL usage from the write path Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 509dcb58959e..f919d9a01a27 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -393,9 +393,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, zero_user_segment(page, pglen, PAGE_CACHE_SIZE); } - lock_kernel(); status = nfs_updatepage(file, page, offset, copied); - unlock_kernel(); unlock_page(page); page_cache_release(page); From bba67e0e3f4caba2b2b90b48ed798fb0461bcb86 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jun 2008 13:26:23 -0400 Subject: [PATCH 73/82] NFS: Remove BKL usage from open() All the NFSv4 stateful operations are already protected by other locks (in particular by the rpc_sequence locks. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 6 ------ fs/nfs/file.c | 2 -- fs/nfs/nfs4proc.c | 2 -- 3 files changed, 10 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 73e0f9740dd1..c68ec447ace1 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -139,10 +139,8 @@ nfs_opendir(struct inode *inode, struct file *filp) nfs_inc_stats(inode, NFSIOS_VFSOPEN); - lock_kernel(); /* Call generic open code in order to cache credentials */ res = nfs_open(inode, filp); - unlock_kernel(); return res; } @@ -1019,9 +1017,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry } /* Open the file on the server */ - lock_kernel(); res = nfs4_atomic_open(dir, dentry, nd); - unlock_kernel(); if (IS_ERR(res)) { error = PTR_ERR(res); switch (error) { @@ -1083,9 +1079,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) * operations that change the directory. We therefore save the * change attribute *before* we do the RPC call. */ - lock_kernel(); ret = nfs4_open_revalidate(dir, dentry, openflags, nd); - unlock_kernel(); out: dput(parent); if (!ret) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f919d9a01a27..9f1bed944b2e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -128,9 +128,7 @@ nfs_file_open(struct inode *inode, struct file *filp) return res; nfs_inc_stats(inode, NFSIOS_VFSOPEN); - lock_kernel(); res = nfs_open(inode, filp); - unlock_kernel(); return res; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4451287a81d1..c910413eaeca 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -451,9 +451,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) /* Save the delegation */ memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); rcu_read_unlock(); - lock_kernel(); ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); - unlock_kernel(); if (ret != 0) goto out; ret = -EAGAIN; From f1e2eda23513b68003202bddf1f84158baad8844 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jun 2008 15:50:50 -0400 Subject: [PATCH 74/82] NFS: Remove the BKL from the inode creation operations nfs_instantiate() does not require the BKL, neither do the attribute updates or the RPC code. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c68ec447ace1..d40e91e5c94a 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1232,14 +1232,11 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, if ((nd->flags & LOOKUP_CREATE) != 0) open_flags = nd->intent.open.flags; - lock_kernel(); error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd); if (error != 0) goto out_err; - unlock_kernel(); return 0; out_err: - unlock_kernel(); d_drop(dentry); return error; } @@ -1262,14 +1259,11 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; - lock_kernel(); status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev); if (status != 0) goto out_err; - unlock_kernel(); return 0; out_err: - unlock_kernel(); d_drop(dentry); return status; } @@ -1288,15 +1282,12 @@ static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) attr.ia_valid = ATTR_MODE; attr.ia_mode = mode | S_IFDIR; - lock_kernel(); error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); if (error != 0) goto out_err; - unlock_kernel(); return 0; out_err: d_drop(dentry); - unlock_kernel(); return error; } From fc81af535e462764e17f638d542973fbef13b026 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jun 2008 15:52:40 -0400 Subject: [PATCH 75/82] NFS: Remove the BKL from nfs_link() Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index d40e91e5c94a..5ae8ee6b298f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1549,14 +1549,12 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) old_dentry->d_parent->d_name.name, old_dentry->d_name.name, dentry->d_parent->d_name.name, dentry->d_name.name); - lock_kernel(); d_drop(dentry); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { atomic_inc(&inode->i_count); d_add(dentry, inode); } - unlock_kernel(); return error; } From fc0f684c21b5d4b41dc2ec76f7c0897ac98f5b6e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jun 2008 15:44:20 -0400 Subject: [PATCH 76/82] NFS: Remove BKL from NFS lookup code All dentry-related operations are already BKL-safe, since they are protected by the VFS locking. No extra locks should be needed in the NFS code. In the case of nfs_revalidate_inode(), we're only doing an attribute update (protected by the inode->i_lock). In the case of nfs_lookup(), we're instantiating a new dentry, so there should be no contention possible until after we call d_materialise_unique. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 5ae8ee6b298f..60da7550133c 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -777,7 +777,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) struct nfs_fattr fattr; parent = dget_parent(dentry); - lock_kernel(); dir = parent->d_inode; nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); inode = dentry->d_inode; @@ -815,7 +814,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_valid: - unlock_kernel(); dput(parent); dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n", __func__, dentry->d_parent->d_name.name, @@ -834,7 +832,6 @@ out_zap_parent: shrink_dcache_parent(dentry); } d_drop(dentry); - unlock_kernel(); dput(parent); dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", __func__, dentry->d_parent->d_name.name, @@ -921,8 +918,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru res = ERR_PTR(-ENOMEM); dentry->d_op = NFS_PROTO(dir)->dentry_ops; - lock_kernel(); - /* * If we're doing an exclusive create, optimize away the lookup * but don't hash the dentry. @@ -930,7 +925,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru if (nfs_is_exclusive_create(dir, nd)) { d_instantiate(dentry, NULL); res = NULL; - goto out_unlock; + goto out; } parent = dentry->d_parent; @@ -958,8 +953,6 @@ no_entry: nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_unblock_sillyrename: nfs_unblock_sillyrename(parent); -out_unlock: - unlock_kernel(); out: return res; } From bd9bb454b76fb6ca2d00f17313f9f9df5f5c404a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jun 2008 16:09:59 -0400 Subject: [PATCH 77/82] NFS: Remove the BKL from the rename, rmdir and unlink operations Attribute updates are safe, and dentry operations are protected using VFS level locks. Defer removing the BKL from sillyrename until a separate patch. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 60da7550133c..68e0688904ea 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1297,14 +1297,12 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry) dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); - lock_kernel(); error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); /* Ensure the VFS deletes this inode */ if (error == 0 && dentry->d_inode != NULL) clear_nlink(dentry->d_inode); else if (error == -ENOENT) nfs_dentry_handle_enoent(dentry); - unlock_kernel(); return error; } @@ -1429,7 +1427,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry) dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); - lock_kernel(); spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); if (atomic_read(&dentry->d_count) > 1) { @@ -1437,6 +1434,7 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry) spin_unlock(&dcache_lock); /* Start asynchronous writeout of the inode */ write_inode_now(dentry->d_inode, 0); + lock_kernel(); error = nfs_sillyrename(dir, dentry); unlock_kernel(); return error; @@ -1452,7 +1450,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry) nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); } else if (need_rehash) d_rehash(dentry); - unlock_kernel(); return error; } @@ -1587,7 +1584,6 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, * To prevent any new references to the target during the rename, * we unhash the dentry and free the inode in advance. */ - lock_kernel(); if (!d_unhashed(new_dentry)) { d_drop(new_dentry); rehash = new_dentry; @@ -1621,7 +1617,9 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; /* silly-rename the existing target ... */ + lock_kernel(); err = nfs_sillyrename(new_dir, new_dentry); + unlock_kernel(); if (!err) { new_dentry = rehash = dentry; new_inode = NULL; @@ -1665,7 +1663,6 @@ out: /* new dentry created? */ if (dentry) dput(dentry); - unlock_kernel(); return error; } From 52e2e8d37e01edf38ccdccc983fb13ec1456d63d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jun 2008 15:44:21 -0400 Subject: [PATCH 78/82] NFS: Remove BKL from the sillydelete operations Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 68e0688904ea..1bdc36bf1782 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -884,10 +884,8 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { - lock_kernel(); drop_nlink(inode); nfs_complete_unlink(dentry, inode); - unlock_kernel(); } iput(inode); } @@ -1434,9 +1432,7 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry) spin_unlock(&dcache_lock); /* Start asynchronous writeout of the inode */ write_inode_now(dentry->d_inode, 0); - lock_kernel(); error = nfs_sillyrename(dir, dentry); - unlock_kernel(); return error; } if (!d_unhashed(dentry)) { @@ -1617,9 +1613,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; /* silly-rename the existing target ... */ - lock_kernel(); err = nfs_sillyrename(new_dir, new_dentry); - unlock_kernel(); if (!err) { new_dentry = rehash = dentry; new_inode = NULL; From 76566991f94c206d9c5881edcaf99ba72c9e9d61 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jun 2008 15:44:22 -0400 Subject: [PATCH 79/82] NFS: Remove BKL from the symlink code Page cache accesses are serialised using page locks, whereas attribute updates are serialised using inode->i_lock. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 1bdc36bf1782..e5f950291928 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1482,13 +1482,9 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym attr.ia_mode = S_IFLNK | S_IRWXUGO; attr.ia_valid = ATTR_MODE; - lock_kernel(); - page = alloc_page(GFP_HIGHUSER); - if (!page) { - unlock_kernel(); + if (!page) return -ENOMEM; - } kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr, symname, pathlen); @@ -1503,7 +1499,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym dentry->d_name.name, symname, error); d_drop(dentry); __free_page(page); - unlock_kernel(); return error; } @@ -1521,7 +1516,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym } else __free_page(page); - unlock_kernel(); return 0; } From c3cc8c019ca09767d7c9b5457d5cf8ac65085f44 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jun 2008 15:44:23 -0400 Subject: [PATCH 80/82] NFS: Remove BKL from the readdir code Page accesses are serialised using the page locks, whereas all attribute updates are serialised using the inode->i_lock. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e5f950291928..28a238dab23a 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -534,8 +534,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) (long long)filp->f_pos); nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); - lock_kernel(); - /* * filp->f_pos points to the dirent entry number. * *desc->dir_cookie has the cookie for the next entry. We have @@ -593,7 +591,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) } out: nfs_unblock_sillyrename(dentry); - unlock_kernel(); if (res > 0) res = 0; dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n", From a86dc496b764ebb1431677b38eab45310e5a2ad4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jun 2008 13:37:09 -0400 Subject: [PATCH 81/82] SUNRPC: Remove the BKL from the callback functions Push it into those callback functions that actually need it. Note that all the NFS operations use their own locking, so don't need the BKL. Ditto for the rpcbind client. Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 6 ++++++ fs/lockd/svc4proc.c | 2 ++ fs/lockd/svclock.c | 7 ++++++- fs/lockd/svcproc.c | 2 ++ net/sunrpc/sched.c | 9 +-------- 5 files changed, 17 insertions(+), 9 deletions(-) diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index fd7d4669776e..1f6dc518505c 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -224,7 +224,9 @@ void nlm_release_call(struct nlm_rqst *call) static void nlmclnt_rpc_release(void *data) { + lock_kernel(); nlm_release_call(data); + unlock_kernel(); } static int nlm_wait_on_grace(wait_queue_head_t *queue) @@ -710,7 +712,9 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data) die: return; retry_rebind: + lock_kernel(); nlm_rebind_host(req->a_host); + unlock_kernel(); retry_unlock: rpc_restart_call(task); } @@ -788,7 +792,9 @@ retry_cancel: /* Don't ever retry more than 3 times */ if (req->a_retries++ >= NLMCLNT_MAX_RETRIES) goto die; + lock_kernel(); nlm_rebind_host(req->a_host); + unlock_kernel(); rpc_restart_call(task); rpc_delay(task, 30 * HZ); } diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 385437e3387d..2e27176ff42f 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -248,7 +248,9 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data) static void nlm4svc_callback_release(void *data) { + lock_kernel(); nlm_release_call(data); + unlock_kernel(); } static const struct rpc_call_ops nlm4svc_callback_ops = { diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 81aca859bfde..56a08ab9a4cb 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -795,6 +795,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data) dprintk("lockd: GRANT_MSG RPC callback\n"); + lock_kernel(); /* if the block is not on a list at this point then it has * been invalidated. Don't try to requeue it. * @@ -804,7 +805,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data) * for nlm_blocked? */ if (list_empty(&block->b_list)) - return; + goto out; /* Technically, we should down the file semaphore here. Since we * move the block towards the head of the queue only, no harm @@ -818,13 +819,17 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data) } nlmsvc_insert_block(block, timeout); svc_wake_up(block->b_daemon); +out: + unlock_kernel(); } static void nlmsvc_grant_release(void *data) { struct nlm_rqst *call = data; + lock_kernel(); nlmsvc_release_block(call->a_block); + unlock_kernel(); } static const struct rpc_call_ops nlmsvc_grant_ops = { diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 88379cc6e0b1..ce6952b50a75 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -278,7 +278,9 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data) static void nlmsvc_callback_release(void *data) { + lock_kernel(); nlm_release_call(data); + unlock_kernel(); } static const struct rpc_call_ops nlmsvc_callback_ops = { diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 6288af05c20f..385f427bedad 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -576,9 +576,7 @@ EXPORT_SYMBOL_GPL(rpc_delay); */ static void rpc_prepare_task(struct rpc_task *task) { - lock_kernel(); task->tk_ops->rpc_call_prepare(task, task->tk_calldata); - unlock_kernel(); } /* @@ -588,9 +586,7 @@ void rpc_exit_task(struct rpc_task *task) { task->tk_action = NULL; if (task->tk_ops->rpc_call_done != NULL) { - lock_kernel(); task->tk_ops->rpc_call_done(task, task->tk_calldata); - unlock_kernel(); if (task->tk_action != NULL) { WARN_ON(RPC_ASSASSINATED(task)); /* Always release the RPC slot and buffer memory */ @@ -602,11 +598,8 @@ EXPORT_SYMBOL_GPL(rpc_exit_task); void rpc_release_calldata(const struct rpc_call_ops *ops, void *calldata) { - if (ops->rpc_release != NULL) { - lock_kernel(); + if (ops->rpc_release != NULL) ops->rpc_release(calldata); - unlock_kernel(); - } } /* From f839c4c1991cc9b580ae38f98f54554938a7f49c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jun 2008 15:44:26 -0400 Subject: [PATCH 82/82] NFSv4: Remove BKL from the nfsv4 state recovery Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 856a8934f610..401ef8b28f97 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -940,7 +940,6 @@ static int reclaimer(void *ptr) allow_signal(SIGKILL); /* Ensure exclusive access to NFSv4 state */ - lock_kernel(); down_write(&clp->cl_sem); /* Are there any NFS mounts out there? */ if (list_empty(&clp->cl_superblocks)) @@ -1000,7 +999,6 @@ restart_loop: nfs_delegation_reap_unclaimed(clp); out: up_write(&clp->cl_sem); - unlock_kernel(); if (status == -NFS4ERR_CB_PATH_DOWN) nfs_handle_cb_pathdown(clp); nfs4_clear_recover_bit(clp);