From 1c8d477a77e2d1d3504419e7f2e02e6422becf9a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 14 Aug 2016 12:47:49 -0400
Subject: [PATCH 01/15] pNFS/flexfiles: Fix layoutstat periodic reporting

Putting the periodicity timer in the mirror instances is causing
non-scalable reporting behaviour and missed reporting intervals.
When you recall layouts and/or implement client side mirroring, it
leads to consecutive reports with only a few ms between RPC calls.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Fixes: d0379a5d066a9 ("pNFS/flexfiles: Support server-supplied...")
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 8 ++++----
 fs/nfs/flexfilelayout/flexfilelayout.h | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index e6206eaf2bdf..ee1c94c7614c 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -37,6 +37,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
 	if (ffl) {
 		INIT_LIST_HEAD(&ffl->error_list);
 		INIT_LIST_HEAD(&ffl->mirrors);
+		ffl->last_report_time = ktime_get();
 		return &ffl->generic_hdr;
 	} else
 		return NULL;
@@ -640,19 +641,18 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
 {
 	static const ktime_t notime = {0};
 	s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
+	struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout);
 
 	nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
 	if (ktime_equal(mirror->start_time, notime))
 		mirror->start_time = now;
-	if (ktime_equal(mirror->last_report_time, notime))
-		mirror->last_report_time = now;
 	if (mirror->report_interval != 0)
 		report_interval = (s64)mirror->report_interval * 1000LL;
 	else if (layoutstats_timer != 0)
 		report_interval = (s64)layoutstats_timer * 1000LL;
-	if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
+	if (ktime_to_ms(ktime_sub(now, ffl->last_report_time)) >=
 			report_interval) {
-		mirror->last_report_time = now;
+		ffl->last_report_time = now;
 		return true;
 	}
 
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 1bcdb15d0c41..3ee0c9fcea76 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -84,7 +84,6 @@ struct nfs4_ff_layout_mirror {
 	struct nfs4_ff_layoutstat	read_stat;
 	struct nfs4_ff_layoutstat	write_stat;
 	ktime_t				start_time;
-	ktime_t				last_report_time;
 	u32				report_interval;
 };
 
@@ -101,6 +100,7 @@ struct nfs4_flexfile_layout {
 	struct pnfs_ds_commit_info commit_info;
 	struct list_head	mirrors;
 	struct list_head	error_list; /* nfs4_ff_layout_ds_err */
+	ktime_t			last_report_time; /* Layoutstat report times */
 };
 
 static inline struct nfs4_flexfile_layout *

From a956beda19a6b39fbc19d0aaf21947acdc18cf74 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 16 Aug 2016 10:26:47 -0400
Subject: [PATCH 02/15] NFS: Allow the mount option retrans=0

We should allow retrans=0 as just meaning that every timeout is a major
timeout, and that there is no increment in the timeout value.

For instance, this means that we would allow TCP users to specify a
flat timeout value of 60s, by specifying "timeo=600,retrans=0" in their
mount option string.

Siged-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/client.c   | 10 +++++-----
 fs/nfs/internal.h |  5 ++++-
 fs/nfs/super.c    | 19 +++++++++++++++++--
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 003ebce4bbc4..1e106780a237 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -426,7 +426,7 @@ EXPORT_SYMBOL_GPL(nfs_mark_client_ready);
  * Initialise the timeout values for a connection
  */
 void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
-				    unsigned int timeo, unsigned int retrans)
+				    int timeo, int retrans)
 {
 	to->to_initval = timeo * HZ / 10;
 	to->to_retries = retrans;
@@ -434,9 +434,9 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
 	switch (proto) {
 	case XPRT_TRANSPORT_TCP:
 	case XPRT_TRANSPORT_RDMA:
-		if (to->to_retries == 0)
+		if (retrans == NFS_UNSPEC_RETRANS)
 			to->to_retries = NFS_DEF_TCP_RETRANS;
-		if (to->to_initval == 0)
+		if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0)
 			to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
 		if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
 			to->to_initval = NFS_MAX_TCP_TIMEOUT;
@@ -449,9 +449,9 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
 		to->to_exponential = 0;
 		break;
 	case XPRT_TRANSPORT_UDP:
-		if (to->to_retries == 0)
+		if (retrans == NFS_UNSPEC_RETRANS)
 			to->to_retries = NFS_DEF_UDP_RETRANS;
-		if (!to->to_initval)
+		if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0)
 			to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10;
 		if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
 			to->to_initval = NFS_MAX_UDP_TIMEOUT;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7ce5e023c3c3..74935a19e4bf 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -58,6 +58,9 @@ struct nfs_clone_mount {
  */
 #define NFS_UNSPEC_PORT		(-1)
 
+#define NFS_UNSPEC_RETRANS	(UINT_MAX)
+#define NFS_UNSPEC_TIMEO	(UINT_MAX)
+
 /*
  * Maximum number of pages that readdir can use for creating
  * a vmapped array of pages.
@@ -156,7 +159,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
 int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
 void nfs_server_insert_lists(struct nfs_server *);
 void nfs_server_remove_lists(struct nfs_server *);
-void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int);
+void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans);
 int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t,
 		rpc_authflavor_t);
 struct nfs_server *nfs_alloc_server(void);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 18d446e1a82b..d39601381adf 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -923,6 +923,8 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (data) {
+		data->timeo		= NFS_UNSPEC_TIMEO;
+		data->retrans		= NFS_UNSPEC_RETRANS;
 		data->acregmin		= NFS_DEF_ACREGMIN;
 		data->acregmax		= NFS_DEF_ACREGMAX;
 		data->acdirmin		= NFS_DEF_ACDIRMIN;
@@ -1189,6 +1191,19 @@ static int nfs_get_option_ul(substring_t args[], unsigned long *option)
 	return rc;
 }
 
+static int nfs_get_option_ul_bound(substring_t args[], unsigned long *option,
+		unsigned long l_bound, unsigned long u_bound)
+{
+	int ret;
+
+	ret = nfs_get_option_ul(args, option);
+	if (ret != 0)
+		return ret;
+	if (*option < l_bound || *option > u_bound)
+		return -ERANGE;
+	return 0;
+}
+
 /*
  * Error-check and convert a string of mount options from user space into
  * a data structure.  The whole mount string is processed; bad options are
@@ -1352,12 +1367,12 @@ static int nfs_parse_mount_options(char *raw,
 			mnt->bsize = option;
 			break;
 		case Opt_timeo:
-			if (nfs_get_option_ul(args, &option) || option == 0)
+			if (nfs_get_option_ul_bound(args, &option, 1, INT_MAX))
 				goto out_invalid_value;
 			mnt->timeo = option;
 			break;
 		case Opt_retrans:
-			if (nfs_get_option_ul(args, &option) || option == 0)
+			if (nfs_get_option_ul_bound(args, &option, 0, INT_MAX))
 				goto out_invalid_value;
 			mnt->retrans = option;
 			break;

From 15d03055cf39fe61714aeda8d0a722b3137531ed Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 16 Aug 2016 11:08:22 -0400
Subject: [PATCH 03/15] pNFS/flexfiles: Set reasonable default retrans values
 for the data channel

Prior to this patch, the retrans value was set at 5, meaning that we
could see a maximum retransmission timeout value of more than 6 minutes.
That's a tad high for NFSv3 where the protocol does allow the server to
drop requests at any time.

Since this is a data channel, let's just set retrans to 0, and the default
timeout to 60s. The user can continue to adjust these defaults using the
dataserver_retrans and dataserver_timeo module parameters.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayoutdev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 0aa36be71fce..970efba05ae1 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -17,8 +17,8 @@
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS_LD
 
-static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
-static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
+static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS;
+static unsigned int dataserver_retrans;
 
 void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
 {

From 9a0fe86745b8e95f7ea39933a956f5771332c430 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 19 Aug 2016 15:33:12 -0400
Subject: [PATCH 04/15] pNFS: Handle NFS4ERR_OLD_STATEID correctly in
 LAYOUTSTAT calls

We normally want to update the stateid and then retry,

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs42proc.c | 34 +++++++++++++++++++++++++++++-----
 fs/nfs/pnfs.c      |  1 -
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 6f4752734804..64b43b4ad9dd 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -318,10 +318,22 @@ static void
 nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs42_layoutstat_data *data = calldata;
-	struct nfs_server *server = NFS_SERVER(data->args.inode);
+	struct inode *inode = data->inode;
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct pnfs_layout_hdr *lo;
 
+	spin_lock(&inode->i_lock);
+	lo = NFS_I(inode)->layout;
+	if (!pnfs_layout_is_valid(lo)) {
+		spin_unlock(&inode->i_lock);
+		rpc_exit(task, 0);
+		return;
+	}
+	nfs4_stateid_copy(&data->args.stateid, &lo->plh_stateid);
+	spin_unlock(&inode->i_lock);
 	nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args,
 			     &data->res.seq_res, task);
+
 }
 
 static void
@@ -341,11 +353,11 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
 	case -NFS4ERR_ADMIN_REVOKED:
 	case -NFS4ERR_DELEG_REVOKED:
 	case -NFS4ERR_STALE_STATEID:
-	case -NFS4ERR_OLD_STATEID:
 	case -NFS4ERR_BAD_STATEID:
 		spin_lock(&inode->i_lock);
 		lo = NFS_I(inode)->layout;
-		if (lo && nfs4_stateid_match(&data->args.stateid,
+		if (pnfs_layout_is_valid(lo) &&
+		    nfs4_stateid_match(&data->args.stateid,
 					     &lo->plh_stateid)) {
 			LIST_HEAD(head);
 
@@ -359,11 +371,23 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
 		} else
 			spin_unlock(&inode->i_lock);
 		break;
+	case -NFS4ERR_OLD_STATEID:
+		spin_lock(&inode->i_lock);
+		lo = NFS_I(inode)->layout;
+		if (pnfs_layout_is_valid(lo) &&
+		    nfs4_stateid_match_other(&data->args.stateid,
+					&lo->plh_stateid)) {
+			/* Do we need to delay before resending? */
+			if (!nfs4_stateid_is_newer(&lo->plh_stateid,
+						&data->args.stateid))
+				rpc_delay(task, HZ);
+			rpc_restart_call_prepare(task);
+		}
+		spin_unlock(&inode->i_lock);
+		break;
 	case -ENOTSUPP:
 	case -EOPNOTSUPP:
 		NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
-	default:
-		break;
 	}
 
 	dprintk("%s server returns %d\n", __func__, task->tk_status);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 70806cae0d36..bf98f1b2595f 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2510,7 +2510,6 @@ pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
 
 	data->args.fh = NFS_FH(inode);
 	data->args.inode = inode;
-	nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid);
 	status = ld->prepare_layoutstats(&data->args);
 	if (status)
 		goto out_free;

From b88fa69eaa8649f11828158c7b65c4bcd886ebd5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 23 Aug 2016 11:19:33 -0400
Subject: [PATCH 05/15] pNFS: The client must not do I/O to the DS if it's
 lease has expired

Ensure that the client conforms to the normative behaviour described in
RFC5661 Section 12.7.2: "If a client believes its lease has expired,
it MUST NOT send I/O to the storage device until it has validated its
lease."

So ensure that we wait for the lease to be validated before using
the layout.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: stable@vger.kernel.org # v3.20+
---
 fs/nfs/pnfs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index bf98f1b2595f..6daf034645c8 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1555,6 +1555,7 @@ pnfs_update_layout(struct inode *ino,
 	}
 
 lookup_again:
+	nfs4_client_recover_expired_lease(clp);
 	first = false;
 	spin_lock(&ino->i_lock);
 	lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);

From 41963c10c47a35185e68cb9049f7a3493c94d2d7 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Mon, 22 Aug 2016 14:11:16 -0400
Subject: [PATCH 06/15] pnfs/blocklayout: update last_write_offset atomically
 with extents

Block/SCSI layout write completion may add committable extents to the
extent tree before updating the layout's last-written byte under the inode
lock.  If a sync happens before this value is updated, then
prepare_layoutcommit may find and encode these extents which would produce
a LAYOUTCOMMIT request whose encoded extents are larger than the request's
loca_length.

Fix this by using a last-written byte value that is updated atomically with
the extent tree so that commitable extents always match.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.c |  2 +-
 fs/nfs/blocklayout/blocklayout.h |  3 ++-
 fs/nfs/blocklayout/extent_tree.c | 10 +++++++---
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index f55a4e756047..217847679f0e 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -346,7 +346,7 @@ static void bl_write_cleanup(struct work_struct *work)
 			PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
 
 		ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
-					(end - start) >> SECTOR_SHIFT);
+					(end - start) >> SECTOR_SHIFT, end);
 	}
 
 	pnfs_ld_write_done(hdr);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 18e6fd0b9506..efc007f00742 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -141,6 +141,7 @@ struct pnfs_block_layout {
 	struct rb_root		bl_ext_ro;
 	spinlock_t		bl_ext_lock;   /* Protects list manipulation */
 	bool			bl_scsi_layout;
+	u64			bl_lwb;
 };
 
 static inline struct pnfs_block_layout *
@@ -182,7 +183,7 @@ int ext_tree_insert(struct pnfs_block_layout *bl,
 int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
 		sector_t end);
 int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
-		sector_t len);
+		sector_t len, u64 lwb);
 bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
 		struct pnfs_block_extent *ret, bool rw);
 int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 992bcb19c11e..c85fbfd2d0d9 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -402,7 +402,7 @@ ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
 
 int
 ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
-		sector_t len)
+		sector_t len, u64 lwb)
 {
 	struct rb_root *root = &bl->bl_ext_rw;
 	sector_t end = start + len;
@@ -471,6 +471,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
 		}
 	}
 out:
+	if (bl->bl_lwb < lwb)
+		bl->bl_lwb = lwb;
 	spin_unlock(&bl->bl_ext_lock);
 
 	__ext_put_deviceids(&tmp);
@@ -518,7 +520,7 @@ static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
 }
 
 static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
-		size_t buffer_size, size_t *count)
+		size_t buffer_size, size_t *count, __u64 *lastbyte)
 {
 	struct pnfs_block_extent *be;
 	int ret = 0;
@@ -542,6 +544,8 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
 			p = encode_block_extent(be, p);
 		be->be_tag = EXTENT_COMMITTING;
 	}
+	*lastbyte = bl->bl_lwb - 1;
+	bl->bl_lwb = 0;
 	spin_unlock(&bl->bl_ext_lock);
 
 	return ret;
@@ -564,7 +568,7 @@ ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
 	arg->layoutupdate_pages = &arg->layoutupdate_page;
 
 retry:
-	ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
+	ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten);
 	if (unlikely(ret)) {
 		ext_tree_free_commitdata(arg, buffer_size);
 

From 16590a228109e2f318d2cc6466221134cfab723a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 22 Aug 2016 14:57:42 -0400
Subject: [PATCH 07/15] SUNRPC: Silence WARN_ON when NFSv4.1 over RDMA is in
 use

Using NFSv4.1 on RDMA should be safe, so broaden the new checks in
rpc_create().

WARN_ON_ONCE is used, matching most other WARN call sites in clnt.c.

Fixes: 39a9beab5acb ("rpc: share one xps between all backchannels")
Fixes: d50039ea5ee6 ("nfsd4/rpc: move backchannel create logic...")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: J. Bruce Fields <bfields@fieldses.org>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/clnt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 7f79fb7dc6a0..66f23b376fa0 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -453,7 +453,7 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
 	struct rpc_xprt_switch *xps;
 
 	if (args->bc_xprt && args->bc_xprt->xpt_bc_xps) {
-		WARN_ON(args->protocol != XPRT_TRANSPORT_BC_TCP);
+		WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC));
 		xps = args->bc_xprt->xpt_bc_xps;
 		xprt_switch_get(xps);
 	} else {
@@ -520,7 +520,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 	char servername[48];
 
 	if (args->bc_xprt) {
-		WARN_ON(args->protocol != XPRT_TRANSPORT_BC_TCP);
+		WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC));
 		xprt = args->bc_xprt->xpt_bc_xprt;
 		if (xprt) {
 			xprt_get(xprt);

From e09c978aae5bedfdb379be80363b024b7d82638b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sat, 27 Aug 2016 23:44:04 -0400
Subject: [PATCH 08/15] NFSv4.1: Fix Oopsable condition in server callback
 races

The slot table hasn't been an array since v3.7. Ensure that we
use nfs4_lookup_slot() to access the slot correctly.

Fixes: 87dda67e7386 ("NFSv4.1: Allow SEQUENCE to resize the slot table...")
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: stable@vger.kernel.org # v3.8+
---
 fs/nfs/callback_proc.c |  5 +----
 fs/nfs/nfs4session.c   | 33 +++++++++++++++++++++++++++++++++
 fs/nfs/nfs4session.h   |  1 +
 3 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c92a75e066a6..a4cf6d2c14a4 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -454,11 +454,8 @@ static bool referring_call_exists(struct nfs_client *clp,
 				((u32 *)&rclist->rcl_sessionid.data)[3],
 				ref->rc_sequenceid, ref->rc_slotid);
 
-			spin_lock(&tbl->slot_tbl_lock);
-			status = (test_bit(ref->rc_slotid, tbl->used_slots) &&
-				  tbl->slots[ref->rc_slotid].seq_nr ==
+			status = nfs4_slot_seqid_in_use(tbl, ref->rc_slotid,
 					ref->rc_sequenceid);
-			spin_unlock(&tbl->slot_tbl_lock);
 			if (status)
 				goto out;
 		}
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index 332d06e64fa9..c1f4c208f38a 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -172,6 +172,39 @@ struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid)
 	return ERR_PTR(-E2BIG);
 }
 
+static int nfs4_slot_get_seqid(struct nfs4_slot_table  *tbl, u32 slotid,
+		u32 *seq_nr)
+	__must_hold(&tbl->slot_tbl_lock)
+{
+	struct nfs4_slot *slot;
+
+	slot = nfs4_lookup_slot(tbl, slotid);
+	if (IS_ERR(slot))
+		return PTR_ERR(slot);
+	*seq_nr = slot->seq_nr;
+	return 0;
+}
+
+/*
+ * nfs4_slot_seqid_in_use - test if a slot sequence id is still in use
+ *
+ * Given a slot table, slot id and sequence number, determine if the
+ * RPC call in question is still in flight. This function is mainly
+ * intended for use by the callback channel.
+ */
+bool nfs4_slot_seqid_in_use(struct nfs4_slot_table *tbl, u32 slotid, u32 seq_nr)
+{
+	u32 cur_seq;
+	bool ret = false;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	if (nfs4_slot_get_seqid(tbl, slotid, &cur_seq) == 0 &&
+	    cur_seq == seq_nr && test_bit(slotid, tbl->used_slots))
+		ret = true;
+	spin_unlock(&tbl->slot_tbl_lock);
+	return ret;
+}
+
 /*
  * nfs4_alloc_slot - efficiently look for a free slot
  *
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 5b51298d1d03..33cace62b50b 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -78,6 +78,7 @@ extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
 extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl);
 extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
 extern struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid);
+extern bool nfs4_slot_seqid_in_use(struct nfs4_slot_table  *tbl, u32 slotid, u32 seq_nr);
 extern bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
 extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
 extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);

From 045d2a6d076a2ecd7043ea543ea198af943f8b16 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 28 Aug 2016 13:25:43 -0400
Subject: [PATCH 09/15] NFSv4.1: Delay callback processing when there are
 referring triples

If CB_SEQUENCE tells us that the processing of this request depends on
the completion of one or more referring triples (see RFC 5661 Section
2.10.6.3), delay the callback processing until after the RPC requests
being referred to have completed.
If we end up delaying for more than 1/2 second, then fall back to
returning NFS4ERR_DELAY in reply to the callback.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback_proc.c |  4 ++--
 fs/nfs/nfs4proc.c      |  2 ++
 fs/nfs/nfs4session.c   | 22 +++++++++++++++++++++-
 fs/nfs/nfs4session.h   |  5 ++++-
 4 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index a4cf6d2c14a4..c35932967722 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -454,8 +454,8 @@ static bool referring_call_exists(struct nfs_client *clp,
 				((u32 *)&rclist->rcl_sessionid.data)[3],
 				ref->rc_sequenceid, ref->rc_slotid);
 
-			status = nfs4_slot_seqid_in_use(tbl, ref->rc_slotid,
-					ref->rc_sequenceid);
+			status = nfs4_slot_wait_on_seqid(tbl, ref->rc_slotid,
+					ref->rc_sequenceid, HZ >> 1) < 0;
 			if (status)
 				goto out;
 		}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1949bbd806eb..0cc0c319cfdd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -686,6 +686,8 @@ out_unlock:
 	res->sr_slot = NULL;
 	if (send_new_highest_used_slotid)
 		nfs41_notify_server(session->clp);
+	if (waitqueue_active(&tbl->slot_waitq))
+		wake_up_all(&tbl->slot_waitq);
 }
 
 int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index c1f4c208f38a..b62973045a3e 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -28,6 +28,7 @@ static void nfs4_init_slot_table(struct nfs4_slot_table *tbl, const char *queue)
 	tbl->highest_used_slotid = NFS4_NO_SLOT;
 	spin_lock_init(&tbl->slot_tbl_lock);
 	rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, queue);
+	init_waitqueue_head(&tbl->slot_waitq);
 	init_completion(&tbl->complete);
 }
 
@@ -192,7 +193,8 @@ static int nfs4_slot_get_seqid(struct nfs4_slot_table  *tbl, u32 slotid,
  * RPC call in question is still in flight. This function is mainly
  * intended for use by the callback channel.
  */
-bool nfs4_slot_seqid_in_use(struct nfs4_slot_table *tbl, u32 slotid, u32 seq_nr)
+static bool nfs4_slot_seqid_in_use(struct nfs4_slot_table *tbl,
+		u32 slotid, u32 seq_nr)
 {
 	u32 cur_seq;
 	bool ret = false;
@@ -205,6 +207,24 @@ bool nfs4_slot_seqid_in_use(struct nfs4_slot_table *tbl, u32 slotid, u32 seq_nr)
 	return ret;
 }
 
+/*
+ * nfs4_slot_wait_on_seqid - wait until a slot sequence id is complete
+ *
+ * Given a slot table, slot id and sequence number, wait until the
+ * corresponding RPC call completes. This function is mainly
+ * intended for use by the callback channel.
+ */
+int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl,
+		u32 slotid, u32 seq_nr,
+		unsigned long timeout)
+{
+	if (wait_event_timeout(tbl->slot_waitq,
+			!nfs4_slot_seqid_in_use(tbl, slotid, seq_nr),
+			timeout) == 0)
+		return -ETIMEDOUT;
+	return 0;
+}
+
 /*
  * nfs4_alloc_slot - efficiently look for a free slot
  *
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 33cace62b50b..fa75d7db3db3 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -36,6 +36,7 @@ struct nfs4_slot_table {
 	unsigned long   used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */
 	spinlock_t	slot_tbl_lock;
 	struct rpc_wait_queue	slot_tbl_waitq;	/* allocators may wait here */
+	wait_queue_head_t	slot_waitq;	/* Completion wait on slot */
 	u32		max_slots;		/* # slots in table */
 	u32		max_slotid;		/* Max allowed slotid value */
 	u32		highest_used_slotid;	/* sent to server on each SEQ.
@@ -78,7 +79,9 @@ extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
 extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl);
 extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
 extern struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid);
-extern bool nfs4_slot_seqid_in_use(struct nfs4_slot_table  *tbl, u32 slotid, u32 seq_nr);
+extern int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl,
+		u32 slotid, u32 seq_nr,
+		unsigned long timeout);
 extern bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
 extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
 extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);

From 07e8dcbda71ef87e9cbdc42b5bb16a44c1ab839b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 28 Aug 2016 10:28:25 -0400
Subject: [PATCH 10/15] NFSv4.1: Defer bumping the slot sequence number until
 we free the slot

For operations like OPEN or LAYOUTGET, which return recallable state
(i.e. delegations and layouts) we want to enable the mechanism for
resolving recall races in RFC5661 Section 2.10.6.3.
To do so, we will want to defer bumping the slot's sequence number until
we have finished processing the RPC results.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c    | 9 +++++++--
 fs/nfs/nfs4session.h | 3 ++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0cc0c319cfdd..de4a89d3d740 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -666,6 +666,11 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
 	tbl = slot->table;
 	session = tbl->session;
 
+	/* Bump the slot sequence number */
+	if (slot->seq_done)
+		slot->seq_nr++;
+	slot->seq_done = 0;
+
 	spin_lock(&tbl->slot_tbl_lock);
 	/* Be nice to the server: try to ensure that the last transmitted
 	 * value for highest_user_slotid <= target_highest_slotid
@@ -716,7 +721,7 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
 	switch (res->sr_status) {
 	case 0:
 		/* Update the slot's sequence and clientid lease timer */
-		++slot->seq_nr;
+		slot->seq_done = 1;
 		clp = session->clp;
 		do_renew_lease(clp, res->sr_timestamp);
 		/* Check sequence flags */
@@ -771,7 +776,7 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
 		goto retry_nowait;
 	default:
 		/* Just update the slot sequence no. */
-		++slot->seq_nr;
+		slot->seq_done = 1;
 	}
 out:
 	/* The session may be reset by one of the error handlers. */
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index fa75d7db3db3..f703b755351b 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -21,7 +21,8 @@ struct nfs4_slot {
 	unsigned long		generation;
 	u32			slot_nr;
 	u32		 	seq_nr;
-	unsigned int		interrupted : 1;
+	unsigned int		interrupted : 1,
+				seq_done : 1;
 };
 
 /* Sessions */

From 2e80dbe7ac51a911e8a828407b1a48c5ba938cd2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 28 Aug 2016 11:50:26 -0400
Subject: [PATCH 11/15] NFSv4.1: Close callback races for OPEN, LAYOUTGET and
 LAYOUTRETURN

Defer freeing the slot until after we have processed the results from
OPEN and LAYOUTGET. This means that the server can rely on the
mechanism in RFC5661 Section 2.10.6.3 to ensure that replies to an
OPEN or LAYOUTGET/RETURN RPC call don't race with the callbacks that
apply to them.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 78 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 65 insertions(+), 13 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index de4a89d3d740..f5aecaabcb7c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -634,15 +634,11 @@ out_sleep:
 }
 EXPORT_SYMBOL_GPL(nfs40_setup_sequence);
 
-static int nfs40_sequence_done(struct rpc_task *task,
-			       struct nfs4_sequence_res *res)
+static void nfs40_sequence_free_slot(struct nfs4_sequence_res *res)
 {
 	struct nfs4_slot *slot = res->sr_slot;
 	struct nfs4_slot_table *tbl;
 
-	if (slot == NULL)
-		goto out;
-
 	tbl = slot->table;
 	spin_lock(&tbl->slot_tbl_lock);
 	if (!nfs41_wake_and_assign_slot(tbl, slot))
@@ -650,7 +646,13 @@ static int nfs40_sequence_done(struct rpc_task *task,
 	spin_unlock(&tbl->slot_tbl_lock);
 
 	res->sr_slot = NULL;
-out:
+}
+
+static int nfs40_sequence_done(struct rpc_task *task,
+			       struct nfs4_sequence_res *res)
+{
+	if (res->sr_slot != NULL)
+		nfs40_sequence_free_slot(res);
 	return 1;
 }
 
@@ -695,7 +697,8 @@ out_unlock:
 		wake_up_all(&tbl->slot_waitq);
 }
 
-int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
+static int nfs41_sequence_process(struct rpc_task *task,
+		struct nfs4_sequence_res *res)
 {
 	struct nfs4_session *session;
 	struct nfs4_slot *slot = res->sr_slot;
@@ -781,11 +784,11 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
 out:
 	/* The session may be reset by one of the error handlers. */
 	dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
-	nfs41_sequence_free_slot(res);
 out_noaction:
 	return ret;
 retry_nowait:
 	if (rpc_restart_call_prepare(task)) {
+		nfs41_sequence_free_slot(res);
 		task->tk_status = 0;
 		ret = 0;
 	}
@@ -796,8 +799,37 @@ out_retry:
 	rpc_delay(task, NFS4_POLL_RETRY_MAX);
 	return 0;
 }
+
+int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+	if (!nfs41_sequence_process(task, res))
+		return 0;
+	if (res->sr_slot != NULL)
+		nfs41_sequence_free_slot(res);
+	return 1;
+
+}
 EXPORT_SYMBOL_GPL(nfs41_sequence_done);
 
+static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+	if (res->sr_slot == NULL)
+		return 1;
+	if (res->sr_slot->table->session != NULL)
+		return nfs41_sequence_process(task, res);
+	return nfs40_sequence_done(task, res);
+}
+
+static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res)
+{
+	if (res->sr_slot != NULL) {
+		if (res->sr_slot->table->session != NULL)
+			nfs41_sequence_free_slot(res);
+		else
+			nfs40_sequence_free_slot(res);
+	}
+}
+
 int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
 {
 	if (res->sr_slot == NULL)
@@ -927,6 +959,17 @@ static int nfs4_setup_sequence(const struct nfs_server *server,
 				    args, res, task);
 }
 
+static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+	return nfs40_sequence_done(task, res);
+}
+
+static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res)
+{
+	if (res->sr_slot != NULL)
+		nfs40_sequence_free_slot(res);
+}
+
 int nfs4_sequence_done(struct rpc_task *task,
 		       struct nfs4_sequence_res *res)
 {
@@ -1204,6 +1247,7 @@ static void nfs4_opendata_free(struct kref *kref)
 	struct super_block *sb = p->dentry->d_sb;
 
 	nfs_free_seqid(p->o_arg.seqid);
+	nfs4_sequence_free_slot(&p->o_res.seq_res);
 	if (p->state != NULL)
 		nfs4_put_open_state(p->state);
 	nfs4_put_state_owner(p->owner);
@@ -1663,9 +1707,14 @@ err:
 static struct nfs4_state *
 nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
 {
+	struct nfs4_state *ret;
+
 	if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS)
-		return _nfs4_opendata_reclaim_to_nfs4_state(data);
-	return _nfs4_opendata_to_nfs4_state(data);
+		ret =_nfs4_opendata_reclaim_to_nfs4_state(data);
+	else
+		ret = _nfs4_opendata_to_nfs4_state(data);
+	nfs4_sequence_free_slot(&data->o_res.seq_res);
+	return ret;
 }
 
 static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state)
@@ -2063,7 +2112,7 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
 
 	data->rpc_status = task->tk_status;
 
-	if (!nfs4_sequence_done(task, &data->o_res.seq_res))
+	if (!nfs4_sequence_process(task, &data->o_res.seq_res))
 		return;
 
 	if (task->tk_status == 0) {
@@ -7871,7 +7920,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 	struct nfs4_layoutget *lgp = calldata;
 
 	dprintk("--> %s\n", __func__);
-	nfs41_sequence_done(task, &lgp->res.seq_res);
+	nfs41_sequence_process(task, &lgp->res.seq_res);
 	dprintk("<-- %s\n", __func__);
 }
 
@@ -8087,6 +8136,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags)
 	/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
 	if (status == 0 && lgp->res.layoutp->len)
 		lseg = pnfs_layout_process(lgp);
+	nfs4_sequence_free_slot(&lgp->res.seq_res);
 	rpc_put_task(task);
 	dprintk("<-- %s status=%d\n", __func__, status);
 	if (status)
@@ -8113,7 +8163,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
 
 	dprintk("--> %s\n", __func__);
 
-	if (!nfs41_sequence_done(task, &lrp->res.seq_res))
+	if (!nfs41_sequence_process(task, &lrp->res.seq_res))
 		return;
 
 	server = NFS_SERVER(lrp->args.inode);
@@ -8125,6 +8175,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
 	case -NFS4ERR_DELAY:
 		if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
 			break;
+		nfs4_sequence_free_slot(&lrp->res.seq_res);
 		rpc_restart_call_prepare(task);
 		return;
 	}
@@ -8145,6 +8196,7 @@ static void nfs4_layoutreturn_release(void *calldata)
 		pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
 	pnfs_clear_layoutreturn_waitbit(lo);
 	spin_unlock(&lo->plh_inode->i_lock);
+	nfs4_sequence_free_slot(&lrp->res.seq_res);
 	pnfs_free_lseg_list(&freeme);
 	pnfs_put_layout_hdr(lrp->args.layout);
 	nfs_iput_and_deactive(lrp->inode);

From d138027a8256a3e9d7657c8d0dae84c08ef2cfe1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 28 Aug 2016 12:19:04 -0400
Subject: [PATCH 12/15] NFSv4.1: Remove obsolete and incorrrect assignment in
 nfs4_callback_sequence

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback_proc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c35932967722..f953ef6b2f2e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -484,7 +484,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 		goto out;
 
 	tbl = &clp->cl_session->bc_slot_table;
-	slot = tbl->slots + args->csa_slotid;
 
 	/* Set up res before grabbing the spinlock */
 	memcpy(&res->csr_sessionid, &args->csa_sessionid,

From 3dc147359e3dcdf0648f1e2c11f62cfae3160df0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 29 Aug 2016 15:12:54 -0400
Subject: [PATCH 13/15] pNFS/flexfiles: Fix an Oopsable condition when
 connection to the DS fails

If the attempt to connect to a DS fails inside ff_layout_pg_init_read or
ff_layout_pg_init_write, then we currently end up clearing the layout
segment carried by the struct nfs_pageio_descriptor, causing an Oops
when we later call into ff_layout_read_pagelist/ff_layout_write_pagelist.

The fix is to ensure we return the layout and then retry.

Fixes: 446ca2195303 ("pNFS/flexfiles: When initing reads or writes, we...")
Cc: stable@vger.kernel.org # v4.7+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c    | 37 +++++++++++------------
 fs/nfs/flexfilelayout/flexfilelayoutdev.c | 19 ++++++------
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index ee1c94c7614c..51b51369704c 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -806,11 +806,14 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
 {
 	struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
 	struct nfs4_pnfs_ds *ds;
+	bool fail_return = false;
 	int idx;
 
 	/* mirrors are sorted by efficiency */
 	for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
-		ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
+		if (idx+1 == fls->mirror_array_cnt)
+			fail_return = true;
+		ds = nfs4_ff_layout_prepare_ds(lseg, idx, fail_return);
 		if (ds) {
 			*best_idx = idx;
 			return ds;
@@ -859,6 +862,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
 	struct nfs4_pnfs_ds *ds;
 	int ds_idx;
 
+retry:
 	/* Use full layout for now */
 	if (!pgio->pg_lseg)
 		ff_layout_pg_get_read(pgio, req, false);
@@ -871,10 +875,13 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
 
 	ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx);
 	if (!ds) {
-		if (ff_layout_no_fallback_to_mds(pgio->pg_lseg))
-			goto out_pnfs;
-		else
+		if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
 			goto out_mds;
+		pnfs_put_lseg(pgio->pg_lseg);
+		pgio->pg_lseg = NULL;
+		/* Sleep for 1 second before retrying */
+		ssleep(1);
+		goto retry;
 	}
 
 	mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
@@ -890,12 +897,6 @@ out_mds:
 	pnfs_put_lseg(pgio->pg_lseg);
 	pgio->pg_lseg = NULL;
 	nfs_pageio_reset_read_mds(pgio);
-	return;
-
-out_pnfs:
-	pnfs_set_lo_fail(pgio->pg_lseg);
-	pnfs_put_lseg(pgio->pg_lseg);
-	pgio->pg_lseg = NULL;
 }
 
 static void
@@ -909,6 +910,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 	int i;
 	int status;
 
+retry:
 	if (!pgio->pg_lseg) {
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 						   req->wb_context,
@@ -940,10 +942,13 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 	for (i = 0; i < pgio->pg_mirror_count; i++) {
 		ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
 		if (!ds) {
-			if (ff_layout_no_fallback_to_mds(pgio->pg_lseg))
-				goto out_pnfs;
-			else
+			if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
 				goto out_mds;
+			pnfs_put_lseg(pgio->pg_lseg);
+			pgio->pg_lseg = NULL;
+			/* Sleep for 1 second before retrying */
+			ssleep(1);
+			goto retry;
 		}
 		pgm = &pgio->pg_mirrors[i];
 		mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
@@ -956,12 +961,6 @@ out_mds:
 	pnfs_put_lseg(pgio->pg_lseg);
 	pgio->pg_lseg = NULL;
 	nfs_pageio_reset_write_mds(pgio);
-	return;
-
-out_pnfs:
-	pnfs_set_lo_fail(pgio->pg_lseg);
-	pnfs_put_lseg(pgio->pg_lseg);
-	pgio->pg_lseg = NULL;
 }
 
 static unsigned int
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 970efba05ae1..f7a3f6b05369 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -379,7 +379,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
 
 	devid = &mirror->mirror_ds->id_node;
 	if (ff_layout_test_devid_unavailable(devid))
-		goto out;
+		goto out_fail;
 
 	ds = mirror->mirror_ds->ds;
 	/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
@@ -405,15 +405,16 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
 			mirror->mirror_ds->ds_versions[0].rsize = max_payload;
 		if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
 			mirror->mirror_ds->ds_versions[0].wsize = max_payload;
-	} else {
-		ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
-					 mirror, lseg->pls_range.offset,
-					 lseg->pls_range.length, NFS4ERR_NXIO,
-					 OP_ILLEGAL, GFP_NOIO);
-		if (fail_return || !ff_layout_has_available_ds(lseg))
-			pnfs_error_mark_layout_for_return(ino, lseg);
-		ds = NULL;
+		goto out;
 	}
+	ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+				 mirror, lseg->pls_range.offset,
+				 lseg->pls_range.length, NFS4ERR_NXIO,
+				 OP_ILLEGAL, GFP_NOIO);
+out_fail:
+	if (fail_return || !ff_layout_has_available_ds(lseg))
+		pnfs_error_mark_layout_for_return(ino, lseg);
+	ds = NULL;
 out:
 	return ds;
 }

From 52442f9b11b7e5d4a38d99143011831fd171f8d9 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Tue, 30 Aug 2016 09:20:32 -0400
Subject: [PATCH 14/15] NFS4: Avoid migration loops

If a server returns itself as a location while migrating, the client may
end up getting stuck attempting to migrate twice to the same server.  Catch
this by checking if the nfs_client found is the same as the existing
client.  For the other two callers to nfs4_set_client, the nfs_client will
always be ERR_PTR(-EINVAL).

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4client.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 8d7d08d4f95f..cd3b7cfdde16 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -817,6 +817,11 @@ static int nfs4_set_client(struct nfs_server *server,
 		goto error;
 	}
 
+	if (server->nfs_client == clp) {
+		error = -ELOOP;
+		goto error;
+	}
+
 	/*
 	 * Query for the lease time on clientid setup or renewal
 	 *

From 98b0f80c2396224bbbed81792b526e6c72ba9efa Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 29 Aug 2016 11:15:36 -0400
Subject: [PATCH 15/15] NFSv4.x: Fix a refcount leak in nfs_callback_up_net

On error, the callers expect us to return without bumping
nn->cb_users[].

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: stable@vger.kernel.org # v3.7+
---
 fs/nfs/callback.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index a7f2e6e33305..52a28311e2a4 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -275,6 +275,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
 err_socks:
 	svc_rpcb_cleanup(serv, net);
 err_bind:
+	nn->cb_users[minorversion]--;
 	dprintk("NFS: Couldn't create callback socket: err = %d; "
 			"net = %p\n", ret, net);
 	return ret;