From 2acc5cae292355f5f18ad377a2a966e7f03c8fec Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 15 Jun 2020 09:20:52 -0400 Subject: [PATCH 01/10] xprtrdma: Prevent dereferencing r_xprt->rx_ep after it is freed r_xprt->rx_ep is known to be good while the transport's send lock is held. Otherwise additional references on rx_ep must be held when it is used outside of that lock's critical sections. For now, bump the rx_ep reference count once whenever there is at least one outstanding Receive WR. This avoids the memory bandwidth overhead of taking and releasing the reference count for every ib_post_recv() and Receive completion. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 2ae348377806..b021baa4b28d 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -84,7 +84,8 @@ static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep); static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt); -static int rpcrdma_ep_destroy(struct rpcrdma_ep *ep); +static void rpcrdma_ep_get(struct rpcrdma_ep *ep); +static int rpcrdma_ep_put(struct rpcrdma_ep *ep); static struct rpcrdma_regbuf * rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, gfp_t flags); @@ -97,7 +98,8 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); */ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) { - struct rdma_cm_id *id = r_xprt->rx_ep->re_id; + struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct rdma_cm_id *id = ep->re_id; /* Flush Receives, then wait for deferred Reply work * to complete. @@ -108,6 +110,8 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) * local invalidations. */ ib_drain_sq(id->qp); + + rpcrdma_ep_put(ep); } /** @@ -266,7 +270,7 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) xprt_force_disconnect(xprt); goto disconnected; case RDMA_CM_EVENT_ESTABLISHED: - kref_get(&ep->re_kref); + rpcrdma_ep_get(ep); ep->re_connect_status = 1; rpcrdma_update_cm_private(ep, &event->param.conn); trace_xprtrdma_inline_thresh(ep); @@ -289,7 +293,7 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) ep->re_connect_status = -ECONNABORTED; disconnected: xprt_force_disconnect(xprt); - return rpcrdma_ep_destroy(ep); + return rpcrdma_ep_put(ep); default: break; } @@ -345,7 +349,7 @@ out: return ERR_PTR(rc); } -static void rpcrdma_ep_put(struct kref *kref) +static void rpcrdma_ep_destroy(struct kref *kref) { struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref); @@ -369,13 +373,18 @@ static void rpcrdma_ep_put(struct kref *kref) module_put(THIS_MODULE); } +static noinline void rpcrdma_ep_get(struct rpcrdma_ep *ep) +{ + kref_get(&ep->re_kref); +} + /* Returns: * %0 if @ep still has a positive kref count, or * %1 if @ep was destroyed successfully. */ -static int rpcrdma_ep_destroy(struct rpcrdma_ep *ep) +static noinline int rpcrdma_ep_put(struct rpcrdma_ep *ep) { - return kref_put(&ep->re_kref, rpcrdma_ep_put); + return kref_put(&ep->re_kref, rpcrdma_ep_destroy); } static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) @@ -492,7 +501,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) return 0; out_destroy: - rpcrdma_ep_destroy(ep); + rpcrdma_ep_put(ep); rdma_destroy_id(id); out_free: kfree(ep); @@ -521,8 +530,12 @@ retry: ep->re_connect_status = 0; xprt_clear_connected(xprt); - rpcrdma_reset_cwnd(r_xprt); + + /* Bump the ep's reference count while there are + * outstanding Receives. + */ + rpcrdma_ep_get(ep); rpcrdma_post_recvs(r_xprt, true); rc = rpcrdma_sendctxs_create(r_xprt); @@ -587,7 +600,7 @@ void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt) rpcrdma_mrs_destroy(r_xprt); rpcrdma_sendctxs_destroy(r_xprt); - if (rpcrdma_ep_destroy(ep)) + if (rpcrdma_ep_put(ep)) rdma_destroy_id(id); r_xprt->rx_ep = NULL; From 2d97f4637666704953dfbb10322c2b73bb53d5e7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 15 Jun 2020 09:20:57 -0400 Subject: [PATCH 02/10] xprtrdma: Use re_connect_status safely in rpcrdma_xprt_connect() Clean up: Sometimes creating a fresh rpcrdma_ep can fail. That's why xprt_rdma_connect() always checks if the r_xprt->rx_ep pointer is valid before dereferencing it. Instead, xprt_rdma_connect() can simply check rpcrdma_xprt_connect()'s return value. Also, there's no need to set re_connect_status to zero just after the rpcrdma_ep is created, since it is allocated with kzalloc. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 0c4af7f5e241..14165b673b20 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -242,7 +242,7 @@ xprt_rdma_connect_worker(struct work_struct *work) rc = rpcrdma_xprt_connect(r_xprt); xprt_clear_connecting(xprt); - if (r_xprt->rx_ep && r_xprt->rx_ep->re_connect_status > 0) { + if (!rc) { xprt->connect_cookie++; xprt->stat.connect_count++; xprt->stat.connect_time += (long)jiffies - diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index b021baa4b28d..b172e43cb204 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -528,7 +528,6 @@ retry: return rc; ep = r_xprt->rx_ep; - ep->re_connect_status = 0; xprt_clear_connected(xprt); rpcrdma_reset_cwnd(r_xprt); @@ -565,8 +564,6 @@ retry: rpcrdma_mrs_create(r_xprt); out: - if (rc) - ep->re_connect_status = rc; trace_xprtrdma_connect(r_xprt, rc); return rc; } From f423f755f41e4944fb4cd1c259cbf2ba3608d647 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 15 Jun 2020 09:21:02 -0400 Subject: [PATCH 03/10] xprtrdma: Clean up synopsis of rpcrdma_flush_disconnect() Refactor: Pass struct rpcrdma_xprt instead of an IB layer object. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 8 ++++---- net/sunrpc/xprtrdma/verbs.c | 12 ++++++------ net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index ef997880e17a..b647562a26dd 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -367,7 +367,7 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) trace_xprtrdma_wc_fastreg(wc, frwr); /* The MR will get recycled when the associated req is retransmitted */ - rpcrdma_flush_disconnect(cq, wc); + rpcrdma_flush_disconnect(cq->cq_context, wc); } /** @@ -452,7 +452,7 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) trace_xprtrdma_wc_li(wc, frwr); __frwr_release_mr(wc, mr); - rpcrdma_flush_disconnect(cq, wc); + rpcrdma_flush_disconnect(cq->cq_context, wc); } /** @@ -474,7 +474,7 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) __frwr_release_mr(wc, mr); complete(&frwr->fr_linv_done); - rpcrdma_flush_disconnect(cq, wc); + rpcrdma_flush_disconnect(cq->cq_context, wc); } /** @@ -582,7 +582,7 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) smp_rmb(); rpcrdma_complete_rqst(rep); - rpcrdma_flush_disconnect(cq, wc); + rpcrdma_flush_disconnect(cq->cq_context, wc); } /** diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index b172e43cb204..7a112612fc8f 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -132,14 +132,13 @@ static void rpcrdma_qp_event_handler(struct ib_event *event, void *context) /** * rpcrdma_flush_disconnect - Disconnect on flushed completion - * @cq: completion queue + * @r_xprt: transport to disconnect * @wc: work completion entry * * Must be called in process context. */ -void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc) +void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc) { - struct rpcrdma_xprt *r_xprt = cq->cq_context; struct rpc_xprt *xprt = &r_xprt->rx_xprt; if (wc->status != IB_WC_SUCCESS && @@ -160,11 +159,12 @@ static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_sendctx *sc = container_of(cqe, struct rpcrdma_sendctx, sc_cqe); + struct rpcrdma_xprt *r_xprt = cq->cq_context; /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_send(sc, wc); - rpcrdma_sendctx_put_locked((struct rpcrdma_xprt *)cq->cq_context, sc); - rpcrdma_flush_disconnect(cq, wc); + rpcrdma_sendctx_put_locked(r_xprt, sc); + rpcrdma_flush_disconnect(r_xprt, wc); } /** @@ -199,7 +199,7 @@ static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) return; out_flushed: - rpcrdma_flush_disconnect(cq, wc); + rpcrdma_flush_disconnect(r_xprt, wc); rpcrdma_rep_destroy(rep); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 0a16fdb09b2c..098d05a62ead 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -446,7 +446,7 @@ extern unsigned int xprt_rdma_memreg_strategy; /* * Endpoint calls - xprtrdma/verbs.c */ -void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc); +void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc); int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt); void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt); From c487eb7d8e41579d87216ce43152acd336f2c4aa Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 15 Jun 2020 09:21:07 -0400 Subject: [PATCH 04/10] xprtrdma: Clean up disconnect 1. Ensure that only rpcrdma_cm_event_handler() modifies ep->re_connect_status to avoid racy changes to that field. 2. Ensure that xprt_force_disconnect() is invoked only once as a transport is closed or destroyed. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 23 +++++++++++++---------- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 7a112612fc8f..2198c8ec8dff 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -130,6 +130,16 @@ static void rpcrdma_qp_event_handler(struct ib_event *event, void *context) trace_xprtrdma_qp_event(ep, event); } +/* Ensure xprt_force_disconnect() is invoked exactly once when a + * connection is closed or lost. (The important thing is it needs + * to be invoked "at least" once). + */ +static void rpcrdma_force_disconnect(struct rpcrdma_ep *ep) +{ + if (atomic_add_unless(&ep->re_force_disconnect, 1, 1)) + xprt_force_disconnect(ep->re_xprt); +} + /** * rpcrdma_flush_disconnect - Disconnect on flushed completion * @r_xprt: transport to disconnect @@ -139,13 +149,8 @@ static void rpcrdma_qp_event_handler(struct ib_event *event, void *context) */ void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc) { - struct rpc_xprt *xprt = &r_xprt->rx_xprt; - - if (wc->status != IB_WC_SUCCESS && - r_xprt->rx_ep->re_connect_status == 1) { - r_xprt->rx_ep->re_connect_status = -ECONNABORTED; - xprt_force_disconnect(xprt); - } + if (wc->status != IB_WC_SUCCESS) + rpcrdma_force_disconnect(r_xprt->rx_ep); } /** @@ -243,7 +248,6 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr; struct rpcrdma_ep *ep = id->context; - struct rpc_xprt *xprt = ep->re_xprt; might_sleep(); @@ -267,7 +271,6 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) /* fall through */ case RDMA_CM_EVENT_ADDR_CHANGE: ep->re_connect_status = -ENODEV; - xprt_force_disconnect(xprt); goto disconnected; case RDMA_CM_EVENT_ESTABLISHED: rpcrdma_ep_get(ep); @@ -292,7 +295,7 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DISCONNECTED: ep->re_connect_status = -ECONNABORTED; disconnected: - xprt_force_disconnect(xprt); + rpcrdma_force_disconnect(ep); return rpcrdma_ep_put(ep); default: break; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 098d05a62ead..43974ef39a50 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -82,6 +82,7 @@ struct rpcrdma_ep { unsigned int re_max_inline_recv; int re_async_rc; int re_connect_status; + atomic_t re_force_disconnect; struct ib_qp_init_attr re_attr; wait_queue_head_t re_connect_wait; struct rpc_xprt *re_xprt; From 7b2182ec381f8ea15c7eb1266d6b5d7da620ad93 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 15 Jun 2020 09:21:13 -0400 Subject: [PATCH 05/10] xprtrdma: Fix handling of RDMA_ERROR replies The RPC client currently doesn't handle ERR_CHUNK replies correctly. rpcrdma_complete_rqst() incorrectly passes a negative number to xprt_complete_rqst() as the number of bytes copied. Instead, set task->tk_status to the error value, and return zero bytes copied. In these cases, return -EIO rather than -EREMOTEIO. The RPC client's finite state machine doesn't know what to do with -EREMOTEIO. Additional clean ups: - Don't double-count RDMA_ERROR replies - Remove a stale comment Signed-off-by: Chuck Lever Cc: Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 2081c8fbfa48..935bbef2f7be 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1349,8 +1349,7 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, be32_to_cpup(p), be32_to_cpu(rep->rr_xid)); } - r_xprt->rx_stats.bad_reply_count++; - return -EREMOTEIO; + return -EIO; } /* Perform XID lookup, reconstruction of the RPC reply, and @@ -1387,13 +1386,11 @@ out: spin_unlock(&xprt->queue_lock); return; -/* If the incoming reply terminated a pending RPC, the next - * RPC call will post a replacement receive buffer as it is - * being marshaled. - */ out_badheader: trace_xprtrdma_reply_hdr(rep); r_xprt->rx_stats.bad_reply_count++; + rqst->rq_task->tk_status = status; + status = 0; goto out; } From b7ade38165ca0001c5a3bd5314a314abbbfbb1b7 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Mon, 1 Jun 2020 11:54:57 +0300 Subject: [PATCH 06/10] sunrpc: fixed rollback in rpc_gssd_dummy_populate() __rpc_depopulate(gssd_dentry) was lost on error path cc: stable@vger.kernel.org Fixes: commit 4b9a445e3eeb ("sunrpc: create a new dummy pipe for gssd to hold open") Signed-off-by: Vasily Averin Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- net/sunrpc/rpc_pipe.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 39e14d5edaf1..e9d0953522f0 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1317,6 +1317,7 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data) q.len = strlen(gssd_dummy_clnt_dir[0].name); clnt_dentry = d_hash_and_lookup(gssd_dentry, &q); if (!clnt_dentry) { + __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); pipe_dentry = ERR_PTR(-ENOENT); goto out; } From 4659ed7cc8514369043053463514408ca16ad6f3 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 12 Jun 2020 15:45:49 -0700 Subject: [PATCH 07/10] nfs: Fix memory leak of export_path The try_location function is called within a loop by nfs_follow_referral. try_location calls nfs4_pathname_string to created the export_path. nfs4_pathname_string allocates the memory. export_path is stored in the nfs_fs_context/fs_context structure similarly as hostname and source. But whereas the ctx hostname and source are freed before assignment, export_path is not. So if there are multiple loops, the new export_path will overwrite the old without the old being freed. So call kfree for export_path. Signed-off-by: Tom Rix Signed-off-by: Anna Schumaker --- fs/nfs/nfs4namespace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index a3ab6e219061..873342308dc0 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -308,6 +308,7 @@ static int try_location(struct fs_context *fc, if (IS_ERR(export_path)) return PTR_ERR(export_path); + kfree(ctx->nfs_server.export_path); ctx->nfs_server.export_path = export_path; source = kmalloc(len + 1 + ctx->nfs_server.export_path_len + 1, From 8b04013737341442ed914b336cde866b902664ae Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 22 Jun 2020 15:04:15 -0400 Subject: [PATCH 08/10] pNFS/flexfiles: Fix list corruption if the mirror count changes If the mirror count changes in the new layout we pick up inside ff_layout_pg_init_write(), then we can end up adding the request to the wrong mirror and corrupting the mirror->pg_list. Fixes: d600ad1f2bdb ("NFS41: pop some layoutget errors to application") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/flexfilelayout/flexfilelayout.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 7d399f72ebbb..de03e440b7ee 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -907,9 +907,8 @@ retry: goto out_mds; /* Use a direct mapping of ds_idx to pgio mirror_idx */ - if (WARN_ON_ONCE(pgio->pg_mirror_count != - FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg))) - goto out_mds; + if (pgio->pg_mirror_count != FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)) + goto out_eagain; for (i = 0; i < pgio->pg_mirror_count; i++) { mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); @@ -931,7 +930,10 @@ retry: (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR)) pgio->pg_maxretrans = io_maxretrans; return; - +out_eagain: + pnfs_generic_pg_cleanup(pgio); + pgio->pg_error = -EAGAIN; + return; out_mds: trace_pnfs_mds_fallback_pg_init_write(pgio->pg_inode, 0, NFS4_MAX_UINT64, IOMODE_RW, @@ -941,6 +943,7 @@ out_mds: pgio->pg_lseg = NULL; pgio->pg_maxretrans = 0; nfs_pageio_reset_write_mds(pgio); + pgio->pg_error = -EAGAIN; } static unsigned int From d03727b248d0dae6199569a8d7b629a681154633 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 24 Jun 2020 13:54:08 -0400 Subject: [PATCH 09/10] NFSv4 fix CLOSE not waiting for direct IO compeletion Figuring out the root case for the REMOVE/CLOSE race and suggesting the solution was done by Neil Brown. Currently what happens is that direct IO calls hold a reference on the open context which is decremented as an asynchronous task in the nfs_direct_complete(). Before reference is decremented, control is returned to the application which is free to close the file. When close is being processed, it decrements its reference on the open_context but since directIO still holds one, it doesn't sent a close on the wire. It returns control to the application which is free to do other operations. For instance, it can delete a file. Direct IO is finally releasing its reference and triggering an asynchronous close. Which races with the REMOVE. On the server, REMOVE can be processed before the CLOSE, failing the REMOVE with EACCES as the file is still opened. Signed-off-by: Olga Kornievskaia Suggested-by: Neil Brown CC: stable@vger.kernel.org Signed-off-by: Anna Schumaker --- fs/nfs/direct.c | 13 +++++++++---- fs/nfs/file.c | 1 + 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 1b79dd5cf661..3d113cf8908a 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -267,8 +267,6 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) { struct inode *inode = dreq->inode; - inode_dio_end(inode); - if (dreq->iocb) { long res = (long) dreq->error; if (dreq->count != 0) { @@ -280,7 +278,10 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) complete(&dreq->completion); + igrab(inode); nfs_direct_req_release(dreq); + inode_dio_end(inode); + iput(inode); } static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) @@ -410,8 +411,10 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * generic layer handle the completion. */ if (requested_bytes == 0) { - inode_dio_end(inode); + igrab(inode); nfs_direct_req_release(dreq); + inode_dio_end(inode); + iput(inode); return result < 0 ? result : -EIO; } @@ -864,8 +867,10 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * generic layer handle the completion. */ if (requested_bytes == 0) { - inode_dio_end(inode); + igrab(inode); nfs_direct_req_release(dreq); + inode_dio_end(inode); + iput(inode); return result < 0 ? result : -EIO; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f96367a2463e..ccd6c1637b27 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -83,6 +83,7 @@ nfs_file_release(struct inode *inode, struct file *filp) dprintk("NFS: release(%pD2)\n", filp); nfs_inc_stats(inode, NFSIOS_VFSRELEASE); + inode_dio_wait(inode); nfs_file_clear_open_context(filp); return 0; } From 89a3c9f5b9f0bcaa9aea3e8b2a616fcaea9aad78 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Jun 2020 11:32:34 -0400 Subject: [PATCH 10/10] SUNRPC: Properly set the @subbuf parameter of xdr_buf_subsegment() @subbuf is an output parameter of xdr_buf_subsegment(). A survey of call sites shows that @subbuf is always uninitialized before xdr_buf_segment() is invoked by callers. There are some execution paths through xdr_buf_subsegment() that do not set all of the fields in @subbuf, leaving some pointer fields containing garbage addresses. Subsequent processing of that buffer then results in a page fault. Signed-off-by: Chuck Lever Cc: Signed-off-by: Anna Schumaker --- net/sunrpc/xdr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 6f7d82fb1eb0..be11d672b5b9 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1118,6 +1118,7 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, base = 0; } else { base -= buf->head[0].iov_len; + subbuf->head[0].iov_base = buf->head[0].iov_base; subbuf->head[0].iov_len = 0; } @@ -1130,6 +1131,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, base = 0; } else { base -= buf->page_len; + subbuf->pages = buf->pages; + subbuf->page_base = 0; subbuf->page_len = 0; } @@ -1141,6 +1144,7 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, base = 0; } else { base -= buf->tail[0].iov_len; + subbuf->tail[0].iov_base = buf->tail[0].iov_base; subbuf->tail[0].iov_len = 0; }