From d7d94d48a272fd7583dc3c83acb8f5ed4ef456a4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 16 Jul 2018 12:11:34 -0400 Subject: [PATCH 01/12] nbd: don't requeue the same request twice. We can race with the snd timeout and the per-request timeout and end up requeuing the same request twice. We can't use the send_complete completion to tell if everything is ok because we hold the tx_lock during send, so the timeout stuff will block waiting to mark the socket dead, and we could be marked complete and still requeue. Instead add a flag to the socket so we know whether we've been requeued yet. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 74a05561b620..f8cf7d4cca7f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -112,12 +112,15 @@ struct nbd_device { struct task_struct *task_setup; }; +#define NBD_CMD_REQUEUED 1 + struct nbd_cmd { struct nbd_device *nbd; int index; int cookie; struct completion send_complete; blk_status_t status; + unsigned long flags; }; #if IS_ENABLED(CONFIG_DEBUG_FS) @@ -146,6 +149,14 @@ static inline struct device *nbd_to_dev(struct nbd_device *nbd) return disk_to_dev(nbd->disk); } +static void nbd_requeue_cmd(struct nbd_cmd *cmd) +{ + struct request *req = blk_mq_rq_from_pdu(cmd); + + if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags)) + blk_mq_requeue_request(req, true); +} + static const char *nbdcmd_to_ascii(int cmd) { switch (cmd) { @@ -343,7 +354,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, nbd_mark_nsock_dead(nbd, nsock, 1); mutex_unlock(&nsock->tx_lock); } - blk_mq_requeue_request(req, true); + nbd_requeue_cmd(cmd); nbd_config_put(nbd); return BLK_EH_DONE; } @@ -500,6 +511,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) nsock->pending = req; nsock->sent = sent; } + set_bit(NBD_CMD_REQUEUED, &cmd->flags); return BLK_STS_RESOURCE; } dev_err_ratelimited(disk_to_dev(nbd->disk), @@ -541,6 +553,7 @@ send_pages: */ nsock->pending = req; nsock->sent = sent; + set_bit(NBD_CMD_REQUEUED, &cmd->flags); return BLK_STS_RESOURCE; } dev_err(disk_to_dev(nbd->disk), @@ -805,7 +818,7 @@ again: */ blk_mq_start_request(req); if (unlikely(nsock->pending && nsock->pending != req)) { - blk_mq_requeue_request(req, true); + nbd_requeue_cmd(cmd); ret = 0; goto out; } @@ -818,7 +831,7 @@ again: dev_err_ratelimited(disk_to_dev(nbd->disk), "Request send failed, requeueing\n"); nbd_mark_nsock_dead(nbd, nsock, 1); - blk_mq_requeue_request(req, true); + nbd_requeue_cmd(cmd); ret = 0; } out: @@ -843,6 +856,7 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx, * done sending everything over the wire. */ init_completion(&cmd->send_complete); + clear_bit(NBD_CMD_REQUEUED, &cmd->flags); /* We can be called directly from the user space process, which means we * could possibly have signals pending so our sendmsg will fail. In @@ -1460,6 +1474,7 @@ static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq, { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq); cmd->nbd = set->driver_data; + cmd->flags = 0; return 0; } From 8f3ea35929a0806ad1397db99a89ffee0140822a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 16 Jul 2018 12:11:35 -0400 Subject: [PATCH 02/12] nbd: handle unexpected replies better If the server or network is misbehaving and we get an unexpected reply we can sometimes miss the request not being started and wait on a request and never get a response, or even double complete the same request. Fix this by replacing the send_complete completion with just a per command lock. Add a per command cookie as well so that we can know if we're getting a double completion for a previous event. Also check to make sure we dont have REQUEUED set as that means we raced with the timeout handler and need to just let the retry occur. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 75 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index f8cf7d4cca7f..3fb95c8d9fd8 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -116,11 +116,12 @@ struct nbd_device { struct nbd_cmd { struct nbd_device *nbd; + struct mutex lock; int index; int cookie; - struct completion send_complete; blk_status_t status; unsigned long flags; + u32 cmd_cookie; }; #if IS_ENABLED(CONFIG_DEBUG_FS) @@ -157,6 +158,27 @@ static void nbd_requeue_cmd(struct nbd_cmd *cmd) blk_mq_requeue_request(req, true); } +#define NBD_COOKIE_BITS 32 + +static u64 nbd_cmd_handle(struct nbd_cmd *cmd) +{ + struct request *req = blk_mq_rq_from_pdu(cmd); + u32 tag = blk_mq_unique_tag(req); + u64 cookie = cmd->cmd_cookie; + + return (cookie << NBD_COOKIE_BITS) | tag; +} + +static u32 nbd_handle_to_tag(u64 handle) +{ + return (u32)handle; +} + +static u32 nbd_handle_to_cookie(u64 handle) +{ + return (u32)(handle >> NBD_COOKIE_BITS); +} + static const char *nbdcmd_to_ascii(int cmd) { switch (cmd) { @@ -330,6 +352,9 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, } config = nbd->config; + if (!mutex_trylock(&cmd->lock)) + return BLK_EH_RESET_TIMER; + if (config->num_connections > 1) { dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out, retrying (%d/%d alive)\n", @@ -354,6 +379,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, nbd_mark_nsock_dead(nbd, nsock, 1); mutex_unlock(&nsock->tx_lock); } + mutex_unlock(&cmd->lock); nbd_requeue_cmd(cmd); nbd_config_put(nbd); return BLK_EH_DONE; @@ -364,6 +390,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, } set_bit(NBD_TIMEDOUT, &config->runtime_flags); cmd->status = BLK_STS_IOERR; + mutex_unlock(&cmd->lock); sock_shutdown(nbd); nbd_config_put(nbd); done: @@ -441,9 +468,9 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) struct iov_iter from; unsigned long size = blk_rq_bytes(req); struct bio *bio; + u64 handle; u32 type; u32 nbd_cmd_flags = 0; - u32 tag = blk_mq_unique_tag(req); int sent = nsock->sent, skip = 0; iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request)); @@ -485,6 +512,8 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) goto send_pages; } iov_iter_advance(&from, sent); + } else { + cmd->cmd_cookie++; } cmd->index = index; cmd->cookie = nsock->cookie; @@ -493,7 +522,8 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); request.len = htonl(size); } - memcpy(request.handle, &tag, sizeof(tag)); + handle = nbd_cmd_handle(cmd); + memcpy(request.handle, &handle, sizeof(handle)); dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n", req, nbdcmd_to_ascii(type), @@ -586,10 +616,12 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) struct nbd_reply reply; struct nbd_cmd *cmd; struct request *req = NULL; + u64 handle; u16 hwq; u32 tag; struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)}; struct iov_iter to; + int ret = 0; reply.magic = 0; iov_iter_kvec(&to, READ | ITER_KVEC, &iov, 1, sizeof(reply)); @@ -607,8 +639,8 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) return ERR_PTR(-EPROTO); } - memcpy(&tag, reply.handle, sizeof(u32)); - + memcpy(&handle, reply.handle, sizeof(handle)); + tag = nbd_handle_to_tag(handle); hwq = blk_mq_unique_tag_to_hwq(tag); if (hwq < nbd->tag_set.nr_hw_queues) req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq], @@ -619,11 +651,25 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) return ERR_PTR(-ENOENT); } cmd = blk_mq_rq_to_pdu(req); + + mutex_lock(&cmd->lock); + if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) { + dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n", + req, cmd->cmd_cookie, nbd_handle_to_cookie(handle)); + ret = -ENOENT; + goto out; + } + if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) { + dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n", + req); + ret = -ENOENT; + goto out; + } if (ntohl(reply.error)) { dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", ntohl(reply.error)); cmd->status = BLK_STS_IOERR; - return cmd; + goto out; } dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req); @@ -648,18 +694,18 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) if (nbd_disconnected(config) || config->num_connections <= 1) { cmd->status = BLK_STS_IOERR; - return cmd; + goto out; } - return ERR_PTR(-EIO); + ret = -EIO; + goto out; } dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n", req, bvec.bv_len); } - } else { - /* See the comment in nbd_queue_rq. */ - wait_for_completion(&cmd->send_complete); } - return cmd; +out: + mutex_unlock(&cmd->lock); + return ret ? ERR_PTR(ret) : cmd; } static void recv_work(struct work_struct *work) @@ -855,7 +901,7 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx, * that the server is misbehaving (or there was an error) before we're * done sending everything over the wire. */ - init_completion(&cmd->send_complete); + mutex_lock(&cmd->lock); clear_bit(NBD_CMD_REQUEUED, &cmd->flags); /* We can be called directly from the user space process, which means we @@ -868,7 +914,7 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx, ret = BLK_STS_IOERR; else if (!ret) ret = BLK_STS_OK; - complete(&cmd->send_complete); + mutex_unlock(&cmd->lock); return ret; } @@ -1475,6 +1521,7 @@ static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq, struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq); cmd->nbd = set->driver_data; cmd->flags = 0; + mutex_init(&cmd->lock); return 0; } From d082dc1562a2ff0947b214796f12faaa87e816a9 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 16 Jul 2018 14:38:14 -0700 Subject: [PATCH 03/12] nvmet-fc: fix target sgl list on large transfers The existing code to carve up the sg list expected an sg element-per-page which can be very incorrect with iommu's remapping multiple memory pages to fewer bus addresses. To hit this error required a large io payload (greater than 256k) and a system that maps on a per-page basis. It's possible that large ios could get by fine if the system condensed the sgl list into the first 64 elements. This patch corrects the sg list handling by specifically walking the sg list element by element and attempting to divide the transfer up on a per-sg element boundary. While doing so, it still tries to keep sequences under 256k, but will exceed that rule if a single sg element is larger than 256k. Fixes: 48fa362b6c3f ("nvmet-fc: simplify sg list handling") Cc: # 4.14 Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 44 ++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 408279cb6f2c..29b4b236afd8 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -58,8 +58,8 @@ struct nvmet_fc_ls_iod { struct work_struct work; } __aligned(sizeof(unsigned long long)); +/* desired maximum for a single sequence - if sg list allows it */ #define NVMET_FC_MAX_SEQ_LENGTH (256 * 1024) -#define NVMET_FC_MAX_XFR_SGENTS (NVMET_FC_MAX_SEQ_LENGTH / PAGE_SIZE) enum nvmet_fcp_datadir { NVMET_FCP_NODATA, @@ -74,6 +74,7 @@ struct nvmet_fc_fcp_iod { struct nvme_fc_cmd_iu cmdiubuf; struct nvme_fc_ersp_iu rspiubuf; dma_addr_t rspdma; + struct scatterlist *next_sg; struct scatterlist *data_sg; int data_sg_cnt; u32 offset; @@ -1025,8 +1026,7 @@ nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo, INIT_LIST_HEAD(&newrec->assoc_list); kref_init(&newrec->ref); ida_init(&newrec->assoc_cnt); - newrec->max_sg_cnt = min_t(u32, NVMET_FC_MAX_XFR_SGENTS, - template->max_sgl_segments); + newrec->max_sg_cnt = template->max_sgl_segments; ret = nvmet_fc_alloc_ls_iodlist(newrec); if (ret) { @@ -1722,6 +1722,7 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod) ((fod->io_dir == NVMET_FCP_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE)); /* note: write from initiator perspective */ + fod->next_sg = fod->data_sg; return 0; @@ -1866,24 +1867,49 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport, struct nvmet_fc_fcp_iod *fod, u8 op) { struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq; + struct scatterlist *sg = fod->next_sg; unsigned long flags; - u32 tlen; + u32 remaininglen = fod->req.transfer_len - fod->offset; + u32 tlen = 0; int ret; fcpreq->op = op; fcpreq->offset = fod->offset; fcpreq->timeout = NVME_FC_TGTOP_TIMEOUT_SEC; - tlen = min_t(u32, tgtport->max_sg_cnt * PAGE_SIZE, - (fod->req.transfer_len - fod->offset)); + /* + * for next sequence: + * break at a sg element boundary + * attempt to keep sequence length capped at + * NVMET_FC_MAX_SEQ_LENGTH but allow sequence to + * be longer if a single sg element is larger + * than that amount. This is done to avoid creating + * a new sg list to use for the tgtport api. + */ + fcpreq->sg = sg; + fcpreq->sg_cnt = 0; + while (tlen < remaininglen && + fcpreq->sg_cnt < tgtport->max_sg_cnt && + tlen + sg_dma_len(sg) < NVMET_FC_MAX_SEQ_LENGTH) { + fcpreq->sg_cnt++; + tlen += sg_dma_len(sg); + sg = sg_next(sg); + } + if (tlen < remaininglen && fcpreq->sg_cnt == 0) { + fcpreq->sg_cnt++; + tlen += min_t(u32, sg_dma_len(sg), remaininglen); + sg = sg_next(sg); + } + if (tlen < remaininglen) + fod->next_sg = sg; + else + fod->next_sg = NULL; + fcpreq->transfer_length = tlen; fcpreq->transferred_length = 0; fcpreq->fcp_error = 0; fcpreq->rsplen = 0; - fcpreq->sg = &fod->data_sg[fod->offset / PAGE_SIZE]; - fcpreq->sg_cnt = DIV_ROUND_UP(tlen, PAGE_SIZE); - /* * If the last READDATA request: check if LLDD supports * combined xfr with response. From 6cdefc6e2ad52170f89a8d0e8b1a1339f91834dc Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 20 Jul 2018 15:49:48 -0700 Subject: [PATCH 04/12] nvme: if_ready checks to fail io to deleting controller The revised if_ready checks skipped over the case of returning error when the controller is being deleted. Instead it was returning BUSY, which caused the ios to retry, which caused the ns delete to hang waiting for the ios to drain. Stack trace of hang looks like: kworker/u64:2 D 0 74 2 0x80000000 Workqueue: nvme-delete-wq nvme_delete_ctrl_work [nvme_core] Call Trace: ? __schedule+0x26d/0x820 schedule+0x32/0x80 blk_mq_freeze_queue_wait+0x36/0x80 ? remove_wait_queue+0x60/0x60 blk_cleanup_queue+0x72/0x160 nvme_ns_remove+0x106/0x140 [nvme_core] nvme_remove_namespaces+0x7e/0xa0 [nvme_core] nvme_delete_ctrl_work+0x4d/0x80 [nvme_core] process_one_work+0x160/0x350 worker_thread+0x1c3/0x3d0 kthread+0xf5/0x130 ? process_one_work+0x350/0x350 ? kthread_bind+0x10/0x10 ret_from_fork+0x1f/0x30 Extend nvmf_fail_nonready_command() to supply the controller pointer so that the controller state can be looked at. Fail any io to a controller that is deleting. Fixes: 3bc32bb1186c ("nvme-fabrics: refactor queue ready check") Fixes: 35897b920c8a ("nvme-fabrics: fix and refine state checks in __nvmf_check_ready") Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Tested-by: Ewan D. Milne Reviewed-by: Ewan D. Milne --- drivers/nvme/host/fabrics.c | 10 +++++++--- drivers/nvme/host/fabrics.h | 3 ++- drivers/nvme/host/fc.c | 2 +- drivers/nvme/host/rdma.c | 2 +- drivers/nvme/target/loop.c | 2 +- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 903eb4545e26..f7efe5a58cc7 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -539,14 +539,18 @@ static struct nvmf_transport_ops *nvmf_lookup_transport( /* * For something we're not in a state to send to the device the default action * is to busy it and retry it after the controller state is recovered. However, - * anything marked for failfast or nvme multipath is immediately failed. + * if the controller is deleting or if anything is marked for failfast or + * nvme multipath it is immediately failed. * * Note: commands used to initialize the controller will be marked for failfast. * Note: nvme cli/ioctl commands are marked for failfast. */ -blk_status_t nvmf_fail_nonready_command(struct request *rq) +blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl, + struct request *rq) { - if (!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) + if (ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DEAD && + !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) return BLK_STS_RESOURCE; nvme_req(rq)->status = NVME_SC_ABORT_REQ; return BLK_STS_IOERR; diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index e1818a27aa2d..aa2fdb2a2e8f 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -162,7 +162,8 @@ void nvmf_unregister_transport(struct nvmf_transport_ops *ops); void nvmf_free_options(struct nvmf_ctrl_options *opts); int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); bool nvmf_should_reconnect(struct nvme_ctrl *ctrl); -blk_status_t nvmf_fail_nonready_command(struct request *rq); +blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl, + struct request *rq); bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, bool queue_live); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 41d45a1b5c62..9bac912173ba 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2272,7 +2272,7 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE || !nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) - return nvmf_fail_nonready_command(rq); + return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq); ret = nvme_setup_cmd(ns, rq, sqe); if (ret) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 518c5b09038c..66ec5985c9f3 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1639,7 +1639,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, WARN_ON_ONCE(rq->tag < 0); if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) - return nvmf_fail_nonready_command(rq); + return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq); dev = queue->device->dev; ib_dma_sync_single_for_cpu(dev, sqe->dma, diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index d8d91f04bd7e..ae7586b8be07 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -162,7 +162,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, blk_status_t ret; if (!nvmf_check_ready(&queue->ctrl->ctrl, req, queue_ready)) - return nvmf_fail_nonready_command(req); + return nvmf_fail_nonready_command(&queue->ctrl->ctrl, req); ret = nvme_setup_cmd(ns, req, &iod->cmd); if (ret) From 0fc09f920983f61be625658c62cc40ac25a7b3a5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 23 Jul 2018 08:37:50 -0600 Subject: [PATCH 05/12] blk-mq: export setting request completion state This is preparing for drivers that want to directly alter the state of their requests. No functional change here. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 +--- include/linux/blk-mq.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index d394cdd8d8c6..5291a95ba362 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -558,10 +558,8 @@ static void __blk_mq_complete_request(struct request *rq) bool shared = false; int cpu; - if (cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) != - MQ_RQ_IN_FLIGHT) + if (!blk_mq_mark_complete(rq)) return; - if (rq->internal_tag != -1) blk_mq_sched_completed_request(rq); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e3147eb74222..ca3f2c2edd85 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -287,6 +287,20 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); void blk_mq_quiesce_queue_nowait(struct request_queue *q); +/** + * blk_mq_mark_complete() - Set request state to complete + * @rq: request to set to complete state + * + * Returns true if request state was successfully set to complete. If + * successful, the caller is responsibile for seeing this request is ended, as + * blk_mq_complete_request will not work again. + */ +static inline bool blk_mq_mark_complete(struct request *rq) +{ + return cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) == + MQ_RQ_IN_FLIGHT; +} + /* * Driver command data is immediately after the request. So subtract request * size to get back to the original request, add request size to get the PDU. From 065990bd198e0e67417c2c34e5e80140d4b8cef7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 23 Jul 2018 08:37:51 -0600 Subject: [PATCH 06/12] scsi: set timed out out mq requests to complete The scsi block layer requires requests claimed by the error handling be completed by the error handler. A previous commit allowed completions to proceed for blk-mq, breaking that assumption. This patch prevents completions that may race with the timeout handler by marking the state to complete, restoring the previous behavior. Fixes: 12f5b931 ("blk-mq: Remove generation seqeunce") Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/scsi/scsi_error.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 8932ae81a15a..2715cdaa669c 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -296,6 +296,20 @@ enum blk_eh_timer_return scsi_times_out(struct request *req) rtn = host->hostt->eh_timed_out(scmd); if (rtn == BLK_EH_DONE) { + /* + * For blk-mq, we must set the request state to complete now + * before sending the request to the scsi error handler. This + * will prevent a use-after-free in the event the LLD manages + * to complete the request before the error handler finishes + * processing this timed out request. + * + * If the request was already completed, then the LLD beat the + * time out handler from transferring the request to the scsi + * error handler. In that case we can return immediately as no + * further action is required. + */ + if (req->q->mq_ops && !blk_mq_mark_complete(req)) + return rtn; if (scsi_abort_command(scmd) != SUCCESS) { set_host_byte(scmd, DID_TIME_OUT); scsi_eh_scmd_add(scmd); From 5613d31214eb4c5c04cdfce4966bb661c8b43191 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 25 Jul 2018 08:35:17 +0200 Subject: [PATCH 07/12] nvmet: fixup crash on NULL device path When writing an empty string into the device_path attribute the kernel will crash with nvmet: failed to open block device (null): (-22) BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 This patch sanitizes the error handling for invalid device path settings. Fixes: a07b4970 ("nvmet: add a generic NVMe target") Signed-off-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index d3f3b3ec4d1a..ebea1373d1b7 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -282,6 +282,7 @@ static ssize_t nvmet_ns_device_path_store(struct config_item *item, { struct nvmet_ns *ns = to_nvmet_ns(item); struct nvmet_subsys *subsys = ns->subsys; + size_t len; int ret; mutex_lock(&subsys->lock); @@ -289,10 +290,14 @@ static ssize_t nvmet_ns_device_path_store(struct config_item *item, if (ns->enabled) goto out_unlock; - kfree(ns->device_path); + ret = -EINVAL; + len = strcspn(page, "\n"); + if (!len) + goto out_unlock; + kfree(ns->device_path); ret = -ENOMEM; - ns->device_path = kstrndup(page, strcspn(page, "\n"), GFP_KERNEL); + ns->device_path = kstrndup(page, len, GFP_KERNEL); if (!ns->device_path) goto out_unlock; From 405a7519607e7a364114896264440c0f87b325c0 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 25 Jul 2018 08:34:45 +0200 Subject: [PATCH 08/12] nvmet: only check for filebacking on -ENOTBLK We only need to check for a file-backed namespace if nvmet_bdev_ns_enable() returns -ENOTBLK. For any other error it's pointless as the open() error will remain the same. Fixes: d5eff33e ("nvmet: add simple file backed ns support") Signed-off-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 74d4b785d2da..9838103f2d62 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -339,7 +339,7 @@ int nvmet_ns_enable(struct nvmet_ns *ns) goto out_unlock; ret = nvmet_bdev_ns_enable(ns); - if (ret) + if (ret == -ENOTBLK) ret = nvmet_file_ns_enable(ns); if (ret) goto out_unlock; From b403ea2404889e1227812fa9657667a1deb9c694 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Wed, 25 Jul 2018 23:15:07 +0200 Subject: [PATCH 09/12] block: bio_iov_iter_get_pages: fix size of last iovec If the last page of the bio is not "full", the length of the last vector slot needs to be corrected. This slot has the index (bio->bi_vcnt - 1), but only in bio->bi_io_vec. In the "bv" helper array, which is shifted by the value of bio->bi_vcnt at function invocation, the correct index is (nr_pages - 1). v2: improved readability following suggestions from Ming Lei. v3: followed a formatting suggestion from Christoph Hellwig. Fixes: 2cefe4dbaadf ("block: add bio_iov_iter_get_pages()") Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Martin Wilck Signed-off-by: Jens Axboe --- block/bio.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/block/bio.c b/block/bio.c index f7e3d88bd0b6..cd55ea6bd47c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -912,16 +912,16 @@ EXPORT_SYMBOL(bio_add_page); */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { - unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; + unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt, idx; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; - size_t offset, diff; + size_t offset; ssize_t size; size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; - nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE; + idx = nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE; /* * Deep magic below: We need to walk the pinned pages backwards @@ -934,17 +934,15 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) bio->bi_iter.bi_size += size; bio->bi_vcnt += nr_pages; - diff = (nr_pages * PAGE_SIZE - offset) - size; - while (nr_pages--) { - bv[nr_pages].bv_page = pages[nr_pages]; - bv[nr_pages].bv_len = PAGE_SIZE; - bv[nr_pages].bv_offset = 0; + while (idx--) { + bv[idx].bv_page = pages[idx]; + bv[idx].bv_len = PAGE_SIZE; + bv[idx].bv_offset = 0; } bv[0].bv_offset += offset; bv[0].bv_len -= offset; - if (diff) - bv[bio->bi_vcnt - 1].bv_len -= diff; + bv[nr_pages - 1].bv_len -= nr_pages * PAGE_SIZE - offset - size; iov_iter_advance(iter, size); return 0; From 9362dd1109f87a9d0a798fbc890cb339c171ed35 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Wed, 25 Jul 2018 23:15:08 +0200 Subject: [PATCH 10/12] blkdev: __blkdev_direct_IO_simple: fix leak in error case Fixes: 72ecad22d9f1 ("block: support a full bio worth of IO for simplified bdev direct-io") Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Martin Wilck Signed-off-by: Jens Axboe --- fs/block_dev.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 05e12aea2404..192005376884 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -220,7 +220,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, ret = bio_iov_iter_get_pages(&bio, iter); if (unlikely(ret)) - return ret; + goto out; ret = bio.bi_iter.bi_size; if (iov_iter_rw(iter) == READ) { @@ -249,12 +249,13 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, put_page(bvec->bv_page); } - if (vecs != inline_vecs) - kfree(vecs); - if (unlikely(bio.bi_status)) ret = blk_status_to_errno(bio.bi_status); +out: + if (vecs != inline_vecs) + kfree(vecs); + bio_uninit(&bio); return ret; From 17d51b10d7773e4618bcac64648f30f12d4078fb Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Wed, 25 Jul 2018 23:15:09 +0200 Subject: [PATCH 11/12] block: bio_iov_iter_get_pages: pin more pages for multi-segment IOs bio_iov_iter_get_pages() currently only adds pages for the next non-zero segment from the iov_iter to the bio. That's suboptimal for callers, which typically try to pin as many pages as fit into the bio. This patch converts the current bio_iov_iter_get_pages() into a static helper, and introduces a new helper that allocates as many pages as 1) fit into the bio, 2) are present in the iov_iter, 3) and can be pinned by MM. Error is returned only if zero pages could be pinned. Because of 3), a zero return value doesn't necessarily mean all pages have been pinned. Callers that have to pin every page in the iov_iter must still call this function in a loop (this is currently the case). This change matters most for __blkdev_direct_IO_simple(), which calls bio_iov_iter_get_pages() only once. If it obtains less pages than requested, it returns a "short write" or "short read", and __generic_file_write_iter() falls back to buffered writes, which may lead to data corruption. Fixes: 72ecad22d9f1 ("block: support a full bio worth of IO for simplified bdev direct-io") Reviewed-by: Christoph Hellwig Signed-off-by: Martin Wilck Signed-off-by: Jens Axboe --- block/bio.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/block/bio.c b/block/bio.c index cd55ea6bd47c..dc07a427e782 100644 --- a/block/bio.c +++ b/block/bio.c @@ -903,14 +903,16 @@ int bio_add_page(struct bio *bio, struct page *page, EXPORT_SYMBOL(bio_add_page); /** - * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio + * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be mapped * - * Pins as many pages from *iter and appends them to @bio's bvec array. The + * Pins pages from *iter and appends them to @bio's bvec array. The * pages will have to be released using put_page() when done. + * For multi-segment *iter, this function only adds pages from the + * the next non-empty segment of the iov iterator. */ -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt, idx; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; @@ -947,6 +949,33 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) iov_iter_advance(iter, size); return 0; } + +/** + * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio + * @bio: bio to add pages to + * @iter: iov iterator describing the region to be mapped + * + * Pins pages from *iter and appends them to @bio's bvec array. The + * pages will have to be released using put_page() when done. + * The function tries, but does not guarantee, to pin as many pages as + * fit into the bio, or are requested in *iter, whatever is smaller. + * If MM encounters an error pinning the requested pages, it stops. + * Error is returned only if 0 pages could be pinned. + */ +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +{ + unsigned short orig_vcnt = bio->bi_vcnt; + + do { + int ret = __bio_iov_iter_get_pages(bio, iter); + + if (unlikely(ret)) + return bio->bi_vcnt > orig_vcnt ? 0 : ret; + + } while (iov_iter_count(iter) && !bio_full(bio)); + + return 0; +} EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); static void submit_bio_wait_endio(struct bio *bio) From 5151842b9d8732d4cbfa8400b40bff894f501b2f Mon Sep 17 00:00:00 2001 From: Greg Edwards Date: Thu, 26 Jul 2018 14:39:37 -0400 Subject: [PATCH 12/12] block: reset bi_iter.bi_done after splitting bio After the bio has been updated to represent the remaining sectors, reset bi_done so bio_rewind_iter() does not rewind further than it should. This resolves a bio_integrity_process() failure on reads where the original request was split. Fixes: 63573e359d05 ("bio-integrity: Restore original iterator on verify stage") Signed-off-by: Greg Edwards Signed-off-by: Jens Axboe --- block/bio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/bio.c b/block/bio.c index dc07a427e782..05d81912870b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1893,6 +1893,7 @@ struct bio *bio_split(struct bio *bio, int sectors, bio_integrity_trim(split); bio_advance(bio, split->bi_iter.bi_size); + bio->bi_iter.bi_done = 0; if (bio_flagged(bio, BIO_TRACE_COMPLETION)) bio_set_flag(split, BIO_TRACE_COMPLETION);