From 2e13456fb3d3e5f462dc1e59efc04eb62df35566 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 21 Jul 2017 10:48:13 -0400 Subject: [PATCH 01/16] nbd: allow multiple disconnects to be sent There's no reason to limit ourselves to one disconnect message per socket. Sometimes networks do strange things, might as well let sysadmins hit the panic button as much as they want. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 87a0a29f6e7e..f91e7ac3fa32 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -991,9 +991,8 @@ static int nbd_disconnect(struct nbd_device *nbd) struct nbd_config *config = nbd->config; dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); - if (!test_and_set_bit(NBD_DISCONNECT_REQUESTED, - &config->runtime_flags)) - send_disconnects(nbd); + set_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags); + send_disconnects(nbd); return 0; } From b4b2aeccf0f0fa1fd356cc72eeda7a2c66c39904 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 21 Jul 2017 10:48:14 -0400 Subject: [PATCH 02/16] nbd: take tx_lock before disconnecting We need to take the tx_lock so we don't interleave our disconnect request between real data going down the wire. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index f91e7ac3fa32..6aefe9fca6ce 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -978,11 +978,15 @@ static void send_disconnects(struct nbd_device *nbd) int i, ret; for (i = 0; i < config->num_connections; i++) { + struct nbd_sock *nsock = config->socks[i]; + iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request)); + mutex_lock(&nsock->tx_lock); ret = sock_xmit(nbd, i, 1, &from, 0, NULL); if (ret <= 0) dev_err(disk_to_dev(nbd->disk), "Send disconnect failed %d\n", ret); + mutex_unlock(&nsock->tx_lock); } } From a7ee8cf190ecc8f118f0b7839912564117524d85 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 21 Jul 2017 10:48:15 -0400 Subject: [PATCH 03/16] nbd: only set sndtimeo if we have a timeout set A user reported that he was getting immediate disconnects with my sndtimeo patch applied. This is because by default the OSS nbd client doesn't set a timeout, so we end up setting the sndtimeo to 0, which of course means we have send errors a lot. Instead only set our sndtimeo if the user specified a timeout, otherwise we'll just wait forever like we did previously. Fixes: dc88e34d69d8 ("nbd: set sk->sk_sndtimeo for our sockets") Reported-by: Adam Borowski Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 6aefe9fca6ce..64b19b10b739 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -908,7 +908,8 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) continue; } sk_set_memalloc(sock->sk); - sock->sk->sk_sndtimeo = nbd->tag_set.timeout; + if (nbd->tag_set.timeout) + sock->sk->sk_sndtimeo = nbd->tag_set.timeout; atomic_inc(&config->recv_threads); refcount_inc(&nbd->config_refs); old = nsock->sock; @@ -1077,7 +1078,9 @@ static int nbd_start_device(struct nbd_device *nbd) return -ENOMEM; } sk_set_memalloc(config->socks[i]->sock->sk); - config->socks[i]->sock->sk->sk_sndtimeo = nbd->tag_set.timeout; + if (nbd->tag_set.timeout) + config->socks[i]->sock->sk->sk_sndtimeo = + nbd->tag_set.timeout; atomic_inc(&config->recv_threads); refcount_inc(&nbd->config_refs); INIT_WORK(&args->work, recv_work); From 31c4ccc3ecb4944f6936082fabb5a86e3d086191 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 21 Jul 2017 19:11:10 +0200 Subject: [PATCH 04/16] xen-blkfront: Fix handling of non-supported operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes the following sparse warnings: drivers/block/xen-blkfront.c:916:45: warning: incorrect type in argument 2 (different base types) drivers/block/xen-blkfront.c:916:45: expected restricted blk_status_t [usertype] error drivers/block/xen-blkfront.c:916:45: got int [signed] error drivers/block/xen-blkfront.c:1599:47: warning: incorrect type in assignment (different base types) drivers/block/xen-blkfront.c:1599:47: expected int [signed] error drivers/block/xen-blkfront.c:1599:47: got restricted blk_status_t [usertype] drivers/block/xen-blkfront.c:1607:55: warning: incorrect type in assignment (different base types) drivers/block/xen-blkfront.c:1607:55: expected int [signed] error drivers/block/xen-blkfront.c:1607:55: got restricted blk_status_t [usertype] drivers/block/xen-blkfront.c:1625:55: warning: incorrect type in assignment (different base types) drivers/block/xen-blkfront.c:1625:55: expected int [signed] error drivers/block/xen-blkfront.c:1625:55: got restricted blk_status_t [usertype] drivers/block/xen-blkfront.c:1628:62: warning: restricted blk_status_t degrades to integer Compile-tested only. Fixes: commit 2a842acab109 ("block: introduce new block status code type") Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Cc: Konrad Rzeszutek Wilk Cc: Roger Pau Monné Cc: Signed-off-by: Jens Axboe --- drivers/block/xen-blkfront.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index c852ed3c01d5..1799bba74390 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -111,7 +111,7 @@ struct blk_shadow { }; struct blkif_req { - int error; + blk_status_t error; }; static inline struct blkif_req *blkif_req(struct request *rq) @@ -1616,7 +1616,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { printk(KERN_WARNING "blkfront: %s: %s op failed\n", info->gd->disk_name, op_name(bret->operation)); - blkif_req(req)->error = -EOPNOTSUPP; + blkif_req(req)->error = BLK_STS_NOTSUPP; } if (unlikely(bret->status == BLKIF_RSP_ERROR && rinfo->shadow[id].req.u.rw.nr_segments == 0)) { From 765e40b675a9566459ddcb8358ad16f3b8344bbe Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jul 2017 13:46:10 +0200 Subject: [PATCH 05/16] block: disable runtime-pm for blk-mq The blk-mq code lacks support for looking at the rpm_status field, tracking active requests and the RQF_PM flag. Due to the default switch to blk-mq for scsi people start to run into suspend / resume issue due to this fact, so make sure we disable the runtime PM functionality until it is properly implemented. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index 970b9c9638c5..dbecbf4a64e0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3421,6 +3421,10 @@ EXPORT_SYMBOL(blk_finish_plug); */ void blk_pm_runtime_init(struct request_queue *q, struct device *dev) { + /* not support for RQF_PM and ->rpm_status in blk-mq yet */ + if (q->mq_ops) + return; + q->dev = dev; q->rpm_status = RPM_ACTIVE; pm_runtime_set_autosuspend_delay(q->dev, -1); From 76451d79bde6bed17e113f057e58e1fa5fb79e78 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Jul 2017 17:04:40 +0200 Subject: [PATCH 06/16] blk-mq: map queues to all present CPUs We already do this for PCI mappings, and the higher level code now expects that CPU on/offlining doesn't have an affect on the queue mappings. Signed-off-by: Christoph Hellwig Tested-by: Max Gurtovoy Reviewed-by: Max Gurtovoy Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-mq-cpumap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 4891f042a22f..9f8cffc8a701 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -17,9 +17,9 @@ static int cpu_to_queue_index(unsigned int nr_queues, const int cpu) { /* - * Non online CPU will be mapped to queue index 0. + * Non present CPU will be mapped to queue index 0. */ - if (!cpu_online(cpu)) + if (!cpu_present(cpu)) return 0; return cpu % nr_queues; } From 4b422cb99836de3d261faec20a0329385bdec43d Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Thu, 20 Jul 2017 09:26:21 +0800 Subject: [PATCH 07/16] xen-blkfront: fix mq start/stop race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When ring buf full, hw queue will be stopped. While blkif interrupt consume request and make free space in ring buf, hw queue will be started again. But since start queue is protected by spin lock while stop not, that will cause a race. interrupt: process: blkif_interrupt() blkif_queue_rq() kick_pending_request_queues_locked() blk_mq_start_stopped_hw_queues() clear_bit(BLK_MQ_S_STOPPED, &hctx->state) blk_mq_stop_hw_queue(hctx) blk_mq_run_hw_queue(hctx, async) If ring buf is made empty in this case, interrupt will never come, then the hw queue will be stopped forever, all processes waiting for the pending io in the queue will hung. Signed-off-by: Junxiao Bi Reviewed-by: Ankur Arora Acked-by: Roger Pau Monné Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 1799bba74390..04eeb540490f 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -906,8 +906,8 @@ out_err: return BLK_STS_IOERR; out_busy: - spin_unlock_irqrestore(&rinfo->ring_lock, flags); blk_mq_stop_hw_queue(hctx); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_STS_RESOURCE; } From bd912ef3e46b6edb51bb8af4b73fd2be7817e305 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Wed, 28 Jun 2017 20:57:28 +0800 Subject: [PATCH 08/16] xen/blkfront: always allocate grants first from per-queue persistent grants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch partially reverts 3df0e50 ("xen/blkfront: pseudo support for multi hardware queues/rings"). The xen-blkfront queue/ring might hang due to grants allocation failure in the situation when gnttab_free_head is almost empty while many persistent grants are reserved for this queue/ring. As persistent grants management was per-queue since 73716df ("xen/blkfront: make persistent grants pool per-queue"), we should always allocate from persistent grants first. Acked-by: Roger Pau Monné Signed-off-by: Dongli Zhang Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 04eeb540490f..98e34e4c62b8 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -708,6 +708,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri * existing persistent grants, or if we have to get new grants, * as there are not sufficiently many free. */ + bool new_persistent_gnts = false; struct scatterlist *sg; int num_sg, max_grefs, num_grant; @@ -719,19 +720,21 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri */ max_grefs += INDIRECT_GREFS(max_grefs); - /* - * We have to reserve 'max_grefs' grants because persistent - * grants are shared by all rings. - */ - if (max_grefs > 0) - if (gnttab_alloc_grant_references(max_grefs, &setup.gref_head) < 0) { + /* Check if we have enough persistent grants to allocate a requests */ + if (rinfo->persistent_gnts_c < max_grefs) { + new_persistent_gnts = true; + + if (gnttab_alloc_grant_references( + max_grefs - rinfo->persistent_gnts_c, + &setup.gref_head) < 0) { gnttab_request_free_callback( &rinfo->callback, blkif_restart_queue_callback, rinfo, - max_grefs); + max_grefs - rinfo->persistent_gnts_c); return 1; } + } /* Fill out a communications ring structure. */ id = blkif_ring_get_request(rinfo, req, &ring_req); @@ -832,7 +835,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri if (unlikely(require_extra_req)) rinfo->shadow[extra_id].req = *extra_ring_req; - if (max_grefs > 0) + if (new_persistent_gnts) gnttab_free_grant_references(setup.gref_head); return 0; From 6484f5d16f9d5368afac61091972242f3bd695a9 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 12 Jul 2017 15:38:56 +0200 Subject: [PATCH 09/16] nvme: also provide a UUID in the WWID sysfs attribute The WWID sysfs attribute can provide multiple means of a World Wide ID for a NVMe device. It can either be a NGUID, a EUI-64 or a concatenation of VID, Serial Number, Model and the Namespace ID in this order of preference. If the target also sends us a UUID use the UUID for identification and give it the highest priority. This eases generation of /dev/disk/by-* symlinks. Signed-off-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3b77cfe5aa1e..4cacab331f2a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1995,6 +1995,9 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, int serial_len = sizeof(ctrl->serial); int model_len = sizeof(ctrl->model); + if (!uuid_is_null(&ns->uuid)) + return sprintf(buf, "uuid.%pU\n", &ns->uuid); + if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) return sprintf(buf, "eui.%16phN\n", ns->nguid); From 2fd4167fadd1360ab015e4f0e88e51843e49556c Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Wed, 12 Jul 2017 10:58:19 -0600 Subject: [PATCH 10/16] nvme: fabrics commands should use the fctype field for data direction Fabrics commands with opcode 0x7F use the fctype field to indicate data direction. Signed-off-by: Jon Derrick Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Fixes: eb793e2c ("nvme.h: add NVMe over Fabrics definitions") --- include/linux/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index bc74da018bdc..25d8225dbd04 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1006,7 +1006,7 @@ static inline bool nvme_is_write(struct nvme_command *cmd) * Why can't we simply have a Fabrics In and Fabrics out command? */ if (unlikely(cmd->common.opcode == nvme_fabrics_command)) - return cmd->fabrics.opcode & 1; + return cmd->fabrics.fctype & 1; return cmd->common.opcode & 1; } From 8b25f351929b5a5216ccb2c8882965134019679d Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 18 Jul 2017 14:29:34 -0700 Subject: [PATCH 11/16] nvme-fc: address target disconnect race conditions in fcp io submit There are cases where threads are in the process of submitting new io when the LLDD calls in to remove the remote port. In some cases, the next io actually goes to the LLDD, who knows the remoteport isn't present and rejects it. To properly recovery/restart these i/o's we don't want to hard fail them, we want to treat them as temporary resource errors in which a delayed retry will work. Add a couple more checks on remoteport connectivity and commonize the busy response handling when it's seen. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index d666ada39a9b..5630ca46c3b5 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1888,7 +1888,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, * the target device is present */ if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) - return BLK_STS_IOERR; + goto busy; if (!nvme_fc_ctrl_get(ctrl)) return BLK_STS_IOERR; @@ -1958,22 +1958,25 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, queue->lldd_handle, &op->fcp_req); if (ret) { - if (op->rq) /* normal request */ + if (!(op->flags & FCOP_FLAGS_AEN)) nvme_fc_unmap_data(ctrl, op->rq, op); - /* else - aen. no cleanup needed */ nvme_fc_ctrl_put(ctrl); - if (ret != -EBUSY) + if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE && + ret != -EBUSY) return BLK_STS_IOERR; - if (op->rq) - blk_mq_delay_run_hw_queue(queue->hctx, NVMEFC_QUEUE_DELAY); - - return BLK_STS_RESOURCE; + goto busy; } return BLK_STS_OK; + +busy: + if (!(op->flags & FCOP_FLAGS_AEN) && queue->hctx) + blk_mq_delay_run_hw_queue(queue->hctx, NVMEFC_QUEUE_DELAY); + + return BLK_STS_RESOURCE; } static blk_status_t From 9c5358e15ca12ed3dc3b1e51671dee5d155de8e0 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 17 Jul 2017 13:59:39 -0700 Subject: [PATCH 12/16] nvme-fc: revise TRADDR parsing The FC-NVME spec hasn't locked down on the format string for TRADDR. Currently the spec is lobbying for "nn-<16hexdigits>:pn-<16hexdigits>" where the wwn's are hex values but not prefixed by 0x. Most implementations so far expect a string format of "nn-0x<16hexdigits>:pn-0x<16hexdigits>" to be used. The transport uses the match_u64 parser which requires a leading 0x prefix to set the base properly. If it's not there, a match will either fail or return a base 10 value. The resolution in T11 is pushing out. Therefore, to fix things now and to cover any eventuality and any implementations already in the field, this patch adds support for both formats. The change consists of replacing the token matching routine with a routine that validates the fixed string format, and then builds a local copy of the hex name with a 0x prefix before calling the system parser. Note: the same parser routine exists in both the initiator and target transports. Given this is about the only "shared" item, we chose to replicate rather than create an interdendency on some shared code. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 102 ++++++++++++++++++++------------------- drivers/nvme/target/fc.c | 101 ++++++++++++++++++++------------------ include/linux/nvme-fc.h | 19 ++++++++ 3 files changed, 125 insertions(+), 97 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 5630ca46c3b5..5c2a08ef08ba 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2805,66 +2805,70 @@ out_fail: return ERR_PTR(ret); } -enum { - FCT_TRADDR_ERR = 0, - FCT_TRADDR_WWNN = 1 << 0, - FCT_TRADDR_WWPN = 1 << 1, -}; struct nvmet_fc_traddr { u64 nn; u64 pn; }; -static const match_table_t traddr_opt_tokens = { - { FCT_TRADDR_WWNN, "nn-%s" }, - { FCT_TRADDR_WWPN, "pn-%s" }, - { FCT_TRADDR_ERR, NULL } -}; - static int -nvme_fc_parse_address(struct nvmet_fc_traddr *traddr, char *buf) +__nvme_fc_parse_u64(substring_t *sstr, u64 *val) { - substring_t args[MAX_OPT_ARGS]; - char *options, *o, *p; - int token, ret = 0; u64 token64; - options = o = kstrdup(buf, GFP_KERNEL); - if (!options) - return -ENOMEM; + if (match_u64(sstr, &token64)) + return -EINVAL; + *val = token64; - while ((p = strsep(&o, ":\n")) != NULL) { - if (!*p) - continue; + return 0; +} - token = match_token(p, traddr_opt_tokens, args); - switch (token) { - case FCT_TRADDR_WWNN: - if (match_u64(args, &token64)) { - ret = -EINVAL; - goto out; - } - traddr->nn = token64; - break; - case FCT_TRADDR_WWPN: - if (match_u64(args, &token64)) { - ret = -EINVAL; - goto out; - } - traddr->pn = token64; - break; - default: - pr_warn("unknown traddr token or missing value '%s'\n", - p); - ret = -EINVAL; - goto out; - } - } +/* + * This routine validates and extracts the WWN's from the TRADDR string. + * As kernel parsers need the 0x to determine number base, universally + * build string to parse with 0x prefix before parsing name strings. + */ +static int +nvme_fc_parse_traddr(struct nvmet_fc_traddr *traddr, char *buf, size_t blen) +{ + char name[2 + NVME_FC_TRADDR_HEXNAMELEN + 1]; + substring_t wwn = { name, &name[sizeof(name)-1] }; + int nnoffset, pnoffset; -out: - kfree(options); - return ret; + /* validate it string one of the 2 allowed formats */ + if (strnlen(buf, blen) == NVME_FC_TRADDR_MAXLENGTH && + !strncmp(buf, "nn-0x", NVME_FC_TRADDR_OXNNLEN) && + !strncmp(&buf[NVME_FC_TRADDR_MAX_PN_OFFSET], + "pn-0x", NVME_FC_TRADDR_OXNNLEN)) { + nnoffset = NVME_FC_TRADDR_OXNNLEN; + pnoffset = NVME_FC_TRADDR_MAX_PN_OFFSET + + NVME_FC_TRADDR_OXNNLEN; + } else if ((strnlen(buf, blen) == NVME_FC_TRADDR_MINLENGTH && + !strncmp(buf, "nn-", NVME_FC_TRADDR_NNLEN) && + !strncmp(&buf[NVME_FC_TRADDR_MIN_PN_OFFSET], + "pn-", NVME_FC_TRADDR_NNLEN))) { + nnoffset = NVME_FC_TRADDR_NNLEN; + pnoffset = NVME_FC_TRADDR_MIN_PN_OFFSET + NVME_FC_TRADDR_NNLEN; + } else + goto out_einval; + + name[0] = '0'; + name[1] = 'x'; + name[2 + NVME_FC_TRADDR_HEXNAMELEN] = 0; + + memcpy(&name[2], &buf[nnoffset], NVME_FC_TRADDR_HEXNAMELEN); + if (__nvme_fc_parse_u64(&wwn, &traddr->nn)) + goto out_einval; + + memcpy(&name[2], &buf[pnoffset], NVME_FC_TRADDR_HEXNAMELEN); + if (__nvme_fc_parse_u64(&wwn, &traddr->pn)) + goto out_einval; + + return 0; + +out_einval: + pr_warn("%s: bad traddr string\n", __func__); + return -EINVAL; } static struct nvme_ctrl * @@ -2878,11 +2882,11 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) unsigned long flags; int ret; - ret = nvme_fc_parse_address(&raddr, opts->traddr); + ret = nvme_fc_parse_traddr(&raddr, opts->traddr, NVMF_TRADDR_SIZE); if (ret || !raddr.nn || !raddr.pn) return ERR_PTR(-EINVAL); - ret = nvme_fc_parse_address(&laddr, opts->host_traddr); + ret = nvme_fc_parse_traddr(&laddr, opts->host_traddr, NVMF_TRADDR_SIZE); if (ret || !laddr.nn || !laddr.pn) return ERR_PTR(-EINVAL); diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index d5801c150b1c..31ca55dfcb1d 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -2293,66 +2293,70 @@ nvmet_fc_rcv_fcp_abort(struct nvmet_fc_target_port *target_port, } EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_abort); -enum { - FCT_TRADDR_ERR = 0, - FCT_TRADDR_WWNN = 1 << 0, - FCT_TRADDR_WWPN = 1 << 1, -}; struct nvmet_fc_traddr { u64 nn; u64 pn; }; -static const match_table_t traddr_opt_tokens = { - { FCT_TRADDR_WWNN, "nn-%s" }, - { FCT_TRADDR_WWPN, "pn-%s" }, - { FCT_TRADDR_ERR, NULL } -}; - static int -nvmet_fc_parse_traddr(struct nvmet_fc_traddr *traddr, char *buf) +__nvme_fc_parse_u64(substring_t *sstr, u64 *val) { - substring_t args[MAX_OPT_ARGS]; - char *options, *o, *p; - int token, ret = 0; u64 token64; - options = o = kstrdup(buf, GFP_KERNEL); - if (!options) - return -ENOMEM; + if (match_u64(sstr, &token64)) + return -EINVAL; + *val = token64; - while ((p = strsep(&o, ":\n")) != NULL) { - if (!*p) - continue; + return 0; +} - token = match_token(p, traddr_opt_tokens, args); - switch (token) { - case FCT_TRADDR_WWNN: - if (match_u64(args, &token64)) { - ret = -EINVAL; - goto out; - } - traddr->nn = token64; - break; - case FCT_TRADDR_WWPN: - if (match_u64(args, &token64)) { - ret = -EINVAL; - goto out; - } - traddr->pn = token64; - break; - default: - pr_warn("unknown traddr token or missing value '%s'\n", - p); - ret = -EINVAL; - goto out; - } - } +/* + * This routine validates and extracts the WWN's from the TRADDR string. + * As kernel parsers need the 0x to determine number base, universally + * build string to parse with 0x prefix before parsing name strings. + */ +static int +nvme_fc_parse_traddr(struct nvmet_fc_traddr *traddr, char *buf, size_t blen) +{ + char name[2 + NVME_FC_TRADDR_HEXNAMELEN + 1]; + substring_t wwn = { name, &name[sizeof(name)-1] }; + int nnoffset, pnoffset; -out: - kfree(options); - return ret; + /* validate it string one of the 2 allowed formats */ + if (strnlen(buf, blen) == NVME_FC_TRADDR_MAXLENGTH && + !strncmp(buf, "nn-0x", NVME_FC_TRADDR_OXNNLEN) && + !strncmp(&buf[NVME_FC_TRADDR_MAX_PN_OFFSET], + "pn-0x", NVME_FC_TRADDR_OXNNLEN)) { + nnoffset = NVME_FC_TRADDR_OXNNLEN; + pnoffset = NVME_FC_TRADDR_MAX_PN_OFFSET + + NVME_FC_TRADDR_OXNNLEN; + } else if ((strnlen(buf, blen) == NVME_FC_TRADDR_MINLENGTH && + !strncmp(buf, "nn-", NVME_FC_TRADDR_NNLEN) && + !strncmp(&buf[NVME_FC_TRADDR_MIN_PN_OFFSET], + "pn-", NVME_FC_TRADDR_NNLEN))) { + nnoffset = NVME_FC_TRADDR_NNLEN; + pnoffset = NVME_FC_TRADDR_MIN_PN_OFFSET + NVME_FC_TRADDR_NNLEN; + } else + goto out_einval; + + name[0] = '0'; + name[1] = 'x'; + name[2 + NVME_FC_TRADDR_HEXNAMELEN] = 0; + + memcpy(&name[2], &buf[nnoffset], NVME_FC_TRADDR_HEXNAMELEN); + if (__nvme_fc_parse_u64(&wwn, &traddr->nn)) + goto out_einval; + + memcpy(&name[2], &buf[pnoffset], NVME_FC_TRADDR_HEXNAMELEN); + if (__nvme_fc_parse_u64(&wwn, &traddr->pn)) + goto out_einval; + + return 0; + +out_einval: + pr_warn("%s: bad traddr string\n", __func__); + return -EINVAL; } static int @@ -2370,7 +2374,8 @@ nvmet_fc_add_port(struct nvmet_port *port) /* map the traddr address info to a target port */ - ret = nvmet_fc_parse_traddr(&traddr, port->disc_addr.traddr); + ret = nvme_fc_parse_traddr(&traddr, port->disc_addr.traddr, + sizeof(port->disc_addr.traddr)); if (ret) return ret; diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h index 21c37e39e41a..36cca93a5ff2 100644 --- a/include/linux/nvme-fc.h +++ b/include/linux/nvme-fc.h @@ -334,5 +334,24 @@ struct fcnvme_ls_disconnect_acc { #define NVME_FC_LS_TIMEOUT_SEC 2 /* 2 seconds */ #define NVME_FC_TGTOP_TIMEOUT_SEC 2 /* 2 seconds */ +/* + * TRADDR string must be of form "nn-<16hexdigits>:pn-<16hexdigits>" + * the string is allowed to be specified with or without a "0x" prefix + * infront of the <16hexdigits>. Without is considered the "min" string + * and with is considered the "max" string. The hexdigits may be upper + * or lower case. + */ +#define NVME_FC_TRADDR_NNLEN 3 /* "?n-" */ +#define NVME_FC_TRADDR_OXNNLEN 5 /* "?n-0x" */ +#define NVME_FC_TRADDR_HEXNAMELEN 16 +#define NVME_FC_TRADDR_MINLENGTH \ + (2 * (NVME_FC_TRADDR_NNLEN + NVME_FC_TRADDR_HEXNAMELEN) + 1) +#define NVME_FC_TRADDR_MAXLENGTH \ + (2 * (NVME_FC_TRADDR_OXNNLEN + NVME_FC_TRADDR_HEXNAMELEN) + 1) +#define NVME_FC_TRADDR_MIN_PN_OFFSET \ + (NVME_FC_TRADDR_NNLEN + NVME_FC_TRADDR_HEXNAMELEN + 1) +#define NVME_FC_TRADDR_MAX_PN_OFFSET \ + (NVME_FC_TRADDR_OXNNLEN + NVME_FC_TRADDR_HEXNAMELEN + 1) + #endif /* _NVME_FC_H */ From 50cdb7c61b019a732fe34635a7cbf2a7487f5e90 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Jul 2017 17:39:07 +0200 Subject: [PATCH 13/16] nvme-pci: fix HMB size calculation It's possible the preferred HMB size may not be a multiple of the chunk_size. This patch moves len to function scope and uses that in the for loop increment so the last iteration doesn't cause the total size to exceed the allocated HMB size. Based on an earlier patch from Keith Busch. Signed-off-by: Christoph Hellwig Reported-by: Dan Carpenter Reviewed-by: Keith Busch Fixes: 87ad72a59a38 ("nvme-pci: implement host memory buffer support") --- drivers/nvme/host/pci.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8569ee771269..cd888a47d0fc 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1619,7 +1619,7 @@ static void nvme_free_host_mem(struct nvme_dev *dev) static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) { struct nvme_host_mem_buf_desc *descs; - u32 chunk_size, max_entries; + u32 chunk_size, max_entries, len; int i = 0; void **bufs; u64 size = 0, tmp; @@ -1638,10 +1638,10 @@ retry: if (!bufs) goto out_free_descs; - for (size = 0; size < preferred; size += chunk_size) { - u32 len = min_t(u64, chunk_size, preferred - size); + for (size = 0; size < preferred; size += len) { dma_addr_t dma_addr; + len = min_t(u64, chunk_size, preferred - size); bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL, DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); if (!bufs[i]) From 7a362ea96d0df873397be04f4556e92f7e37c5ec Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 25 Jul 2017 13:31:19 -0400 Subject: [PATCH 14/16] nbd: clear disconnected on reconnect If our device loses its connection for longer than the dead timeout we will set NBD_DISCONNECTED in order to quickly fail any pending IO's that flood in after the IO's that were waiting during the dead timer. However if we re-connect at some point in the future we'll still see this DISCONNECTED flag set if we then lose our connection again after that, which means we won't get notifications for our newly lost connections. Fix this by just clearing the DISCONNECTED flag on reconnect in order to make sure everything works as it's supposed to. Reported-by: Dan Melnic Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 64b19b10b739..5bdf923294a5 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -923,6 +923,8 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) mutex_unlock(&nsock->tx_lock); sockfd_put(old); + clear_bit(NBD_DISCONNECTED, &config->runtime_flags); + /* We take the tx_mutex in an error path in the recv_work, so we * need to queue_work outside of the tx_mutex. */ From 7dd1ab163c17e11473a65b11f7e748db30618ebb Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Tue, 25 Jul 2017 10:27:06 -0600 Subject: [PATCH 15/16] nvme: validate admin queue before unquiesce With a misbehaving controller it's possible we'll never enter the live state and create an admin queue. When we fail out of reset work it's possible we failed out early enough without setting up the admin queue. We tear down queues after a failed reset, but needed to do some more sanitization. Fixes 443bd90f2cca: "nvme: host: unquiesce queue in nvme_kill_queues()" [ 189.650995] nvme nvme1: pci function 0000:0b:00.0 [ 317.680055] nvme nvme0: Device not ready; aborting reset [ 317.680183] nvme nvme0: Removing after probe failure status: -19 [ 317.681258] kasan: GPF could be caused by NULL-ptr deref or user memory access [ 317.681397] general protection fault: 0000 [#1] SMP KASAN [ 317.682984] CPU: 3 PID: 477 Comm: kworker/3:2 Not tainted 4.13.0-rc1+ #5 [ 317.683112] Hardware name: Gigabyte Technology Co., Ltd. Z170X-UD5/Z170X-UD5-CF, BIOS F5 03/07/2016 [ 317.683284] Workqueue: events nvme_remove_dead_ctrl_work [nvme] [ 317.683398] task: ffff8803b0990000 task.stack: ffff8803c2ef0000 [ 317.683516] RIP: 0010:blk_mq_unquiesce_queue+0x2b/0xa0 [ 317.683614] RSP: 0018:ffff8803c2ef7d40 EFLAGS: 00010282 [ 317.683716] RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 1ffff1006fbdcde3 [ 317.683847] RDX: 0000000000000038 RSI: 1ffff1006f5a9245 RDI: 0000000000000000 [ 317.683978] RBP: ffff8803c2ef7d58 R08: 1ffff1007bcdc974 R09: 0000000000000000 [ 317.684108] R10: 1ffff1007bcdc975 R11: 0000000000000000 R12: 00000000000001c0 [ 317.684239] R13: ffff88037ad49228 R14: ffff88037ad492d0 R15: ffff88037ad492e0 [ 317.684371] FS: 0000000000000000(0000) GS:ffff8803de6c0000(0000) knlGS:0000000000000000 [ 317.684519] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 317.684627] CR2: 0000002d1860c000 CR3: 000000045b40d000 CR4: 00000000003406e0 [ 317.684758] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 317.684888] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 317.685018] Call Trace: [ 317.685084] nvme_kill_queues+0x4d/0x170 [nvme_core] [ 317.685185] nvme_remove_dead_ctrl_work+0x3a/0x90 [nvme] [ 317.685289] process_one_work+0x771/0x1170 [ 317.685372] worker_thread+0xde/0x11e0 [ 317.685452] ? pci_mmcfg_check_reserved+0x110/0x110 [ 317.685550] kthread+0x2d3/0x3d0 [ 317.685617] ? process_one_work+0x1170/0x1170 [ 317.685704] ? kthread_create_on_node+0xc0/0xc0 [ 317.685785] ret_from_fork+0x25/0x30 [ 317.685798] Code: 0f 1f 44 00 00 55 48 b8 00 00 00 00 00 fc ff df 48 89 e5 41 54 4c 8d a7 c0 01 00 00 53 48 89 fb 4c 89 e2 48 c1 ea 03 48 83 ec 08 <80> 3c 02 00 75 50 48 8b bb c0 01 00 00 e8 33 8a f9 00 0f ba b3 [ 317.685872] RIP: blk_mq_unquiesce_queue+0x2b/0xa0 RSP: ffff8803c2ef7d40 [ 317.685908] ---[ end trace a3f8704150b1e8b4 ]--- Signed-off-by: Scott Bauer Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4cacab331f2a..c49f1f8b2e57 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2712,7 +2712,8 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) mutex_lock(&ctrl->namespaces_mutex); /* Forcibly unquiesce queues to avoid blocking dispatch */ - blk_mq_unquiesce_queue(ctrl->admin_q); + if (ctrl->admin_q) + blk_mq_unquiesce_queue(ctrl->admin_q); list_for_each_entry(ns, &ctrl->namespaces, list) { /* From 75cb8e939cf30ebdfffd9b28566d8aead95138a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 28 Jul 2017 15:13:16 +0200 Subject: [PATCH 16/16] lightnvm: pblk: advance bio according to lba index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a lba either hits the cache or corresponds to an empty entry in the L2P table, we need to advance the bio according to the position in which the lba is located. Otherwise, we will copy data in the wrong page, thus causing data corruption for the application. In case of a cache hit, we assumed that bio->bi_iter.bi_idx would contain the correct index, but this is no necessarily true. Instead, use the local bio advance counter and iterator. This guarantees that lbas hitting the cache are copied into the right bv_page. In case of an empty L2P entry, we omitted to advance the bio. In the cases when the same I/O also contains a cache hit, data corresponding to this lba will be copied to the wrong bv_page. Fix this by advancing the bio as we do in the case of a cache hit. Fixes: a4bd217b4326 lightnvm: physical block device (pblk) target Signed-off-by: Javier González Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-rb.c | 4 ++-- drivers/lightnvm/pblk-read.c | 23 ++++++++++++++++------- drivers/lightnvm/pblk.h | 2 +- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index 5ecc154f6831..9bc32578a766 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -657,7 +657,7 @@ try: * be directed to disk. */ int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, - struct ppa_addr ppa, int bio_iter) + struct ppa_addr ppa, int bio_iter, bool advanced_bio) { struct pblk *pblk = container_of(rb, struct pblk, rwb); struct pblk_rb_entry *entry; @@ -694,7 +694,7 @@ int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, * filled with data from the cache). If part of the data resides on the * media, we will read later on */ - if (unlikely(!bio->bi_iter.bi_idx)) + if (unlikely(!advanced_bio)) bio_advance(bio, bio_iter * PBLK_EXPOSED_PAGE_SIZE); data = bio_data(bio); diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 4e5c48f3de62..d682e89e6493 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -26,7 +26,7 @@ */ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio, sector_t lba, struct ppa_addr ppa, - int bio_iter) + int bio_iter, bool advanced_bio) { #ifdef CONFIG_NVM_DEBUG /* Callers must ensure that the ppa points to a cache address */ @@ -34,7 +34,8 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio, BUG_ON(!pblk_addr_in_cache(ppa)); #endif - return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba, ppa, bio_iter); + return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba, ppa, + bio_iter, advanced_bio); } static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, @@ -44,7 +45,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS]; sector_t blba = pblk_get_lba(bio); int nr_secs = rqd->nr_ppas; - int advanced_bio = 0; + bool advanced_bio = false; int i, j = 0; /* logic error: lba out-of-bounds. Ignore read request */ @@ -62,19 +63,26 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, retry: if (pblk_ppa_empty(p)) { WARN_ON(test_and_set_bit(i, read_bitmap)); - continue; + + if (unlikely(!advanced_bio)) { + bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE); + advanced_bio = true; + } + + goto next; } /* Try to read from write buffer. The address is later checked * on the write buffer to prevent retrieving overwritten data. */ if (pblk_addr_in_cache(p)) { - if (!pblk_read_from_cache(pblk, bio, lba, p, i)) { + if (!pblk_read_from_cache(pblk, bio, lba, p, i, + advanced_bio)) { pblk_lookup_l2p_seq(pblk, &p, lba, 1); goto retry; } WARN_ON(test_and_set_bit(i, read_bitmap)); - advanced_bio = 1; + advanced_bio = true; #ifdef CONFIG_NVM_DEBUG atomic_long_inc(&pblk->cache_reads); #endif @@ -83,6 +91,7 @@ retry: rqd->ppa_list[j++] = p; } +next: if (advanced_bio) bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE); } @@ -282,7 +291,7 @@ retry: * write buffer to prevent retrieving overwritten data. */ if (pblk_addr_in_cache(ppa)) { - if (!pblk_read_from_cache(pblk, bio, lba, ppa, 0)) { + if (!pblk_read_from_cache(pblk, bio, lba, ppa, 0, 1)) { pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); goto retry; } diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 0c5692cc2f60..67e623bd5c2d 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -670,7 +670,7 @@ unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio, struct list_head *list, unsigned int max); int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, - struct ppa_addr ppa, int bio_iter); + struct ppa_addr ppa, int bio_iter, bool advanced_bio); unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries); unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags);