From 2e13456fb3d3e5f462dc1e59efc04eb62df35566 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 21 Jul 2017 10:48:13 -0400
Subject: [PATCH 01/16] nbd: allow multiple disconnects to be sent

There's no reason to limit ourselves to one disconnect message per
socket.  Sometimes networks do strange things, might as well let
sysadmins hit the panic button as much as they want.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/nbd.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 87a0a29f6e7e..f91e7ac3fa32 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -991,9 +991,8 @@ static int nbd_disconnect(struct nbd_device *nbd)
 	struct nbd_config *config = nbd->config;
 
 	dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
-	if (!test_and_set_bit(NBD_DISCONNECT_REQUESTED,
-			      &config->runtime_flags))
-		send_disconnects(nbd);
+	set_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
+	send_disconnects(nbd);
 	return 0;
 }
 

From b4b2aeccf0f0fa1fd356cc72eeda7a2c66c39904 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 21 Jul 2017 10:48:14 -0400
Subject: [PATCH 02/16] nbd: take tx_lock before disconnecting

We need to take the tx_lock so we don't interleave our disconnect
request between real data going down the wire.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/nbd.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index f91e7ac3fa32..6aefe9fca6ce 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -978,11 +978,15 @@ static void send_disconnects(struct nbd_device *nbd)
 	int i, ret;
 
 	for (i = 0; i < config->num_connections; i++) {
+		struct nbd_sock *nsock = config->socks[i];
+
 		iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
+		mutex_lock(&nsock->tx_lock);
 		ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
 		if (ret <= 0)
 			dev_err(disk_to_dev(nbd->disk),
 				"Send disconnect failed %d\n", ret);
+		mutex_unlock(&nsock->tx_lock);
 	}
 }
 

From a7ee8cf190ecc8f118f0b7839912564117524d85 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 21 Jul 2017 10:48:15 -0400
Subject: [PATCH 03/16] nbd: only set sndtimeo if we have a timeout set

A user reported that he was getting immediate disconnects with my
sndtimeo patch applied.  This is because by default the OSS nbd client
doesn't set a timeout, so we end up setting the sndtimeo to 0, which of
course means we have send errors a lot.  Instead only set our sndtimeo
if the user specified a timeout, otherwise we'll just wait forever like
we did previously.

Fixes: dc88e34d69d8 ("nbd: set sk->sk_sndtimeo for our sockets")
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/nbd.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 6aefe9fca6ce..64b19b10b739 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -908,7 +908,8 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
 			continue;
 		}
 		sk_set_memalloc(sock->sk);
-		sock->sk->sk_sndtimeo = nbd->tag_set.timeout;
+		if (nbd->tag_set.timeout)
+			sock->sk->sk_sndtimeo = nbd->tag_set.timeout;
 		atomic_inc(&config->recv_threads);
 		refcount_inc(&nbd->config_refs);
 		old = nsock->sock;
@@ -1077,7 +1078,9 @@ static int nbd_start_device(struct nbd_device *nbd)
 			return -ENOMEM;
 		}
 		sk_set_memalloc(config->socks[i]->sock->sk);
-		config->socks[i]->sock->sk->sk_sndtimeo = nbd->tag_set.timeout;
+		if (nbd->tag_set.timeout)
+			config->socks[i]->sock->sk->sk_sndtimeo =
+				nbd->tag_set.timeout;
 		atomic_inc(&config->recv_threads);
 		refcount_inc(&nbd->config_refs);
 		INIT_WORK(&args->work, recv_work);

From 31c4ccc3ecb4944f6936082fabb5a86e3d086191 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 21 Jul 2017 19:11:10 +0200
Subject: [PATCH 04/16] xen-blkfront: Fix handling of non-supported operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes the following sparse warnings:

drivers/block/xen-blkfront.c:916:45: warning: incorrect type in argument 2 (different base types)
drivers/block/xen-blkfront.c:916:45:    expected restricted blk_status_t [usertype] error
drivers/block/xen-blkfront.c:916:45:    got int [signed] error
drivers/block/xen-blkfront.c:1599:47: warning: incorrect type in assignment (different base types)
drivers/block/xen-blkfront.c:1599:47:    expected int [signed] error
drivers/block/xen-blkfront.c:1599:47:    got restricted blk_status_t [usertype] <noident>
drivers/block/xen-blkfront.c:1607:55: warning: incorrect type in assignment (different base types)
drivers/block/xen-blkfront.c:1607:55:    expected int [signed] error
drivers/block/xen-blkfront.c:1607:55:    got restricted blk_status_t [usertype] <noident>
drivers/block/xen-blkfront.c:1625:55: warning: incorrect type in assignment (different base types)
drivers/block/xen-blkfront.c:1625:55:    expected int [signed] error
drivers/block/xen-blkfront.c:1625:55:    got restricted blk_status_t [usertype] <noident>
drivers/block/xen-blkfront.c:1628:62: warning: restricted blk_status_t degrades to integer

Compile-tested only.

Fixes: commit 2a842acab109 ("block: introduce new block status code type")
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Roger Pau Monné <roger.pau@citrix.com>
Cc: <xen-devel@lists.xenproject.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/xen-blkfront.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index c852ed3c01d5..1799bba74390 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -111,7 +111,7 @@ struct blk_shadow {
 };
 
 struct blkif_req {
-	int	error;
+	blk_status_t	error;
 };
 
 static inline struct blkif_req *blkif_req(struct request *rq)
@@ -1616,7 +1616,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
 				printk(KERN_WARNING "blkfront: %s: %s op failed\n",
 				       info->gd->disk_name, op_name(bret->operation));
-				blkif_req(req)->error = -EOPNOTSUPP;
+				blkif_req(req)->error = BLK_STS_NOTSUPP;
 			}
 			if (unlikely(bret->status == BLKIF_RSP_ERROR &&
 				     rinfo->shadow[id].req.u.rw.nr_segments == 0)) {

From 765e40b675a9566459ddcb8358ad16f3b8344bbe Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Jul 2017 13:46:10 +0200
Subject: [PATCH 05/16] block: disable runtime-pm for blk-mq

The blk-mq code lacks support for looking at the rpm_status field, tracking
active requests and the RQF_PM flag.

Due to the default switch to blk-mq for scsi people start to run into
suspend / resume issue due to this fact, so make sure we disable the runtime
PM functionality until it is properly implemented.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 970b9c9638c5..dbecbf4a64e0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3421,6 +3421,10 @@ EXPORT_SYMBOL(blk_finish_plug);
  */
 void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
 {
+	/* not support for RQF_PM and ->rpm_status in blk-mq yet */
+	if (q->mq_ops)
+		return;
+
 	q->dev = dev;
 	q->rpm_status = RPM_ACTIVE;
 	pm_runtime_set_autosuspend_delay(q->dev, -1);

From 76451d79bde6bed17e113f057e58e1fa5fb79e78 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 18 Jul 2017 17:04:40 +0200
Subject: [PATCH 06/16] blk-mq: map queues to all present CPUs

We already do this for PCI mappings, and the higher level code now
expects that CPU on/offlining doesn't have an affect on the queue
mappings.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-cpumap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 4891f042a22f..9f8cffc8a701 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -17,9 +17,9 @@
 static int cpu_to_queue_index(unsigned int nr_queues, const int cpu)
 {
 	/*
-	 * Non online CPU will be mapped to queue index 0.
+	 * Non present CPU will be mapped to queue index 0.
 	 */
-	if (!cpu_online(cpu))
+	if (!cpu_present(cpu))
 		return 0;
 	return cpu % nr_queues;
 }

From 4b422cb99836de3d261faec20a0329385bdec43d Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Thu, 20 Jul 2017 09:26:21 +0800
Subject: [PATCH 07/16] xen-blkfront: fix mq start/stop race
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When ring buf full, hw queue will be stopped. While blkif interrupt consume
request and make free space in ring buf, hw queue will be started again.
But since start queue is protected by spin lock while stop not, that will
cause a race.

interrupt:                                      process:
blkif_interrupt()                               blkif_queue_rq()
 kick_pending_request_queues_locked()
   blk_mq_start_stopped_hw_queues()
      clear_bit(BLK_MQ_S_STOPPED, &hctx->state)
	                                             blk_mq_stop_hw_queue(hctx)
	  blk_mq_run_hw_queue(hctx, async)

If ring buf is made empty in this case, interrupt will never come, then the
hw queue will be stopped forever, all processes waiting for the pending io
in the queue will hung.

Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Ankur Arora <ankur.a.arora@oracle.com>
Acked-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 1799bba74390..04eeb540490f 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -906,8 +906,8 @@ out_err:
 	return BLK_STS_IOERR;
 
 out_busy:
-	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
 	blk_mq_stop_hw_queue(hctx);
+	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
 	return BLK_STS_RESOURCE;
 }
 

From bd912ef3e46b6edb51bb8af4b73fd2be7817e305 Mon Sep 17 00:00:00 2001
From: Dongli Zhang <dongli.zhang@oracle.com>
Date: Wed, 28 Jun 2017 20:57:28 +0800
Subject: [PATCH 08/16] xen/blkfront: always allocate grants first from
 per-queue persistent grants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch partially reverts 3df0e50 ("xen/blkfront: pseudo support for
multi hardware queues/rings"). The xen-blkfront queue/ring might hang due
to grants allocation failure in the situation when gnttab_free_head is
almost empty while many persistent grants are reserved for this queue/ring.

As persistent grants management was per-queue since 73716df ("xen/blkfront:
make persistent grants pool per-queue"), we should always allocate from
persistent grants first.

Acked-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 04eeb540490f..98e34e4c62b8 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -708,6 +708,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
 	 * existing persistent grants, or if we have to get new grants,
 	 * as there are not sufficiently many free.
 	 */
+	bool new_persistent_gnts = false;
 	struct scatterlist *sg;
 	int num_sg, max_grefs, num_grant;
 
@@ -719,19 +720,21 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
 		 */
 		max_grefs += INDIRECT_GREFS(max_grefs);
 
-	/*
-	 * We have to reserve 'max_grefs' grants because persistent
-	 * grants are shared by all rings.
-	 */
-	if (max_grefs > 0)
-		if (gnttab_alloc_grant_references(max_grefs, &setup.gref_head) < 0) {
+	/* Check if we have enough persistent grants to allocate a requests */
+	if (rinfo->persistent_gnts_c < max_grefs) {
+		new_persistent_gnts = true;
+
+		if (gnttab_alloc_grant_references(
+		    max_grefs - rinfo->persistent_gnts_c,
+		    &setup.gref_head) < 0) {
 			gnttab_request_free_callback(
 				&rinfo->callback,
 				blkif_restart_queue_callback,
 				rinfo,
-				max_grefs);
+				max_grefs - rinfo->persistent_gnts_c);
 			return 1;
 		}
+	}
 
 	/* Fill out a communications ring structure. */
 	id = blkif_ring_get_request(rinfo, req, &ring_req);
@@ -832,7 +835,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
 	if (unlikely(require_extra_req))
 		rinfo->shadow[extra_id].req = *extra_ring_req;
 
-	if (max_grefs > 0)
+	if (new_persistent_gnts)
 		gnttab_free_grant_references(setup.gref_head);
 
 	return 0;

From 6484f5d16f9d5368afac61091972242f3bd695a9 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn@suse.de>
Date: Wed, 12 Jul 2017 15:38:56 +0200
Subject: [PATCH 09/16] nvme: also provide a UUID in the WWID sysfs attribute

The WWID sysfs attribute can provide multiple means of a World Wide ID
for a NVMe device. It can either be a NGUID, a EUI-64 or a concatenation
of VID, Serial Number, Model and the Namespace ID in this order of
preference.

If the target also sends us a UUID use the UUID for identification and
give it the highest priority.

This eases generation of /dev/disk/by-* symlinks.

Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3b77cfe5aa1e..4cacab331f2a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1995,6 +1995,9 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
 	int serial_len = sizeof(ctrl->serial);
 	int model_len = sizeof(ctrl->model);
 
+	if (!uuid_is_null(&ns->uuid))
+		return sprintf(buf, "uuid.%pU\n", &ns->uuid);
+
 	if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
 		return sprintf(buf, "eui.%16phN\n", ns->nguid);
 

From 2fd4167fadd1360ab015e4f0e88e51843e49556c Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Wed, 12 Jul 2017 10:58:19 -0600
Subject: [PATCH 10/16] nvme: fabrics commands should use the fctype field for
 data direction

Fabrics commands with opcode 0x7F use the fctype field to indicate data
direction.

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Reviewed-by: Sagi Grimberg <sai@grmberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Fixes: eb793e2c ("nvme.h: add NVMe over Fabrics definitions")
---
 include/linux/nvme.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index bc74da018bdc..25d8225dbd04 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1006,7 +1006,7 @@ static inline bool nvme_is_write(struct nvme_command *cmd)
 	 * Why can't we simply have a Fabrics In and Fabrics out command?
 	 */
 	if (unlikely(cmd->common.opcode == nvme_fabrics_command))
-		return cmd->fabrics.opcode & 1;
+		return cmd->fabrics.fctype & 1;
 	return cmd->common.opcode & 1;
 }
 

From 8b25f351929b5a5216ccb2c8882965134019679d Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Tue, 18 Jul 2017 14:29:34 -0700
Subject: [PATCH 11/16] nvme-fc: address target disconnect race conditions in
 fcp io submit

There are cases where threads are in the process of submitting new
io when the LLDD calls in to remove the remote port. In some cases,
the next io actually goes to the LLDD, who knows the remoteport isn't
present and rejects it. To properly recovery/restart these i/o's we
don't want to hard fail them, we want to treat them as temporary
resource errors in which a delayed retry will work.

Add a couple more checks on remoteport connectivity and commonize the
busy response handling when it's seen.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index d666ada39a9b..5630ca46c3b5 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1888,7 +1888,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
 	 * the target device is present
 	 */
 	if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
-		return BLK_STS_IOERR;
+		goto busy;
 
 	if (!nvme_fc_ctrl_get(ctrl))
 		return BLK_STS_IOERR;
@@ -1958,22 +1958,25 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
 					queue->lldd_handle, &op->fcp_req);
 
 	if (ret) {
-		if (op->rq)			/* normal request */
+		if (!(op->flags & FCOP_FLAGS_AEN))
 			nvme_fc_unmap_data(ctrl, op->rq, op);
-		/* else - aen. no cleanup needed */
 
 		nvme_fc_ctrl_put(ctrl);
 
-		if (ret != -EBUSY)
+		if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE &&
+				ret != -EBUSY)
 			return BLK_STS_IOERR;
 
-		if (op->rq)
-			blk_mq_delay_run_hw_queue(queue->hctx, NVMEFC_QUEUE_DELAY);
-
-		return BLK_STS_RESOURCE;
+		goto busy;
 	}
 
 	return BLK_STS_OK;
+
+busy:
+	if (!(op->flags & FCOP_FLAGS_AEN) && queue->hctx)
+		blk_mq_delay_run_hw_queue(queue->hctx, NVMEFC_QUEUE_DELAY);
+
+	return BLK_STS_RESOURCE;
 }
 
 static blk_status_t

From 9c5358e15ca12ed3dc3b1e51671dee5d155de8e0 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Mon, 17 Jul 2017 13:59:39 -0700
Subject: [PATCH 12/16] nvme-fc: revise TRADDR parsing

The FC-NVME spec hasn't locked down on the format string for TRADDR.
Currently the spec is lobbying for "nn-<16hexdigits>:pn-<16hexdigits>"
where the wwn's are hex values but not prefixed by 0x.

Most implementations so far expect a string format of
"nn-0x<16hexdigits>:pn-0x<16hexdigits>" to be used. The transport
uses the match_u64 parser which requires a leading 0x prefix to set
the base properly. If it's not there, a match will either fail or return
a base 10 value.

The resolution in T11 is pushing out. Therefore, to fix things now and
to cover any eventuality and any implementations already in the field,
this patch adds support for both formats.

The change consists of replacing the token matching routine with a
routine that validates the fixed string format, and then builds
a local copy of the hex name with a 0x prefix before calling
the system parser.

Note: the same parser routine exists in both the initiator and target
transports. Given this is about the only "shared" item, we chose to
replicate rather than create an interdendency on some shared code.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c   | 102 ++++++++++++++++++++-------------------
 drivers/nvme/target/fc.c | 101 ++++++++++++++++++++------------------
 include/linux/nvme-fc.h  |  19 ++++++++
 3 files changed, 125 insertions(+), 97 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 5630ca46c3b5..5c2a08ef08ba 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2805,66 +2805,70 @@ out_fail:
 	return ERR_PTR(ret);
 }
 
-enum {
-	FCT_TRADDR_ERR		= 0,
-	FCT_TRADDR_WWNN		= 1 << 0,
-	FCT_TRADDR_WWPN		= 1 << 1,
-};
 
 struct nvmet_fc_traddr {
 	u64	nn;
 	u64	pn;
 };
 
-static const match_table_t traddr_opt_tokens = {
-	{ FCT_TRADDR_WWNN,	"nn-%s"		},
-	{ FCT_TRADDR_WWPN,	"pn-%s"		},
-	{ FCT_TRADDR_ERR,	NULL		}
-};
-
 static int
-nvme_fc_parse_address(struct nvmet_fc_traddr *traddr, char *buf)
+__nvme_fc_parse_u64(substring_t *sstr, u64 *val)
 {
-	substring_t args[MAX_OPT_ARGS];
-	char *options, *o, *p;
-	int token, ret = 0;
 	u64 token64;
 
-	options = o = kstrdup(buf, GFP_KERNEL);
-	if (!options)
-		return -ENOMEM;
+	if (match_u64(sstr, &token64))
+		return -EINVAL;
+	*val = token64;
 
-	while ((p = strsep(&o, ":\n")) != NULL) {
-		if (!*p)
-			continue;
+	return 0;
+}
 
-		token = match_token(p, traddr_opt_tokens, args);
-		switch (token) {
-		case FCT_TRADDR_WWNN:
-			if (match_u64(args, &token64)) {
-				ret = -EINVAL;
-				goto out;
-			}
-			traddr->nn = token64;
-			break;
-		case FCT_TRADDR_WWPN:
-			if (match_u64(args, &token64)) {
-				ret = -EINVAL;
-				goto out;
-			}
-			traddr->pn = token64;
-			break;
-		default:
-			pr_warn("unknown traddr token or missing value '%s'\n",
-					p);
-			ret = -EINVAL;
-			goto out;
-		}
-	}
+/*
+ * This routine validates and extracts the WWN's from the TRADDR string.
+ * As kernel parsers need the 0x to determine number base, universally
+ * build string to parse with 0x prefix before parsing name strings.
+ */
+static int
+nvme_fc_parse_traddr(struct nvmet_fc_traddr *traddr, char *buf, size_t blen)
+{
+	char name[2 + NVME_FC_TRADDR_HEXNAMELEN + 1];
+	substring_t wwn = { name, &name[sizeof(name)-1] };
+	int nnoffset, pnoffset;
 
-out:
-	kfree(options);
-	return ret;
+	/* validate it string one of the 2 allowed formats */
+	if (strnlen(buf, blen) == NVME_FC_TRADDR_MAXLENGTH &&
+			!strncmp(buf, "nn-0x", NVME_FC_TRADDR_OXNNLEN) &&
+			!strncmp(&buf[NVME_FC_TRADDR_MAX_PN_OFFSET],
+				"pn-0x", NVME_FC_TRADDR_OXNNLEN)) {
+		nnoffset = NVME_FC_TRADDR_OXNNLEN;
+		pnoffset = NVME_FC_TRADDR_MAX_PN_OFFSET +
+						NVME_FC_TRADDR_OXNNLEN;
+	} else if ((strnlen(buf, blen) == NVME_FC_TRADDR_MINLENGTH &&
+			!strncmp(buf, "nn-", NVME_FC_TRADDR_NNLEN) &&
+			!strncmp(&buf[NVME_FC_TRADDR_MIN_PN_OFFSET],
+				"pn-", NVME_FC_TRADDR_NNLEN))) {
+		nnoffset = NVME_FC_TRADDR_NNLEN;
+		pnoffset = NVME_FC_TRADDR_MIN_PN_OFFSET + NVME_FC_TRADDR_NNLEN;
+	} else
+		goto out_einval;
+
+	name[0] = '0';
+	name[1] = 'x';
+	name[2 + NVME_FC_TRADDR_HEXNAMELEN] = 0;
+
+	memcpy(&name[2], &buf[nnoffset], NVME_FC_TRADDR_HEXNAMELEN);
+	if (__nvme_fc_parse_u64(&wwn, &traddr->nn))
+		goto out_einval;
+
+	memcpy(&name[2], &buf[pnoffset], NVME_FC_TRADDR_HEXNAMELEN);
+	if (__nvme_fc_parse_u64(&wwn, &traddr->pn))
+		goto out_einval;
+
+	return 0;
+
+out_einval:
+	pr_warn("%s: bad traddr string\n", __func__);
+	return -EINVAL;
 }
 
 static struct nvme_ctrl *
@@ -2878,11 +2882,11 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
 	unsigned long flags;
 	int ret;
 
-	ret = nvme_fc_parse_address(&raddr, opts->traddr);
+	ret = nvme_fc_parse_traddr(&raddr, opts->traddr, NVMF_TRADDR_SIZE);
 	if (ret || !raddr.nn || !raddr.pn)
 		return ERR_PTR(-EINVAL);
 
-	ret = nvme_fc_parse_address(&laddr, opts->host_traddr);
+	ret = nvme_fc_parse_traddr(&laddr, opts->host_traddr, NVMF_TRADDR_SIZE);
 	if (ret || !laddr.nn || !laddr.pn)
 		return ERR_PTR(-EINVAL);
 
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index d5801c150b1c..31ca55dfcb1d 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -2293,66 +2293,70 @@ nvmet_fc_rcv_fcp_abort(struct nvmet_fc_target_port *target_port,
 }
 EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_abort);
 
-enum {
-	FCT_TRADDR_ERR		= 0,
-	FCT_TRADDR_WWNN		= 1 << 0,
-	FCT_TRADDR_WWPN		= 1 << 1,
-};
 
 struct nvmet_fc_traddr {
 	u64	nn;
 	u64	pn;
 };
 
-static const match_table_t traddr_opt_tokens = {
-	{ FCT_TRADDR_WWNN,	"nn-%s"		},
-	{ FCT_TRADDR_WWPN,	"pn-%s"		},
-	{ FCT_TRADDR_ERR,	NULL		}
-};
-
 static int
-nvmet_fc_parse_traddr(struct nvmet_fc_traddr *traddr, char *buf)
+__nvme_fc_parse_u64(substring_t *sstr, u64 *val)
 {
-	substring_t args[MAX_OPT_ARGS];
-	char *options, *o, *p;
-	int token, ret = 0;
 	u64 token64;
 
-	options = o = kstrdup(buf, GFP_KERNEL);
-	if (!options)
-		return -ENOMEM;
+	if (match_u64(sstr, &token64))
+		return -EINVAL;
+	*val = token64;
 
-	while ((p = strsep(&o, ":\n")) != NULL) {
-		if (!*p)
-			continue;
+	return 0;
+}
 
-		token = match_token(p, traddr_opt_tokens, args);
-		switch (token) {
-		case FCT_TRADDR_WWNN:
-			if (match_u64(args, &token64)) {
-				ret = -EINVAL;
-				goto out;
-			}
-			traddr->nn = token64;
-			break;
-		case FCT_TRADDR_WWPN:
-			if (match_u64(args, &token64)) {
-				ret = -EINVAL;
-				goto out;
-			}
-			traddr->pn = token64;
-			break;
-		default:
-			pr_warn("unknown traddr token or missing value '%s'\n",
-					p);
-			ret = -EINVAL;
-			goto out;
-		}
-	}
+/*
+ * This routine validates and extracts the WWN's from the TRADDR string.
+ * As kernel parsers need the 0x to determine number base, universally
+ * build string to parse with 0x prefix before parsing name strings.
+ */
+static int
+nvme_fc_parse_traddr(struct nvmet_fc_traddr *traddr, char *buf, size_t blen)
+{
+	char name[2 + NVME_FC_TRADDR_HEXNAMELEN + 1];
+	substring_t wwn = { name, &name[sizeof(name)-1] };
+	int nnoffset, pnoffset;
 
-out:
-	kfree(options);
-	return ret;
+	/* validate it string one of the 2 allowed formats */
+	if (strnlen(buf, blen) == NVME_FC_TRADDR_MAXLENGTH &&
+			!strncmp(buf, "nn-0x", NVME_FC_TRADDR_OXNNLEN) &&
+			!strncmp(&buf[NVME_FC_TRADDR_MAX_PN_OFFSET],
+				"pn-0x", NVME_FC_TRADDR_OXNNLEN)) {
+		nnoffset = NVME_FC_TRADDR_OXNNLEN;
+		pnoffset = NVME_FC_TRADDR_MAX_PN_OFFSET +
+						NVME_FC_TRADDR_OXNNLEN;
+	} else if ((strnlen(buf, blen) == NVME_FC_TRADDR_MINLENGTH &&
+			!strncmp(buf, "nn-", NVME_FC_TRADDR_NNLEN) &&
+			!strncmp(&buf[NVME_FC_TRADDR_MIN_PN_OFFSET],
+				"pn-", NVME_FC_TRADDR_NNLEN))) {
+		nnoffset = NVME_FC_TRADDR_NNLEN;
+		pnoffset = NVME_FC_TRADDR_MIN_PN_OFFSET + NVME_FC_TRADDR_NNLEN;
+	} else
+		goto out_einval;
+
+	name[0] = '0';
+	name[1] = 'x';
+	name[2 + NVME_FC_TRADDR_HEXNAMELEN] = 0;
+
+	memcpy(&name[2], &buf[nnoffset], NVME_FC_TRADDR_HEXNAMELEN);
+	if (__nvme_fc_parse_u64(&wwn, &traddr->nn))
+		goto out_einval;
+
+	memcpy(&name[2], &buf[pnoffset], NVME_FC_TRADDR_HEXNAMELEN);
+	if (__nvme_fc_parse_u64(&wwn, &traddr->pn))
+		goto out_einval;
+
+	return 0;
+
+out_einval:
+	pr_warn("%s: bad traddr string\n", __func__);
+	return -EINVAL;
 }
 
 static int
@@ -2370,7 +2374,8 @@ nvmet_fc_add_port(struct nvmet_port *port)
 
 	/* map the traddr address info to a target port */
 
-	ret = nvmet_fc_parse_traddr(&traddr, port->disc_addr.traddr);
+	ret = nvme_fc_parse_traddr(&traddr, port->disc_addr.traddr,
+			sizeof(port->disc_addr.traddr));
 	if (ret)
 		return ret;
 
diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h
index 21c37e39e41a..36cca93a5ff2 100644
--- a/include/linux/nvme-fc.h
+++ b/include/linux/nvme-fc.h
@@ -334,5 +334,24 @@ struct fcnvme_ls_disconnect_acc {
 #define NVME_FC_LS_TIMEOUT_SEC		2		/* 2 seconds */
 #define NVME_FC_TGTOP_TIMEOUT_SEC	2		/* 2 seconds */
 
+/*
+ * TRADDR string must be of form "nn-<16hexdigits>:pn-<16hexdigits>"
+ * the string is allowed to be specified with or without a "0x" prefix
+ * infront of the <16hexdigits>.  Without is considered the "min" string
+ * and with is considered the "max" string. The hexdigits may be upper
+ * or lower case.
+ */
+#define NVME_FC_TRADDR_NNLEN		3	/* "?n-" */
+#define NVME_FC_TRADDR_OXNNLEN		5	/* "?n-0x" */
+#define NVME_FC_TRADDR_HEXNAMELEN	16
+#define NVME_FC_TRADDR_MINLENGTH	\
+		(2 * (NVME_FC_TRADDR_NNLEN + NVME_FC_TRADDR_HEXNAMELEN) + 1)
+#define NVME_FC_TRADDR_MAXLENGTH	\
+		(2 * (NVME_FC_TRADDR_OXNNLEN + NVME_FC_TRADDR_HEXNAMELEN) + 1)
+#define NVME_FC_TRADDR_MIN_PN_OFFSET	\
+		(NVME_FC_TRADDR_NNLEN + NVME_FC_TRADDR_HEXNAMELEN + 1)
+#define NVME_FC_TRADDR_MAX_PN_OFFSET	\
+		(NVME_FC_TRADDR_OXNNLEN + NVME_FC_TRADDR_HEXNAMELEN + 1)
+
 
 #endif /* _NVME_FC_H */

From 50cdb7c61b019a732fe34635a7cbf2a7487f5e90 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 25 Jul 2017 17:39:07 +0200
Subject: [PATCH 13/16] nvme-pci: fix HMB size calculation

It's possible the preferred HMB size may not be a multiple of the
chunk_size. This patch moves len to function scope and uses that in
the for loop increment so the last iteration doesn't cause the total
size to exceed the allocated HMB size.

Based on an earlier patch from Keith Busch.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Fixes: 87ad72a59a38 ("nvme-pci: implement host memory buffer support")
---
 drivers/nvme/host/pci.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8569ee771269..cd888a47d0fc 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1619,7 +1619,7 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
 static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
 {
 	struct nvme_host_mem_buf_desc *descs;
-	u32 chunk_size, max_entries;
+	u32 chunk_size, max_entries, len;
 	int i = 0;
 	void **bufs;
 	u64 size = 0, tmp;
@@ -1638,10 +1638,10 @@ retry:
 	if (!bufs)
 		goto out_free_descs;
 
-	for (size = 0; size < preferred; size += chunk_size) {
-		u32 len = min_t(u64, chunk_size, preferred - size);
+	for (size = 0; size < preferred; size += len) {
 		dma_addr_t dma_addr;
 
+		len = min_t(u64, chunk_size, preferred - size);
 		bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL,
 				DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
 		if (!bufs[i])

From 7a362ea96d0df873397be04f4556e92f7e37c5ec Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 25 Jul 2017 13:31:19 -0400
Subject: [PATCH 14/16] nbd: clear disconnected on reconnect

If our device loses its connection for longer than the dead timeout we
will set NBD_DISCONNECTED in order to quickly fail any pending IO's that
flood in after the IO's that were waiting during the dead timer.
However if we re-connect at some point in the future we'll still see
this DISCONNECTED flag set if we then lose our connection again after
that, which means we won't get notifications for our newly lost
connections.  Fix this by just clearing the DISCONNECTED flag on
reconnect in order to make sure everything works as it's supposed to.

Reported-by: Dan Melnic <dmm@fb.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/nbd.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 64b19b10b739..5bdf923294a5 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -923,6 +923,8 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
 		mutex_unlock(&nsock->tx_lock);
 		sockfd_put(old);
 
+		clear_bit(NBD_DISCONNECTED, &config->runtime_flags);
+
 		/* We take the tx_mutex in an error path in the recv_work, so we
 		 * need to queue_work outside of the tx_mutex.
 		 */

From 7dd1ab163c17e11473a65b11f7e748db30618ebb Mon Sep 17 00:00:00 2001
From: Scott Bauer <scott.bauer@intel.com>
Date: Tue, 25 Jul 2017 10:27:06 -0600
Subject: [PATCH 15/16] nvme: validate admin queue before unquiesce

With a misbehaving controller it's possible we'll never
enter the live state and create an admin queue. When we
fail out of reset work it's possible we failed out early
enough without setting up the admin queue. We tear down
queues after a failed reset, but needed to do some more
sanitization.

Fixes 443bd90f2cca: "nvme: host: unquiesce queue in nvme_kill_queues()"

[  189.650995] nvme nvme1: pci function 0000:0b:00.0
[  317.680055] nvme nvme0: Device not ready; aborting reset
[  317.680183] nvme nvme0: Removing after probe failure status: -19
[  317.681258] kasan: GPF could be caused by NULL-ptr deref or user memory access
[  317.681397] general protection fault: 0000 [#1] SMP KASAN
[  317.682984] CPU: 3 PID: 477 Comm: kworker/3:2 Not tainted 4.13.0-rc1+ #5
[  317.683112] Hardware name: Gigabyte Technology Co., Ltd. Z170X-UD5/Z170X-UD5-CF, BIOS F5 03/07/2016
[  317.683284] Workqueue: events nvme_remove_dead_ctrl_work [nvme]
[  317.683398] task: ffff8803b0990000 task.stack: ffff8803c2ef0000
[  317.683516] RIP: 0010:blk_mq_unquiesce_queue+0x2b/0xa0
[  317.683614] RSP: 0018:ffff8803c2ef7d40 EFLAGS: 00010282
[  317.683716] RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 1ffff1006fbdcde3
[  317.683847] RDX: 0000000000000038 RSI: 1ffff1006f5a9245 RDI: 0000000000000000
[  317.683978] RBP: ffff8803c2ef7d58 R08: 1ffff1007bcdc974 R09: 0000000000000000
[  317.684108] R10: 1ffff1007bcdc975 R11: 0000000000000000 R12: 00000000000001c0
[  317.684239] R13: ffff88037ad49228 R14: ffff88037ad492d0 R15: ffff88037ad492e0
[  317.684371] FS:  0000000000000000(0000) GS:ffff8803de6c0000(0000) knlGS:0000000000000000
[  317.684519] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  317.684627] CR2: 0000002d1860c000 CR3: 000000045b40d000 CR4: 00000000003406e0
[  317.684758] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  317.684888] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  317.685018] Call Trace:
[  317.685084]  nvme_kill_queues+0x4d/0x170 [nvme_core]
[  317.685185]  nvme_remove_dead_ctrl_work+0x3a/0x90 [nvme]
[  317.685289]  process_one_work+0x771/0x1170
[  317.685372]  worker_thread+0xde/0x11e0
[  317.685452]  ? pci_mmcfg_check_reserved+0x110/0x110
[  317.685550]  kthread+0x2d3/0x3d0
[  317.685617]  ? process_one_work+0x1170/0x1170
[  317.685704]  ? kthread_create_on_node+0xc0/0xc0
[  317.685785]  ret_from_fork+0x25/0x30
[  317.685798] Code: 0f 1f 44 00 00 55 48 b8 00 00 00 00 00 fc ff df 48 89 e5 41 54 4c 8d a7 c0 01 00 00 53 48 89 fb 4c 89 e2 48 c1 ea 03 48 83 ec 08 <80> 3c 02 00 75 50 48 8b bb c0 01 00 00 e8 33 8a f9 00 0f ba b3
[  317.685872] RIP: blk_mq_unquiesce_queue+0x2b/0xa0 RSP: ffff8803c2ef7d40
[  317.685908] ---[ end trace a3f8704150b1e8b4 ]---

Signed-off-by: Scott Bauer <scott.bauer@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4cacab331f2a..c49f1f8b2e57 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2712,7 +2712,8 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
 	mutex_lock(&ctrl->namespaces_mutex);
 
 	/* Forcibly unquiesce queues to avoid blocking dispatch */
-	blk_mq_unquiesce_queue(ctrl->admin_q);
+	if (ctrl->admin_q)
+		blk_mq_unquiesce_queue(ctrl->admin_q);
 
 	list_for_each_entry(ns, &ctrl->namespaces, list) {
 		/*

From 75cb8e939cf30ebdfffd9b28566d8aead95138a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Fri, 28 Jul 2017 15:13:16 +0200
Subject: [PATCH 16/16] lightnvm: pblk: advance bio according to lba index
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a lba either hits the cache or corresponds to an empty entry in the
L2P table, we need to advance the bio according to the position in which
the lba is located. Otherwise, we will copy data in the wrong page, thus
causing data corruption for the application.

In case of a cache hit, we assumed that bio->bi_iter.bi_idx would
contain the correct index, but this is no necessarily true. Instead, use
the local bio advance counter and iterator. This guarantees that lbas
hitting the cache are copied into the right bv_page.

In case of an empty L2P entry, we omitted to advance the bio. In the
cases when the same I/O also contains a cache hit, data corresponding
to this lba will be copied to the wrong bv_page. Fix this by advancing
the bio as we do in the case of a cache hit.

Fixes: a4bd217b4326 lightnvm: physical block device (pblk) target

Signed-off-by: Javier González <javier@javigon.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-rb.c   |  4 ++--
 drivers/lightnvm/pblk-read.c | 23 ++++++++++++++++-------
 drivers/lightnvm/pblk.h      |  2 +-
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index 5ecc154f6831..9bc32578a766 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -657,7 +657,7 @@ try:
  * be directed to disk.
  */
 int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
-			struct ppa_addr ppa, int bio_iter)
+			struct ppa_addr ppa, int bio_iter, bool advanced_bio)
 {
 	struct pblk *pblk = container_of(rb, struct pblk, rwb);
 	struct pblk_rb_entry *entry;
@@ -694,7 +694,7 @@ int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
 	 * filled with data from the cache). If part of the data resides on the
 	 * media, we will read later on
 	 */
-	if (unlikely(!bio->bi_iter.bi_idx))
+	if (unlikely(!advanced_bio))
 		bio_advance(bio, bio_iter * PBLK_EXPOSED_PAGE_SIZE);
 
 	data = bio_data(bio);
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 4e5c48f3de62..d682e89e6493 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -26,7 +26,7 @@
  */
 static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
 				sector_t lba, struct ppa_addr ppa,
-				int bio_iter)
+				int bio_iter, bool advanced_bio)
 {
 #ifdef CONFIG_NVM_DEBUG
 	/* Callers must ensure that the ppa points to a cache address */
@@ -34,7 +34,8 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
 	BUG_ON(!pblk_addr_in_cache(ppa));
 #endif
 
-	return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba, ppa, bio_iter);
+	return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba, ppa,
+						bio_iter, advanced_bio);
 }
 
 static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
@@ -44,7 +45,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
 	struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
 	sector_t blba = pblk_get_lba(bio);
 	int nr_secs = rqd->nr_ppas;
-	int advanced_bio = 0;
+	bool advanced_bio = false;
 	int i, j = 0;
 
 	/* logic error: lba out-of-bounds. Ignore read request */
@@ -62,19 +63,26 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
 retry:
 		if (pblk_ppa_empty(p)) {
 			WARN_ON(test_and_set_bit(i, read_bitmap));
-			continue;
+
+			if (unlikely(!advanced_bio)) {
+				bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE);
+				advanced_bio = true;
+			}
+
+			goto next;
 		}
 
 		/* Try to read from write buffer. The address is later checked
 		 * on the write buffer to prevent retrieving overwritten data.
 		 */
 		if (pblk_addr_in_cache(p)) {
-			if (!pblk_read_from_cache(pblk, bio, lba, p, i)) {
+			if (!pblk_read_from_cache(pblk, bio, lba, p, i,
+								advanced_bio)) {
 				pblk_lookup_l2p_seq(pblk, &p, lba, 1);
 				goto retry;
 			}
 			WARN_ON(test_and_set_bit(i, read_bitmap));
-			advanced_bio = 1;
+			advanced_bio = true;
 #ifdef CONFIG_NVM_DEBUG
 			atomic_long_inc(&pblk->cache_reads);
 #endif
@@ -83,6 +91,7 @@ retry:
 			rqd->ppa_list[j++] = p;
 		}
 
+next:
 		if (advanced_bio)
 			bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
 	}
@@ -282,7 +291,7 @@ retry:
 	 * write buffer to prevent retrieving overwritten data.
 	 */
 	if (pblk_addr_in_cache(ppa)) {
-		if (!pblk_read_from_cache(pblk, bio, lba, ppa, 0)) {
+		if (!pblk_read_from_cache(pblk, bio, lba, ppa, 0, 1)) {
 			pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
 			goto retry;
 		}
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 0c5692cc2f60..67e623bd5c2d 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -670,7 +670,7 @@ unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
 				      struct list_head *list,
 				      unsigned int max);
 int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
-			struct ppa_addr ppa, int bio_iter);
+			struct ppa_addr ppa, int bio_iter, bool advanced_bio);
 unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries);
 
 unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags);