From e8efa9b88e3c20a20ff34bb33e2e94bf6896016a Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Tue, 4 Aug 2020 17:27:18 -0700 Subject: [PATCH 1/7] md: get sysfs entry after redundancy attr group create "sync_completed" and "degraded" belongs to redundancy attr group, it was not exist yet when md device was created. Reported-by: kernel test robot Fixes: e1a86dbbbd6a ("md: fix deadlock causing by sysfs_notify") Signed-off-by: Junxiao Bi Signed-off-by: Song Liu --- drivers/md/md.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 9c69084cae73..6b511c9007d3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -876,7 +876,13 @@ void mddev_unlock(struct mddev *mddev) sysfs_remove_group(&mddev->kobj, &md_redundancy_group); if (mddev->sysfs_action) sysfs_put(mddev->sysfs_action); + if (mddev->sysfs_completed) + sysfs_put(mddev->sysfs_completed); + if (mddev->sysfs_degraded) + sysfs_put(mddev->sysfs_degraded); mddev->sysfs_action = NULL; + mddev->sysfs_completed = NULL; + mddev->sysfs_degraded = NULL; } } mddev->sysfs_active = 0; @@ -4094,6 +4100,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) pr_warn("md: cannot register extra attributes for %s\n", mdname(mddev)); mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); + mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); + mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); } if (oldpers->sync_request != NULL && pers->sync_request == NULL) { @@ -5609,14 +5617,9 @@ static void md_free(struct kobject *ko) if (mddev->sysfs_state) sysfs_put(mddev->sysfs_state); - if (mddev->sysfs_completed) - sysfs_put(mddev->sysfs_completed); - if (mddev->sysfs_degraded) - sysfs_put(mddev->sysfs_degraded); if (mddev->sysfs_level) sysfs_put(mddev->sysfs_level); - if (mddev->gendisk) del_gendisk(mddev->gendisk); if (mddev->queue) @@ -5783,8 +5786,6 @@ static int md_alloc(dev_t dev, char *name) if (!error && mddev->kobj.sd) { kobject_uevent(&mddev->kobj, KOBJ_ADD); mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); - mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); - mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); } mddev_put(mddev); @@ -6064,6 +6065,8 @@ int md_run(struct mddev *mddev) pr_warn("md: cannot register extra attributes for %s\n", mdname(mddev)); mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); + mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); + mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); } else if (mddev->ro == 2) /* auto-readonly not meaningful */ mddev->ro = 0; From b35fd7422c2f8e04496f5a770bd4e1a205414b3f Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 6 Aug 2020 01:25:03 +0800 Subject: [PATCH 2/7] block: check queue's limits.discard_granularity in __blkdev_issue_discard() If create a loop device with a backing NVMe SSD, current loop device driver doesn't correctly set its queue's limits.discard_granularity and leaves it as 0. If a discard request at LBA 0 on this loop device, in __blkdev_issue_discard() the calculated req_sects will be 0, and a zero length discard request will trigger a BUG() panic in generic block layer code at block/blk-mq.c:563. [ 955.565006][ C39] ------------[ cut here ]------------ [ 955.559660][ C39] invalid opcode: 0000 [#1] SMP NOPTI [ 955.622171][ C39] CPU: 39 PID: 248 Comm: ksoftirqd/39 Tainted: G E 5.8.0-default+ #40 [ 955.622171][ C39] Hardware name: Lenovo ThinkSystem SR650 -[7X05CTO1WW]-/-[7X05CTO1WW]-, BIOS -[IVE160M-2.70]- 07/17/2020 [ 955.622175][ C39] RIP: 0010:blk_mq_end_request+0x107/0x110 [ 955.622177][ C39] Code: 48 8b 03 e9 59 ff ff ff 48 89 df 5b 5d 41 5c e9 9f ed ff ff 48 8b 35 98 3c f4 00 48 83 c7 10 48 83 c6 19 e8 cb 56 c9 ff eb cb <0f> 0b 0f 1f 80 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 41 56 41 54 [ 955.622179][ C39] RSP: 0018:ffffb1288701fe28 EFLAGS: 00010202 [ 955.749277][ C39] RAX: 0000000000000001 RBX: ffff956fffba5080 RCX: 0000000000004003 [ 955.749278][ C39] RDX: 0000000000000003 RSI: 0000000000000000 RDI: 0000000000000000 [ 955.749279][ C39] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 [ 955.749279][ C39] R10: ffffb1288701fd28 R11: 0000000000000001 R12: ffffffffa8e05160 [ 955.749280][ C39] R13: 0000000000000004 R14: 0000000000000004 R15: ffffffffa7ad3a1e [ 955.749281][ C39] FS: 0000000000000000(0000) GS:ffff95bfbda00000(0000) knlGS:0000000000000000 [ 955.749282][ C39] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 955.749282][ C39] CR2: 00007f6f0ef766a8 CR3: 0000005a37012002 CR4: 00000000007606e0 [ 955.749283][ C39] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 955.749284][ C39] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 955.749284][ C39] PKRU: 55555554 [ 955.749285][ C39] Call Trace: [ 955.749290][ C39] blk_done_softirq+0x99/0xc0 [ 957.550669][ C39] __do_softirq+0xd3/0x45f [ 957.550677][ C39] ? smpboot_thread_fn+0x2f/0x1e0 [ 957.550679][ C39] ? smpboot_thread_fn+0x74/0x1e0 [ 957.550680][ C39] ? smpboot_thread_fn+0x14e/0x1e0 [ 957.550684][ C39] run_ksoftirqd+0x30/0x60 [ 957.550687][ C39] smpboot_thread_fn+0x149/0x1e0 [ 957.886225][ C39] ? sort_range+0x20/0x20 [ 957.886226][ C39] kthread+0x137/0x160 [ 957.886228][ C39] ? kthread_park+0x90/0x90 [ 957.886231][ C39] ret_from_fork+0x22/0x30 [ 959.117120][ C39] ---[ end trace 3dacdac97e2ed164 ]--- This is the procedure to reproduce the panic, # modprobe scsi_debug delay=0 dev_size_mb=2048 max_queue=1 # losetup -f /dev/nvme0n1 --direct-io=on # blkdiscard /dev/loop0 -o 0 -l 0x200 This patch fixes the issue by checking q->limits.discard_granularity in __blkdev_issue_discard() before composing the discard bio. If the value is 0, then prints a warning oops information and returns -EOPNOTSUPP to the caller to indicate that this buggy device driver doesn't support discard request. Fixes: 9b15d109a6b2 ("block: improve discard bio alignment in __blkdev_issue_discard()") Fixes: c52abf563049 ("loop: Better discard support for block devices") Reported-and-suggested-by: Ming Lei Signed-off-by: Coly Li Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Reviewed-by: Jack Wang Cc: Bart Van Assche Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: Enzo Matsumiya Cc: Evan Green Cc: Jens Axboe Cc: Martin K. Petersen Cc: Xiao Ni Signed-off-by: Jens Axboe --- block/blk-lib.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/blk-lib.c b/block/blk-lib.c index 019e09bb9c0e..0d1811e57ac7 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -47,6 +47,15 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, op = REQ_OP_DISCARD; } + /* In case the discard granularity isn't set by buggy device driver */ + if (WARN_ON_ONCE(!q->limits.discard_granularity)) { + char dev_name[BDEVNAME_SIZE]; + + bdevname(bdev, dev_name); + pr_err_ratelimited("%s: Error: discard_granularity is 0.\n", dev_name); + return -EOPNOTSUPP; + } + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; if ((sector | nr_sects) & bs_mask) return -EINVAL; From e8abe1de43dac658dacbd04a4543e0c988a8d386 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 4 Aug 2020 13:16:45 +0300 Subject: [PATCH 3/7] md-cluster: Fix potential error pointer dereference in resize_bitmaps() The error handling calls md_bitmap_free(bitmap) which checks for NULL but will Oops if we pass an error pointer. Let's set "bitmap" to NULL on this error path. Fixes: afd756286083 ("md-cluster/raid10: resize all the bitmaps before start reshape") Signed-off-by: Dan Carpenter Reviewed-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md-cluster.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 73fd50e77975..d50737ec4039 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1139,6 +1139,7 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz bitmap = get_bitmap_from_slot(mddev, i); if (IS_ERR(bitmap)) { pr_err("can't get bitmap from slot %d\n", i); + bitmap = NULL; goto out; } counts = &bitmap->counts; From 735d77d4fd28bd47db9998a3744346302f8c1ee9 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 30 Jul 2020 11:13:57 +0200 Subject: [PATCH 4/7] rnbd: remove rnbd_dev_submit_io The function only has one caller, so let's open code it in process_rdma. Another bonus is we can avoid push/pop stack, since we need to pass 8 arguments to rnbd_dev_submit_io. Signed-off-by: Guoqing Jiang Acked-by: Danil Kipnis Acked-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv-dev.c | 36 +++---------------------------- drivers/block/rnbd/rnbd-srv-dev.h | 19 +++++----------- drivers/block/rnbd/rnbd-srv.c | 32 +++++++++++++++++++-------- 3 files changed, 31 insertions(+), 56 deletions(-) diff --git a/drivers/block/rnbd/rnbd-srv-dev.c b/drivers/block/rnbd/rnbd-srv-dev.c index 5eddfd29ab64..49c62b506c9b 100644 --- a/drivers/block/rnbd/rnbd-srv-dev.c +++ b/drivers/block/rnbd/rnbd-srv-dev.c @@ -45,7 +45,7 @@ void rnbd_dev_close(struct rnbd_dev *dev) kfree(dev); } -static void rnbd_dev_bi_end_io(struct bio *bio) +void rnbd_dev_bi_end_io(struct bio *bio) { struct rnbd_dev_blk_io *io = bio->bi_private; @@ -63,8 +63,8 @@ static void rnbd_dev_bi_end_io(struct bio *bio) * Map the kernel address into a bio suitable for io to a block * device. Returns an error pointer in case of error. */ -static struct bio *rnbd_bio_map_kern(void *data, struct bio_set *bs, - unsigned int len, gfp_t gfp_mask) +struct bio *rnbd_bio_map_kern(void *data, struct bio_set *bs, + unsigned int len, gfp_t gfp_mask) { unsigned long kaddr = (unsigned long)data; unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -102,33 +102,3 @@ static struct bio *rnbd_bio_map_kern(void *data, struct bio_set *bs, bio->bi_end_io = bio_put; return bio; } - -int rnbd_dev_submit_io(struct rnbd_dev *dev, sector_t sector, void *data, - size_t len, u32 bi_size, enum rnbd_io_flags flags, - short prio, void *priv) -{ - struct rnbd_dev_blk_io *io; - struct bio *bio; - - /* Generate bio with pages pointing to the rdma buffer */ - bio = rnbd_bio_map_kern(data, dev->ibd_bio_set, len, GFP_KERNEL); - if (IS_ERR(bio)) - return PTR_ERR(bio); - - io = container_of(bio, struct rnbd_dev_blk_io, bio); - - io->dev = dev; - io->priv = priv; - - bio->bi_end_io = rnbd_dev_bi_end_io; - bio->bi_private = io; - bio->bi_opf = rnbd_to_bio_flags(flags); - bio->bi_iter.bi_sector = sector; - bio->bi_iter.bi_size = bi_size; - bio_set_prio(bio, prio); - bio_set_dev(bio, dev->bdev); - - submit_bio(bio); - - return 0; -} diff --git a/drivers/block/rnbd/rnbd-srv-dev.h b/drivers/block/rnbd/rnbd-srv-dev.h index 0f65b09a270e..0eb23850afb9 100644 --- a/drivers/block/rnbd/rnbd-srv-dev.h +++ b/drivers/block/rnbd/rnbd-srv-dev.h @@ -41,6 +41,11 @@ void rnbd_dev_close(struct rnbd_dev *dev); void rnbd_endio(void *priv, int error); +void rnbd_dev_bi_end_io(struct bio *bio); + +struct bio *rnbd_bio_map_kern(void *data, struct bio_set *bs, + unsigned int len, gfp_t gfp_mask); + static inline int rnbd_dev_get_max_segs(const struct rnbd_dev *dev) { return queue_max_segments(bdev_get_queue(dev->bdev)); @@ -75,18 +80,4 @@ static inline int rnbd_dev_get_discard_alignment(const struct rnbd_dev *dev) return bdev_get_queue(dev->bdev)->limits.discard_alignment; } -/** - * rnbd_dev_submit_io() - Submit an I/O to the disk - * @dev: device to that the I/O is submitted - * @sector: address to read/write data to - * @data: I/O data to write or buffer to read I/O date into - * @len: length of @data - * @bi_size: Amount of data that will be read/written - * @prio: IO priority - * @priv: private data passed to @io_fn - */ -int rnbd_dev_submit_io(struct rnbd_dev *dev, sector_t sector, void *data, - size_t len, u32 bi_size, enum rnbd_io_flags flags, - short prio, void *priv); - #endif /* RNBD_SRV_DEV_H */ diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 86e61523907b..0fb94843a495 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -124,6 +124,9 @@ static int process_rdma(struct rtrs_srv *sess, struct rnbd_srv_sess_dev *sess_dev; u32 dev_id; int err; + struct rnbd_dev_blk_io *io; + struct bio *bio; + short prio; priv = kmalloc(sizeof(*priv), GFP_KERNEL); if (!priv) @@ -142,18 +145,29 @@ static int process_rdma(struct rtrs_srv *sess, priv->sess_dev = sess_dev; priv->id = id; - err = rnbd_dev_submit_io(sess_dev->rnbd_dev, le64_to_cpu(msg->sector), - data, datalen, le32_to_cpu(msg->bi_size), - le32_to_cpu(msg->rw), - srv_sess->ver < RNBD_PROTO_VER_MAJOR || - usrlen < sizeof(*msg) ? - 0 : le16_to_cpu(msg->prio), priv); - if (unlikely(err)) { - rnbd_srv_err(sess_dev, "Submitting I/O to device failed, err: %d\n", - err); + /* Generate bio with pages pointing to the rdma buffer */ + bio = rnbd_bio_map_kern(data, sess_dev->rnbd_dev->ibd_bio_set, datalen, GFP_KERNEL); + if (IS_ERR(bio)) { + rnbd_srv_err(sess_dev, "Failed to generate bio, err: %ld\n", PTR_ERR(bio)); goto sess_dev_put; } + io = container_of(bio, struct rnbd_dev_blk_io, bio); + io->dev = sess_dev->rnbd_dev; + io->priv = priv; + + bio->bi_end_io = rnbd_dev_bi_end_io; + bio->bi_private = io; + bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw)); + bio->bi_iter.bi_sector = le64_to_cpu(msg->sector); + bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size); + prio = srv_sess->ver < RNBD_PROTO_VER_MAJOR || + usrlen < sizeof(*msg) ? 0 : le16_to_cpu(msg->prio); + bio_set_prio(bio, prio); + bio_set_dev(bio, sess_dev->rnbd_dev->bdev); + + submit_bio(bio); + return 0; sess_dev_put: From d7aaeef293404bc41ea9d51c20a518945eb38366 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 30 Jul 2020 11:13:58 +0200 Subject: [PATCH 5/7] rnbd: no need to set bi_end_io in rnbd_bio_map_kern Since we always set bi_end_io after call rnbd_bio_map_kern, so the setting in rnbd_bio_map_kern is redundant. Signed-off-by: Guoqing Jiang Acked-by: Danil Kipnis Acked-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv-dev.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-srv-dev.c b/drivers/block/rnbd/rnbd-srv-dev.c index 49c62b506c9b..b241a099aeae 100644 --- a/drivers/block/rnbd/rnbd-srv-dev.c +++ b/drivers/block/rnbd/rnbd-srv-dev.c @@ -99,6 +99,5 @@ struct bio *rnbd_bio_map_kern(void *data, struct bio_set *bs, offset = 0; } - bio->bi_end_io = bio_put; return bio; } From fe6a8fc5ed2f0081f17375ae2005718522c392c6 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 10 Aug 2020 19:16:32 +0200 Subject: [PATCH 6/7] loop: unset GENHD_FL_NO_PART_SCAN on LOOP_CONFIGURE When LOOP_CONFIGURE is used with LO_FLAGS_PARTSCAN we need to propagate this into the GENHD_FL_NO_PART_SCAN. LOOP_SETSTATUS does this, LOOP_CONFIGURE doesn't so far. Effect is that setting up a loopback device with partition scanning doesn't actually work when LOOP_CONFIGURE is issued, though it works fine with LOOP_SETSTATUS. Let's correct that and propagate the flag in LOOP_CONFIGURE too. Fixes: 3448914e8cc5("loop: Add LOOP_CONFIGURE ioctl") Signed-off-by: Lennart Poettering Acked-by: Martijn Coenen Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index d18160146226..2f137d6ce169 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1171,6 +1171,8 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, if (part_shift) lo->lo_flags |= LO_FLAGS_PARTSCAN; partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; + if (partscan) + lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; /* Grab the block_device to prevent its destruction after we * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev). From c1e2b8422bf946c80e832cee22b3399634f87a2c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 10 Aug 2020 11:59:50 +0800 Subject: [PATCH 7/7] block: fix double account of flush request's driver tag In case of none scheduler, we share data request's driver tag for flush request, so have to mark the flush request as INFLIGHT for avoiding double account of this driver tag. Fixes: 568f27006577 ("blk-mq: centralise related handling into blk_mq_get_driver_tag") Reported-by: Matthew Wilcox Signed-off-by: Ming Lei Tested-by: Matthew Wilcox Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-flush.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 6e1543c10493..53abb5c73d99 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -308,9 +308,16 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->mq_ctx = first_rq->mq_ctx; flush_rq->mq_hctx = first_rq->mq_hctx; - if (!q->elevator) + if (!q->elevator) { flush_rq->tag = first_rq->tag; - else + + /* + * We borrow data request's driver tag, so have to mark + * this flush request as INFLIGHT for avoiding double + * account of this driver tag + */ + flush_rq->rq_flags |= RQF_MQ_INFLIGHT; + } else flush_rq->internal_tag = first_rq->internal_tag; flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;